Coverage Report

Created: 2026-05-30 06:26

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/openvswitch/lib/dpif-netdev.c
Line
Count
Source
1
/*
2
 * Copyright (c) 2009-2014, 2016-2018 Nicira, Inc.
3
 *
4
 * Licensed under the Apache License, Version 2.0 (the "License");
5
 * you may not use this file except in compliance with the License.
6
 * You may obtain a copy of the License at:
7
 *
8
 *     http://www.apache.org/licenses/LICENSE-2.0
9
 *
10
 * Unless required by applicable law or agreed to in writing, software
11
 * distributed under the License is distributed on an "AS IS" BASIS,
12
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
 * See the License for the specific language governing permissions and
14
 * limitations under the License.
15
 */
16
17
#include <config.h>
18
#include "dpif-netdev.h"
19
20
#include <ctype.h>
21
#include <errno.h>
22
#include <fcntl.h>
23
#include <inttypes.h>
24
#include <net/if.h>
25
#include <sys/types.h>
26
#include <netinet/in.h>
27
#include <stdint.h>
28
#include <stdlib.h>
29
#include <string.h>
30
#include <sys/ioctl.h>
31
#include <sys/socket.h>
32
#include <sys/stat.h>
33
#include <unistd.h>
34
35
#include "bitmap.h"
36
#include "ccmap.h"
37
#include "cmap.h"
38
#include "conntrack.h"
39
#include "conntrack-tp.h"
40
#include "coverage.h"
41
#include "ct-dpif.h"
42
#include "csum.h"
43
#include "dp-packet.h"
44
#include "dpif.h"
45
#include "dpif-netdev-dfc.h"
46
#include "dpif-netdev-dpcls.h"
47
#include "dpif-netdev-flow.h"
48
#include "dpif-netdev-perf.h"
49
#include "dpif-netdev-thread.h"
50
#include "dpif-offload.h"
51
#include "dpif-provider.h"
52
#include "dummy.h"
53
#include "fat-rwlock.h"
54
#include "flow.h"
55
#include "hmapx.h"
56
#include "id-fpool.h"
57
#include "id-pool.h"
58
#include "ipf.h"
59
#include "mov-avg.h"
60
#include "mpsc-queue.h"
61
#include "netdev.h"
62
#include "netdev-provider.h"
63
#include "netdev-vport.h"
64
#include "netlink.h"
65
#include "odp-execute.h"
66
#include "odp-util.h"
67
#include "openvswitch/dynamic-string.h"
68
#include "openvswitch/list.h"
69
#include "openvswitch/match.h"
70
#include "openvswitch/ofp-parse.h"
71
#include "openvswitch/ofp-print.h"
72
#include "openvswitch/ofpbuf.h"
73
#include "openvswitch/shash.h"
74
#include "openvswitch/vlog.h"
75
#include "ovs-numa.h"
76
#include "ovs-rcu.h"
77
#include "packets.h"
78
#include "openvswitch/poll-loop.h"
79
#include "pvector.h"
80
#include "random.h"
81
#include "seq.h"
82
#include "smap.h"
83
#include "sset.h"
84
#include "timeval.h"
85
#include "tnl-neigh-cache.h"
86
#include "tnl-ports.h"
87
#include "unixctl.h"
88
#include "util.h"
89
#include "uuid.h"
90
91
VLOG_DEFINE_THIS_MODULE(dpif_netdev);
92
93
/* Auto Load Balancing Defaults */
94
0
#define ALB_IMPROVEMENT_THRESHOLD    25
95
0
#define ALB_LOAD_THRESHOLD           95
96
0
#define ALB_REBALANCE_INTERVAL       1     /* 1 Min */
97
0
#define MAX_ALB_REBALANCE_INTERVAL   20000 /* 20000 Min */
98
0
#define MIN_TO_MSEC                  60000
99
100
#define FLOW_DUMP_MAX_BATCH 50
101
/* Use per thread recirc_depth to prevent recirculation loop. */
102
0
#define MAX_RECIRC_DEPTH 8
103
DEFINE_STATIC_PER_THREAD_DATA(uint32_t, recirc_depth, 0)
104
105
/* Use instant packet send by default. */
106
0
#define DEFAULT_TX_FLUSH_INTERVAL 0
107
108
/* Configuration parameters. */
109
enum { MAX_METERS = 1 << 18 };  /* Maximum number of meters. */
110
enum { MAX_BANDS = 8 };         /* Maximum number of bands / meter. */
111
112
COVERAGE_DEFINE(datapath_drop_meter);
113
COVERAGE_DEFINE(datapath_drop_upcall_error);
114
COVERAGE_DEFINE(datapath_drop_lock_error);
115
COVERAGE_DEFINE(datapath_drop_userspace_action_error);
116
COVERAGE_DEFINE(datapath_drop_tunnel_push_error);
117
COVERAGE_DEFINE(datapath_drop_tunnel_pop_error);
118
COVERAGE_DEFINE(datapath_drop_recirc_error);
119
COVERAGE_DEFINE(datapath_drop_invalid_port);
120
COVERAGE_DEFINE(datapath_drop_invalid_bond);
121
COVERAGE_DEFINE(datapath_drop_invalid_tnl_port);
122
COVERAGE_DEFINE(datapath_drop_rx_invalid_packet);
123
COVERAGE_DEFINE(datapath_drop_hw_post_process);
124
COVERAGE_DEFINE(datapath_drop_hw_post_process_consumed);
125
126
/* Protects against changes to 'dp_netdevs'. */
127
static struct ovs_mutex dp_netdev_mutex = OVS_MUTEX_INITIALIZER;
128
129
/* Contains all 'struct dp_netdev's. */
130
static struct shash dp_netdevs OVS_GUARDED_BY(dp_netdev_mutex)
131
    = SHASH_INITIALIZER(&dp_netdevs);
132
133
static struct vlog_rate_limit upcall_rl = VLOG_RATE_LIMIT_INIT(600, 600);
134
135
0
#define DP_NETDEV_CS_SUPPORTED_MASK (CS_NEW | CS_ESTABLISHED | CS_RELATED \
136
0
                                     | CS_INVALID | CS_REPLY_DIR | CS_TRACKED \
137
0
                                     | CS_SRC_NAT | CS_DST_NAT)
138
0
#define DP_NETDEV_CS_UNSUPPORTED_MASK (~(uint32_t)DP_NETDEV_CS_SUPPORTED_MASK)
139
140
static struct odp_support dp_netdev_support = {
141
    .max_vlan_headers = SIZE_MAX,
142
    .max_mpls_depth = SIZE_MAX,
143
    .recirc = true,
144
    .ct_state = true,
145
    .ct_zone = true,
146
    .ct_mark = true,
147
    .ct_label = true,
148
    .ct_state_nat = true,
149
    .ct_orig_tuple = true,
150
    .ct_orig_tuple6 = true,
151
};
152
153

154
/* Simple non-wildcarding single-priority classifier. */
155
156
/* Time in microseconds between successive optimizations of the dpcls
157
 * subtable vector */
158
0
#define DPCLS_OPTIMIZATION_INTERVAL 1000000LL
159
160
/* Time in microseconds of the interval in which rxq processing cycles used
161
 * in rxq to pmd assignments is measured and stored. */
162
0
#define PMD_INTERVAL_LEN 5000000LL
163
/* For converting PMD_INTERVAL_LEN to secs. */
164
0
#define INTERVAL_USEC_TO_SEC 1000000LL
165
166
/* Number of intervals for which cycles are stored
167
 * and used during rxq to pmd assignment. */
168
0
#define PMD_INTERVAL_MAX 12
169
170
/* Time in microseconds to try RCU quiescing. */
171
0
#define PMD_RCU_QUIESCE_INTERVAL 10000LL
172
173
/* Timer resolution for PMD threads in nanoseconds. */
174
0
#define PMD_TIMER_RES_NS 1000
175
176
/* Number of pkts Rx on an interface that will stop pmd thread sleeping. */
177
0
#define PMD_SLEEP_THRESH (NETDEV_MAX_BURST / 2)
178
/* Time in uS to increment a pmd thread sleep time. */
179
0
#define PMD_SLEEP_INC_US 1
180
181
struct pmd_sleep {
182
    unsigned core_id;
183
    uint64_t max_sleep;
184
};
185
186
struct dpcls {
187
    struct cmap_node node;      /* Within dp_netdev_pmd_thread.classifiers */
188
    odp_port_t in_port;
189
    struct cmap subtables_map;
190
    struct pvector subtables;
191
};
192
193
/* Data structure to keep packet order till fastpath processing. */
194
struct dp_packet_flow_map {
195
    struct dp_packet *packet;
196
    struct dp_netdev_flow *flow;
197
    uint16_t tcp_flags;
198
};
199
200
static void dpcls_init(struct dpcls *);
201
static void dpcls_destroy(struct dpcls *);
202
static void dpcls_sort_subtable_vector(struct dpcls *);
203
static void dpcls_insert(struct dpcls *, struct dpcls_rule *,
204
                         const struct netdev_flow_key *mask);
205
static void dpcls_remove(struct dpcls *, struct dpcls_rule *);
206
207
/* Set of supported meter flags */
208
#define DP_SUPPORTED_METER_FLAGS_MASK \
209
0
    (OFPMF13_STATS | OFPMF13_PKTPS | OFPMF13_KBPS | OFPMF13_BURST)
210
211
/* Set of supported meter band types */
212
#define DP_SUPPORTED_METER_BAND_TYPES           \
213
0
    ( 1 << OFPMBT13_DROP )
214
215
struct dp_meter_band {
216
    uint32_t rate;
217
    uint32_t burst_size;
218
    atomic_uint64_t bucket;          /* In 1/1000 packets for PKTPS,
219
                                      * or in bits for KBPS. */
220
    atomic_uint64_t packet_count;
221
    atomic_uint64_t byte_count;
222
};
223
224
struct dp_meter {
225
    struct cmap_node node;
226
    uint32_t id;
227
    uint16_t flags;
228
    uint16_t n_bands;
229
    uint32_t max_delta_t;
230
    atomic_uint64_t used;  /* Time of a last use in milliseconds. */
231
    atomic_uint64_t packet_count;
232
    atomic_uint64_t byte_count;
233
    struct dp_meter_band bands[];
234
};
235
236
struct pmd_auto_lb {
237
    bool do_dry_run;
238
    bool recheck_config;
239
    bool is_enabled;            /* Current status of Auto load balancing. */
240
    uint64_t rebalance_intvl;
241
    uint64_t rebalance_poll_timer;
242
    uint8_t rebalance_improve_thresh;
243
    atomic_uint8_t rebalance_load_thresh;
244
};
245
246
enum sched_assignment_type {
247
    SCHED_ROUNDROBIN,
248
    SCHED_CYCLES, /* Default.*/
249
    SCHED_GROUP
250
};
251
252
/* Datapath based on the network device interface from netdev.h.
253
 *
254
 *
255
 * Thread-safety
256
 * =============
257
 *
258
 * Some members, marked 'const', are immutable.  Accessing other members
259
 * requires synchronization, as noted in more detail below.
260
 *
261
 * Acquisition order is, from outermost to innermost:
262
 *
263
 *    dp_netdev_mutex (global)
264
 *    port_rwlock
265
 *    bond_mutex
266
 *    non_pmd_mutex
267
 */
268
struct dp_netdev {
269
    const struct dpif_class *const class;
270
    const char *const name;
271
    const char *const full_name;
272
    struct ovs_refcount ref_cnt;
273
    atomic_flag destroyed;
274
275
    /* Ports.
276
     *
277
     * Any lookup into 'ports' or any access to the dp_netdev_ports found
278
     * through 'ports' requires taking 'port_rwlock'. */
279
    struct ovs_rwlock port_rwlock;
280
    struct hmap ports;
281
    struct seq *port_seq;       /* Incremented whenever a port changes. */
282
283
    /* The time that a packet can wait in output batch for sending. */
284
    atomic_uint32_t tx_flush_interval;
285
286
    /* Meters. */
287
    struct ovs_mutex meters_lock;
288
    struct cmap meters;
289
290
    /* Probability of EMC insertions is a factor of 'emc_insert_min'.*/
291
    atomic_uint32_t emc_insert_min;
292
    /* Enable collection of PMD performance metrics. */
293
    atomic_bool pmd_perf_metrics;
294
    /* Default max load based sleep request. */
295
    uint64_t pmd_max_sleep_default;
296
    /* Enable the SMC cache from ovsdb config */
297
    atomic_bool smc_enable_db;
298
299
    /* Protects access to ofproto-dpif-upcall interface during revalidator
300
     * thread synchronization. */
301
    struct fat_rwlock upcall_rwlock;
302
    upcall_callback *upcall_cb;  /* Callback function for executing upcalls. */
303
    void *upcall_aux;
304
305
    /* Callback function for notifying the purging of dp flows (during
306
     * reseting pmd deletion). */
307
    dp_purge_callback *dp_purge_cb;
308
    void *dp_purge_aux;
309
310
    /* Stores all 'struct dp_netdev_pmd_thread's. */
311
    struct cmap poll_threads;
312
    /* id pool for per thread static_tx_qid. */
313
    struct id_pool *tx_qid_pool;
314
    struct ovs_mutex tx_qid_pool_mutex;
315
    /* Rxq to pmd assignment type. */
316
    enum sched_assignment_type pmd_rxq_assign_type;
317
    bool pmd_iso;
318
319
    /* Protects the access of the 'struct dp_netdev_pmd_thread'
320
     * instance for non-pmd thread. */
321
    struct ovs_mutex non_pmd_mutex;
322
323
    /* Each pmd thread will store its pointer to
324
     * 'struct dp_netdev_pmd_thread' in 'per_pmd_key'. */
325
    ovsthread_key_t per_pmd_key;
326
327
    struct seq *reconfigure_seq;
328
    uint64_t last_reconfigure_seq;
329
    struct ovsthread_once once_set_config;
330
331
    /* Cpu mask for pin of pmd threads. */
332
    char *pmd_cmask;
333
334
    /* PMD max load based sleep request user string. */
335
    char *max_sleep_list;
336
337
    uint64_t last_tnl_conf_seq;
338
339
    struct conntrack *conntrack;
340
    struct pmd_auto_lb pmd_alb;
341
342
    /* Bonds. */
343
    struct ovs_mutex bond_mutex; /* Protects updates of 'tx_bonds'. */
344
    struct cmap tx_bonds; /* Contains 'struct tx_bond'. */
345
};
346
347
static struct dp_netdev_port *dp_netdev_lookup_port(const struct dp_netdev *dp,
348
                                                    odp_port_t)
349
    OVS_REQ_RDLOCK(dp->port_rwlock);
350
351
enum rxq_cycles_counter_type {
352
    RXQ_CYCLES_PROC_CURR,       /* Cycles spent successfully polling and
353
                                   processing packets during the current
354
                                   interval. */
355
    RXQ_CYCLES_PROC_HIST,       /* Total cycles of all intervals that are used
356
                                   during rxq to pmd assignment. */
357
    RXQ_N_CYCLES
358
};
359
360
0
#define XPS_TIMEOUT 500000LL    /* In microseconds. */
361
362
/* Contained by struct dp_netdev_port's 'rxqs' member.  */
363
struct dp_netdev_rxq {
364
    struct dp_netdev_port *port;
365
    struct netdev_rxq *rx;
366
    unsigned core_id;                  /* Core to which this queue should be
367
                                          pinned. OVS_CORE_UNSPEC if the
368
                                          queue doesn't need to be pinned to a
369
                                          particular core. */
370
    atomic_count intrvl_idx;           /* Write index for 'cycles_intrvl'. */
371
    struct dp_netdev_pmd_thread *pmd;  /* pmd thread that polls this queue. */
372
    bool is_vhost;                     /* Is rxq of a vhost port. */
373
374
    /* Counters of cycles spent successfully polling and processing pkts. */
375
    atomic_ullong cycles[RXQ_N_CYCLES];
376
    /* We store PMD_INTERVAL_MAX intervals of data for an rxq and then
377
       sum them to yield the cycles used for an rxq. */
378
    atomic_ullong cycles_intrvl[PMD_INTERVAL_MAX];
379
};
380
381
enum txq_req_mode {
382
    TXQ_REQ_MODE_THREAD,
383
    TXQ_REQ_MODE_HASH,
384
};
385
386
enum txq_mode {
387
    TXQ_MODE_STATIC,
388
    TXQ_MODE_XPS,
389
    TXQ_MODE_XPS_HASH,
390
};
391
392
/* A port in a netdev-based datapath. */
393
struct dp_netdev_port {
394
    odp_port_t port_no;
395
    enum txq_mode txq_mode;     /* static, XPS, XPS_HASH. */
396
    bool need_reconfigure;      /* True if we should reconfigure netdev. */
397
    struct netdev *netdev;
398
    struct hmap_node node;      /* Node in dp_netdev's 'ports'. */
399
    struct netdev_saved_flags *sf;
400
    struct dp_netdev_rxq *rxqs;
401
    unsigned n_rxq;             /* Number of elements in 'rxqs' */
402
    unsigned *txq_used;         /* Number of threads that use each tx queue. */
403
    struct ovs_mutex txq_used_mutex;
404
    bool emc_enabled;           /* If true EMC will be used. */
405
    char *type;                 /* Port type as requested by user. */
406
    char *rxq_affinity_list;    /* Requested affinity of rx queues. */
407
    enum txq_req_mode txq_requested_mode;
408
};
409
410
static bool dp_netdev_flow_ref(struct dp_netdev_flow *);
411
static int dpif_netdev_flow_from_nlattrs(const struct nlattr *, uint32_t,
412
                                         struct flow *, bool);
413
414
struct dp_netdev_actions *dp_netdev_actions_create(const struct nlattr *,
415
                                                   size_t);
416
struct dp_netdev_actions *dp_netdev_flow_get_actions(
417
    const struct dp_netdev_flow *);
418
static void dp_netdev_actions_free(struct dp_netdev_actions *);
419
420
struct polled_queue {
421
    struct dp_netdev_rxq *rxq;
422
    odp_port_t port_no;
423
    bool emc_enabled;
424
    bool rxq_enabled;
425
    uint64_t change_seq;
426
};
427
428
/* Contained by struct dp_netdev_pmd_thread's 'poll_list' member. */
429
struct rxq_poll {
430
    struct dp_netdev_rxq *rxq;
431
    struct hmap_node node;
432
};
433
434
/* Contained by struct dp_netdev_pmd_thread's 'send_port_cache',
435
 * 'tnl_port_cache' or 'tx_ports'. */
436
struct tx_port {
437
    struct dp_netdev_port *port;
438
    int qid;
439
    long long last_used;
440
    struct hmap_node node;
441
    long long flush_time;
442
    struct dp_packet_batch output_pkts;
443
    struct dp_packet_batch *txq_pkts; /* Only for hash mode. */
444
    struct dp_netdev_rxq *output_pkts_rxqs[NETDEV_MAX_BURST];
445
};
446
447
/* Contained by struct tx_bond 'member_buckets'. */
448
struct member_entry {
449
    odp_port_t member_id;
450
    atomic_ullong n_packets;
451
    atomic_ullong n_bytes;
452
};
453
454
/* Contained by struct dp_netdev_pmd_thread's 'tx_bonds'. */
455
struct tx_bond {
456
    struct cmap_node node;
457
    uint32_t bond_id;
458
    struct member_entry member_buckets[BOND_BUCKETS];
459
};
460
461
/* Interface to netdev-based datapath. */
462
struct dpif_netdev {
463
    struct dpif dpif;
464
    struct dp_netdev *dp;
465
    uint64_t last_port_seq;
466
};
467
468
static int get_port_by_number(struct dp_netdev *dp, odp_port_t port_no,
469
                              struct dp_netdev_port **portp)
470
    OVS_REQ_RDLOCK(dp->port_rwlock);
471
static int get_port_by_name(struct dp_netdev *dp, const char *devname,
472
                            struct dp_netdev_port **portp)
473
    OVS_REQ_RDLOCK(dp->port_rwlock);
474
static void dp_netdev_free(struct dp_netdev *)
475
    OVS_REQUIRES(dp_netdev_mutex);
476
static int do_add_port(struct dp_netdev *dp, const char *devname,
477
                       const char *type, odp_port_t port_no)
478
    OVS_REQ_WRLOCK(dp->port_rwlock);
479
static void do_del_port(struct dp_netdev *dp, struct dp_netdev_port *)
480
    OVS_REQ_WRLOCK(dp->port_rwlock);
481
static int dpif_netdev_open(const struct dpif_class *, const char *name,
482
                            bool create, struct dpif **);
483
static void dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
484
                                      struct dp_packet_batch *,
485
                                      bool should_steal,
486
                                      const struct flow *flow,
487
                                      const struct nlattr *actions,
488
                                      size_t actions_len);
489
static void dp_netdev_input(struct dp_netdev_pmd_thread *,
490
                            struct dp_packet_batch *, odp_port_t port_no);
491
static void dp_netdev_recirculate(struct dp_netdev_pmd_thread *,
492
                                  struct dp_packet_batch *);
493
494
static void dp_netdev_disable_upcall(struct dp_netdev *);
495
static void dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd);
496
static void dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd,
497
                                    struct dp_netdev *dp, unsigned core_id,
498
                                    int numa_id);
499
static void dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd);
500
static void dp_netdev_set_nonpmd(struct dp_netdev *dp)
501
    OVS_REQ_WRLOCK(dp->port_rwlock);
502
503
static void *pmd_thread_main(void *);
504
static struct dp_netdev_pmd_thread *dp_netdev_get_pmd(struct dp_netdev *dp,
505
                                                      unsigned core_id);
506
static struct dp_netdev_pmd_thread *
507
dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos);
508
static void dp_netdev_del_pmd(struct dp_netdev *dp,
509
                              struct dp_netdev_pmd_thread *pmd);
510
static void dp_netdev_destroy_all_pmds(struct dp_netdev *dp, bool non_pmd);
511
static void dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd);
512
static void dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
513
                                         struct dp_netdev_port *port)
514
    OVS_REQUIRES(pmd->port_mutex);
515
static void dp_netdev_del_port_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
516
                                           struct tx_port *tx)
517
    OVS_REQUIRES(pmd->port_mutex);
518
static void dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd,
519
                                     struct dp_netdev_rxq *rxq)
520
    OVS_REQUIRES(pmd->port_mutex);
521
static void dp_netdev_del_rxq_from_pmd(struct dp_netdev_pmd_thread *pmd,
522
                                       struct rxq_poll *poll)
523
    OVS_REQUIRES(pmd->port_mutex);
524
static int
525
dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd,
526
                                   bool force);
527
static void dp_netdev_add_bond_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
528
                                         struct tx_bond *bond, bool update)
529
    OVS_EXCLUDED(pmd->bond_mutex);
530
static void dp_netdev_del_bond_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
531
                                           uint32_t bond_id)
532
    OVS_EXCLUDED(pmd->bond_mutex);
533
534
static void reconfigure_datapath(struct dp_netdev *dp)
535
    OVS_REQ_RDLOCK(dp->port_rwlock);
536
static bool dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd);
537
static void dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd);
538
static void dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd);
539
static void pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
540
    OVS_REQUIRES(pmd->port_mutex);
541
static inline void
542
dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd,
543
                           struct polled_queue *poll_list, int poll_cnt);
544
static void
545
dp_netdev_rxq_set_cycles(struct dp_netdev_rxq *rx,
546
                         enum rxq_cycles_counter_type type,
547
                         unsigned long long cycles);
548
static uint64_t
549
dp_netdev_rxq_get_cycles(struct dp_netdev_rxq *rx,
550
                         enum rxq_cycles_counter_type type);
551
static void
552
dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx,
553
                           unsigned long long cycles);
554
static uint64_t
555
dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned idx);
556
static uint64_t
557
get_interval_values(atomic_ullong *source, atomic_count *cur_idx,
558
                    int num_to_read);
559
static void
560
dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
561
                               bool purge);
562
static int dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd,
563
                                      struct tx_port *tx);
564
static inline struct dpcls *dp_netdev_pmd_lookup_dpcls(
565
    struct dp_netdev_pmd_thread *pmd, odp_port_t in_port);
566
567
static void dp_netdev_request_reconfigure(struct dp_netdev *dp);
568
static inline bool
569
pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd);
570
571
static void dp_netdev_simple_match_insert(struct dp_netdev_pmd_thread *pmd,
572
                                          struct dp_netdev_flow *flow)
573
    OVS_REQUIRES(pmd->flow_mutex);
574
static void dp_netdev_simple_match_remove(struct dp_netdev_pmd_thread *pmd,
575
                                          struct dp_netdev_flow *flow)
576
    OVS_REQUIRES(pmd->flow_mutex);
577
578
static bool dp_netdev_flow_is_simple_match(const struct match *);
579
580
/* Updates the time in PMD threads context and should be called in three cases:
581
 *
582
 *     1. PMD structure initialization:
583
 *         - dp_netdev_configure_pmd()
584
 *
585
 *     2. Before processing of the new packet batch:
586
 *         - dpif_netdev_execute()
587
 *         - dp_netdev_process_rxq_port()
588
 *
589
 *     3. At least once per polling iteration in main polling threads if no
590
 *        packets received on current iteration:
591
 *         - dpif_netdev_run()
592
 *         - pmd_thread_main()
593
 *
594
 * 'pmd->ctx.now' should be used without update in all other cases if possible.
595
 */
596
static inline void
597
pmd_thread_ctx_time_update(struct dp_netdev_pmd_thread *pmd)
598
0
{
599
0
    pmd->ctx.now = time_usec();
600
0
}
601
602
/* Returns true if 'dpif' is a netdev or dummy dpif, false otherwise. */
603
bool
604
dpif_is_netdev(const struct dpif *dpif)
605
0
{
606
0
    return dpif->dpif_class->open == dpif_netdev_open;
607
0
}
608
609
static struct dpif_netdev *
610
dpif_netdev_cast(const struct dpif *dpif)
611
0
{
612
0
    ovs_assert(dpif_is_netdev(dpif));
613
0
    return CONTAINER_OF(dpif, struct dpif_netdev, dpif);
614
0
}
615
616
static struct dp_netdev *
617
get_dp_netdev(const struct dpif *dpif)
618
0
{
619
0
    return dpif_netdev_cast(dpif)->dp;
620
0
}
621

622
enum pmd_info_type {
623
    PMD_INFO_SHOW_STATS,  /* Show how cpu cycles are spent. */
624
    PMD_INFO_CLEAR_STATS, /* Set the cycles count to 0. */
625
    PMD_INFO_SHOW_RXQ,    /* Show poll lists of pmd threads. */
626
    PMD_INFO_PERF_SHOW,   /* Show pmd performance details. */
627
    PMD_INFO_SLEEP_SHOW,  /* Show max sleep configuration details. */
628
};
629
630
static void
631
format_pmd_thread(struct ds *reply, struct dp_netdev_pmd_thread *pmd)
632
0
{
633
0
    ds_put_cstr(reply, (pmd->core_id == NON_PMD_CORE_ID)
634
0
                        ? "main thread" : "pmd thread");
635
0
    if (pmd->numa_id != OVS_NUMA_UNSPEC) {
636
0
        ds_put_format(reply, " numa_id %d", pmd->numa_id);
637
0
    }
638
0
    if (pmd->core_id != OVS_CORE_UNSPEC && pmd->core_id != NON_PMD_CORE_ID) {
639
0
        ds_put_format(reply, " core_id %u", pmd->core_id);
640
0
    }
641
0
    ds_put_cstr(reply, ":\n");
642
0
}
643
644
static void
645
pmd_info_show_stats(struct ds *reply,
646
                    struct dp_netdev_pmd_thread *pmd)
647
0
{
648
0
    uint64_t stats[PMD_N_STATS];
649
0
    uint64_t total_cycles, total_packets;
650
0
    double passes_per_pkt = 0;
651
0
    double lookups_per_hit = 0;
652
0
    double packets_per_batch = 0;
653
654
0
    pmd_perf_read_counters(&pmd->perf_stats, stats);
655
0
    total_cycles = stats[PMD_CYCLES_ITER_IDLE]
656
0
                         + stats[PMD_CYCLES_ITER_BUSY];
657
0
    total_packets = stats[PMD_STAT_RECV];
658
659
0
    format_pmd_thread(reply, pmd);
660
661
0
    if (total_packets > 0) {
662
0
        passes_per_pkt = (total_packets + stats[PMD_STAT_RECIRC])
663
0
                            / (double) total_packets;
664
0
    }
665
0
    if (stats[PMD_STAT_MASKED_HIT] > 0) {
666
0
        lookups_per_hit = stats[PMD_STAT_MASKED_LOOKUP]
667
0
                            / (double) stats[PMD_STAT_MASKED_HIT];
668
0
    }
669
0
    if (stats[PMD_STAT_SENT_BATCHES] > 0) {
670
0
        packets_per_batch = stats[PMD_STAT_SENT_PKTS]
671
0
                            / (double) stats[PMD_STAT_SENT_BATCHES];
672
0
    }
673
674
0
    ds_put_format(reply,
675
0
                  "  packets received: %"PRIu64"\n"
676
0
                  "  packet recirculations: %"PRIu64"\n"
677
0
                  "  avg. datapath passes per packet: %.02f\n"
678
0
                  "  phwol hits: %"PRIu64"\n"
679
0
                  "  simple match hits: %"PRIu64"\n"
680
0
                  "  emc hits: %"PRIu64"\n"
681
0
                  "  smc hits: %"PRIu64"\n"
682
0
                  "  megaflow hits: %"PRIu64"\n"
683
0
                  "  avg. subtable lookups per megaflow hit: %.02f\n"
684
0
                  "  miss with success upcall: %"PRIu64"\n"
685
0
                  "  miss with failed upcall: %"PRIu64"\n"
686
0
                  "  avg. packets per output batch: %.02f\n",
687
0
                  total_packets, stats[PMD_STAT_RECIRC],
688
0
                  passes_per_pkt, stats[PMD_STAT_PHWOL_HIT],
689
0
                  stats[PMD_STAT_SIMPLE_HIT],
690
0
                  stats[PMD_STAT_EXACT_HIT],
691
0
                  stats[PMD_STAT_SMC_HIT],
692
0
                  stats[PMD_STAT_MASKED_HIT],
693
0
                  lookups_per_hit, stats[PMD_STAT_MISS], stats[PMD_STAT_LOST],
694
0
                  packets_per_batch);
695
696
0
    if (total_cycles == 0) {
697
0
        return;
698
0
    }
699
700
0
    ds_put_format(reply,
701
0
                  "  idle cycles: %"PRIu64" (%.02f%%)\n"
702
0
                  "  processing cycles: %"PRIu64" (%.02f%%)\n",
703
0
                  stats[PMD_CYCLES_ITER_IDLE],
704
0
                  stats[PMD_CYCLES_ITER_IDLE] / (double) total_cycles * 100,
705
0
                  stats[PMD_CYCLES_ITER_BUSY],
706
0
                  stats[PMD_CYCLES_ITER_BUSY] / (double) total_cycles * 100);
707
708
0
    if (total_packets == 0) {
709
0
        return;
710
0
    }
711
712
0
    ds_put_format(reply,
713
0
                  "  avg cycles per packet: %.02f (%"PRIu64"/%"PRIu64")\n",
714
0
                  total_cycles / (double) total_packets,
715
0
                  total_cycles, total_packets);
716
717
0
    ds_put_format(reply,
718
0
                  "  avg processing cycles per packet: "
719
0
                  "%.02f (%"PRIu64"/%"PRIu64")\n",
720
0
                  stats[PMD_CYCLES_ITER_BUSY] / (double) total_packets,
721
0
                  stats[PMD_CYCLES_ITER_BUSY], total_packets);
722
0
}
723
724
static void
725
pmd_info_show_perf(struct ds *reply,
726
                   struct dp_netdev_pmd_thread *pmd,
727
                   struct pmd_perf_params *par)
728
0
{
729
0
    if (pmd->core_id != NON_PMD_CORE_ID) {
730
0
        char *time_str =
731
0
                xastrftime_msec("%H:%M:%S.###", time_wall_msec(), true);
732
0
        long long now = time_msec();
733
0
        double duration = (now - pmd->perf_stats.start_ms) / 1000.0;
734
735
0
        ds_put_cstr(reply, "\n");
736
0
        ds_put_format(reply, "Time: %s\n", time_str);
737
0
        ds_put_format(reply, "Measurement duration: %.3f s\n", duration);
738
0
        ds_put_cstr(reply, "\n");
739
0
        format_pmd_thread(reply, pmd);
740
0
        ds_put_cstr(reply, "\n");
741
0
        pmd_perf_format_overall_stats(reply, &pmd->perf_stats, duration);
742
0
        if (pmd_perf_metrics_enabled(pmd)) {
743
            /* Prevent parallel clearing of perf metrics. */
744
0
            ovs_mutex_lock(&pmd->perf_stats.clear_mutex);
745
0
            if (par->histograms) {
746
0
                ds_put_cstr(reply, "\n");
747
0
                pmd_perf_format_histograms(reply, &pmd->perf_stats);
748
0
            }
749
0
            if (par->iter_hist_len > 0) {
750
0
                ds_put_cstr(reply, "\n");
751
0
                pmd_perf_format_iteration_history(reply, &pmd->perf_stats,
752
0
                        par->iter_hist_len);
753
0
            }
754
0
            if (par->ms_hist_len > 0) {
755
0
                ds_put_cstr(reply, "\n");
756
0
                pmd_perf_format_ms_history(reply, &pmd->perf_stats,
757
0
                        par->ms_hist_len);
758
0
            }
759
0
            ovs_mutex_unlock(&pmd->perf_stats.clear_mutex);
760
0
        }
761
0
        free(time_str);
762
0
    }
763
0
}
764
765
static int
766
compare_poll_list(const void *a_, const void *b_)
767
0
{
768
0
    const struct rxq_poll *a = a_;
769
0
    const struct rxq_poll *b = b_;
770
771
0
    const char *namea = netdev_rxq_get_name(a->rxq->rx);
772
0
    const char *nameb = netdev_rxq_get_name(b->rxq->rx);
773
774
0
    int cmp = strcmp(namea, nameb);
775
0
    if (!cmp) {
776
0
        return netdev_rxq_get_queue_id(a->rxq->rx)
777
0
               - netdev_rxq_get_queue_id(b->rxq->rx);
778
0
    } else {
779
0
        return cmp;
780
0
    }
781
0
}
782
783
static void
784
sorted_poll_list(struct dp_netdev_pmd_thread *pmd, struct rxq_poll **list,
785
                 size_t *n)
786
    OVS_REQUIRES(pmd->port_mutex)
787
0
{
788
0
    struct rxq_poll *ret, *poll;
789
0
    size_t i;
790
791
0
    *n = hmap_count(&pmd->poll_list);
792
0
    if (!*n) {
793
0
        ret = NULL;
794
0
    } else {
795
0
        ret = xcalloc(*n, sizeof *ret);
796
0
        i = 0;
797
0
        HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
798
0
            ret[i] = *poll;
799
0
            i++;
800
0
        }
801
0
        ovs_assert(i == *n);
802
0
        qsort(ret, *n, sizeof *ret, compare_poll_list);
803
0
    }
804
805
0
    *list = ret;
806
0
}
807
808
static void
809
pmd_info_show_rxq(struct ds *reply, struct dp_netdev_pmd_thread *pmd,
810
                  int secs)
811
0
{
812
0
    if (pmd->core_id != NON_PMD_CORE_ID) {
813
0
        struct rxq_poll *list;
814
0
        size_t n_rxq;
815
0
        uint64_t total_pmd_cycles = 0;
816
0
        uint64_t busy_pmd_cycles = 0;
817
0
        uint64_t total_rxq_proc_cycles = 0;
818
0
        unsigned int intervals;
819
820
0
        ds_put_format(reply,
821
0
                      "pmd thread numa_id %d core_id %u:\n  isolated : %s\n",
822
0
                      pmd->numa_id, pmd->core_id, (pmd->isolated)
823
0
                                                  ? "true" : "false");
824
825
0
        ovs_mutex_lock(&pmd->port_mutex);
826
0
        sorted_poll_list(pmd, &list, &n_rxq);
827
828
        /* Get the total pmd cycles for an interval. */
829
0
        atomic_read_relaxed(&pmd->intrvl_cycles, &total_pmd_cycles);
830
        /* Calculate how many intervals are to be used. */
831
0
        intervals = DIV_ROUND_UP(secs,
832
0
                                 PMD_INTERVAL_LEN / INTERVAL_USEC_TO_SEC);
833
        /* Estimate the cycles to cover all intervals. */
834
0
        total_pmd_cycles *= intervals;
835
0
        busy_pmd_cycles = get_interval_values(pmd->busy_cycles_intrvl,
836
0
                                              &pmd->intrvl_idx,
837
0
                                              intervals);
838
0
        if (busy_pmd_cycles > total_pmd_cycles) {
839
0
            busy_pmd_cycles = total_pmd_cycles;
840
0
        }
841
842
0
        for (int i = 0; i < n_rxq; i++) {
843
0
            struct dp_netdev_rxq *rxq = list[i].rxq;
844
0
            const char *name = netdev_rxq_get_name(rxq->rx);
845
0
            uint64_t rxq_proc_cycles = 0;
846
847
0
            rxq_proc_cycles = get_interval_values(rxq->cycles_intrvl,
848
0
                                                  &rxq->intrvl_idx,
849
0
                                                  intervals);
850
0
            total_rxq_proc_cycles += rxq_proc_cycles;
851
0
            ds_put_format(reply, "  port: %-16s  queue-id: %2d", name,
852
0
                          netdev_rxq_get_queue_id(list[i].rxq->rx));
853
0
            ds_put_format(reply, " %s", netdev_rxq_enabled(list[i].rxq->rx)
854
0
                                        ? "(enabled) " : "(disabled)");
855
0
            ds_put_format(reply, "  pmd usage: ");
856
0
            if (total_pmd_cycles) {
857
0
                ds_put_format(reply, "%2.0f %%",
858
0
                              (double) (rxq_proc_cycles * 100) /
859
0
                              total_pmd_cycles);
860
0
            } else {
861
0
                ds_put_format(reply, "%s", "NOT AVAIL");
862
0
            }
863
0
            ds_put_cstr(reply, "\n");
864
0
        }
865
866
0
        if (n_rxq > 0) {
867
0
            ds_put_cstr(reply, "  overhead: ");
868
0
            if (total_pmd_cycles) {
869
0
                uint64_t overhead_cycles = 0;
870
871
0
                if (total_rxq_proc_cycles < busy_pmd_cycles) {
872
0
                    overhead_cycles = busy_pmd_cycles - total_rxq_proc_cycles;
873
0
                }
874
875
0
                ds_put_format(reply, "%2.0f %%",
876
0
                              (double) (overhead_cycles * 100) /
877
0
                              total_pmd_cycles);
878
0
            } else {
879
0
                ds_put_cstr(reply, "NOT AVAIL");
880
0
            }
881
0
            ds_put_cstr(reply, "\n");
882
0
        }
883
884
0
        ovs_mutex_unlock(&pmd->port_mutex);
885
0
        free(list);
886
0
    }
887
0
}
888
889
static int
890
compare_poll_thread_list(const void *a_, const void *b_)
891
0
{
892
0
    const struct dp_netdev_pmd_thread *a, *b;
893
894
0
    a = *(struct dp_netdev_pmd_thread **)a_;
895
0
    b = *(struct dp_netdev_pmd_thread **)b_;
896
897
0
    if (a->core_id < b->core_id) {
898
0
        return -1;
899
0
    }
900
0
    if (a->core_id > b->core_id) {
901
0
        return 1;
902
0
    }
903
0
    return 0;
904
0
}
905
906
/* Create a sorted list of pmd's from the dp->poll_threads cmap. We can use
907
 * this list, as long as we do not go to quiescent state. */
908
static void
909
sorted_poll_thread_list(struct dp_netdev *dp,
910
                        struct dp_netdev_pmd_thread ***list,
911
                        size_t *n)
912
0
{
913
0
    struct dp_netdev_pmd_thread *pmd;
914
0
    struct dp_netdev_pmd_thread **pmd_list;
915
0
    size_t k = 0, n_pmds;
916
917
0
    n_pmds = cmap_count(&dp->poll_threads);
918
0
    pmd_list = xcalloc(n_pmds, sizeof *pmd_list);
919
920
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
921
0
        if (k >= n_pmds) {
922
0
            break;
923
0
        }
924
0
        pmd_list[k++] = pmd;
925
0
    }
926
927
0
    qsort(pmd_list, k, sizeof *pmd_list, compare_poll_thread_list);
928
929
0
    *list = pmd_list;
930
0
    *n = k;
931
0
}
932
933
static void
934
dpif_netdev_pmd_rebalance(struct unixctl_conn *conn, int argc,
935
                          const char *argv[], void *aux OVS_UNUSED)
936
0
{
937
0
    struct ds reply = DS_EMPTY_INITIALIZER;
938
0
    struct dp_netdev *dp = NULL;
939
940
0
    ovs_mutex_lock(&dp_netdev_mutex);
941
942
0
    if (argc == 2) {
943
0
        dp = shash_find_data(&dp_netdevs, argv[1]);
944
0
    } else if (shash_count(&dp_netdevs) == 1) {
945
        /* There's only one datapath */
946
0
        dp = shash_first(&dp_netdevs)->data;
947
0
    }
948
949
0
    if (!dp) {
950
0
        ovs_mutex_unlock(&dp_netdev_mutex);
951
0
        unixctl_command_reply_error(conn,
952
0
                                    "please specify an existing datapath");
953
0
        return;
954
0
    }
955
956
0
    dp_netdev_request_reconfigure(dp);
957
0
    ovs_mutex_unlock(&dp_netdev_mutex);
958
0
    ds_put_cstr(&reply, "pmd rxq rebalance requested.\n");
959
0
    unixctl_command_reply(conn, ds_cstr(&reply));
960
0
    ds_destroy(&reply);
961
0
}
962
963
static void
964
pmd_info_show_sleep(struct ds *reply, unsigned core_id, int numa_id,
965
                    uint64_t pmd_max_sleep)
966
0
{
967
0
    if (core_id == NON_PMD_CORE_ID) {
968
0
        return;
969
0
    }
970
0
    ds_put_format(reply,
971
0
                  "pmd thread numa_id %d core_id %d:\n"
972
0
                  "  max sleep: %4"PRIu64" us\n",
973
0
                  numa_id, core_id, pmd_max_sleep);
974
0
}
975
976
static void
977
dpif_netdev_pmd_info(struct unixctl_conn *conn, int argc, const char *argv[],
978
                     void *aux)
979
0
{
980
0
    struct ds reply = DS_EMPTY_INITIALIZER;
981
0
    struct dp_netdev_pmd_thread **pmd_list;
982
0
    struct dp_netdev *dp = NULL;
983
0
    enum pmd_info_type type = *(enum pmd_info_type *) aux;
984
0
    unsigned int core_id;
985
0
    bool filter_on_pmd = false;
986
0
    size_t n;
987
0
    unsigned int secs = 0;
988
0
    unsigned long long max_secs = (PMD_INTERVAL_LEN * PMD_INTERVAL_MAX)
989
0
                                      / INTERVAL_USEC_TO_SEC;
990
0
    bool show_header = true;
991
0
    uint64_t max_sleep;
992
993
0
    ovs_mutex_lock(&dp_netdev_mutex);
994
995
0
    while (argc > 1) {
996
0
        if (!strcmp(argv[1], "-pmd") && argc > 2) {
997
0
            if (str_to_uint(argv[2], 10, &core_id)) {
998
0
                filter_on_pmd = true;
999
0
            }
1000
0
            argc -= 2;
1001
0
            argv += 2;
1002
0
        } else if (type == PMD_INFO_SHOW_RXQ &&
1003
0
                       !strcmp(argv[1], "-secs") &&
1004
0
                       argc > 2) {
1005
0
            if (!str_to_uint(argv[2], 10, &secs)) {
1006
0
                secs = max_secs;
1007
0
            }
1008
0
            argc -= 2;
1009
0
            argv += 2;
1010
0
        } else {
1011
0
            dp = shash_find_data(&dp_netdevs, argv[1]);
1012
0
            argc -= 1;
1013
0
            argv += 1;
1014
0
        }
1015
0
    }
1016
1017
0
    if (!dp) {
1018
0
        if (shash_count(&dp_netdevs) == 1) {
1019
            /* There's only one datapath */
1020
0
            dp = shash_first(&dp_netdevs)->data;
1021
0
        } else {
1022
0
            ovs_mutex_unlock(&dp_netdev_mutex);
1023
0
            unixctl_command_reply_error(conn,
1024
0
                                        "please specify an existing datapath");
1025
0
            return;
1026
0
        }
1027
0
    }
1028
1029
0
    sorted_poll_thread_list(dp, &pmd_list, &n);
1030
0
    for (size_t i = 0; i < n; i++) {
1031
0
        struct dp_netdev_pmd_thread *pmd = pmd_list[i];
1032
0
        if (!pmd) {
1033
0
            break;
1034
0
        }
1035
0
        if (filter_on_pmd && pmd->core_id != core_id) {
1036
0
            continue;
1037
0
        }
1038
0
        if (type == PMD_INFO_SHOW_RXQ) {
1039
0
            if (show_header) {
1040
0
                if (!secs || secs > max_secs) {
1041
0
                    secs = max_secs;
1042
0
                } else {
1043
0
                    secs = ROUND_UP(secs,
1044
0
                                    PMD_INTERVAL_LEN / INTERVAL_USEC_TO_SEC);
1045
0
                }
1046
0
                ds_put_format(&reply, "Displaying last %u seconds "
1047
0
                              "pmd usage %%\n", secs);
1048
0
                show_header = false;
1049
0
            }
1050
0
            pmd_info_show_rxq(&reply, pmd, secs);
1051
0
        } else if (type == PMD_INFO_CLEAR_STATS) {
1052
0
            pmd_perf_stats_clear(&pmd->perf_stats);
1053
0
        } else if (type == PMD_INFO_SHOW_STATS) {
1054
0
            pmd_info_show_stats(&reply, pmd);
1055
0
        } else if (type == PMD_INFO_PERF_SHOW) {
1056
0
            pmd_info_show_perf(&reply, pmd, (struct pmd_perf_params *)aux);
1057
0
        } else if (type == PMD_INFO_SLEEP_SHOW) {
1058
0
            if (show_header) {
1059
0
                ds_put_format(&reply, "Default max sleep: %4"PRIu64" us\n",
1060
0
                              dp->pmd_max_sleep_default);
1061
0
                show_header = false;
1062
0
            }
1063
0
            atomic_read_relaxed(&pmd->max_sleep, &max_sleep);
1064
0
            pmd_info_show_sleep(&reply, pmd->core_id, pmd->numa_id,
1065
0
                                max_sleep);
1066
0
        }
1067
0
    }
1068
0
    free(pmd_list);
1069
1070
0
    ovs_mutex_unlock(&dp_netdev_mutex);
1071
1072
0
    unixctl_command_reply(conn, ds_cstr(&reply));
1073
0
    ds_destroy(&reply);
1074
0
}
1075
1076
static void
1077
pmd_perf_show_cmd(struct unixctl_conn *conn, int argc,
1078
                          const char *argv[],
1079
                          void *aux OVS_UNUSED)
1080
0
{
1081
0
    struct pmd_perf_params par;
1082
0
    long int it_hist = 0, ms_hist = 0;
1083
0
    par.histograms = true;
1084
1085
0
    while (argc > 1) {
1086
0
        if (!strcmp(argv[1], "-nh")) {
1087
0
            par.histograms = false;
1088
0
            argc -= 1;
1089
0
            argv += 1;
1090
0
        } else if (!strcmp(argv[1], "-it") && argc > 2) {
1091
0
            it_hist = strtol(argv[2], NULL, 10);
1092
0
            if (it_hist < 0) {
1093
0
                it_hist = 0;
1094
0
            } else if (it_hist > HISTORY_LEN) {
1095
0
                it_hist = HISTORY_LEN;
1096
0
            }
1097
0
            argc -= 2;
1098
0
            argv += 2;
1099
0
        } else if (!strcmp(argv[1], "-ms") && argc > 2) {
1100
0
            ms_hist = strtol(argv[2], NULL, 10);
1101
0
            if (ms_hist < 0) {
1102
0
                ms_hist = 0;
1103
0
            } else if (ms_hist > HISTORY_LEN) {
1104
0
                ms_hist = HISTORY_LEN;
1105
0
            }
1106
0
            argc -= 2;
1107
0
            argv += 2;
1108
0
        } else {
1109
0
            break;
1110
0
        }
1111
0
    }
1112
0
    par.iter_hist_len = it_hist;
1113
0
    par.ms_hist_len = ms_hist;
1114
0
    par.command_type = PMD_INFO_PERF_SHOW;
1115
0
    dpif_netdev_pmd_info(conn, argc, argv, &par);
1116
0
}
1117
1118
static void
1119
dpif_netdev_bond_show(struct unixctl_conn *conn, int argc,
1120
                      const char *argv[], void *aux OVS_UNUSED)
1121
0
{
1122
0
    struct ds reply = DS_EMPTY_INITIALIZER;
1123
0
    struct dp_netdev *dp = NULL;
1124
1125
0
    ovs_mutex_lock(&dp_netdev_mutex);
1126
0
    if (argc == 2) {
1127
0
        dp = shash_find_data(&dp_netdevs, argv[1]);
1128
0
    } else if (shash_count(&dp_netdevs) == 1) {
1129
        /* There's only one datapath. */
1130
0
        dp = shash_first(&dp_netdevs)->data;
1131
0
    }
1132
0
    if (!dp) {
1133
0
        ovs_mutex_unlock(&dp_netdev_mutex);
1134
0
        unixctl_command_reply_error(conn,
1135
0
                                    "please specify an existing datapath");
1136
0
        return;
1137
0
    }
1138
1139
0
    if (cmap_count(&dp->tx_bonds) > 0) {
1140
0
        struct tx_bond *dp_bond_entry;
1141
1142
0
        ds_put_cstr(&reply, "Bonds:\n");
1143
0
        CMAP_FOR_EACH (dp_bond_entry, node, &dp->tx_bonds) {
1144
0
            ds_put_format(&reply, "  bond-id %"PRIu32":\n",
1145
0
                          dp_bond_entry->bond_id);
1146
0
            for (int bucket = 0; bucket < BOND_BUCKETS; bucket++) {
1147
0
                uint32_t member_id = odp_to_u32(
1148
0
                    dp_bond_entry->member_buckets[bucket].member_id);
1149
0
                ds_put_format(&reply,
1150
0
                              "    bucket %d - member %"PRIu32"\n",
1151
0
                              bucket, member_id);
1152
0
            }
1153
0
        }
1154
0
    }
1155
0
    ovs_mutex_unlock(&dp_netdev_mutex);
1156
0
    unixctl_command_reply(conn, ds_cstr(&reply));
1157
0
    ds_destroy(&reply);
1158
0
}
1159
1160

1161
static int
1162
dpif_netdev_init(void)
1163
0
{
1164
0
    static enum pmd_info_type show_aux = PMD_INFO_SHOW_STATS,
1165
0
                              clear_aux = PMD_INFO_CLEAR_STATS,
1166
0
                              poll_aux = PMD_INFO_SHOW_RXQ,
1167
0
                              sleep_aux = PMD_INFO_SLEEP_SHOW;
1168
1169
0
    unixctl_command_register("dpif-netdev/pmd-stats-show", "[-pmd core] [dp]",
1170
0
                             0, 3, dpif_netdev_pmd_info,
1171
0
                             (void *)&show_aux);
1172
0
    unixctl_command_register("dpif-netdev/pmd-stats-clear", "[-pmd core] [dp]",
1173
0
                             0, 3, dpif_netdev_pmd_info,
1174
0
                             (void *)&clear_aux);
1175
0
    unixctl_command_register("dpif-netdev/pmd-rxq-show", "[-pmd core] "
1176
0
                             "[-secs secs] [dp]",
1177
0
                             0, 5, dpif_netdev_pmd_info,
1178
0
                             (void *)&poll_aux);
1179
0
    unixctl_command_register("dpif-netdev/pmd-sleep-show", "[dp]",
1180
0
                             0, 1, dpif_netdev_pmd_info,
1181
0
                             (void *)&sleep_aux);
1182
0
    unixctl_command_register("dpif-netdev/pmd-perf-show",
1183
0
                             "[-nh] [-it iter-history-len]"
1184
0
                             " [-ms ms-history-len]"
1185
0
                             " [-pmd core] [dp]",
1186
0
                             0, 8, pmd_perf_show_cmd,
1187
0
                             NULL);
1188
0
    unixctl_command_register("dpif-netdev/pmd-rxq-rebalance", "[dp]",
1189
0
                             0, 1, dpif_netdev_pmd_rebalance,
1190
0
                             NULL);
1191
0
    unixctl_command_register("dpif-netdev/pmd-perf-log-set",
1192
0
                             "on|off [-b before] [-a after] [-e|-ne] "
1193
0
                             "[-us usec] [-q qlen]",
1194
0
                             0, 10, pmd_perf_log_set_cmd,
1195
0
                             NULL);
1196
0
    unixctl_command_register("dpif-netdev/bond-show", "[dp]",
1197
0
                             0, 1, dpif_netdev_bond_show,
1198
0
                             NULL);
1199
0
    return 0;
1200
0
}
1201
1202
static int
1203
dpif_netdev_enumerate(struct sset *all_dps,
1204
                      const struct dpif_class *dpif_class)
1205
0
{
1206
0
    struct shash_node *node;
1207
1208
0
    ovs_mutex_lock(&dp_netdev_mutex);
1209
0
    SHASH_FOR_EACH(node, &dp_netdevs) {
1210
0
        struct dp_netdev *dp = node->data;
1211
0
        if (dpif_class != dp->class) {
1212
            /* 'dp_netdevs' contains both "netdev" and "dummy" dpifs.
1213
             * If the class doesn't match, skip this dpif. */
1214
0
             continue;
1215
0
        }
1216
0
        sset_add(all_dps, node->name);
1217
0
    }
1218
0
    ovs_mutex_unlock(&dp_netdev_mutex);
1219
1220
0
    return 0;
1221
0
}
1222
1223
static bool
1224
dpif_netdev_class_is_dummy(const struct dpif_class *class)
1225
0
{
1226
0
    return class != &dpif_netdev_class;
1227
0
}
1228
1229
static const char *
1230
dpif_netdev_port_open_type(const struct dpif_class *class, const char *type)
1231
0
{
1232
0
    return strcmp(type, "internal") ? type
1233
0
                  : dpif_netdev_class_is_dummy(class) ? "dummy-internal"
1234
0
                  : "tap";
1235
0
}
1236
1237
static struct dpif *
1238
create_dpif_netdev(struct dp_netdev *dp)
1239
0
{
1240
0
    uint16_t netflow_id = hash_string(dp->name, 0);
1241
0
    struct dpif_netdev *dpif;
1242
1243
0
    ovs_refcount_ref(&dp->ref_cnt);
1244
1245
0
    dpif = xmalloc(sizeof *dpif);
1246
0
    dpif_init(&dpif->dpif, dp->class, dp->name, netflow_id >> 8, netflow_id);
1247
0
    dpif->dp = dp;
1248
0
    dpif->last_port_seq = seq_read(dp->port_seq);
1249
1250
0
    return &dpif->dpif;
1251
0
}
1252
1253
/* Choose an unused, non-zero port number and return it on success.
1254
 * Return ODPP_NONE on failure. */
1255
static odp_port_t
1256
choose_port(struct dp_netdev *dp, const char *name)
1257
    OVS_REQ_RDLOCK(dp->port_rwlock)
1258
0
{
1259
0
    uint32_t port_no;
1260
1261
0
    if (dp->class != &dpif_netdev_class) {
1262
0
        const char *p;
1263
0
        int start_no = 0;
1264
1265
        /* If the port name begins with "br", start the number search at
1266
         * 100 to make writing tests easier. */
1267
0
        if (!strncmp(name, "br", 2)) {
1268
0
            start_no = 100;
1269
0
        }
1270
1271
        /* If the port name contains a number, try to assign that port number.
1272
         * This can make writing unit tests easier because port numbers are
1273
         * predictable. */
1274
0
        for (p = name; *p != '\0'; p++) {
1275
0
            if (isdigit((unsigned char) *p)) {
1276
0
                port_no = start_no + strtol(p, NULL, 10);
1277
0
                if (port_no > 0 && port_no != odp_to_u32(ODPP_NONE)
1278
0
                    && !dp_netdev_lookup_port(dp, u32_to_odp(port_no))) {
1279
0
                    return u32_to_odp(port_no);
1280
0
                }
1281
0
                break;
1282
0
            }
1283
0
        }
1284
0
    }
1285
1286
0
    for (port_no = 1; port_no <= UINT16_MAX; port_no++) {
1287
0
        if (!dp_netdev_lookup_port(dp, u32_to_odp(port_no))) {
1288
0
            return u32_to_odp(port_no);
1289
0
        }
1290
0
    }
1291
1292
0
    return ODPP_NONE;
1293
0
}
1294
1295
static uint32_t
1296
dp_meter_hash(uint32_t meter_id)
1297
0
{
1298
    /* In the ofproto-dpif layer, we use the id-pool to alloc meter id
1299
     * orderly (e.g. 1, 2, ... N.), which provides a better hash
1300
     * distribution.  Use them directly instead of hash_xxx function for
1301
     * achieving high-performance. */
1302
0
    return meter_id;
1303
0
}
1304
1305
static void
1306
dp_netdev_meter_destroy(struct dp_netdev *dp)
1307
0
{
1308
0
    struct dp_meter *m;
1309
1310
0
    ovs_mutex_lock(&dp->meters_lock);
1311
0
    CMAP_FOR_EACH (m, node, &dp->meters) {
1312
0
        cmap_remove(&dp->meters, &m->node, dp_meter_hash(m->id));
1313
0
        ovsrcu_postpone(free, m);
1314
0
    }
1315
1316
0
    cmap_destroy(&dp->meters);
1317
0
    ovs_mutex_unlock(&dp->meters_lock);
1318
0
    ovs_mutex_destroy(&dp->meters_lock);
1319
0
}
1320
1321
static struct dp_meter *
1322
dp_meter_lookup(struct cmap *meters, uint32_t meter_id)
1323
0
{
1324
0
    uint32_t hash = dp_meter_hash(meter_id);
1325
0
    struct dp_meter *m;
1326
1327
0
    CMAP_FOR_EACH_WITH_HASH (m, node, hash, meters) {
1328
0
        if (m->id == meter_id) {
1329
0
            return m;
1330
0
        }
1331
0
    }
1332
1333
0
    return NULL;
1334
0
}
1335
1336
static void
1337
dp_meter_detach_free(struct cmap *meters, uint32_t meter_id)
1338
0
{
1339
0
    struct dp_meter *m = dp_meter_lookup(meters, meter_id);
1340
1341
0
    if (m) {
1342
0
        cmap_remove(meters, &m->node, dp_meter_hash(meter_id));
1343
0
        ovsrcu_postpone(free, m);
1344
0
    }
1345
0
}
1346
1347
static void
1348
dp_meter_attach(struct cmap *meters, struct dp_meter *meter)
1349
0
{
1350
0
    cmap_insert(meters, &meter->node, dp_meter_hash(meter->id));
1351
0
}
1352
1353
static int
1354
create_dp_netdev(const char *name, const struct dpif_class *class,
1355
                 struct dp_netdev **dpp)
1356
    OVS_REQUIRES(dp_netdev_mutex)
1357
0
{
1358
0
    static struct ovsthread_once tsc_freq_check = OVSTHREAD_ONCE_INITIALIZER;
1359
0
    struct dp_netdev *dp;
1360
0
    int error;
1361
1362
    /* Avoid estimating TSC frequency for dummy datapath to not slow down
1363
     * unit tests. */
1364
0
    if (!dpif_netdev_class_is_dummy(class)
1365
0
        && ovsthread_once_start(&tsc_freq_check)) {
1366
0
        pmd_perf_estimate_tsc_frequency();
1367
0
        ovsthread_once_done(&tsc_freq_check);
1368
0
    }
1369
1370
0
    dp = xzalloc(sizeof *dp);
1371
0
    shash_add(&dp_netdevs, name, dp);
1372
1373
0
    *CONST_CAST(const struct dpif_class **, &dp->class) = class;
1374
0
    *CONST_CAST(const char **, &dp->name) = xstrdup(name);
1375
0
    *CONST_CAST(const char **, &dp->full_name) = xasprintf("%s@%s",
1376
0
                                                           class->type, name);
1377
0
    ovs_refcount_init(&dp->ref_cnt);
1378
0
    atomic_flag_clear(&dp->destroyed);
1379
1380
0
    ovs_rwlock_init(&dp->port_rwlock);
1381
0
    hmap_init(&dp->ports);
1382
0
    dp->port_seq = seq_create();
1383
0
    ovs_mutex_init(&dp->bond_mutex);
1384
0
    cmap_init(&dp->tx_bonds);
1385
1386
0
    fat_rwlock_init(&dp->upcall_rwlock);
1387
1388
0
    dp->reconfigure_seq = seq_create();
1389
0
    dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq);
1390
0
    dp->once_set_config = (struct ovsthread_once) OVSTHREAD_ONCE_INITIALIZER;
1391
1392
    /* Init meter resources. */
1393
0
    cmap_init(&dp->meters);
1394
0
    ovs_mutex_init(&dp->meters_lock);
1395
1396
    /* Disable upcalls by default. */
1397
0
    dp_netdev_disable_upcall(dp);
1398
0
    dp->upcall_aux = NULL;
1399
0
    dp->upcall_cb = NULL;
1400
1401
0
    dp->conntrack = conntrack_init();
1402
1403
0
    atomic_init(&dp->emc_insert_min, DEFAULT_EM_FLOW_INSERT_MIN);
1404
0
    atomic_init(&dp->tx_flush_interval, DEFAULT_TX_FLUSH_INTERVAL);
1405
1406
0
    cmap_init(&dp->poll_threads);
1407
0
    dp->pmd_rxq_assign_type = SCHED_CYCLES;
1408
1409
0
    ovs_mutex_init(&dp->tx_qid_pool_mutex);
1410
    /* We need 1 Tx queue for each possible core + 1 for non-PMD threads. */
1411
0
    dp->tx_qid_pool = id_pool_create(0, ovs_numa_get_n_cores() + 1);
1412
1413
0
    ovs_mutex_init_recursive(&dp->non_pmd_mutex);
1414
0
    ovsthread_key_create(&dp->per_pmd_key, NULL);
1415
1416
0
    ovs_rwlock_wrlock(&dp->port_rwlock);
1417
    /* non-PMD will be created before all other threads and will
1418
     * allocate static_tx_qid = 0. */
1419
0
    dp_netdev_set_nonpmd(dp);
1420
1421
0
    error = do_add_port(dp, name, dpif_netdev_port_open_type(dp->class,
1422
0
                                                             "internal"),
1423
0
                        ODPP_LOCAL);
1424
0
    ovs_rwlock_unlock(&dp->port_rwlock);
1425
0
    if (error) {
1426
0
        dp_netdev_free(dp);
1427
0
        return error;
1428
0
    }
1429
1430
0
    dp->max_sleep_list = NULL;
1431
1432
0
    dp->last_tnl_conf_seq = seq_read(tnl_conf_seq);
1433
0
    *dpp = dp;
1434
0
    return 0;
1435
0
}
1436
1437
static void
1438
dp_netdev_request_reconfigure(struct dp_netdev *dp)
1439
0
{
1440
0
    seq_change(dp->reconfigure_seq);
1441
0
}
1442
1443
static bool
1444
dp_netdev_is_reconf_required(struct dp_netdev *dp)
1445
0
{
1446
0
    return seq_read(dp->reconfigure_seq) != dp->last_reconfigure_seq;
1447
0
}
1448
1449
static int
1450
dpif_netdev_open(const struct dpif_class *class, const char *name,
1451
                 bool create, struct dpif **dpifp)
1452
0
{
1453
0
    struct dp_netdev *dp;
1454
0
    int error;
1455
1456
0
    ovs_mutex_lock(&dp_netdev_mutex);
1457
0
    dp = shash_find_data(&dp_netdevs, name);
1458
0
    if (!dp) {
1459
0
        error = create ? create_dp_netdev(name, class, &dp) : ENODEV;
1460
0
    } else {
1461
0
        error = (dp->class != class ? EINVAL
1462
0
                 : create ? EEXIST
1463
0
                 : 0);
1464
0
    }
1465
0
    if (!error) {
1466
0
        *dpifp = create_dpif_netdev(dp);
1467
0
    }
1468
0
    ovs_mutex_unlock(&dp_netdev_mutex);
1469
1470
0
    return error;
1471
0
}
1472
1473
static void
1474
dp_netdev_destroy_upcall_lock(struct dp_netdev *dp)
1475
    OVS_NO_THREAD_SAFETY_ANALYSIS
1476
0
{
1477
    /* Check that upcalls are disabled, i.e. that the rwlock is taken */
1478
0
    ovs_assert(fat_rwlock_tryrdlock(&dp->upcall_rwlock));
1479
1480
    /* Before freeing a lock we should release it */
1481
0
    fat_rwlock_unlock(&dp->upcall_rwlock);
1482
0
    fat_rwlock_destroy(&dp->upcall_rwlock);
1483
0
}
1484
1485
static uint32_t
1486
hash_bond_id(uint32_t bond_id)
1487
0
{
1488
0
    return hash_int(bond_id, 0);
1489
0
}
1490
1491
/* Requires dp_netdev_mutex so that we can't get a new reference to 'dp'
1492
 * through the 'dp_netdevs' shash while freeing 'dp'. */
1493
static void
1494
dp_netdev_free(struct dp_netdev *dp)
1495
    OVS_REQUIRES(dp_netdev_mutex)
1496
0
{
1497
0
    struct dp_netdev_port *port;
1498
0
    struct tx_bond *bond;
1499
1500
0
    shash_find_and_delete(&dp_netdevs, dp->name);
1501
1502
0
    ovs_rwlock_wrlock(&dp->port_rwlock);
1503
0
    HMAP_FOR_EACH_SAFE (port, node, &dp->ports) {
1504
0
        do_del_port(dp, port);
1505
0
    }
1506
0
    ovs_rwlock_unlock(&dp->port_rwlock);
1507
1508
0
    ovs_mutex_lock(&dp->bond_mutex);
1509
0
    CMAP_FOR_EACH (bond, node, &dp->tx_bonds) {
1510
0
        cmap_remove(&dp->tx_bonds, &bond->node, hash_bond_id(bond->bond_id));
1511
0
        ovsrcu_postpone(free, bond);
1512
0
    }
1513
0
    ovs_mutex_unlock(&dp->bond_mutex);
1514
1515
0
    dp_netdev_destroy_all_pmds(dp, true);
1516
0
    cmap_destroy(&dp->poll_threads);
1517
1518
0
    ovs_mutex_destroy(&dp->tx_qid_pool_mutex);
1519
0
    id_pool_destroy(dp->tx_qid_pool);
1520
1521
0
    ovs_mutex_destroy(&dp->non_pmd_mutex);
1522
0
    ovsthread_key_delete(dp->per_pmd_key);
1523
1524
0
    conntrack_destroy(dp->conntrack);
1525
1526
1527
0
    seq_destroy(dp->reconfigure_seq);
1528
0
    ovsthread_once_destroy(&dp->once_set_config);
1529
1530
0
    seq_destroy(dp->port_seq);
1531
0
    hmap_destroy(&dp->ports);
1532
0
    ovs_rwlock_destroy(&dp->port_rwlock);
1533
1534
0
    cmap_destroy(&dp->tx_bonds);
1535
0
    ovs_mutex_destroy(&dp->bond_mutex);
1536
1537
    /* Upcalls must be disabled at this point */
1538
0
    dp_netdev_destroy_upcall_lock(dp);
1539
1540
0
    dp_netdev_meter_destroy(dp);
1541
1542
0
    free(dp->max_sleep_list);
1543
0
    free(dp->pmd_cmask);
1544
0
    free(CONST_CAST(char *, dp->name));
1545
0
    free(CONST_CAST(char *, dp->full_name));
1546
0
    free(dp);
1547
0
}
1548
1549
static void
1550
dp_netdev_unref(struct dp_netdev *dp)
1551
0
{
1552
0
    if (dp) {
1553
        /* Take dp_netdev_mutex so that, if dp->ref_cnt falls to zero, we can't
1554
         * get a new reference to 'dp' through the 'dp_netdevs' shash. */
1555
0
        ovs_mutex_lock(&dp_netdev_mutex);
1556
0
        if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) {
1557
0
            dp_netdev_free(dp);
1558
0
        }
1559
0
        ovs_mutex_unlock(&dp_netdev_mutex);
1560
0
    }
1561
0
}
1562
1563
static void
1564
dpif_netdev_close(struct dpif *dpif)
1565
0
{
1566
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
1567
1568
0
    dp_netdev_unref(dp);
1569
0
    free(dpif);
1570
0
}
1571
1572
static int
1573
dpif_netdev_destroy(struct dpif *dpif)
1574
0
{
1575
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
1576
1577
0
    if (!atomic_flag_test_and_set(&dp->destroyed)) {
1578
0
        if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) {
1579
            /* Can't happen: 'dpif' still owns a reference to 'dp'. */
1580
0
            OVS_NOT_REACHED();
1581
0
        }
1582
0
    }
1583
1584
0
    return 0;
1585
0
}
1586
1587
/* Add 'n' to the atomic variable 'var' non-atomically and using relaxed
1588
 * load/store semantics.  While the increment is not atomic, the load and
1589
 * store operations are, making it impossible to read inconsistent values.
1590
 *
1591
 * This is used to update thread local stats counters. */
1592
static void
1593
non_atomic_ullong_add(atomic_ullong *var, unsigned long long n)
1594
0
{
1595
0
    unsigned long long tmp;
1596
1597
0
    atomic_read_relaxed(var, &tmp);
1598
0
    tmp += n;
1599
0
    atomic_store_relaxed(var, tmp);
1600
0
}
1601
1602
static int
1603
dpif_netdev_get_stats(const struct dpif *dpif, struct dpif_dp_stats *stats)
1604
0
{
1605
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
1606
0
    struct dp_netdev_pmd_thread *pmd;
1607
0
    uint64_t pmd_stats[PMD_N_STATS];
1608
1609
0
    stats->n_flows = stats->n_hit = stats->n_missed = stats->n_lost = 0;
1610
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
1611
0
        stats->n_flows += cmap_count(&pmd->flow_table);
1612
0
        pmd_perf_read_counters(&pmd->perf_stats, pmd_stats);
1613
0
        stats->n_hit += pmd_stats[PMD_STAT_PHWOL_HIT];
1614
0
        stats->n_hit += pmd_stats[PMD_STAT_SIMPLE_HIT];
1615
0
        stats->n_hit += pmd_stats[PMD_STAT_EXACT_HIT];
1616
0
        stats->n_hit += pmd_stats[PMD_STAT_SMC_HIT];
1617
0
        stats->n_hit += pmd_stats[PMD_STAT_MASKED_HIT];
1618
0
        stats->n_missed += pmd_stats[PMD_STAT_MISS];
1619
0
        stats->n_lost += pmd_stats[PMD_STAT_LOST];
1620
0
    }
1621
0
    stats->n_masks = UINT32_MAX;
1622
0
    stats->n_mask_hit = UINT64_MAX;
1623
0
    stats->n_cache_hit = UINT64_MAX;
1624
1625
0
    return 0;
1626
0
}
1627
1628
static void
1629
dp_netdev_reload_pmd__(struct dp_netdev_pmd_thread *pmd)
1630
0
{
1631
0
    if (pmd->core_id == NON_PMD_CORE_ID) {
1632
0
        ovs_mutex_lock(&pmd->dp->non_pmd_mutex);
1633
0
        ovs_mutex_lock(&pmd->port_mutex);
1634
0
        pmd_load_cached_ports(pmd);
1635
0
        ovs_mutex_unlock(&pmd->port_mutex);
1636
0
        ovs_mutex_unlock(&pmd->dp->non_pmd_mutex);
1637
0
        return;
1638
0
    }
1639
1640
0
    seq_change(pmd->reload_seq);
1641
0
    atomic_store_explicit(&pmd->reload, true, memory_order_release);
1642
0
}
1643
1644
static uint32_t
1645
hash_port_no(odp_port_t port_no)
1646
0
{
1647
0
    return hash_int(odp_to_u32(port_no), 0);
1648
0
}
1649
1650
static int
1651
port_create(const char *devname, const char *type,
1652
            odp_port_t port_no, struct dp_netdev_port **portp)
1653
0
{
1654
0
    struct dp_netdev_port *port;
1655
0
    enum netdev_flags flags;
1656
0
    struct netdev *netdev;
1657
0
    int error;
1658
1659
0
    *portp = NULL;
1660
1661
    /* Open and validate network device. */
1662
0
    error = netdev_open(devname, type, &netdev);
1663
0
    if (error) {
1664
0
        return error;
1665
0
    }
1666
    /* XXX reject non-Ethernet devices */
1667
1668
0
    netdev_get_flags(netdev, &flags);
1669
0
    if (flags & NETDEV_LOOPBACK) {
1670
0
        VLOG_ERR("%s: cannot add a loopback device", devname);
1671
0
        error = EINVAL;
1672
0
        goto out;
1673
0
    }
1674
1675
0
    port = xzalloc(sizeof *port);
1676
0
    port->port_no = port_no;
1677
0
    port->netdev = netdev;
1678
0
    port->type = xstrdup(type);
1679
0
    port->sf = NULL;
1680
0
    port->emc_enabled = true;
1681
0
    port->need_reconfigure = true;
1682
0
    ovs_mutex_init(&port->txq_used_mutex);
1683
1684
0
    *portp = port;
1685
1686
0
    return 0;
1687
1688
0
out:
1689
0
    netdev_close(netdev);
1690
0
    return error;
1691
0
}
1692
1693
static int
1694
do_add_port(struct dp_netdev *dp, const char *devname, const char *type,
1695
            odp_port_t port_no)
1696
    OVS_REQ_WRLOCK(dp->port_rwlock)
1697
0
{
1698
0
    struct netdev_saved_flags *sf;
1699
0
    struct dp_netdev_port *port;
1700
0
    int error;
1701
1702
    /* Reject devices already in 'dp'. */
1703
0
    if (!get_port_by_name(dp, devname, &port)) {
1704
0
        return EEXIST;
1705
0
    }
1706
1707
0
    error = port_create(devname, type, port_no, &port);
1708
0
    if (error) {
1709
0
        return error;
1710
0
    }
1711
1712
0
    hmap_insert(&dp->ports, &port->node, hash_port_no(port_no));
1713
0
    seq_change(dp->port_seq);
1714
1715
0
    reconfigure_datapath(dp);
1716
1717
    /* Check that port was successfully configured. */
1718
0
    if (!dp_netdev_lookup_port(dp, port_no)) {
1719
0
        return EINVAL;
1720
0
    }
1721
1722
    /* Updating device flags triggers an if_notifier, which triggers a bridge
1723
     * reconfiguration and another attempt to add this port, leading to an
1724
     * infinite loop if the device is configured incorrectly and cannot be
1725
     * added.  Setting the promisc mode after a successful reconfiguration,
1726
     * since we already know that the device is somehow properly configured. */
1727
0
    error = netdev_turn_flags_on(port->netdev, NETDEV_PROMISC, &sf);
1728
0
    if (error) {
1729
0
        VLOG_ERR("%s: cannot set promisc flag", devname);
1730
0
        do_del_port(dp, port);
1731
0
        return error;
1732
0
    }
1733
0
    port->sf = sf;
1734
1735
0
    return 0;
1736
0
}
1737
1738
static int
1739
dpif_netdev_port_add(struct dpif *dpif, struct netdev *netdev,
1740
                     odp_port_t *port_nop)
1741
0
{
1742
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
1743
0
    char namebuf[NETDEV_VPORT_NAME_BUFSIZE];
1744
0
    const char *dpif_port;
1745
0
    odp_port_t port_no;
1746
0
    int error;
1747
1748
0
    ovs_rwlock_wrlock(&dp->port_rwlock);
1749
0
    dpif_port = netdev_vport_get_dpif_port(netdev, namebuf, sizeof namebuf);
1750
0
    if (*port_nop != ODPP_NONE) {
1751
0
        port_no = *port_nop;
1752
0
        error = dp_netdev_lookup_port(dp, *port_nop) ? EBUSY : 0;
1753
0
    } else {
1754
0
        port_no = choose_port(dp, dpif_port);
1755
0
        error = port_no == ODPP_NONE ? EFBIG : 0;
1756
0
    }
1757
0
    if (!error) {
1758
0
        *port_nop = port_no;
1759
0
        error = do_add_port(dp, dpif_port, netdev_get_type(netdev), port_no);
1760
0
    }
1761
0
    ovs_rwlock_unlock(&dp->port_rwlock);
1762
1763
0
    return error;
1764
0
}
1765
1766
static int
1767
dpif_netdev_port_del(struct dpif *dpif, odp_port_t port_no)
1768
0
{
1769
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
1770
0
    int error;
1771
1772
0
    ovs_rwlock_wrlock(&dp->port_rwlock);
1773
0
    if (port_no == ODPP_LOCAL) {
1774
0
        error = EINVAL;
1775
0
    } else {
1776
0
        struct dp_netdev_port *port;
1777
1778
0
        error = get_port_by_number(dp, port_no, &port);
1779
0
        if (!error) {
1780
0
            do_del_port(dp, port);
1781
0
        }
1782
0
    }
1783
0
    ovs_rwlock_unlock(&dp->port_rwlock);
1784
1785
0
    return error;
1786
0
}
1787
1788
static bool
1789
is_valid_port_number(odp_port_t port_no)
1790
0
{
1791
0
    return port_no != ODPP_NONE;
1792
0
}
1793
1794
static struct dp_netdev_port *
1795
dp_netdev_lookup_port(const struct dp_netdev *dp, odp_port_t port_no)
1796
    OVS_REQ_RDLOCK(dp->port_rwlock)
1797
0
{
1798
0
    struct dp_netdev_port *port;
1799
1800
0
    HMAP_FOR_EACH_WITH_HASH (port, node, hash_port_no(port_no), &dp->ports) {
1801
0
        if (port->port_no == port_no) {
1802
0
            return port;
1803
0
        }
1804
0
    }
1805
0
    return NULL;
1806
0
}
1807
1808
static int
1809
get_port_by_number(struct dp_netdev *dp,
1810
                   odp_port_t port_no, struct dp_netdev_port **portp)
1811
    OVS_REQ_RDLOCK(dp->port_rwlock)
1812
0
{
1813
0
    if (!is_valid_port_number(port_no)) {
1814
0
        *portp = NULL;
1815
0
        return EINVAL;
1816
0
    } else {
1817
0
        *portp = dp_netdev_lookup_port(dp, port_no);
1818
0
        return *portp ? 0 : ENODEV;
1819
0
    }
1820
0
}
1821
1822
static void
1823
port_destroy(struct dp_netdev_port *port)
1824
0
{
1825
0
    if (!port) {
1826
0
        return;
1827
0
    }
1828
1829
0
    netdev_close(port->netdev);
1830
0
    netdev_restore_flags(port->sf);
1831
1832
0
    for (unsigned i = 0; i < port->n_rxq; i++) {
1833
0
        netdev_rxq_close(port->rxqs[i].rx);
1834
0
    }
1835
0
    ovs_mutex_destroy(&port->txq_used_mutex);
1836
0
    free(port->rxq_affinity_list);
1837
0
    free(port->txq_used);
1838
0
    free(port->rxqs);
1839
0
    free(port->type);
1840
0
    free(port);
1841
0
}
1842
1843
static int
1844
get_port_by_name(struct dp_netdev *dp,
1845
                 const char *devname, struct dp_netdev_port **portp)
1846
    OVS_REQ_RDLOCK(dp->port_rwlock)
1847
0
{
1848
0
    struct dp_netdev_port *port;
1849
1850
0
    HMAP_FOR_EACH (port, node, &dp->ports) {
1851
0
        if (!strcmp(netdev_get_name(port->netdev), devname)) {
1852
0
            *portp = port;
1853
0
            return 0;
1854
0
        }
1855
0
    }
1856
1857
    /* Callers of dpif_netdev_port_query_by_name() expect ENODEV for a non
1858
     * existing port. */
1859
0
    return ENODEV;
1860
0
}
1861
1862
/* Returns 'true' if there is a port with pmd netdev. */
1863
static bool
1864
has_pmd_port(struct dp_netdev *dp)
1865
    OVS_REQ_RDLOCK(dp->port_rwlock)
1866
0
{
1867
0
    struct dp_netdev_port *port;
1868
1869
0
    HMAP_FOR_EACH (port, node, &dp->ports) {
1870
0
        if (netdev_is_pmd(port->netdev)) {
1871
0
            return true;
1872
0
        }
1873
0
    }
1874
1875
0
    return false;
1876
0
}
1877
1878
static void
1879
do_del_port(struct dp_netdev *dp, struct dp_netdev_port *port)
1880
    OVS_REQ_WRLOCK(dp->port_rwlock)
1881
0
{
1882
0
    hmap_remove(&dp->ports, &port->node);
1883
0
    seq_change(dp->port_seq);
1884
1885
0
    reconfigure_datapath(dp);
1886
0
    port_destroy(port);
1887
0
}
1888
1889
static void
1890
answer_port_query(const struct dp_netdev_port *port,
1891
                  struct dpif_port *dpif_port)
1892
0
{
1893
0
    dpif_port->name = xstrdup(netdev_get_name(port->netdev));
1894
0
    dpif_port->type = xstrdup(port->type);
1895
0
    dpif_port->port_no = port->port_no;
1896
0
}
1897
1898
static int
1899
dpif_netdev_port_query_by_number(const struct dpif *dpif, odp_port_t port_no,
1900
                                 struct dpif_port *dpif_port)
1901
0
{
1902
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
1903
0
    struct dp_netdev_port *port;
1904
0
    int error;
1905
1906
0
    ovs_rwlock_rdlock(&dp->port_rwlock);
1907
0
    error = get_port_by_number(dp, port_no, &port);
1908
0
    if (!error && dpif_port) {
1909
0
        answer_port_query(port, dpif_port);
1910
0
    }
1911
0
    ovs_rwlock_unlock(&dp->port_rwlock);
1912
1913
0
    return error;
1914
0
}
1915
1916
static int
1917
dpif_netdev_port_query_by_name(const struct dpif *dpif, const char *devname,
1918
                               struct dpif_port *dpif_port)
1919
0
{
1920
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
1921
0
    struct dp_netdev_port *port;
1922
0
    int error;
1923
1924
0
    ovs_rwlock_rdlock(&dp->port_rwlock);
1925
0
    error = get_port_by_name(dp, devname, &port);
1926
0
    if (!error && dpif_port) {
1927
0
        answer_port_query(port, dpif_port);
1928
0
    }
1929
0
    ovs_rwlock_unlock(&dp->port_rwlock);
1930
1931
0
    return error;
1932
0
}
1933
1934
static void
1935
dp_netdev_flow_free(struct dp_netdev_flow *flow)
1936
0
{
1937
0
    dp_netdev_actions_free(dp_netdev_flow_get_actions(flow));
1938
0
    free(flow->dp_extra_info);
1939
0
    free(flow);
1940
0
}
1941
1942
void dp_netdev_flow_unref(struct dp_netdev_flow *flow)
1943
0
{
1944
0
    if (ovs_refcount_unref_relaxed(&flow->ref_cnt) == 1) {
1945
0
        ovsrcu_postpone(dp_netdev_flow_free, flow);
1946
0
    }
1947
0
}
1948
1949
static inline struct dpcls *
1950
dp_netdev_pmd_lookup_dpcls(struct dp_netdev_pmd_thread *pmd,
1951
                           odp_port_t in_port)
1952
0
{
1953
0
    struct dpcls *cls;
1954
0
    uint32_t hash = hash_port_no(in_port);
1955
0
    CMAP_FOR_EACH_WITH_HASH (cls, node, hash, &pmd->classifiers) {
1956
0
        if (cls->in_port == in_port) {
1957
            /* Port classifier exists already */
1958
0
            return cls;
1959
0
        }
1960
0
    }
1961
0
    return NULL;
1962
0
}
1963
1964
static inline struct dpcls *
1965
dp_netdev_pmd_find_dpcls(struct dp_netdev_pmd_thread *pmd,
1966
                         odp_port_t in_port)
1967
    OVS_REQUIRES(pmd->flow_mutex)
1968
0
{
1969
0
    struct dpcls *cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
1970
1971
0
    if (!cls) {
1972
0
        uint32_t hash = hash_port_no(in_port);
1973
1974
        /* Create new classifier for in_port */
1975
0
        cls = xmalloc(sizeof(*cls));
1976
0
        dpcls_init(cls);
1977
0
        cls->in_port = in_port;
1978
0
        cmap_insert(&pmd->classifiers, &cls->node, hash);
1979
0
        VLOG_DBG("Creating dpcls %p for in_port %d", cls, in_port);
1980
0
    }
1981
0
    return cls;
1982
0
}
1983
1984
static void
1985
log_netdev_flow_change(const struct dp_netdev_flow *flow,
1986
                       const struct match *match,
1987
                       const struct dp_netdev_actions *old_actions,
1988
                       const struct nlattr *actions,
1989
                       size_t actions_len)
1990
0
{
1991
0
    struct ds ds = DS_EMPTY_INITIALIZER;
1992
0
    struct ofpbuf key_buf, mask_buf;
1993
0
    struct odp_flow_key_parms odp_parms = {
1994
0
        .flow = &match->flow,
1995
0
        .mask = &match->wc.masks,
1996
0
        .support = dp_netdev_support,
1997
0
    };
1998
1999
0
    if (OVS_LIKELY(VLOG_DROP_DBG((&upcall_rl)))) {
2000
0
        return;
2001
0
    }
2002
2003
0
    ofpbuf_init(&key_buf, 0);
2004
0
    ofpbuf_init(&mask_buf, 0);
2005
2006
0
    odp_flow_key_from_flow(&odp_parms, &key_buf);
2007
0
    odp_parms.key_buf = &key_buf;
2008
0
    odp_flow_key_from_mask(&odp_parms, &mask_buf);
2009
2010
0
    if (old_actions) {
2011
0
        ds_put_cstr(&ds, "flow_mod: ");
2012
0
    } else {
2013
0
        ds_put_cstr(&ds, "flow_add: ");
2014
0
    }
2015
0
    odp_format_ufid(&flow->ufid, &ds);
2016
0
    ds_put_cstr(&ds, " mega_");
2017
0
    odp_format_ufid(&flow->mega_ufid, &ds);
2018
0
    ds_put_cstr(&ds, " ");
2019
0
    odp_flow_format(key_buf.data, key_buf.size,
2020
0
                    mask_buf.data, mask_buf.size,
2021
0
                    NULL, &ds, false, true);
2022
0
    if (old_actions) {
2023
0
        ds_put_cstr(&ds, ", old_actions:");
2024
0
        format_odp_actions(&ds, old_actions->actions, old_actions->size,
2025
0
                           NULL);
2026
0
    }
2027
0
    ds_put_cstr(&ds, ", actions:");
2028
0
    format_odp_actions(&ds, actions, actions_len, NULL);
2029
2030
0
    VLOG_DBG("%s", ds_cstr(&ds));
2031
2032
0
    ofpbuf_uninit(&key_buf);
2033
0
    ofpbuf_uninit(&mask_buf);
2034
2035
    /* Add a printout of the actual match installed. */
2036
0
    struct match m;
2037
0
    ds_clear(&ds);
2038
0
    ds_put_cstr(&ds, "flow match: ");
2039
0
    miniflow_expand(&flow->cr.flow.mf, &m.flow);
2040
0
    miniflow_expand(&flow->cr.mask->mf, &m.wc.masks);
2041
0
    memset(&m.tun_md, 0, sizeof m.tun_md);
2042
0
    match_format(&m, NULL, &ds, OFP_DEFAULT_PRIORITY);
2043
2044
0
    VLOG_DBG("%s", ds_cstr(&ds));
2045
2046
0
    ds_destroy(&ds);
2047
0
}
2048
2049
/* Offloaded flows can be handled asynchronously, so we do not always know
2050
 * whether a specific flow is offloaded or not.  It might still be pending;
2051
 * in fact, multiple modifications can be pending, and the actual offload
2052
 * state depends on the completion of each modification.
2053
 *
2054
 * To correctly determine whether a flow is offloaded when it is being
2055
 * destroyed (and therefore requires cleanup), we must ensure that all
2056
 * operations have completed.  To achieve this, we track the number of
2057
 * outstanding offloaded flow modifications. */
2058
static bool
2059
offload_queue_inc(struct dp_netdev_flow *flow)
2060
0
{
2061
0
    int current;
2062
2063
0
    while (true) {
2064
0
        atomic_read(&flow->offload_queue_depth, &current);
2065
0
        if (current < 0) {
2066
            /* We are cleaning up, so no longer enqueue operations. */
2067
0
            return false;
2068
0
        }
2069
2070
        /* Here we try to atomically increase the value.  If we do not succeed,
2071
         * someone else has modified it, and we need to check again for a
2072
         * current negative value. */
2073
0
        if (atomic_compare_exchange_strong(&flow->offload_queue_depth,
2074
0
                                           &current, current + 1)) {
2075
0
            return true;
2076
0
        }
2077
0
    }
2078
0
}
2079
2080
static bool
2081
offload_queue_dec(struct dp_netdev_flow *flow)
2082
0
{
2083
0
    int old;
2084
2085
0
    atomic_sub(&flow->offload_queue_depth, 1, &old);
2086
0
    ovs_assert(old >= 1);
2087
2088
0
    if (old == 1) {
2089
        /* Note that this only indicates that the queue might be empty. */
2090
0
        return true;
2091
0
    }
2092
0
    return false;
2093
0
}
2094
2095
static bool
2096
offload_queue_complete(struct dp_netdev_flow *flow)
2097
0
{
2098
    /* This function returns false if the queue is still in use.
2099
     * If the queue is empty, it will attempt to atomically mark it as
2100
     * 'not in use' by making the queue depth negative.  This prevents
2101
     * other flow operations from being added.  If successful, it returns
2102
     * true. */
2103
0
     int expected_val = 0;
2104
2105
0
    return atomic_compare_exchange_strong(&flow->offload_queue_depth,
2106
0
                                          &expected_val, -1);
2107
0
}
2108
2109
static void
2110
offload_flow_reference_unreference_cb(unsigned pmd_id OVS_UNUSED,
2111
                                      void *flow_reference_)
2112
0
{
2113
0
    struct dp_netdev_flow *flow_reference = flow_reference_;
2114
2115
0
    if (flow_reference) {
2116
0
        flow_reference->offloaded = false;
2117
0
        dp_netdev_flow_unref(flow_reference);
2118
0
    }
2119
0
}
2120
2121
static void
2122
offload_flow_del_resume(struct dp_netdev_flow *flow_reference,
2123
                        int error)
2124
0
{
2125
0
    if (error == EINPROGRESS) {
2126
0
        return;
2127
0
    }
2128
2129
0
    if (error) {
2130
0
        odp_port_t in_port = flow_reference->flow.in_port.odp_port;
2131
2132
0
        VLOG_DBG(
2133
0
            "Failed removing offload flow ufid " UUID_FMT " from port %d: %d",
2134
0
            UUID_ARGS((struct uuid *)&flow_reference->mega_ufid), in_port,
2135
0
            error);
2136
0
    } else {
2137
        /* Release because we successfully removed the reference. */
2138
0
        dp_netdev_flow_unref(flow_reference);
2139
0
    }
2140
2141
    /* Release as we took a reference in offload_flow_del(). */
2142
0
    dp_netdev_flow_unref(flow_reference);
2143
0
}
2144
2145
static void
2146
offload_flow_del_resume_cb(void *aux OVS_UNUSED,
2147
                           struct dpif_flow_stats *stats OVS_UNUSED,
2148
                           unsigned pmd_id OVS_UNUSED,
2149
                           void *flow_reference,
2150
                           void *previous_flow_reference OVS_UNUSED, int error)
2151
0
{
2152
0
    offload_flow_del_resume(flow_reference, error);
2153
0
}
2154
2155
static void
2156
offload_flow_del(struct dp_netdev *dp, unsigned pmd_id,
2157
                 struct dp_netdev_flow *flow)
2158
0
{
2159
0
    odp_port_t in_port = flow->flow.in_port.odp_port;
2160
0
    struct dpif_offload_flow_del del = {
2161
0
        .in_port = in_port,
2162
0
        .pmd_id = pmd_id,
2163
0
        .ufid = CONST_CAST(ovs_u128 *, &flow->mega_ufid),
2164
0
        .flow_reference = flow,
2165
0
        .stats = NULL,
2166
0
        .cb_data = { .callback = offload_flow_del_resume_cb },
2167
0
    };
2168
0
    int error;
2169
2170
0
    if (!dpif_offload_enabled()) {
2171
0
        return;
2172
0
    }
2173
2174
    /* This offload flow delete is only called when the actual flow is
2175
     * destructed.  However, we can only trust the state of flow->offloaded
2176
     * if no more flow_put operations are pending.  Below, we check whether
2177
     * the queue can be marked as complete, and then determine if we need
2178
     * to schedule a removal.  If not, the delete will be rescheduled later
2179
     * in the last offload_flow_put_resume_cb() callback. */
2180
0
    ovs_assert(flow->dead);
2181
0
    if (!offload_queue_complete(flow) || !flow->offloaded) {
2182
0
        return;
2183
0
    }
2184
2185
0
    flow->offloaded = false;
2186
0
    dp_netdev_flow_ref(flow);
2187
2188
    /* It's the responsibility of the offload provider to remove the
2189
     * actual rule from hardware only if none of the other PMD threads
2190
     * have the rule installed in hardware. */
2191
0
    error = dpif_offload_datapath_flow_del(dp->full_name, &del);
2192
0
    offload_flow_del_resume(flow, error);
2193
0
}
2194
2195
static void
2196
dp_netdev_pmd_remove_flow(struct dp_netdev_pmd_thread *pmd,
2197
                          struct dp_netdev_flow *flow)
2198
    OVS_REQUIRES(pmd->flow_mutex)
2199
0
{
2200
0
    struct cmap_node *node = CONST_CAST(struct cmap_node *, &flow->node);
2201
0
    struct dpcls *cls;
2202
0
    odp_port_t in_port = flow->flow.in_port.odp_port;
2203
2204
0
    cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
2205
0
    ovs_assert(cls != NULL);
2206
0
    dpcls_remove(cls, &flow->cr);
2207
0
    dp_netdev_simple_match_remove(pmd, flow);
2208
0
    cmap_remove(&pmd->flow_table, node, dp_netdev_flow_hash(&flow->ufid));
2209
0
    ccmap_dec(&pmd->n_flows, odp_to_u32(in_port));
2210
0
    flow->dead = true;
2211
0
    offload_flow_del(pmd->dp, pmd->core_id, flow);
2212
2213
0
    dp_netdev_flow_unref(flow);
2214
0
}
2215
2216
static void
2217
dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd)
2218
0
{
2219
0
    struct dp_netdev_flow *netdev_flow;
2220
2221
0
    ovs_mutex_lock(&pmd->flow_mutex);
2222
0
    CMAP_FOR_EACH (netdev_flow, node, &pmd->flow_table) {
2223
0
        dp_netdev_pmd_remove_flow(pmd, netdev_flow);
2224
0
    }
2225
0
    ovs_mutex_unlock(&pmd->flow_mutex);
2226
0
}
2227
2228
static int
2229
dpif_netdev_flow_flush(struct dpif *dpif)
2230
0
{
2231
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
2232
0
    struct dp_netdev_pmd_thread *pmd;
2233
2234
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
2235
0
        dp_netdev_pmd_flow_flush(pmd);
2236
0
    }
2237
2238
0
    return 0;
2239
0
}
2240
2241
struct dp_netdev_port_state {
2242
    struct hmap_position position;
2243
    char *name;
2244
};
2245
2246
static int
2247
dpif_netdev_port_dump_start(const struct dpif *dpif OVS_UNUSED, void **statep)
2248
0
{
2249
0
    *statep = xzalloc(sizeof(struct dp_netdev_port_state));
2250
0
    return 0;
2251
0
}
2252
2253
static int
2254
dpif_netdev_port_dump_next(const struct dpif *dpif, void *state_,
2255
                           struct dpif_port *dpif_port)
2256
0
{
2257
0
    struct dp_netdev_port_state *state = state_;
2258
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
2259
0
    struct hmap_node *node;
2260
0
    int retval;
2261
2262
0
    ovs_rwlock_rdlock(&dp->port_rwlock);
2263
0
    node = hmap_at_position(&dp->ports, &state->position);
2264
0
    if (node) {
2265
0
        struct dp_netdev_port *port;
2266
2267
0
        port = CONTAINER_OF(node, struct dp_netdev_port, node);
2268
2269
0
        free(state->name);
2270
0
        state->name = xstrdup(netdev_get_name(port->netdev));
2271
0
        dpif_port->name = state->name;
2272
0
        dpif_port->type = port->type;
2273
0
        dpif_port->port_no = port->port_no;
2274
2275
0
        retval = 0;
2276
0
    } else {
2277
0
        retval = EOF;
2278
0
    }
2279
0
    ovs_rwlock_unlock(&dp->port_rwlock);
2280
2281
0
    return retval;
2282
0
}
2283
2284
static int
2285
dpif_netdev_port_dump_done(const struct dpif *dpif OVS_UNUSED, void *state_)
2286
0
{
2287
0
    struct dp_netdev_port_state *state = state_;
2288
0
    free(state->name);
2289
0
    free(state);
2290
0
    return 0;
2291
0
}
2292
2293
static int
2294
dpif_netdev_port_poll(const struct dpif *dpif_, char **devnamep OVS_UNUSED)
2295
0
{
2296
0
    struct dpif_netdev *dpif = dpif_netdev_cast(dpif_);
2297
0
    uint64_t new_port_seq;
2298
0
    int error;
2299
2300
0
    new_port_seq = seq_read(dpif->dp->port_seq);
2301
0
    if (dpif->last_port_seq != new_port_seq) {
2302
0
        dpif->last_port_seq = new_port_seq;
2303
0
        error = ENOBUFS;
2304
0
    } else {
2305
0
        error = EAGAIN;
2306
0
    }
2307
2308
0
    return error;
2309
0
}
2310
2311
static void
2312
dpif_netdev_port_poll_wait(const struct dpif *dpif_)
2313
0
{
2314
0
    struct dpif_netdev *dpif = dpif_netdev_cast(dpif_);
2315
2316
0
    seq_wait(dpif->dp->port_seq, dpif->last_port_seq);
2317
0
}
2318
2319
static struct dp_netdev_flow *
2320
dp_netdev_flow_cast(const struct dpcls_rule *cr)
2321
0
{
2322
0
    return cr ? CONTAINER_OF(cr, struct dp_netdev_flow, cr) : NULL;
2323
0
}
2324
2325
static bool dp_netdev_flow_ref(struct dp_netdev_flow *flow)
2326
0
{
2327
0
    return ovs_refcount_try_ref_rcu(&flow->ref_cnt);
2328
0
}
2329
2330
/* netdev_flow_key utilities.
2331
 *
2332
 * netdev_flow_key is basically a miniflow.  We use these functions
2333
 * (netdev_flow_key_clone, netdev_flow_key_equal, ...) instead of the miniflow
2334
 * functions (miniflow_clone_inline, miniflow_equal, ...), because:
2335
 *
2336
 * - Since we are dealing exclusively with miniflows created by
2337
 *   miniflow_extract(), if the map is different the miniflow is different.
2338
 *   Therefore we can be faster by comparing the map and the miniflow in a
2339
 *   single memcmp().
2340
 * - These functions can be inlined by the compiler. */
2341
2342
static inline bool
2343
netdev_flow_key_equal(const struct netdev_flow_key *a,
2344
                      const struct netdev_flow_key *b)
2345
0
{
2346
    /* 'b->len' may be not set yet. */
2347
0
    return a->hash == b->hash && !memcmp(&a->mf, &b->mf, a->len);
2348
0
}
2349
2350
static inline void
2351
netdev_flow_key_clone(struct netdev_flow_key *dst,
2352
                      const struct netdev_flow_key *src)
2353
0
{
2354
0
    memcpy(dst, src,
2355
0
           offsetof(struct netdev_flow_key, mf) + src->len);
2356
0
}
2357
2358
/* Initialize a netdev_flow_key 'mask' from 'match'. */
2359
static inline void
2360
netdev_flow_mask_init(struct netdev_flow_key *mask,
2361
                      const struct match *match)
2362
0
{
2363
0
    uint64_t *dst = miniflow_values(&mask->mf);
2364
0
    struct flowmap fmap;
2365
0
    uint32_t hash = 0;
2366
0
    size_t idx;
2367
2368
    /* Only check masks that make sense for the flow. */
2369
0
    flow_wc_map(&match->flow, &fmap);
2370
0
    flowmap_init(&mask->mf.map);
2371
2372
0
    FLOWMAP_FOR_EACH_INDEX(idx, fmap) {
2373
0
        uint64_t mask_u64 = flow_u64_value(&match->wc.masks, idx);
2374
2375
0
        if (mask_u64) {
2376
0
            flowmap_set(&mask->mf.map, idx, 1);
2377
0
            *dst++ = mask_u64;
2378
0
            hash = hash_add64(hash, mask_u64);
2379
0
        }
2380
0
    }
2381
2382
0
    map_t map;
2383
2384
0
    FLOWMAP_FOR_EACH_MAP (map, mask->mf.map) {
2385
0
        hash = hash_add64(hash, map);
2386
0
    }
2387
2388
0
    size_t n = dst - miniflow_get_values(&mask->mf);
2389
2390
0
    mask->hash = hash_finish(hash, n * 8);
2391
0
    mask->len = netdev_flow_key_size(n);
2392
0
}
2393
2394
/* Initializes 'dst' as a copy of 'flow' masked with 'mask'. */
2395
static inline void
2396
netdev_flow_key_init_masked(struct netdev_flow_key *dst,
2397
                            const struct flow *flow,
2398
                            const struct netdev_flow_key *mask)
2399
0
{
2400
0
    uint64_t *dst_u64 = miniflow_values(&dst->mf);
2401
0
    const uint64_t *mask_u64 = miniflow_get_values(&mask->mf);
2402
0
    uint32_t hash = 0;
2403
0
    uint64_t value;
2404
2405
0
    dst->len = mask->len;
2406
0
    dst->mf = mask->mf;   /* Copy maps. */
2407
2408
0
    FLOW_FOR_EACH_IN_MAPS(value, flow, mask->mf.map) {
2409
0
        *dst_u64 = value & *mask_u64++;
2410
0
        hash = hash_add64(hash, *dst_u64++);
2411
0
    }
2412
0
    dst->hash = hash_finish(hash,
2413
0
                            (dst_u64 - miniflow_get_values(&dst->mf)) * 8);
2414
0
}
2415
2416
/* Initializes 'key' as a copy of 'flow'. */
2417
static inline void
2418
netdev_flow_key_init(struct netdev_flow_key *key,
2419
                     const struct flow *flow)
2420
0
{
2421
0
    uint32_t hash = 0;
2422
0
    uint64_t value;
2423
2424
0
    miniflow_map_init(&key->mf, flow);
2425
0
    miniflow_init(&key->mf, flow);
2426
2427
0
    size_t n = miniflow_n_values(&key->mf);
2428
2429
0
    FLOW_FOR_EACH_IN_MAPS (value, flow, key->mf.map) {
2430
0
        hash = hash_add64(hash, value);
2431
0
    }
2432
2433
0
    key->hash = hash_finish(hash, n * 8);
2434
0
    key->len = netdev_flow_key_size(n);
2435
0
}
2436
2437
static inline void
2438
emc_change_entry(struct emc_entry *ce, struct dp_netdev_flow *flow,
2439
                 const struct netdev_flow_key *key)
2440
0
{
2441
0
    if (ce->flow != flow) {
2442
0
        if (ce->flow) {
2443
0
            dp_netdev_flow_unref(ce->flow);
2444
0
        }
2445
2446
0
        if (dp_netdev_flow_ref(flow)) {
2447
0
            ce->flow = flow;
2448
0
        } else {
2449
0
            ce->flow = NULL;
2450
0
        }
2451
0
    }
2452
0
    if (key) {
2453
0
        netdev_flow_key_clone(&ce->key, key);
2454
0
    }
2455
0
}
2456
2457
static inline void
2458
emc_insert(struct emc_cache *cache, const struct netdev_flow_key *key,
2459
           struct dp_netdev_flow *flow)
2460
0
{
2461
0
    struct emc_entry *to_be_replaced = NULL;
2462
0
    struct emc_entry *current_entry;
2463
2464
0
    EMC_FOR_EACH_POS_WITH_HASH(cache, current_entry, key->hash) {
2465
0
        if (netdev_flow_key_equal(&current_entry->key, key)) {
2466
            /* We found the entry with the 'mf' miniflow */
2467
0
            emc_change_entry(current_entry, flow, NULL);
2468
0
            return;
2469
0
        }
2470
2471
        /* Replacement policy: put the flow in an empty (not alive) entry, or
2472
         * in the first entry where it can be */
2473
0
        if (!to_be_replaced
2474
0
            || (emc_entry_alive(to_be_replaced)
2475
0
                && !emc_entry_alive(current_entry))
2476
0
            || current_entry->key.hash < to_be_replaced->key.hash) {
2477
0
            to_be_replaced = current_entry;
2478
0
        }
2479
0
    }
2480
    /* We didn't find the miniflow in the cache.
2481
     * The 'to_be_replaced' entry is where the new flow will be stored */
2482
2483
0
    emc_change_entry(to_be_replaced, flow, key);
2484
0
}
2485
2486
static inline void
2487
emc_probabilistic_insert(struct dp_netdev_pmd_thread *pmd,
2488
                         const struct netdev_flow_key *key,
2489
                         struct dp_netdev_flow *flow)
2490
0
{
2491
    /* Insert an entry into the EMC based on probability value 'min'. By
2492
     * default the value is UINT32_MAX / 100 which yields an insertion
2493
     * probability of 1/100 ie. 1% */
2494
2495
0
    uint32_t min = pmd->ctx.emc_insert_min;
2496
2497
0
    if (min && random_uint32() <= min) {
2498
0
        emc_insert(&(pmd->flow_cache).emc_cache, key, flow);
2499
0
    }
2500
0
}
2501
2502
static inline const struct cmap_node *
2503
smc_entry_get(struct dp_netdev_pmd_thread *pmd, const uint32_t hash)
2504
0
{
2505
0
    struct smc_cache *cache = &(pmd->flow_cache).smc_cache;
2506
0
    struct smc_bucket *bucket = &cache->buckets[hash & SMC_MASK];
2507
0
    uint16_t sig = hash >> 16;
2508
0
    uint16_t index = UINT16_MAX;
2509
2510
0
    for (int i = 0; i < SMC_ENTRY_PER_BUCKET; i++) {
2511
0
        if (bucket->sig[i] == sig) {
2512
0
            index = bucket->flow_idx[i];
2513
0
            break;
2514
0
        }
2515
0
    }
2516
0
    if (index != UINT16_MAX) {
2517
0
        return cmap_find_by_index(&pmd->flow_table, index);
2518
0
    }
2519
0
    return NULL;
2520
0
}
2521
2522
/* Insert the flow_table index into SMC. Insertion may fail when 1) SMC is
2523
 * turned off, 2) the flow_table index is larger than uint16_t can handle.
2524
 * If there is already an SMC entry having same signature, the index will be
2525
 * updated. If there is no existing entry, but an empty entry is available,
2526
 * the empty entry will be taken. If no empty entry or existing same signature,
2527
 * a random entry from the hashed bucket will be picked. */
2528
static inline void
2529
smc_insert(struct dp_netdev_pmd_thread *pmd,
2530
           const struct netdev_flow_key *key,
2531
           uint32_t hash)
2532
0
{
2533
0
    struct smc_cache *smc_cache = &(pmd->flow_cache).smc_cache;
2534
0
    struct smc_bucket *bucket = &smc_cache->buckets[key->hash & SMC_MASK];
2535
0
    uint16_t index;
2536
0
    uint32_t cmap_index;
2537
0
    int i;
2538
2539
0
    if (!pmd->ctx.smc_enable_db) {
2540
0
        return;
2541
0
    }
2542
2543
0
    cmap_index = cmap_find_index(&pmd->flow_table, hash);
2544
0
    index = (cmap_index >= UINT16_MAX) ? UINT16_MAX : (uint16_t)cmap_index;
2545
2546
    /* If the index is larger than SMC can handle (uint16_t), we don't
2547
     * insert */
2548
0
    if (index == UINT16_MAX) {
2549
0
        return;
2550
0
    }
2551
2552
    /* If an entry with same signature already exists, update the index */
2553
0
    uint16_t sig = key->hash >> 16;
2554
0
    for (i = 0; i < SMC_ENTRY_PER_BUCKET; i++) {
2555
0
        if (bucket->sig[i] == sig) {
2556
0
            bucket->flow_idx[i] = index;
2557
0
            return;
2558
0
        }
2559
0
    }
2560
    /* If there is an empty entry, occupy it. */
2561
0
    for (i = 0; i < SMC_ENTRY_PER_BUCKET; i++) {
2562
0
        if (bucket->flow_idx[i] == UINT16_MAX) {
2563
0
            bucket->sig[i] = sig;
2564
0
            bucket->flow_idx[i] = index;
2565
0
            return;
2566
0
        }
2567
0
    }
2568
    /* Otherwise, pick a random entry. */
2569
0
    i = random_uint32() % SMC_ENTRY_PER_BUCKET;
2570
0
    bucket->sig[i] = sig;
2571
0
    bucket->flow_idx[i] = index;
2572
0
}
2573
2574
inline void
2575
emc_probabilistic_insert_batch(struct dp_netdev_pmd_thread *pmd,
2576
                               const struct netdev_flow_key *keys,
2577
                               struct dpcls_rule **rules,
2578
                               uint32_t emc_insert_mask)
2579
0
{
2580
0
    while (emc_insert_mask) {
2581
0
        uint32_t i = raw_ctz(emc_insert_mask);
2582
0
        emc_insert_mask &= emc_insert_mask - 1;
2583
        /* Get the require parameters for EMC/SMC from the rule */
2584
0
        struct dp_netdev_flow *flow = dp_netdev_flow_cast(rules[i]);
2585
        /* Insert the key into EMC/SMC. */
2586
0
        emc_probabilistic_insert(pmd, &keys[i], flow);
2587
0
    }
2588
0
}
2589
2590
inline void
2591
smc_insert_batch(struct dp_netdev_pmd_thread *pmd,
2592
                 const struct netdev_flow_key *keys,
2593
                 struct dpcls_rule **rules,
2594
                 uint32_t smc_insert_mask)
2595
0
{
2596
0
    while (smc_insert_mask) {
2597
0
        uint32_t i = raw_ctz(smc_insert_mask);
2598
0
        smc_insert_mask &= smc_insert_mask - 1;
2599
        /* Get the require parameters for EMC/SMC from the rule */
2600
0
        struct dp_netdev_flow *flow = dp_netdev_flow_cast(rules[i]);
2601
0
        uint32_t hash = dp_netdev_flow_hash(&flow->ufid);
2602
        /* Insert the key into EMC/SMC. */
2603
0
        smc_insert(pmd, &keys[i], hash);
2604
0
    }
2605
0
}
2606
2607
static struct dp_netdev_flow *
2608
dp_netdev_pmd_lookup_flow(struct dp_netdev_pmd_thread *pmd,
2609
                          const struct netdev_flow_key *key,
2610
                          int *lookup_num_p)
2611
0
{
2612
0
    struct dpcls *cls;
2613
0
    struct dpcls_rule *rule = NULL;
2614
0
    odp_port_t in_port = u32_to_odp(MINIFLOW_GET_U32(&key->mf,
2615
0
                                                     in_port.odp_port));
2616
0
    struct dp_netdev_flow *netdev_flow = NULL;
2617
2618
0
    cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
2619
0
    if (OVS_LIKELY(cls)) {
2620
0
        dpcls_lookup(cls, &key, &rule, 1, lookup_num_p);
2621
0
        netdev_flow = dp_netdev_flow_cast(rule);
2622
0
    }
2623
0
    return netdev_flow;
2624
0
}
2625
2626
static struct dp_netdev_flow *
2627
dp_netdev_pmd_find_flow(const struct dp_netdev_pmd_thread *pmd,
2628
                        const ovs_u128 *ufidp, const struct nlattr *key,
2629
                        size_t key_len)
2630
0
{
2631
0
    struct dp_netdev_flow *netdev_flow;
2632
0
    struct flow flow;
2633
0
    ovs_u128 ufid;
2634
2635
    /* If a UFID is not provided, determine one based on the key. */
2636
0
    if (!ufidp && key && key_len
2637
0
        && !dpif_netdev_flow_from_nlattrs(key, key_len, &flow, false)) {
2638
0
        odp_flow_key_hash(&flow, sizeof flow, &ufid);
2639
0
        ufidp = &ufid;
2640
0
    }
2641
2642
0
    if (ufidp) {
2643
0
        CMAP_FOR_EACH_WITH_HASH (netdev_flow, node, dp_netdev_flow_hash(ufidp),
2644
0
                                 &pmd->flow_table) {
2645
0
            if (ovs_u128_equals(netdev_flow->ufid, *ufidp)) {
2646
0
                return netdev_flow;
2647
0
            }
2648
0
        }
2649
0
    }
2650
2651
0
    return NULL;
2652
0
}
2653
2654
static void
2655
get_dpif_flow_status(const struct dp_netdev *dp,
2656
                     const struct dp_netdev_flow *netdev_flow_,
2657
                     struct dpif_flow_stats *stats,
2658
                     struct dpif_flow_attrs *attrs)
2659
0
{
2660
0
    struct dpif_flow_stats offload_stats;
2661
0
    struct dpif_flow_attrs offload_attrs;
2662
0
    struct dp_netdev_flow *netdev_flow;
2663
0
    unsigned long long n;
2664
0
    long long used;
2665
0
    uint16_t flags;
2666
2667
0
    netdev_flow = CONST_CAST(struct dp_netdev_flow *, netdev_flow_);
2668
2669
0
    atomic_read_relaxed(&netdev_flow->stats.packet_count, &n);
2670
0
    stats->n_packets = n;
2671
0
    atomic_read_relaxed(&netdev_flow->stats.byte_count, &n);
2672
0
    stats->n_bytes = n;
2673
0
    atomic_read_relaxed(&netdev_flow->stats.used, &used);
2674
0
    stats->used = used;
2675
0
    atomic_read_relaxed(&netdev_flow->stats.tcp_flags, &flags);
2676
0
    stats->tcp_flags = flags;
2677
2678
0
    if (dpif_offload_datapath_flow_stats(dp->full_name,
2679
0
                                         netdev_flow->flow.in_port.odp_port,
2680
0
                                         &netdev_flow->mega_ufid,
2681
0
                                         &offload_stats, &offload_attrs)) {
2682
0
        stats->n_packets += offload_stats.n_packets;
2683
0
        stats->n_bytes += offload_stats.n_bytes;
2684
0
        stats->used = MAX(stats->used, offload_stats.used);
2685
0
        stats->tcp_flags |= offload_stats.tcp_flags;
2686
0
        if (attrs) {
2687
0
            attrs->offloaded = offload_attrs.offloaded;
2688
0
            attrs->dp_layer = offload_attrs.dp_layer;
2689
0
        }
2690
0
    } else if (attrs) {
2691
0
        attrs->offloaded = false;
2692
0
        attrs->dp_layer = "ovs";
2693
0
    }
2694
0
}
2695
2696
/* Converts to the dpif_flow format, using 'key_buf' and 'mask_buf' for
2697
 * storing the netlink-formatted key/mask. 'key_buf' may be the same as
2698
 * 'mask_buf'. Actions will be returned without copying, by relying on RCU to
2699
 * protect them. */
2700
static void
2701
dp_netdev_flow_to_dpif_flow(const struct dp_netdev *dp,
2702
                            const struct dp_netdev_flow *netdev_flow,
2703
                            struct ofpbuf *key_buf, struct ofpbuf *mask_buf,
2704
                            struct dpif_flow *flow, bool terse)
2705
0
{
2706
0
    if (terse) {
2707
0
        memset(flow, 0, sizeof *flow);
2708
0
    } else {
2709
0
        struct flow_wildcards wc;
2710
0
        struct dp_netdev_actions *actions;
2711
0
        size_t offset;
2712
0
        struct odp_flow_key_parms odp_parms = {
2713
0
            .flow = &netdev_flow->flow,
2714
0
            .mask = &wc.masks,
2715
0
            .support = dp_netdev_support,
2716
0
        };
2717
2718
0
        miniflow_expand(&netdev_flow->cr.mask->mf, &wc.masks);
2719
        /* in_port is exact matched, but we have left it out from the mask for
2720
         * optimnization reasons. Add in_port back to the mask. */
2721
0
        wc.masks.in_port.odp_port = ODPP_NONE;
2722
2723
        /* Key */
2724
0
        offset = key_buf->size;
2725
0
        flow->key = ofpbuf_tail(key_buf);
2726
0
        odp_flow_key_from_flow(&odp_parms, key_buf);
2727
0
        flow->key_len = key_buf->size - offset;
2728
2729
        /* Mask */
2730
0
        offset = mask_buf->size;
2731
0
        flow->mask = ofpbuf_tail(mask_buf);
2732
0
        odp_parms.key_buf = key_buf;
2733
0
        odp_flow_key_from_mask(&odp_parms, mask_buf);
2734
0
        flow->mask_len = mask_buf->size - offset;
2735
2736
        /* Actions */
2737
0
        actions = dp_netdev_flow_get_actions(netdev_flow);
2738
0
        flow->actions = actions->actions;
2739
0
        flow->actions_len = actions->size;
2740
0
    }
2741
2742
0
    flow->ufid = netdev_flow->ufid;
2743
0
    flow->ufid_present = true;
2744
0
    flow->pmd_id = netdev_flow->pmd_id;
2745
2746
0
    get_dpif_flow_status(dp, netdev_flow, &flow->stats, &flow->attrs);
2747
0
    flow->attrs.dp_extra_info = netdev_flow->dp_extra_info;
2748
0
}
2749
2750
static int
2751
dpif_netdev_mask_from_nlattrs(const struct nlattr *key, uint32_t key_len,
2752
                              const struct nlattr *mask_key,
2753
                              uint32_t mask_key_len, const struct flow *flow,
2754
                              struct flow_wildcards *wc, bool probe)
2755
0
{
2756
0
    enum odp_key_fitness fitness;
2757
2758
0
    fitness = odp_flow_key_to_mask(mask_key, mask_key_len, wc, flow, NULL);
2759
0
    if (fitness) {
2760
0
        if (!probe) {
2761
            /* This should not happen: it indicates that
2762
             * odp_flow_key_from_mask() and odp_flow_key_to_mask()
2763
             * disagree on the acceptable form of a mask.  Log the problem
2764
             * as an error, with enough details to enable debugging. */
2765
0
            static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
2766
2767
0
            if (!VLOG_DROP_ERR(&rl)) {
2768
0
                struct ds s;
2769
2770
0
                ds_init(&s);
2771
0
                odp_flow_format(key, key_len, mask_key, mask_key_len, NULL, &s,
2772
0
                                true, true);
2773
0
                VLOG_ERR("internal error parsing flow mask %s (%s)",
2774
0
                ds_cstr(&s), odp_key_fitness_to_string(fitness));
2775
0
                ds_destroy(&s);
2776
0
            }
2777
0
        }
2778
2779
0
        return EINVAL;
2780
0
    }
2781
2782
0
    return 0;
2783
0
}
2784
2785
static int
2786
dpif_netdev_flow_from_nlattrs(const struct nlattr *key, uint32_t key_len,
2787
                              struct flow *flow, bool probe)
2788
0
{
2789
0
    if (odp_flow_key_to_flow(key, key_len, flow, NULL)) {
2790
0
        if (!probe) {
2791
            /* This should not happen: it indicates that
2792
             * odp_flow_key_from_flow() and odp_flow_key_to_flow() disagree on
2793
             * the acceptable form of a flow.  Log the problem as an error,
2794
             * with enough details to enable debugging. */
2795
0
            static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
2796
2797
0
            if (!VLOG_DROP_ERR(&rl)) {
2798
0
                struct ds s;
2799
2800
0
                ds_init(&s);
2801
0
                odp_flow_format(key, key_len, NULL, 0, NULL, &s, true, false);
2802
0
                VLOG_ERR("internal error parsing flow key %s", ds_cstr(&s));
2803
0
                ds_destroy(&s);
2804
0
            }
2805
0
        }
2806
2807
0
        return EINVAL;
2808
0
    }
2809
2810
0
    if (flow->ct_state & DP_NETDEV_CS_UNSUPPORTED_MASK) {
2811
0
        return EINVAL;
2812
0
    }
2813
2814
0
    return 0;
2815
0
}
2816
2817
static int
2818
dpif_netdev_flow_get(const struct dpif *dpif, const struct dpif_flow_get *get)
2819
0
{
2820
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
2821
0
    struct dp_netdev_flow *netdev_flow;
2822
0
    struct dp_netdev_pmd_thread *pmd;
2823
0
    struct hmapx to_find = HMAPX_INITIALIZER(&to_find);
2824
0
    struct hmapx_node *node;
2825
0
    int error = EINVAL;
2826
2827
0
    if (get->pmd_id == PMD_ID_NULL) {
2828
0
        CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
2829
0
            if (dp_netdev_pmd_try_ref(pmd) && !hmapx_add(&to_find, pmd)) {
2830
0
                dp_netdev_pmd_unref(pmd);
2831
0
            }
2832
0
        }
2833
0
    } else {
2834
0
        pmd = dp_netdev_get_pmd(dp, get->pmd_id);
2835
0
        if (!pmd) {
2836
0
            goto out;
2837
0
        }
2838
0
        hmapx_add(&to_find, pmd);
2839
0
    }
2840
2841
0
    if (!hmapx_count(&to_find)) {
2842
0
        goto out;
2843
0
    }
2844
2845
0
    HMAPX_FOR_EACH (node, &to_find) {
2846
0
        pmd = (struct dp_netdev_pmd_thread *) node->data;
2847
0
        netdev_flow = dp_netdev_pmd_find_flow(pmd, get->ufid, get->key,
2848
0
                                              get->key_len);
2849
0
        if (netdev_flow) {
2850
0
            dp_netdev_flow_to_dpif_flow(dp, netdev_flow, get->buffer,
2851
0
                                        get->buffer, get->flow, false);
2852
0
            error = 0;
2853
0
            break;
2854
0
        } else {
2855
0
            error = ENOENT;
2856
0
        }
2857
0
    }
2858
2859
0
    HMAPX_FOR_EACH (node, &to_find) {
2860
0
        pmd = (struct dp_netdev_pmd_thread *) node->data;
2861
0
        dp_netdev_pmd_unref(pmd);
2862
0
    }
2863
0
out:
2864
0
    hmapx_destroy(&to_find);
2865
0
    return error;
2866
0
}
2867
2868
static void
2869
dp_netdev_get_mega_ufid(const struct match *match, ovs_u128 *mega_ufid)
2870
0
{
2871
0
    struct {
2872
0
        struct flow masked_flow;
2873
0
        struct flow wc;
2874
0
    } key;
2875
0
    size_t i;
2876
2877
0
    memset(&key, 0, sizeof key);
2878
0
    for (i = 0; i < sizeof(struct flow); i++) {
2879
0
        ((uint8_t *)&key.masked_flow)[i] = ((uint8_t *)&match->flow)[i] &
2880
0
                                           ((uint8_t *)&match->wc)[i];
2881
0
        ((uint8_t *)&key.wc)[i] = ((uint8_t *)&match->wc)[i];
2882
0
    }
2883
2884
0
    odp_flow_key_hash(&key, sizeof key, mega_ufid);
2885
0
}
2886
2887
static uint64_t
2888
dp_netdev_simple_match_mark(odp_port_t in_port, ovs_be16 dl_type,
2889
                            uint8_t nw_frag, ovs_be16 vlan_tci)
2890
0
{
2891
    /* Simple Match Mark:
2892
     *
2893
     * BE:
2894
     * +-----------------+-------------++---------+---+-----------+
2895
     * |     in_port     |   dl_type   || nw_frag |CFI|  VID(12)  |
2896
     * +-----------------+-------------++---------+---+-----------+
2897
     * 0                 32          47 49         51  52     63
2898
     *
2899
     * LE:
2900
     * +-----------------+-------------+------++-------+---+------+
2901
     * |     in_port     |   dl_type   |VID(8)||nw_frag|CFI|VID(4)|
2902
     * +-----------------+-------------+------++-------+---+------+
2903
     * 0                 32          47 48  55  57   59 60  61   63
2904
     *
2905
     *         Big Endian              Little Endian
2906
     * in_port : 32 bits [ 0..31]  in_port : 32 bits [ 0..31]
2907
     * dl_type : 16 bits [32..47]  dl_type : 16 bits [32..47]
2908
     * <empty> :  1 bit  [48..48]  vlan VID:  8 bits [48..55]
2909
     * nw_frag :  2 bits [49..50]  <empty> :  1 bit  [56..56]
2910
     * vlan CFI:  1 bit  [51..51]  nw_frag :  2 bits [57..59]
2911
     * vlan VID: 12 bits [52..63]  vlan CFI:  1 bit  [60..60]
2912
     *                             vlan VID:  4 bits [61..63]
2913
     *
2914
     * Layout is different for LE and BE in order to save a couple of
2915
     * network to host translations.
2916
     * */
2917
0
    return ((uint64_t) odp_to_u32(in_port) << 32)
2918
0
           | ((OVS_FORCE uint32_t) dl_type << 16)
2919
#if WORDS_BIGENDIAN
2920
           | (((uint16_t) nw_frag & FLOW_NW_FRAG_MASK) << VLAN_PCP_SHIFT)
2921
#else
2922
0
           | ((nw_frag & FLOW_NW_FRAG_MASK) << (VLAN_PCP_SHIFT - 8))
2923
0
#endif
2924
0
           | (OVS_FORCE uint16_t) (vlan_tci & htons(VLAN_VID_MASK | VLAN_CFI));
2925
0
}
2926
2927
static struct dp_netdev_flow *
2928
dp_netdev_simple_match_lookup(const struct dp_netdev_pmd_thread *pmd,
2929
                              odp_port_t in_port, ovs_be16 dl_type,
2930
                              uint8_t nw_frag, ovs_be16 vlan_tci)
2931
0
{
2932
0
    uint64_t mark = dp_netdev_simple_match_mark(in_port, dl_type,
2933
0
                                                nw_frag, vlan_tci);
2934
0
    uint32_t hash = hash_uint64(mark);
2935
0
    struct dp_netdev_flow *flow;
2936
0
    bool found = false;
2937
2938
0
    CMAP_FOR_EACH_WITH_HASH (flow, simple_match_node,
2939
0
                             hash, &pmd->simple_match_table) {
2940
0
        if (flow->simple_match_mark == mark) {
2941
0
            found = true;
2942
0
            break;
2943
0
        }
2944
0
    }
2945
0
    return found ? flow : NULL;
2946
0
}
2947
2948
static bool
2949
dp_netdev_simple_match_enabled(const struct dp_netdev_pmd_thread *pmd,
2950
                               odp_port_t in_port)
2951
0
{
2952
0
    return ccmap_find(&pmd->n_flows, odp_to_u32(in_port))
2953
0
           == ccmap_find(&pmd->n_simple_flows, odp_to_u32(in_port));
2954
0
}
2955
2956
static void
2957
dp_netdev_simple_match_insert(struct dp_netdev_pmd_thread *pmd,
2958
                              struct dp_netdev_flow *dp_flow)
2959
    OVS_REQUIRES(pmd->flow_mutex)
2960
0
{
2961
0
    odp_port_t in_port = dp_flow->flow.in_port.odp_port;
2962
0
    ovs_be16 vlan_tci = dp_flow->flow.vlans[0].tci;
2963
0
    ovs_be16 dl_type = dp_flow->flow.dl_type;
2964
0
    uint8_t nw_frag = dp_flow->flow.nw_frag;
2965
2966
0
    if (!dp_netdev_flow_ref(dp_flow)) {
2967
0
        return;
2968
0
    }
2969
2970
    /* Avoid double insertion.  Should not happen in practice. */
2971
0
    dp_netdev_simple_match_remove(pmd, dp_flow);
2972
2973
0
    uint64_t mark = dp_netdev_simple_match_mark(in_port, dl_type,
2974
0
                                                nw_frag, vlan_tci);
2975
0
    uint32_t hash = hash_uint64(mark);
2976
2977
0
    dp_flow->simple_match_mark = mark;
2978
0
    cmap_insert(&pmd->simple_match_table,
2979
0
                CONST_CAST(struct cmap_node *, &dp_flow->simple_match_node),
2980
0
                hash);
2981
0
    ccmap_inc(&pmd->n_simple_flows, odp_to_u32(in_port));
2982
2983
0
    VLOG_DBG("Simple match insert: "
2984
0
             "core_id(%d),in_port(%"PRIu32"),mark(0x%016"PRIx64").",
2985
0
             pmd->core_id, in_port, mark);
2986
0
}
2987
2988
static void
2989
dp_netdev_simple_match_remove(struct dp_netdev_pmd_thread *pmd,
2990
                               struct dp_netdev_flow *dp_flow)
2991
    OVS_REQUIRES(pmd->flow_mutex)
2992
0
{
2993
0
    odp_port_t in_port = dp_flow->flow.in_port.odp_port;
2994
0
    ovs_be16 vlan_tci = dp_flow->flow.vlans[0].tci;
2995
0
    ovs_be16 dl_type = dp_flow->flow.dl_type;
2996
0
    uint8_t nw_frag = dp_flow->flow.nw_frag;
2997
0
    struct dp_netdev_flow *flow;
2998
0
    uint64_t mark = dp_netdev_simple_match_mark(in_port, dl_type,
2999
0
                                                nw_frag, vlan_tci);
3000
0
    uint32_t hash = hash_uint64(mark);
3001
3002
0
    flow = dp_netdev_simple_match_lookup(pmd, in_port, dl_type,
3003
0
                                         nw_frag, vlan_tci);
3004
0
    if (flow == dp_flow) {
3005
0
        VLOG_DBG("Simple match remove: "
3006
0
                 "core_id(%d),in_port(%"PRIu32"),mark(0x%016"PRIx64").",
3007
0
                 pmd->core_id, in_port, mark);
3008
0
        cmap_remove(&pmd->simple_match_table,
3009
0
                    CONST_CAST(struct cmap_node *, &flow->simple_match_node),
3010
0
                    hash);
3011
0
        ccmap_dec(&pmd->n_simple_flows, odp_to_u32(in_port));
3012
0
        dp_netdev_flow_unref(flow);
3013
0
    }
3014
0
}
3015
3016
static bool
3017
dp_netdev_flow_is_simple_match(const struct match *match)
3018
0
{
3019
0
    const struct flow *flow = &match->flow;
3020
0
    const struct flow_wildcards *wc = &match->wc;
3021
3022
0
    if (flow->recirc_id || flow->packet_type != htonl(PT_ETH)) {
3023
0
        return false;
3024
0
    }
3025
3026
    /* Check that flow matches only minimal set of fields that always set.
3027
     * Also checking that VLAN VID+CFI is an exact match, because these
3028
     * are not mandatory and could be masked. */
3029
0
    struct flow_wildcards *minimal = xmalloc(sizeof *minimal);
3030
0
    ovs_be16 vlan_tci_mask = htons(VLAN_VID_MASK | VLAN_CFI);
3031
3032
0
    flow_wildcards_init_catchall(minimal);
3033
    /* 'dpif-netdev' always has following in exact match:
3034
     *   - recirc_id                   <-- recirc_id == 0 checked on input.
3035
     *   - in_port                     <-- Will be checked on input.
3036
     *   - packet_type                 <-- Assuming all packets are PT_ETH.
3037
     *   - dl_type                     <-- Need to match with.
3038
     *   - vlan_tci                    <-- Need to match with.
3039
     *   - and nw_frag for ip packets. <-- Need to match with.
3040
     */
3041
0
    WC_MASK_FIELD(minimal, recirc_id);
3042
0
    WC_MASK_FIELD(minimal, in_port);
3043
0
    WC_MASK_FIELD(minimal, packet_type);
3044
0
    WC_MASK_FIELD(minimal, dl_type);
3045
0
    WC_MASK_FIELD_MASK(minimal, vlans[0].tci, vlan_tci_mask);
3046
0
    WC_MASK_FIELD_MASK(minimal, nw_frag, FLOW_NW_FRAG_MASK);
3047
3048
0
    if (flow_wildcards_has_extra(minimal, wc)
3049
0
        || wc->masks.vlans[0].tci != vlan_tci_mask) {
3050
0
        free(minimal);
3051
0
        return false;
3052
0
    }
3053
0
    free(minimal);
3054
3055
0
    return true;
3056
0
}
3057
3058
static void
3059
offload_flow_put_resume(struct dp_netdev *dp, struct dp_netdev_flow *flow,
3060
                        struct dp_netdev_flow *previous_flow_reference,
3061
                        unsigned pmd_id, int error)
3062
0
{
3063
0
    if (error == EINPROGRESS) {
3064
0
        return;
3065
0
    }
3066
3067
0
    if (!error) {
3068
0
        flow->offloaded = true;
3069
0
    } else {
3070
        /* If the flow was already offloaded, the new action set can no
3071
         * longer be offloaded.  In theory, we should disassociate the
3072
         * offload from all PMDs that have this flow marked as offloaded.
3073
         * Unfortunately, there is no mechanism to inform other PMDs, so
3074
         * we cannot explicitly mark such flows.  This situation typically
3075
         * occurs when the revalidator modifies the flow, so it is safe to
3076
         * assume it will update all affected flows and that the offload
3077
         * will subsequently fail. */
3078
0
        flow->offloaded = false;
3079
3080
        /* On error, the flow reference was not stored by the offload provider,
3081
         * so we should decrease the reference. */
3082
0
        dp_netdev_flow_unref(flow);
3083
0
    }
3084
3085
0
    if (offload_queue_dec(flow) && flow->dead) {
3086
        /* If flows are processed asynchronously, modifications might
3087
         * still be queued up while the flow is being removed.  If this
3088
         * was the last flow in the queue on a dead flow, we try again
3089
         * to see if we need to remove this flow. */
3090
0
        offload_flow_del(dp, pmd_id, flow);
3091
0
    }
3092
3093
0
    if (previous_flow_reference) {
3094
0
        dp_netdev_flow_unref(previous_flow_reference);
3095
0
        if (previous_flow_reference != flow) {
3096
0
            VLOG_DBG("Updated flow reference was from outdated flow");
3097
0
        }
3098
0
    }
3099
0
}
3100
3101
static void
3102
offload_flow_put_resume_cb(void *aux, struct dpif_flow_stats *stats OVS_UNUSED,
3103
                           unsigned pmd_id, void *flow_reference_,
3104
                           void *old_flow_reference_,
3105
                           int error)
3106
0
{
3107
0
    struct dp_netdev *dp = aux;
3108
0
    struct dp_netdev_flow *flow_reference = flow_reference_;
3109
0
    struct dp_netdev_flow *old_flow_reference = old_flow_reference_;
3110
3111
0
    offload_flow_put_resume(dp, flow_reference, old_flow_reference,
3112
0
                            pmd_id, error);
3113
0
}
3114
3115
static void
3116
offload_flow_put(struct dp_netdev_pmd_thread *pmd, struct dp_netdev_flow *flow,
3117
                 struct match *match, const struct nlattr *actions,
3118
                 size_t actions_len)
3119
0
{
3120
0
    struct dpif_offload_flow_put put = {
3121
0
        .in_port = match->flow.in_port.odp_port,
3122
0
        .orig_in_port = flow->orig_in_port,
3123
0
        .pmd_id = pmd->core_id,
3124
0
        .ufid = CONST_CAST(ovs_u128 *, &flow->mega_ufid),
3125
0
        .match = match,
3126
0
        .actions = actions,
3127
0
        .actions_len = actions_len,
3128
0
        .stats = NULL,
3129
0
        .flow_reference = flow,
3130
0
        .cb_data = {
3131
0
            .callback = offload_flow_put_resume_cb,
3132
0
            .callback_aux = pmd->dp,
3133
0
        },
3134
0
    };
3135
0
    void *previous_flow_reference = NULL;
3136
0
    int error;
3137
3138
0
    if (!dpif_offload_enabled() || flow->dead || !offload_queue_inc(flow)) {
3139
0
        return;
3140
0
    }
3141
3142
0
    dp_netdev_flow_ref(flow);
3143
3144
0
    error = dpif_offload_datapath_flow_put(pmd->dp->full_name, &put,
3145
0
                                           &previous_flow_reference);
3146
0
    offload_flow_put_resume(pmd->dp, put.flow_reference,
3147
0
                            previous_flow_reference,
3148
0
                            pmd->core_id, error);
3149
0
}
3150
3151
static struct dp_netdev_flow *
3152
dp_netdev_flow_add(struct dp_netdev_pmd_thread *pmd,
3153
                   struct match *match, const ovs_u128 *ufid,
3154
                   const struct nlattr *actions, size_t actions_len,
3155
                   odp_port_t orig_in_port)
3156
    OVS_REQUIRES(pmd->flow_mutex)
3157
0
{
3158
0
    struct ds extra_info = DS_EMPTY_INITIALIZER;
3159
0
    struct dp_netdev_flow *flow;
3160
0
    struct netdev_flow_key mask;
3161
0
    struct dpcls *cls;
3162
0
    size_t unit;
3163
3164
    /* Make sure in_port is exact matched before we read it. */
3165
0
    ovs_assert(match->wc.masks.in_port.odp_port == ODPP_NONE);
3166
0
    odp_port_t in_port = match->flow.in_port.odp_port;
3167
3168
    /* As we select the dpcls based on the port number, each netdev flow
3169
     * belonging to the same dpcls will have the same odp_port value.
3170
     * For performance reasons we wildcard odp_port here in the mask.  In the
3171
     * typical case dp_hash is also wildcarded, and the resulting 8-byte
3172
     * chunk {dp_hash, in_port} will be ignored by netdev_flow_mask_init() and
3173
     * will not be part of the subtable mask.
3174
     * This will speed up the hash computation during dpcls_lookup() because
3175
     * there is one less call to hash_add64() in this case. */
3176
0
    match->wc.masks.in_port.odp_port = 0;
3177
0
    netdev_flow_mask_init(&mask, match);
3178
0
    match->wc.masks.in_port.odp_port = ODPP_NONE;
3179
3180
    /* Make sure wc does not have metadata. */
3181
0
    ovs_assert(!FLOWMAP_HAS_FIELD(&mask.mf.map, metadata)
3182
0
               && !FLOWMAP_HAS_FIELD(&mask.mf.map, regs));
3183
3184
    /* Do not allocate extra space. */
3185
0
    flow = xmalloc(sizeof *flow - sizeof flow->cr.flow.mf + mask.len);
3186
0
    memset(&flow->stats, 0, sizeof flow->stats);
3187
0
    flow->dead = false;
3188
0
    flow->offloaded = false;
3189
0
    atomic_init(&flow->offload_queue_depth, 0);
3190
0
    flow->batch = NULL;
3191
0
    flow->orig_in_port = orig_in_port;
3192
0
    *CONST_CAST(unsigned *, &flow->pmd_id) = pmd->core_id;
3193
0
    *CONST_CAST(struct flow *, &flow->flow) = match->flow;
3194
0
    *CONST_CAST(ovs_u128 *, &flow->ufid) = *ufid;
3195
0
    ovs_refcount_init(&flow->ref_cnt);
3196
0
    ovsrcu_set(&flow->actions, dp_netdev_actions_create(actions, actions_len));
3197
3198
0
    dp_netdev_get_mega_ufid(match, CONST_CAST(ovs_u128 *, &flow->mega_ufid));
3199
0
    netdev_flow_key_init_masked(&flow->cr.flow, &match->flow, &mask);
3200
3201
    /* Select dpcls for in_port. Relies on in_port to be exact match. */
3202
0
    cls = dp_netdev_pmd_find_dpcls(pmd, in_port);
3203
0
    dpcls_insert(cls, &flow->cr, &mask);
3204
3205
0
    ds_put_cstr(&extra_info, "miniflow_bits(");
3206
0
    FLOWMAP_FOR_EACH_UNIT (unit) {
3207
0
        if (unit) {
3208
0
            ds_put_char(&extra_info, ',');
3209
0
        }
3210
0
        ds_put_format(&extra_info, "%d",
3211
0
                      count_1bits(flow->cr.mask->mf.map.bits[unit]));
3212
0
    }
3213
0
    ds_put_char(&extra_info, ')');
3214
0
    flow->dp_extra_info = ds_steal_cstr(&extra_info);
3215
0
    ds_destroy(&extra_info);
3216
3217
0
    cmap_insert(&pmd->flow_table, CONST_CAST(struct cmap_node *, &flow->node),
3218
0
                dp_netdev_flow_hash(&flow->ufid));
3219
0
    ccmap_inc(&pmd->n_flows, odp_to_u32(in_port));
3220
3221
0
    if (dp_netdev_flow_is_simple_match(match)) {
3222
0
        dp_netdev_simple_match_insert(pmd, flow);
3223
0
    }
3224
3225
0
    offload_flow_put(pmd, flow, match, actions, actions_len);
3226
0
    log_netdev_flow_change(flow, match, NULL, actions, actions_len);
3227
3228
0
    return flow;
3229
0
}
3230
3231
static int
3232
flow_put_on_pmd(struct dp_netdev_pmd_thread *pmd,
3233
                struct netdev_flow_key *key,
3234
                struct match *match,
3235
                ovs_u128 *ufid,
3236
                const struct dpif_flow_put *put,
3237
                struct dpif_flow_stats *stats)
3238
0
{
3239
0
    struct dp_netdev_flow *netdev_flow = NULL;
3240
0
    int error = 0;
3241
3242
0
    if (stats) {
3243
0
        memset(stats, 0, sizeof *stats);
3244
0
    }
3245
3246
0
    ovs_mutex_lock(&pmd->flow_mutex);
3247
0
    if (put->ufid) {
3248
0
        netdev_flow = dp_netdev_pmd_find_flow(pmd, put->ufid,
3249
0
                                              put->key, put->key_len);
3250
0
    } else {
3251
        /* Use key instead of the locally generated ufid
3252
         * to search netdev_flow. */
3253
0
        netdev_flow = dp_netdev_pmd_lookup_flow(pmd, key, NULL);
3254
0
    }
3255
3256
0
    if (put->flags & DPIF_FP_CREATE) {
3257
0
        if (!netdev_flow) {
3258
0
            dp_netdev_flow_add(pmd, match, ufid,
3259
0
                               put->actions, put->actions_len, ODPP_NONE);
3260
0
        } else {
3261
0
            error = EEXIST;
3262
0
        }
3263
0
        goto exit;
3264
0
    }
3265
3266
0
    if (put->flags & DPIF_FP_MODIFY) {
3267
0
        if (!netdev_flow) {
3268
0
            error = ENOENT;
3269
0
        } else {
3270
0
            if (!put->ufid && !flow_equal(&match->flow, &netdev_flow->flow)) {
3271
                /* Overlapping flow. */
3272
0
                error = EINVAL;
3273
0
                goto exit;
3274
0
            }
3275
3276
0
            struct dp_netdev_actions *new_actions;
3277
0
            struct dp_netdev_actions *old_actions;
3278
3279
0
            new_actions = dp_netdev_actions_create(put->actions,
3280
0
                                                   put->actions_len);
3281
3282
0
            old_actions = dp_netdev_flow_get_actions(netdev_flow);
3283
0
            ovsrcu_set(&netdev_flow->actions, new_actions);
3284
3285
0
            offload_flow_put(pmd, netdev_flow, match, put->actions,
3286
0
                             put->actions_len);
3287
0
            log_netdev_flow_change(netdev_flow, match, old_actions,
3288
0
                                   put->actions, put->actions_len);
3289
3290
0
            if (stats) {
3291
0
                get_dpif_flow_status(pmd->dp, netdev_flow, stats, NULL);
3292
0
            }
3293
0
            if (put->flags & DPIF_FP_ZERO_STATS) {
3294
                /* XXX: The userspace datapath uses thread local statistics
3295
                 * (for flows), which should be updated only by the owning
3296
                 * thread.  Since we cannot write on stats memory here,
3297
                 * we choose not to support this flag.  Please note:
3298
                 * - This feature is currently used only by dpctl commands with
3299
                 *   option --clear.
3300
                 * - Should the need arise, this operation can be implemented
3301
                 *   by keeping a base value (to be update here) for each
3302
                 *   counter, and subtracting it before outputting the stats */
3303
0
                error = EOPNOTSUPP;
3304
0
            }
3305
0
            ovsrcu_postpone(dp_netdev_actions_free, old_actions);
3306
0
        }
3307
0
    }
3308
3309
0
exit:
3310
0
    ovs_mutex_unlock(&pmd->flow_mutex);
3311
0
    return error;
3312
0
}
3313
3314
static int
3315
dpif_netdev_flow_put(struct dpif *dpif, const struct dpif_flow_put *put)
3316
0
{
3317
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
3318
0
    struct netdev_flow_key key;
3319
0
    struct dp_netdev_pmd_thread *pmd;
3320
0
    struct match match;
3321
0
    ovs_u128 ufid;
3322
0
    int error;
3323
0
    bool probe = put->flags & DPIF_FP_PROBE;
3324
3325
0
    if (put->stats) {
3326
0
        memset(put->stats, 0, sizeof *put->stats);
3327
0
    }
3328
0
    error = dpif_netdev_flow_from_nlattrs(put->key, put->key_len, &match.flow,
3329
0
                                          probe);
3330
0
    if (error) {
3331
0
        return error;
3332
0
    }
3333
0
    error = dpif_netdev_mask_from_nlattrs(put->key, put->key_len,
3334
0
                                          put->mask, put->mask_len,
3335
0
                                          &match.flow, &match.wc, probe);
3336
0
    if (error) {
3337
0
        return error;
3338
0
    }
3339
3340
0
    if (match.wc.masks.in_port.odp_port != ODPP_NONE) {
3341
0
        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
3342
3343
0
        VLOG_ERR_RL(&rl, "failed to put%s flow: in_port is not an exact match",
3344
0
                    (put->flags & DPIF_FP_CREATE) ? "[create]"
3345
0
                    : (put->flags & DPIF_FP_MODIFY) ? "[modify]" : "[zero]");
3346
0
        return EINVAL;
3347
0
    }
3348
3349
0
    if (put->ufid) {
3350
0
        ufid = *put->ufid;
3351
0
    } else {
3352
0
        odp_flow_key_hash(&match.flow, sizeof match.flow, &ufid);
3353
0
    }
3354
3355
    /* The Netlink encoding of datapath flow keys cannot express
3356
     * wildcarding the presence of a VLAN tag. Instead, a missing VLAN
3357
     * tag is interpreted as exact match on the fact that there is no
3358
     * VLAN.  Unless we refactor a lot of code that translates between
3359
     * Netlink and struct flow representations, we have to do the same
3360
     * here.  This must be in sync with 'match' in handle_packet_upcall(). */
3361
0
    if (!match.wc.masks.vlans[0].tci) {
3362
0
        match.wc.masks.vlans[0].tci = htons(VLAN_VID_MASK | VLAN_CFI);
3363
0
    }
3364
3365
    /* Must produce a netdev_flow_key for lookup.
3366
     * Use the same method as employed to create the key when adding
3367
     * the flow to the dplcs to make sure they match.
3368
     * We need to put in the unmasked key as flow_put_on_pmd() will first try
3369
     * to see if an entry exists doing a packet type lookup. As masked-out
3370
     * fields are interpreted as zeros, they could falsely match a wider IP
3371
     * address mask. Installation of the flow will use the match variable. */
3372
0
    netdev_flow_key_init(&key, &match.flow);
3373
3374
0
    if (put->pmd_id == PMD_ID_NULL) {
3375
0
        if (cmap_count(&dp->poll_threads) == 0) {
3376
0
            return EINVAL;
3377
0
        }
3378
0
        CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3379
0
            struct dpif_flow_stats pmd_stats;
3380
0
            int pmd_error;
3381
3382
0
            pmd_error = flow_put_on_pmd(pmd, &key, &match, &ufid, put,
3383
0
                                        &pmd_stats);
3384
0
            if (pmd_error) {
3385
0
                error = pmd_error;
3386
0
            } else if (put->stats) {
3387
0
                put->stats->n_packets += pmd_stats.n_packets;
3388
0
                put->stats->n_bytes += pmd_stats.n_bytes;
3389
0
                put->stats->used = MAX(put->stats->used, pmd_stats.used);
3390
0
                put->stats->tcp_flags |= pmd_stats.tcp_flags;
3391
0
            }
3392
0
        }
3393
0
    } else {
3394
0
        pmd = dp_netdev_get_pmd(dp, put->pmd_id);
3395
0
        if (!pmd) {
3396
0
            return EINVAL;
3397
0
        }
3398
0
        error = flow_put_on_pmd(pmd, &key, &match, &ufid, put, put->stats);
3399
0
        dp_netdev_pmd_unref(pmd);
3400
0
    }
3401
3402
0
    return error;
3403
0
}
3404
3405
static int
3406
flow_del_on_pmd(struct dp_netdev_pmd_thread *pmd,
3407
                struct dpif_flow_stats *stats,
3408
                const struct dpif_flow_del *del)
3409
0
{
3410
0
    struct dp_netdev_flow *netdev_flow;
3411
0
    int error = 0;
3412
3413
0
    ovs_mutex_lock(&pmd->flow_mutex);
3414
0
    netdev_flow = dp_netdev_pmd_find_flow(pmd, del->ufid, del->key,
3415
0
                                          del->key_len);
3416
0
    if (netdev_flow) {
3417
0
        if (stats) {
3418
0
            get_dpif_flow_status(pmd->dp, netdev_flow, stats, NULL);
3419
0
        }
3420
0
        dp_netdev_pmd_remove_flow(pmd, netdev_flow);
3421
0
    } else {
3422
0
        error = ENOENT;
3423
0
    }
3424
0
    ovs_mutex_unlock(&pmd->flow_mutex);
3425
3426
0
    return error;
3427
0
}
3428
3429
static int
3430
dpif_netdev_flow_del(struct dpif *dpif, const struct dpif_flow_del *del)
3431
0
{
3432
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
3433
0
    struct dp_netdev_pmd_thread *pmd;
3434
0
    int error = 0;
3435
3436
0
    if (del->stats) {
3437
0
        memset(del->stats, 0, sizeof *del->stats);
3438
0
    }
3439
3440
0
    if (del->pmd_id == PMD_ID_NULL) {
3441
0
        if (cmap_count(&dp->poll_threads) == 0) {
3442
0
            return EINVAL;
3443
0
        }
3444
0
        CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3445
0
            struct dpif_flow_stats pmd_stats;
3446
0
            int pmd_error;
3447
3448
0
            pmd_error = flow_del_on_pmd(pmd, &pmd_stats, del);
3449
0
            if (pmd_error) {
3450
0
                error = pmd_error;
3451
0
            } else if (del->stats) {
3452
0
                del->stats->n_packets += pmd_stats.n_packets;
3453
0
                del->stats->n_bytes += pmd_stats.n_bytes;
3454
0
                del->stats->used = MAX(del->stats->used, pmd_stats.used);
3455
0
                del->stats->tcp_flags |= pmd_stats.tcp_flags;
3456
0
            }
3457
0
        }
3458
0
    } else {
3459
0
        pmd = dp_netdev_get_pmd(dp, del->pmd_id);
3460
0
        if (!pmd) {
3461
0
            return EINVAL;
3462
0
        }
3463
0
        error = flow_del_on_pmd(pmd, del->stats, del);
3464
0
        dp_netdev_pmd_unref(pmd);
3465
0
    }
3466
3467
3468
0
    return error;
3469
0
}
3470
3471
struct dpif_netdev_flow_dump {
3472
    struct dpif_flow_dump up;
3473
    struct cmap_position poll_thread_pos;
3474
    struct cmap_position flow_pos;
3475
    struct dp_netdev_pmd_thread *cur_pmd;
3476
    int status;
3477
    struct ovs_mutex mutex;
3478
};
3479
3480
static struct dpif_netdev_flow_dump *
3481
dpif_netdev_flow_dump_cast(struct dpif_flow_dump *dump)
3482
0
{
3483
0
    return CONTAINER_OF(dump, struct dpif_netdev_flow_dump, up);
3484
0
}
3485
3486
static struct dpif_flow_dump *
3487
dpif_netdev_flow_dump_create(const struct dpif *dpif_, bool terse,
3488
                             struct dpif_flow_dump_types *types)
3489
0
{
3490
0
    struct dpif_netdev_flow_dump *dump;
3491
3492
0
    dump = xzalloc(sizeof *dump);
3493
0
    dpif_flow_dump_init(&dump->up, dpif_, terse, types);
3494
0
    ovs_mutex_init(&dump->mutex);
3495
3496
0
    return &dump->up;
3497
0
}
3498
3499
static int
3500
dpif_netdev_flow_dump_destroy(struct dpif_flow_dump *dump_)
3501
0
{
3502
0
    struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_);
3503
3504
0
    ovs_mutex_destroy(&dump->mutex);
3505
0
    free(dump);
3506
0
    return 0;
3507
0
}
3508
3509
struct dpif_netdev_flow_dump_thread {
3510
    struct dpif_flow_dump_thread up;
3511
    struct dpif_netdev_flow_dump *dump;
3512
    struct odputil_keybuf keybuf[FLOW_DUMP_MAX_BATCH];
3513
    struct odputil_keybuf maskbuf[FLOW_DUMP_MAX_BATCH];
3514
};
3515
3516
static struct dpif_netdev_flow_dump_thread *
3517
dpif_netdev_flow_dump_thread_cast(struct dpif_flow_dump_thread *thread)
3518
0
{
3519
0
    return CONTAINER_OF(thread, struct dpif_netdev_flow_dump_thread, up);
3520
0
}
3521
3522
static struct dpif_flow_dump_thread *
3523
dpif_netdev_flow_dump_thread_create(struct dpif_flow_dump *dump_)
3524
0
{
3525
0
    struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_);
3526
0
    struct dpif_netdev_flow_dump_thread *thread;
3527
3528
0
    thread = xmalloc(sizeof *thread);
3529
0
    dpif_flow_dump_thread_init(&thread->up, &dump->up);
3530
0
    thread->dump = dump;
3531
0
    return &thread->up;
3532
0
}
3533
3534
static void
3535
dpif_netdev_flow_dump_thread_destroy(struct dpif_flow_dump_thread *thread_)
3536
0
{
3537
0
    struct dpif_netdev_flow_dump_thread *thread
3538
0
        = dpif_netdev_flow_dump_thread_cast(thread_);
3539
3540
0
    free(thread);
3541
0
}
3542
3543
static int
3544
dpif_netdev_flow_dump_next(struct dpif_flow_dump_thread *thread_,
3545
                           struct dpif_flow *flows, int max_flows)
3546
0
{
3547
0
    struct dpif_netdev_flow_dump_thread *thread
3548
0
        = dpif_netdev_flow_dump_thread_cast(thread_);
3549
0
    struct dpif_netdev_flow_dump *dump = thread->dump;
3550
0
    struct dp_netdev_flow *netdev_flows[FLOW_DUMP_MAX_BATCH];
3551
0
    struct dpif_netdev *dpif = dpif_netdev_cast(thread->up.dump->dpif);
3552
0
    struct dp_netdev *dp = get_dp_netdev(&dpif->dpif);
3553
0
    int n_flows = 0;
3554
0
    int i;
3555
3556
0
    ovs_mutex_lock(&dump->mutex);
3557
0
    if (!dump->status) {
3558
0
        struct dp_netdev_pmd_thread *pmd = dump->cur_pmd;
3559
0
        int flow_limit = MIN(max_flows, FLOW_DUMP_MAX_BATCH);
3560
3561
        /* First call to dump_next(), extracts the first pmd thread.
3562
         * If there is no pmd thread, returns immediately. */
3563
0
        if (!pmd) {
3564
0
            pmd = dp_netdev_pmd_get_next(dp, &dump->poll_thread_pos);
3565
0
            if (!pmd) {
3566
0
                ovs_mutex_unlock(&dump->mutex);
3567
0
                return n_flows;
3568
3569
0
            }
3570
0
        }
3571
3572
0
        do {
3573
0
            for (n_flows = 0; n_flows < flow_limit; n_flows++) {
3574
0
                struct cmap_node *node;
3575
3576
0
                node = cmap_next_position(&pmd->flow_table, &dump->flow_pos);
3577
0
                if (!node) {
3578
0
                    break;
3579
0
                }
3580
0
                netdev_flows[n_flows] = CONTAINER_OF(node,
3581
0
                                                     struct dp_netdev_flow,
3582
0
                                                     node);
3583
0
            }
3584
            /* When finishing dumping the current pmd thread, moves to
3585
             * the next. */
3586
0
            if (n_flows < flow_limit) {
3587
0
                memset(&dump->flow_pos, 0, sizeof dump->flow_pos);
3588
0
                dp_netdev_pmd_unref(pmd);
3589
0
                pmd = dp_netdev_pmd_get_next(dp, &dump->poll_thread_pos);
3590
0
                if (!pmd) {
3591
0
                    dump->status = EOF;
3592
0
                    break;
3593
0
                }
3594
0
            }
3595
            /* Keeps the reference to next caller. */
3596
0
            dump->cur_pmd = pmd;
3597
3598
            /* If the current dump is empty, do not exit the loop, since the
3599
             * remaining pmds could have flows to be dumped.  Just dumps again
3600
             * on the new 'pmd'. */
3601
0
        } while (!n_flows);
3602
0
    }
3603
0
    ovs_mutex_unlock(&dump->mutex);
3604
3605
0
    for (i = 0; i < n_flows; i++) {
3606
0
        struct odputil_keybuf *maskbuf = &thread->maskbuf[i];
3607
0
        struct odputil_keybuf *keybuf = &thread->keybuf[i];
3608
0
        struct dp_netdev_flow *netdev_flow = netdev_flows[i];
3609
0
        struct dpif_flow *f = &flows[i];
3610
0
        struct ofpbuf key, mask;
3611
3612
0
        ofpbuf_use_stack(&key, keybuf, sizeof *keybuf);
3613
0
        ofpbuf_use_stack(&mask, maskbuf, sizeof *maskbuf);
3614
0
        dp_netdev_flow_to_dpif_flow(dp, netdev_flow, &key, &mask, f,
3615
0
                                    dump->up.terse);
3616
0
    }
3617
3618
0
    return n_flows;
3619
0
}
3620
3621
static int
3622
dpif_netdev_execute(struct dpif *dpif, struct dpif_execute *execute)
3623
    OVS_NO_THREAD_SAFETY_ANALYSIS
3624
0
{
3625
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
3626
0
    struct dp_netdev_pmd_thread *pmd;
3627
0
    struct dp_packet_batch pp;
3628
3629
0
    if (dp_packet_size(execute->packet) < ETH_HEADER_LEN ||
3630
0
        dp_packet_size(execute->packet) > UINT16_MAX) {
3631
0
        return EINVAL;
3632
0
    }
3633
3634
    /* Tries finding the 'pmd'.  If NULL is returned, that means
3635
     * the current thread is a non-pmd thread and should use
3636
     * dp_netdev_get_pmd(dp, NON_PMD_CORE_ID). */
3637
0
    pmd = ovsthread_getspecific(dp->per_pmd_key);
3638
0
    if (!pmd) {
3639
0
        pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
3640
0
        if (!pmd) {
3641
0
            return EBUSY;
3642
0
        }
3643
0
    }
3644
3645
0
    if (execute->probe) {
3646
        /* If this is part of a probe, Drop the packet, since executing
3647
         * the action may actually cause spurious packets be sent into
3648
         * the network. */
3649
0
        if (pmd->core_id == NON_PMD_CORE_ID) {
3650
0
            dp_netdev_pmd_unref(pmd);
3651
0
        }
3652
0
        return 0;
3653
0
    }
3654
3655
    /* If the current thread is non-pmd thread, acquires
3656
     * the 'non_pmd_mutex'. */
3657
0
    if (pmd->core_id == NON_PMD_CORE_ID) {
3658
0
        ovs_mutex_lock(&dp->non_pmd_mutex);
3659
0
    }
3660
3661
    /* Update current time in PMD context. We don't care about EMC insertion
3662
     * probability, because we are on a slow path. */
3663
0
    pmd_thread_ctx_time_update(pmd);
3664
3665
    /* The action processing expects the RSS hash to be valid, because
3666
     * it's always initialized at the beginning of datapath processing.
3667
     * In this case, though, 'execute->packet' may not have gone through
3668
     * the datapath at all, it may have been generated by the upper layer
3669
     * (OpenFlow packet-out, BFD frame, ...). */
3670
0
    if (!dp_packet_rss_valid(execute->packet)) {
3671
0
        dp_packet_set_rss_hash(execute->packet,
3672
0
                               flow_hash_5tuple(execute->flow, 0));
3673
0
    }
3674
3675
    /* Making a copy because the packet might be stolen during the execution
3676
     * and caller might still need it.  */
3677
0
    struct dp_packet *packet_clone = dp_packet_clone(execute->packet);
3678
0
    dp_packet_batch_init_packet(&pp, packet_clone);
3679
0
    dp_netdev_execute_actions(pmd, &pp, false, execute->flow,
3680
0
                              execute->actions, execute->actions_len);
3681
0
    dp_netdev_pmd_flush_output_packets(pmd, true);
3682
3683
0
    if (pmd->core_id == NON_PMD_CORE_ID) {
3684
0
        ovs_mutex_unlock(&dp->non_pmd_mutex);
3685
0
        dp_netdev_pmd_unref(pmd);
3686
0
    }
3687
3688
0
    if (dp_packet_batch_size(&pp) == 1) {
3689
        /* Packet wasn't dropped during the execution.  Swapping content with
3690
         * the original packet, because the caller might expect actions to
3691
         * modify it.  Uisng the packet from a batch instead of 'packet_clone'
3692
         * because it maybe stolen and replaced by other packet, e.g. by
3693
         * the fragmentation engine. */
3694
0
        dp_packet_swap(execute->packet, pp.packets[0]);
3695
0
        dp_packet_delete_batch(&pp, true);
3696
0
    } else if (dp_packet_batch_size(&pp)) {
3697
        /* FIXME: We have more packets than expected.  Likely, we got IP
3698
         * fragments of the reassembled packet.  Dropping them here as we have
3699
         * no way to get them to the caller.  It might be that all the required
3700
         * actions with them are already executed, but it also might not be a
3701
         * case, e.g. if dpif_netdev_execute() called to execute a single
3702
         * tunnel push. */
3703
0
        dp_packet_delete_batch(&pp, true);
3704
0
    }
3705
3706
0
    return 0;
3707
0
}
3708
3709
static void
3710
dpif_netdev_operate(struct dpif *dpif, struct dpif_op **ops, size_t n_ops)
3711
0
{
3712
0
    size_t i;
3713
3714
0
    for (i = 0; i < n_ops; i++) {
3715
0
        struct dpif_op *op = ops[i];
3716
3717
0
        switch (op->type) {
3718
0
        case DPIF_OP_FLOW_PUT:
3719
0
            op->error = dpif_netdev_flow_put(dpif, &op->flow_put);
3720
0
            break;
3721
3722
0
        case DPIF_OP_FLOW_DEL:
3723
0
            op->error = dpif_netdev_flow_del(dpif, &op->flow_del);
3724
0
            break;
3725
3726
0
        case DPIF_OP_EXECUTE:
3727
0
            op->error = dpif_netdev_execute(dpif, &op->execute);
3728
0
            break;
3729
3730
0
        case DPIF_OP_FLOW_GET:
3731
0
            op->error = dpif_netdev_flow_get(dpif, &op->flow_get);
3732
0
            break;
3733
0
        }
3734
0
    }
3735
0
}
3736
3737
/* Enable or Disable PMD auto load balancing. */
3738
static void
3739
set_pmd_auto_lb(struct dp_netdev *dp, bool state, bool always_log)
3740
0
{
3741
0
    struct pmd_auto_lb *pmd_alb = &dp->pmd_alb;
3742
3743
0
    if (pmd_alb->is_enabled != state || always_log) {
3744
0
        pmd_alb->is_enabled = state;
3745
0
        if (pmd_alb->is_enabled) {
3746
0
            uint8_t rebalance_load_thresh;
3747
3748
0
            atomic_read_relaxed(&pmd_alb->rebalance_load_thresh,
3749
0
                                &rebalance_load_thresh);
3750
0
            VLOG_INFO("PMD auto load balance is enabled, "
3751
0
                      "interval %"PRIu64" mins, "
3752
0
                      "pmd load threshold %"PRIu8"%%, "
3753
0
                      "improvement threshold %"PRIu8"%%.",
3754
0
                       pmd_alb->rebalance_intvl / MIN_TO_MSEC,
3755
0
                       rebalance_load_thresh,
3756
0
                       pmd_alb->rebalance_improve_thresh);
3757
0
        } else {
3758
0
            pmd_alb->rebalance_poll_timer = 0;
3759
0
            VLOG_INFO("PMD auto load balance is disabled.");
3760
0
        }
3761
0
    }
3762
0
}
3763
3764
static int
3765
parse_pmd_sleep_list(const char *max_sleep_list,
3766
                     struct pmd_sleep **pmd_sleeps)
3767
0
{
3768
0
    char *list, *copy, *key, *value;
3769
0
    int num_vals = 0;
3770
3771
0
    if (!max_sleep_list) {
3772
0
        return num_vals;
3773
0
    }
3774
3775
0
    list = copy = xstrdup(max_sleep_list);
3776
3777
0
    while (ofputil_parse_key_value(&list, &key, &value)) {
3778
0
        uint64_t temp, pmd_max_sleep;
3779
0
        char *error = NULL;
3780
0
        unsigned core;
3781
0
        int i;
3782
3783
0
        error = str_to_u64(key, &temp);
3784
0
        if (error) {
3785
0
            free(error);
3786
0
            continue;
3787
0
        }
3788
3789
0
        if (value[0] == '\0') {
3790
            /* No value specified. key is dp default. */
3791
0
            core = UINT_MAX;
3792
0
            pmd_max_sleep = temp;
3793
0
        } else {
3794
0
            error = str_to_u64(value, &pmd_max_sleep);
3795
0
            if (!error && temp < UINT_MAX) {
3796
                /* Key is pmd core id. */
3797
0
                core = (unsigned) temp;
3798
0
            } else {
3799
0
                free(error);
3800
0
                continue;
3801
0
            }
3802
0
        }
3803
3804
        /* Detect duplicate max sleep values. */
3805
0
        for (i = 0; i < num_vals; i++) {
3806
0
            if ((*pmd_sleeps)[i].core_id == core) {
3807
0
                break;
3808
0
            }
3809
0
        }
3810
0
        if (i == num_vals) {
3811
            /* Not duplicate, add a new entry. */
3812
0
            *pmd_sleeps = xrealloc(*pmd_sleeps,
3813
0
                                   (num_vals + 1) * sizeof **pmd_sleeps);
3814
0
            num_vals++;
3815
0
        }
3816
3817
0
        pmd_max_sleep = MIN(PMD_RCU_QUIESCE_INTERVAL, pmd_max_sleep);
3818
3819
0
        (*pmd_sleeps)[i].core_id = core;
3820
0
        (*pmd_sleeps)[i].max_sleep = pmd_max_sleep;
3821
0
    }
3822
3823
0
    free(copy);
3824
0
    return num_vals;
3825
0
}
3826
3827
static void
3828
log_pmd_sleep(unsigned core_id, int numa_id, uint64_t pmd_max_sleep)
3829
0
{
3830
0
    if (core_id == NON_PMD_CORE_ID) {
3831
0
        return;
3832
0
    }
3833
0
    VLOG_INFO("PMD thread on numa_id: %d, core id: %2d, "
3834
0
              "max sleep: %4"PRIu64" us.", numa_id, core_id, pmd_max_sleep);
3835
0
}
3836
3837
static void
3838
pmd_init_max_sleep(struct dp_netdev *dp, struct dp_netdev_pmd_thread *pmd)
3839
0
{
3840
0
    uint64_t max_sleep = dp->pmd_max_sleep_default;
3841
0
    struct pmd_sleep *pmd_sleeps = NULL;
3842
0
    int num_vals;
3843
3844
0
    num_vals = parse_pmd_sleep_list(dp->max_sleep_list, &pmd_sleeps);
3845
3846
    /* Check if the user has set a specific value for this pmd. */
3847
0
    for (int i = 0; i < num_vals; i++) {
3848
0
        if (pmd_sleeps[i].core_id == pmd->core_id) {
3849
0
            max_sleep = pmd_sleeps[i].max_sleep;
3850
0
            break;
3851
0
        }
3852
0
    }
3853
0
    atomic_init(&pmd->max_sleep, max_sleep);
3854
0
    log_pmd_sleep(pmd->core_id, pmd->numa_id, max_sleep);
3855
0
    free(pmd_sleeps);
3856
0
}
3857
3858
static bool
3859
assign_sleep_values_to_pmds(struct dp_netdev *dp, int num_vals,
3860
                            struct pmd_sleep *pmd_sleeps)
3861
0
{
3862
0
    struct dp_netdev_pmd_thread *pmd;
3863
0
    bool value_changed = false;
3864
3865
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3866
0
        uint64_t new_max_sleep, cur_pmd_max_sleep;
3867
3868
0
        if (pmd->core_id == NON_PMD_CORE_ID) {
3869
0
            continue;
3870
0
        }
3871
3872
        /* Default to global value. */
3873
0
        new_max_sleep = dp->pmd_max_sleep_default;
3874
3875
        /* Check for pmd specific value. */
3876
0
        for (int i = 0;  i < num_vals; i++) {
3877
0
            if (pmd->core_id == pmd_sleeps[i].core_id) {
3878
0
                new_max_sleep = pmd_sleeps[i].max_sleep;
3879
0
                break;
3880
0
            }
3881
0
        }
3882
0
        atomic_read_relaxed(&pmd->max_sleep, &cur_pmd_max_sleep);
3883
0
        if (new_max_sleep != cur_pmd_max_sleep) {
3884
0
            atomic_store_relaxed(&pmd->max_sleep, new_max_sleep);
3885
0
            value_changed = true;
3886
0
        }
3887
0
    }
3888
0
    return value_changed;
3889
0
}
3890
3891
static void
3892
log_all_pmd_sleeps(struct dp_netdev *dp)
3893
0
{
3894
0
    struct dp_netdev_pmd_thread **pmd_list = NULL;
3895
0
    struct dp_netdev_pmd_thread *pmd;
3896
0
    size_t n;
3897
3898
0
    VLOG_INFO("Default PMD thread max sleep: %4"PRIu64" us.",
3899
0
              dp->pmd_max_sleep_default);
3900
3901
0
    sorted_poll_thread_list(dp, &pmd_list, &n);
3902
3903
0
    for (size_t i = 0; i < n; i++) {
3904
0
        uint64_t cur_pmd_max_sleep;
3905
3906
0
        pmd = pmd_list[i];
3907
0
        atomic_read_relaxed(&pmd->max_sleep, &cur_pmd_max_sleep);
3908
0
        log_pmd_sleep(pmd->core_id, pmd->numa_id, cur_pmd_max_sleep);
3909
0
    }
3910
0
    free(pmd_list);
3911
0
}
3912
3913
static bool
3914
set_all_pmd_max_sleeps(struct dp_netdev *dp, const struct smap *config)
3915
0
{
3916
0
    const char *max_sleep_list = smap_get(config, "pmd-sleep-max");
3917
0
    struct pmd_sleep *pmd_sleeps = NULL;
3918
0
    uint64_t default_max_sleep = 0;
3919
0
    bool default_changed = false;
3920
0
    bool pmd_changed = false;
3921
0
    uint64_t pmd_maxsleep;
3922
0
    int num_vals = 0;
3923
3924
    /* Check for deprecated 'pmd-maxsleep' value. */
3925
0
    pmd_maxsleep = smap_get_ullong(config, "pmd-maxsleep", UINT64_MAX);
3926
0
    if (pmd_maxsleep != UINT64_MAX && !max_sleep_list) {
3927
0
        VLOG_WARN_ONCE("pmd-maxsleep is deprecated. "
3928
0
                       "Please use pmd-sleep-max instead.");
3929
0
        default_max_sleep = pmd_maxsleep;
3930
0
    }
3931
3932
    /* Check if there is no change in string or value. */
3933
0
    if (!!dp->max_sleep_list == !!max_sleep_list) {
3934
0
        if (max_sleep_list
3935
0
            ? nullable_string_is_equal(max_sleep_list, dp->max_sleep_list)
3936
0
            : default_max_sleep == dp->pmd_max_sleep_default) {
3937
0
            return false;
3938
0
        }
3939
0
    }
3940
3941
    /* Free existing string and copy new one (if any). */
3942
0
    free(dp->max_sleep_list);
3943
0
    dp->max_sleep_list = nullable_xstrdup(max_sleep_list);
3944
3945
0
    if (max_sleep_list) {
3946
0
        num_vals = parse_pmd_sleep_list(max_sleep_list, &pmd_sleeps);
3947
3948
        /* Check if the user has set a global value. */
3949
0
        for (int i = 0; i < num_vals; i++) {
3950
0
            if (pmd_sleeps[i].core_id == UINT_MAX) {
3951
0
                default_max_sleep = pmd_sleeps[i].max_sleep;
3952
0
                break;
3953
0
            }
3954
0
        }
3955
0
    }
3956
3957
0
    if (dp->pmd_max_sleep_default != default_max_sleep) {
3958
0
        dp->pmd_max_sleep_default = default_max_sleep;
3959
0
        default_changed = true;
3960
0
    }
3961
0
    pmd_changed = assign_sleep_values_to_pmds(dp, num_vals, pmd_sleeps);
3962
3963
0
    free(pmd_sleeps);
3964
0
    return default_changed || pmd_changed;
3965
0
}
3966
3967
/* Applies datapath configuration from the database. Some of the changes are
3968
 * actually applied in dpif_netdev_run(). */
3969
static int
3970
dpif_netdev_set_config(struct dpif *dpif, const struct smap *other_config)
3971
0
{
3972
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
3973
0
    const char *cmask = smap_get(other_config, "pmd-cpu-mask");
3974
0
    const char *pmd_rxq_assign = smap_get_def(other_config, "pmd-rxq-assign",
3975
0
                                             "cycles");
3976
0
    unsigned long long insert_prob =
3977
0
        smap_get_ullong(other_config, "emc-insert-inv-prob",
3978
0
                        DEFAULT_EM_FLOW_INSERT_INV_PROB);
3979
0
    uint32_t insert_min, cur_min;
3980
0
    uint32_t tx_flush_interval, cur_tx_flush_interval;
3981
0
    uint64_t rebalance_intvl;
3982
0
    uint8_t cur_rebalance_load;
3983
0
    uint32_t rebalance_load, rebalance_improve;
3984
0
    bool log_autolb = false;
3985
0
    enum sched_assignment_type pmd_rxq_assign_type;
3986
3987
0
    tx_flush_interval = smap_get_int(other_config, "tx-flush-interval",
3988
0
                                     DEFAULT_TX_FLUSH_INTERVAL);
3989
0
    atomic_read_relaxed(&dp->tx_flush_interval, &cur_tx_flush_interval);
3990
0
    if (tx_flush_interval != cur_tx_flush_interval) {
3991
0
        atomic_store_relaxed(&dp->tx_flush_interval, tx_flush_interval);
3992
0
        VLOG_INFO("Flushing interval for tx queues set to %"PRIu32" us",
3993
0
                  tx_flush_interval);
3994
0
    }
3995
3996
0
    if (!nullable_string_is_equal(dp->pmd_cmask, cmask)) {
3997
0
        free(dp->pmd_cmask);
3998
0
        dp->pmd_cmask = nullable_xstrdup(cmask);
3999
0
        dp_netdev_request_reconfigure(dp);
4000
0
    }
4001
4002
0
    atomic_read_relaxed(&dp->emc_insert_min, &cur_min);
4003
0
    if (insert_prob <= UINT32_MAX) {
4004
0
        insert_min = insert_prob == 0 ? 0 : UINT32_MAX / insert_prob;
4005
0
    } else {
4006
0
        insert_min = DEFAULT_EM_FLOW_INSERT_MIN;
4007
0
        insert_prob = DEFAULT_EM_FLOW_INSERT_INV_PROB;
4008
0
    }
4009
4010
0
    if (insert_min != cur_min) {
4011
0
        atomic_store_relaxed(&dp->emc_insert_min, insert_min);
4012
0
        if (insert_min == 0) {
4013
0
            VLOG_INFO("EMC insertion probability changed to zero");
4014
0
        } else {
4015
0
            VLOG_INFO("EMC insertion probability changed to 1/%llu (~%.2f%%)",
4016
0
                      insert_prob, (100 / (float)insert_prob));
4017
0
        }
4018
0
    }
4019
4020
0
    bool perf_enabled = smap_get_bool(other_config, "pmd-perf-metrics", false);
4021
0
    bool cur_perf_enabled;
4022
0
    atomic_read_relaxed(&dp->pmd_perf_metrics, &cur_perf_enabled);
4023
0
    if (perf_enabled != cur_perf_enabled) {
4024
0
        atomic_store_relaxed(&dp->pmd_perf_metrics, perf_enabled);
4025
0
        if (perf_enabled) {
4026
0
            VLOG_INFO("PMD performance metrics collection enabled");
4027
0
        } else {
4028
0
            VLOG_INFO("PMD performance metrics collection disabled");
4029
0
        }
4030
0
    }
4031
4032
0
    bool smc_enable = smap_get_bool(other_config, "smc-enable", false);
4033
0
    bool cur_smc;
4034
0
    atomic_read_relaxed(&dp->smc_enable_db, &cur_smc);
4035
0
    if (smc_enable != cur_smc) {
4036
0
        atomic_store_relaxed(&dp->smc_enable_db, smc_enable);
4037
0
        if (smc_enable) {
4038
0
            VLOG_INFO("SMC cache is enabled");
4039
0
        } else {
4040
0
            VLOG_INFO("SMC cache is disabled");
4041
0
        }
4042
0
    }
4043
4044
0
    if (!strcmp(pmd_rxq_assign, "roundrobin")) {
4045
0
        pmd_rxq_assign_type = SCHED_ROUNDROBIN;
4046
0
    } else if (!strcmp(pmd_rxq_assign, "cycles")) {
4047
0
        pmd_rxq_assign_type = SCHED_CYCLES;
4048
0
    } else if (!strcmp(pmd_rxq_assign, "group")) {
4049
0
        pmd_rxq_assign_type = SCHED_GROUP;
4050
0
    } else {
4051
        /* Default. */
4052
0
        VLOG_WARN("Unsupported rx queue to PMD assignment mode in "
4053
0
                  "pmd-rxq-assign. Defaulting to 'cycles'.");
4054
0
        pmd_rxq_assign_type = SCHED_CYCLES;
4055
0
        pmd_rxq_assign = "cycles";
4056
0
    }
4057
0
    if (dp->pmd_rxq_assign_type != pmd_rxq_assign_type) {
4058
0
        dp->pmd_rxq_assign_type = pmd_rxq_assign_type;
4059
0
        VLOG_INFO("Rxq to PMD assignment mode changed to: \'%s\'.",
4060
0
                  pmd_rxq_assign);
4061
0
        dp_netdev_request_reconfigure(dp);
4062
0
    }
4063
4064
0
    bool pmd_iso = smap_get_bool(other_config, "pmd-rxq-isolate", true);
4065
4066
0
    if (pmd_rxq_assign_type != SCHED_GROUP && pmd_iso == false) {
4067
        /* Invalid combination. */
4068
0
        VLOG_WARN("pmd-rxq-isolate can only be set false "
4069
0
                  "when using pmd-rxq-assign=group");
4070
0
        pmd_iso = true;
4071
0
    }
4072
0
    if (dp->pmd_iso != pmd_iso) {
4073
0
        dp->pmd_iso = pmd_iso;
4074
0
        if (pmd_iso) {
4075
0
            VLOG_INFO("pmd-rxq-affinity isolates PMD core");
4076
0
        } else {
4077
0
            VLOG_INFO("pmd-rxq-affinity does not isolate PMD core");
4078
0
        }
4079
0
        dp_netdev_request_reconfigure(dp);
4080
0
    }
4081
4082
0
    struct pmd_auto_lb *pmd_alb = &dp->pmd_alb;
4083
4084
0
    rebalance_intvl = smap_get_ullong(other_config,
4085
0
                                      "pmd-auto-lb-rebal-interval",
4086
0
                                      ALB_REBALANCE_INTERVAL);
4087
0
    if (rebalance_intvl > MAX_ALB_REBALANCE_INTERVAL) {
4088
0
        rebalance_intvl = ALB_REBALANCE_INTERVAL;
4089
0
    }
4090
4091
    /* Input is in min, convert it to msec. */
4092
0
    rebalance_intvl =
4093
0
        rebalance_intvl ? rebalance_intvl * MIN_TO_MSEC : MIN_TO_MSEC;
4094
4095
0
    if (pmd_alb->rebalance_intvl != rebalance_intvl) {
4096
0
        pmd_alb->rebalance_intvl = rebalance_intvl;
4097
0
        VLOG_INFO("PMD auto load balance interval set to "
4098
0
                  "%"PRIu64" mins\n", rebalance_intvl / MIN_TO_MSEC);
4099
0
        log_autolb = true;
4100
0
    }
4101
4102
0
    rebalance_improve = smap_get_uint(other_config,
4103
0
                                      "pmd-auto-lb-improvement-threshold",
4104
0
                                      ALB_IMPROVEMENT_THRESHOLD);
4105
0
    if (rebalance_improve > 100) {
4106
0
        rebalance_improve = ALB_IMPROVEMENT_THRESHOLD;
4107
0
    }
4108
0
    if (rebalance_improve != pmd_alb->rebalance_improve_thresh) {
4109
0
        pmd_alb->rebalance_improve_thresh = rebalance_improve;
4110
0
        VLOG_INFO("PMD auto load balance improvement threshold set to "
4111
0
                  "%"PRIu32"%%", rebalance_improve);
4112
0
        log_autolb = true;
4113
0
    }
4114
4115
0
    rebalance_load = smap_get_uint(other_config, "pmd-auto-lb-load-threshold",
4116
0
                                   ALB_LOAD_THRESHOLD);
4117
0
    if (rebalance_load > 100) {
4118
0
        rebalance_load = ALB_LOAD_THRESHOLD;
4119
0
    }
4120
0
    atomic_read_relaxed(&pmd_alb->rebalance_load_thresh, &cur_rebalance_load);
4121
0
    if (rebalance_load != cur_rebalance_load) {
4122
0
        atomic_store_relaxed(&pmd_alb->rebalance_load_thresh,
4123
0
                             rebalance_load);
4124
0
        VLOG_INFO("PMD auto load balance load threshold set to %"PRIu32"%%",
4125
0
                  rebalance_load);
4126
0
        log_autolb = true;
4127
0
    }
4128
4129
0
    bool autolb_state = smap_get_bool(other_config, "pmd-auto-lb", false);
4130
4131
0
    set_pmd_auto_lb(dp, autolb_state, log_autolb);
4132
4133
0
    bool sleep_changed = set_all_pmd_max_sleeps(dp, other_config);
4134
4135
0
    if (ovsthread_once_start(&dp->once_set_config)) {
4136
0
        log_all_pmd_sleeps(dp);
4137
0
        dpif_offload_datapath_register_flow_unreference_cb(
4138
0
            dpif, offload_flow_reference_unreference_cb);
4139
4140
0
        ovsthread_once_done(&dp->once_set_config);
4141
0
    } else if (sleep_changed) {
4142
0
        log_all_pmd_sleeps(dp);
4143
0
    }
4144
4145
0
    return 0;
4146
0
}
4147
4148
static bool
4149
dpif_netdev_number_handlers_required(struct dpif *dpif_ OVS_UNUSED,
4150
                                     uint32_t *n_handlers)
4151
0
{
4152
0
    *n_handlers = 0;
4153
0
    return true;
4154
0
}
4155
4156
/* Parses affinity list and returns result in 'core_ids'. */
4157
static int
4158
parse_affinity_list(const char *affinity_list, unsigned *core_ids, int n_rxq)
4159
0
{
4160
0
    unsigned i;
4161
0
    char *list, *copy, *key, *value;
4162
0
    int error = 0;
4163
4164
0
    for (i = 0; i < n_rxq; i++) {
4165
0
        core_ids[i] = OVS_CORE_UNSPEC;
4166
0
    }
4167
4168
0
    if (!affinity_list) {
4169
0
        return 0;
4170
0
    }
4171
4172
0
    list = copy = xstrdup(affinity_list);
4173
4174
0
    while (ofputil_parse_key_value(&list, &key, &value)) {
4175
0
        int rxq_id, core_id;
4176
4177
0
        if (!str_to_int(key, 0, &rxq_id) || rxq_id < 0
4178
0
            || !str_to_int(value, 0, &core_id) || core_id < 0) {
4179
0
            error = EINVAL;
4180
0
            break;
4181
0
        }
4182
4183
0
        if (rxq_id < n_rxq) {
4184
0
            core_ids[rxq_id] = core_id;
4185
0
        }
4186
0
    }
4187
4188
0
    free(copy);
4189
0
    return error;
4190
0
}
4191
4192
/* Parses 'affinity_list' and applies configuration if it is valid. */
4193
static int
4194
dpif_netdev_port_set_rxq_affinity(struct dp_netdev_port *port,
4195
                                  const char *affinity_list)
4196
0
{
4197
0
    unsigned *core_ids, i;
4198
0
    int error = 0;
4199
4200
0
    core_ids = xmalloc(port->n_rxq * sizeof *core_ids);
4201
0
    if (parse_affinity_list(affinity_list, core_ids, port->n_rxq)) {
4202
0
        error = EINVAL;
4203
0
        goto exit;
4204
0
    }
4205
4206
0
    for (i = 0; i < port->n_rxq; i++) {
4207
0
        port->rxqs[i].core_id = core_ids[i];
4208
0
    }
4209
4210
0
exit:
4211
0
    free(core_ids);
4212
0
    return error;
4213
0
}
4214
4215
/* Returns 'true' if one of the 'port's RX queues exists in 'poll_list'
4216
 * of given PMD thread. */
4217
static bool
4218
dpif_netdev_pmd_polls_port(struct dp_netdev_pmd_thread *pmd,
4219
                           struct dp_netdev_port *port)
4220
    OVS_EXCLUDED(pmd->port_mutex)
4221
0
{
4222
0
    struct rxq_poll *poll;
4223
0
    bool found = false;
4224
4225
0
    ovs_mutex_lock(&pmd->port_mutex);
4226
0
    HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
4227
0
        if (port == poll->rxq->port) {
4228
0
            found = true;
4229
0
            break;
4230
0
        }
4231
0
    }
4232
0
    ovs_mutex_unlock(&pmd->port_mutex);
4233
0
    return found;
4234
0
}
4235
4236
/* Updates port configuration from the database.  The changes are actually
4237
 * applied in dpif_netdev_run(). */
4238
static int
4239
dpif_netdev_port_set_config(struct dpif *dpif, odp_port_t port_no,
4240
                            const struct smap *cfg)
4241
0
{
4242
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
4243
0
    struct dp_netdev_port *port;
4244
0
    int error = 0;
4245
0
    const char *affinity_list = smap_get(cfg, "pmd-rxq-affinity");
4246
0
    bool emc_enabled = smap_get_bool(cfg, "emc-enable", true);
4247
0
    const char *tx_steering_mode = smap_get(cfg, "tx-steering");
4248
0
    enum txq_req_mode txq_mode;
4249
4250
0
    ovs_rwlock_wrlock(&dp->port_rwlock);
4251
0
    error = get_port_by_number(dp, port_no, &port);
4252
0
    if (error) {
4253
0
        goto unlock;
4254
0
    }
4255
4256
0
    if (emc_enabled != port->emc_enabled) {
4257
0
        struct dp_netdev_pmd_thread *pmd;
4258
0
        struct ds ds = DS_EMPTY_INITIALIZER;
4259
0
        uint32_t cur_min, insert_prob;
4260
4261
0
        port->emc_enabled = emc_enabled;
4262
        /* Mark for reload all the threads that polls this port and request
4263
         * for reconfiguration for the actual reloading of threads. */
4264
0
        CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4265
0
            if (dpif_netdev_pmd_polls_port(pmd, port)) {
4266
0
                pmd->need_reload = true;
4267
0
            }
4268
0
        }
4269
0
        dp_netdev_request_reconfigure(dp);
4270
4271
0
        ds_put_format(&ds, "%s: EMC has been %s.",
4272
0
                      netdev_get_name(port->netdev),
4273
0
                      (emc_enabled) ? "enabled" : "disabled");
4274
0
        if (emc_enabled) {
4275
0
            ds_put_cstr(&ds, " Current insertion probability is ");
4276
0
            atomic_read_relaxed(&dp->emc_insert_min, &cur_min);
4277
0
            if (!cur_min) {
4278
0
                ds_put_cstr(&ds, "zero.");
4279
0
            } else {
4280
0
                insert_prob = UINT32_MAX / cur_min;
4281
0
                ds_put_format(&ds, "1/%"PRIu32" (~%.2f%%).",
4282
0
                              insert_prob, 100 / (float) insert_prob);
4283
0
            }
4284
0
        }
4285
0
        VLOG_INFO("%s", ds_cstr(&ds));
4286
0
        ds_destroy(&ds);
4287
0
    }
4288
4289
    /* Checking for RXq affinity changes. */
4290
0
    if (netdev_is_pmd(port->netdev)
4291
0
        && !nullable_string_is_equal(affinity_list, port->rxq_affinity_list)) {
4292
4293
0
        error = dpif_netdev_port_set_rxq_affinity(port, affinity_list);
4294
0
        if (error) {
4295
0
            goto unlock;
4296
0
        }
4297
0
        free(port->rxq_affinity_list);
4298
0
        port->rxq_affinity_list = nullable_xstrdup(affinity_list);
4299
4300
0
        dp_netdev_request_reconfigure(dp);
4301
0
    }
4302
4303
0
    if (nullable_string_is_equal(tx_steering_mode, "hash")) {
4304
0
        txq_mode = TXQ_REQ_MODE_HASH;
4305
0
    } else {
4306
0
        txq_mode = TXQ_REQ_MODE_THREAD;
4307
0
    }
4308
4309
0
    if (txq_mode != port->txq_requested_mode) {
4310
0
        port->txq_requested_mode = txq_mode;
4311
0
        VLOG_INFO("%s: Tx packet steering mode has been set to '%s'.",
4312
0
                  netdev_get_name(port->netdev),
4313
0
                  (txq_mode == TXQ_REQ_MODE_THREAD) ? "thread" : "hash");
4314
0
        dp_netdev_request_reconfigure(dp);
4315
0
    }
4316
4317
0
unlock:
4318
0
    ovs_rwlock_unlock(&dp->port_rwlock);
4319
0
    return error;
4320
0
}
4321
4322
static int
4323
dpif_netdev_queue_to_priority(const struct dpif *dpif OVS_UNUSED,
4324
                              uint32_t queue_id, uint32_t *priority)
4325
0
{
4326
0
    *priority = queue_id;
4327
0
    return 0;
4328
0
}
4329
4330

4331
/* Creates and returns a new 'struct dp_netdev_actions', whose actions are
4332
 * a copy of the 'size' bytes of 'actions' input parameters. */
4333
struct dp_netdev_actions *
4334
dp_netdev_actions_create(const struct nlattr *actions, size_t size)
4335
0
{
4336
0
    struct dp_netdev_actions *netdev_actions;
4337
4338
0
    netdev_actions = xmalloc(sizeof *netdev_actions + size);
4339
0
    netdev_actions->size = size;
4340
0
    if (size) {
4341
0
        memcpy(netdev_actions->actions, actions, size);
4342
0
    }
4343
4344
0
    return netdev_actions;
4345
0
}
4346
4347
struct dp_netdev_actions *
4348
dp_netdev_flow_get_actions(const struct dp_netdev_flow *flow)
4349
0
{
4350
0
    return ovsrcu_get(struct dp_netdev_actions *, &flow->actions);
4351
0
}
4352
4353
static void
4354
dp_netdev_actions_free(struct dp_netdev_actions *actions)
4355
0
{
4356
0
    free(actions);
4357
0
}
4358

4359
static void
4360
dp_netdev_rxq_set_cycles(struct dp_netdev_rxq *rx,
4361
                         enum rxq_cycles_counter_type type,
4362
                         unsigned long long cycles)
4363
0
{
4364
0
   atomic_store_relaxed(&rx->cycles[type], cycles);
4365
0
}
4366
4367
static void
4368
dp_netdev_rxq_add_cycles(struct dp_netdev_rxq *rx,
4369
                         enum rxq_cycles_counter_type type,
4370
                         unsigned long long cycles)
4371
0
{
4372
0
    non_atomic_ullong_add(&rx->cycles[type], cycles);
4373
0
}
4374
4375
static uint64_t
4376
dp_netdev_rxq_get_cycles(struct dp_netdev_rxq *rx,
4377
                         enum rxq_cycles_counter_type type)
4378
0
{
4379
0
    unsigned long long processing_cycles;
4380
0
    atomic_read_relaxed(&rx->cycles[type], &processing_cycles);
4381
0
    return processing_cycles;
4382
0
}
4383
4384
static void
4385
dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx,
4386
                                unsigned long long cycles)
4387
0
{
4388
0
    unsigned int idx = atomic_count_inc(&rx->intrvl_idx) % PMD_INTERVAL_MAX;
4389
0
    atomic_store_relaxed(&rx->cycles_intrvl[idx], cycles);
4390
0
}
4391
4392
static uint64_t
4393
dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned idx)
4394
0
{
4395
0
    unsigned long long processing_cycles;
4396
0
    atomic_read_relaxed(&rx->cycles_intrvl[idx], &processing_cycles);
4397
0
    return processing_cycles;
4398
0
}
4399
4400
#if ATOMIC_ALWAYS_LOCK_FREE_8B
4401
static inline bool
4402
pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd)
4403
0
{
4404
0
    bool pmd_perf_enabled;
4405
0
    atomic_read_relaxed(&pmd->dp->pmd_perf_metrics, &pmd_perf_enabled);
4406
0
    return pmd_perf_enabled;
4407
0
}
4408
#else
4409
/* If stores and reads of 64-bit integers are not atomic, the full PMD
4410
 * performance metrics are not available as locked access to 64 bit
4411
 * integers would be prohibitively expensive. */
4412
static inline bool
4413
pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd OVS_UNUSED)
4414
{
4415
    return false;
4416
}
4417
#endif
4418
4419
static int
4420
dp_netdev_pmd_flush_output_on_port(struct dp_netdev_pmd_thread *pmd,
4421
                                   struct tx_port *p)
4422
0
{
4423
0
    int i;
4424
0
    int tx_qid;
4425
0
    int output_cnt;
4426
0
    bool concurrent_txqs;
4427
0
    struct cycle_timer timer;
4428
0
    uint64_t cycles;
4429
0
    uint32_t tx_flush_interval;
4430
4431
0
    cycle_timer_start(&pmd->perf_stats, &timer);
4432
4433
0
    output_cnt = dp_packet_batch_size(&p->output_pkts);
4434
0
    ovs_assert(output_cnt > 0);
4435
4436
0
    if (p->port->txq_mode == TXQ_MODE_XPS_HASH) {
4437
0
        int n_txq = netdev_n_txq(p->port->netdev);
4438
4439
        /* Re-batch per txq based on packet hash. */
4440
0
        struct dp_packet *packet;
4441
0
        DP_PACKET_BATCH_FOR_EACH (j, packet, &p->output_pkts) {
4442
0
            uint32_t hash;
4443
4444
0
            if (OVS_LIKELY(dp_packet_rss_valid(packet))) {
4445
0
                hash = dp_packet_get_rss_hash(packet);
4446
0
            } else {
4447
0
                struct flow flow;
4448
4449
0
                flow_extract(packet, &flow);
4450
0
                hash = flow_hash_5tuple(&flow, 0);
4451
0
            }
4452
0
            dp_packet_batch_add(&p->txq_pkts[hash % n_txq], packet);
4453
0
        }
4454
4455
        /* Flush batches of each Tx queues. */
4456
0
        for (i = 0; i < n_txq; i++) {
4457
0
            if (dp_packet_batch_is_empty(&p->txq_pkts[i])) {
4458
0
                continue;
4459
0
            }
4460
0
            netdev_send(p->port->netdev, i, &p->txq_pkts[i], true);
4461
0
            dp_packet_batch_init(&p->txq_pkts[i]);
4462
0
        }
4463
0
    } else {
4464
0
        if (p->port->txq_mode == TXQ_MODE_XPS) {
4465
0
            tx_qid = dpif_netdev_xps_get_tx_qid(pmd, p);
4466
0
            concurrent_txqs = true;
4467
0
        } else {
4468
0
            tx_qid = pmd->static_tx_qid;
4469
0
            concurrent_txqs = false;
4470
0
        }
4471
0
        netdev_send(p->port->netdev, tx_qid, &p->output_pkts, concurrent_txqs);
4472
0
    }
4473
0
    dp_packet_batch_init(&p->output_pkts);
4474
4475
    /* Update time of the next flush. */
4476
0
    atomic_read_relaxed(&pmd->dp->tx_flush_interval, &tx_flush_interval);
4477
0
    p->flush_time = pmd->ctx.now + tx_flush_interval;
4478
4479
0
    ovs_assert(pmd->n_output_batches > 0);
4480
0
    pmd->n_output_batches--;
4481
4482
0
    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_PKTS, output_cnt);
4483
0
    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_BATCHES, 1);
4484
4485
    /* Distribute send cycles evenly among transmitted packets and assign to
4486
     * their respective rx queues. */
4487
0
    cycles = cycle_timer_stop(&pmd->perf_stats, &timer) / output_cnt;
4488
0
    for (i = 0; i < output_cnt; i++) {
4489
0
        if (p->output_pkts_rxqs[i]) {
4490
0
            dp_netdev_rxq_add_cycles(p->output_pkts_rxqs[i],
4491
0
                                     RXQ_CYCLES_PROC_CURR, cycles);
4492
0
        }
4493
0
    }
4494
4495
0
    return output_cnt;
4496
0
}
4497
4498
static int
4499
dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd,
4500
                                   bool force)
4501
0
{
4502
0
    struct tx_port *p;
4503
0
    int output_cnt = 0;
4504
4505
0
    if (!pmd->n_output_batches) {
4506
0
        return 0;
4507
0
    }
4508
4509
0
    HMAP_FOR_EACH (p, node, &pmd->send_port_cache) {
4510
0
        if (!dp_packet_batch_is_empty(&p->output_pkts)
4511
0
            && (force || pmd->ctx.now >= p->flush_time)) {
4512
0
            output_cnt += dp_netdev_pmd_flush_output_on_port(pmd, p);
4513
0
        }
4514
0
    }
4515
0
    return output_cnt;
4516
0
}
4517
4518
static int
4519
dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
4520
                           struct dp_netdev_rxq *rxq,
4521
                           odp_port_t port_no)
4522
0
{
4523
0
    struct pmd_perf_stats *s = &pmd->perf_stats;
4524
0
    struct dp_packet_batch batch;
4525
0
    struct cycle_timer timer;
4526
0
    int error;
4527
0
    int batch_cnt = 0;
4528
0
    int rem_qlen = 0, *qlen_p = NULL;
4529
0
    uint64_t cycles;
4530
4531
    /* Measure duration for polling and processing rx burst. */
4532
0
    cycle_timer_start(&pmd->perf_stats, &timer);
4533
4534
0
    pmd->ctx.last_rxq = rxq;
4535
0
    dp_packet_batch_init(&batch);
4536
4537
    /* Fetch the rx queue length only for vhostuser ports. */
4538
0
    if (pmd_perf_metrics_enabled(pmd) && rxq->is_vhost) {
4539
0
        qlen_p = &rem_qlen;
4540
0
    }
4541
4542
0
    error = netdev_rxq_recv(rxq->rx, &batch, qlen_p);
4543
0
    if (!error) {
4544
        /* At least one packet received. */
4545
0
        *recirc_depth_get() = 0;
4546
0
        pmd_thread_ctx_time_update(pmd);
4547
0
        batch_cnt = dp_packet_batch_size(&batch);
4548
0
        if (pmd_perf_metrics_enabled(pmd)) {
4549
            /* Update batch histogram. */
4550
0
            s->current.batches++;
4551
0
            histogram_add_sample(&s->pkts_per_batch, batch_cnt);
4552
            /* Update the maximum vhost rx queue fill level. */
4553
0
            if (rxq->is_vhost && rem_qlen >= 0) {
4554
0
                uint32_t qfill = batch_cnt + rem_qlen;
4555
0
                if (qfill > s->current.max_vhost_qfill) {
4556
0
                    s->current.max_vhost_qfill = qfill;
4557
0
                }
4558
0
            }
4559
0
        }
4560
4561
        /* Process packet batch. */
4562
0
        dp_netdev_input(pmd, &batch, port_no);
4563
4564
        /* Assign processing cycles to rx queue. */
4565
0
        cycles = cycle_timer_stop(&pmd->perf_stats, &timer);
4566
0
        dp_netdev_rxq_add_cycles(rxq, RXQ_CYCLES_PROC_CURR, cycles);
4567
4568
0
        dp_netdev_pmd_flush_output_packets(pmd, false);
4569
0
    } else {
4570
        /* Discard cycles. */
4571
0
        cycle_timer_stop(&pmd->perf_stats, &timer);
4572
0
        if (error != EAGAIN && error != EOPNOTSUPP) {
4573
0
            static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
4574
4575
0
            VLOG_ERR_RL(&rl, "error receiving data from %s: %s",
4576
0
                    netdev_rxq_get_name(rxq->rx), ovs_strerror(error));
4577
0
        }
4578
0
    }
4579
4580
0
    pmd->ctx.last_rxq = NULL;
4581
4582
0
    return batch_cnt;
4583
0
}
4584
4585
static struct tx_port *
4586
tx_port_lookup(const struct hmap *hmap, odp_port_t port_no)
4587
0
{
4588
0
    struct tx_port *tx;
4589
4590
0
    HMAP_FOR_EACH_IN_BUCKET (tx, node, hash_port_no(port_no), hmap) {
4591
0
        if (tx->port->port_no == port_no) {
4592
0
            return tx;
4593
0
        }
4594
0
    }
4595
4596
0
    return NULL;
4597
0
}
4598
4599
static struct tx_bond *
4600
tx_bond_lookup(const struct cmap *tx_bonds, uint32_t bond_id)
4601
0
{
4602
0
    uint32_t hash = hash_bond_id(bond_id);
4603
0
    struct tx_bond *tx;
4604
4605
0
    CMAP_FOR_EACH_WITH_HASH (tx, node, hash, tx_bonds) {
4606
0
        if (tx->bond_id == bond_id) {
4607
0
            return tx;
4608
0
        }
4609
0
    }
4610
0
    return NULL;
4611
0
}
4612
4613
static int
4614
port_reconfigure(struct dp_netdev_port *port)
4615
0
{
4616
0
    struct netdev *netdev = port->netdev;
4617
0
    int i, err;
4618
4619
    /* Closes the existing 'rxq's. */
4620
0
    for (i = 0; i < port->n_rxq; i++) {
4621
0
        netdev_rxq_close(port->rxqs[i].rx);
4622
0
        port->rxqs[i].rx = NULL;
4623
0
    }
4624
0
    unsigned last_nrxq = port->n_rxq;
4625
0
    port->n_rxq = 0;
4626
4627
    /* Allows 'netdev' to apply the pending configuration changes. */
4628
0
    if (netdev_is_reconf_required(netdev) || port->need_reconfigure) {
4629
0
        err = netdev_reconfigure(netdev);
4630
0
        if (err && (err != EOPNOTSUPP)) {
4631
0
            VLOG_ERR("Failed to set interface %s new configuration",
4632
0
                     netdev_get_name(netdev));
4633
0
            return err;
4634
0
        }
4635
0
    }
4636
    /* If the netdev_reconfigure() above succeeds, reopens the 'rxq's. */
4637
0
    port->rxqs = xrealloc(port->rxqs,
4638
0
                          sizeof *port->rxqs * netdev_n_rxq(netdev));
4639
    /* Realloc 'used' counters for tx queues. */
4640
0
    free(port->txq_used);
4641
0
    port->txq_used = xcalloc(netdev_n_txq(netdev), sizeof *port->txq_used);
4642
4643
0
    for (i = 0; i < netdev_n_rxq(netdev); i++) {
4644
0
        bool new_queue = i >= last_nrxq;
4645
0
        if (new_queue) {
4646
0
            memset(&port->rxqs[i], 0, sizeof port->rxqs[i]);
4647
0
        }
4648
4649
0
        port->rxqs[i].port = port;
4650
0
        port->rxqs[i].is_vhost = !strncmp(port->type, "dpdkvhost", 9);
4651
4652
0
        err = netdev_rxq_open(netdev, &port->rxqs[i].rx, i);
4653
0
        if (err) {
4654
0
            return err;
4655
0
        }
4656
0
        port->n_rxq++;
4657
0
    }
4658
4659
    /* Parse affinity list to apply configuration for new queues. */
4660
0
    dpif_netdev_port_set_rxq_affinity(port, port->rxq_affinity_list);
4661
4662
    /* If reconfiguration was successful mark it as such, so we can use it */
4663
0
    port->need_reconfigure = false;
4664
4665
0
    return 0;
4666
0
}
4667
4668
struct sched_numa_list {
4669
    struct hmap numas;  /* Contains 'struct sched_numa'. */
4670
};
4671
4672
/* Meta data for out-of-place pmd rxq assignments. */
4673
struct sched_pmd {
4674
    struct sched_numa *numa;
4675
    /* Associated PMD thread. */
4676
    struct dp_netdev_pmd_thread *pmd;
4677
    uint64_t pmd_proc_cycles;
4678
    struct dp_netdev_rxq **rxqs;
4679
    unsigned n_rxq;
4680
    bool isolated;
4681
};
4682
4683
struct sched_numa {
4684
    struct hmap_node node;
4685
    int numa_id;
4686
    /* PMDs on numa node. */
4687
    struct sched_pmd *pmds;
4688
    /* Num of PMDs on numa node. */
4689
    unsigned n_pmds;
4690
    /* Num of isolated PMDs on numa node. */
4691
    unsigned n_isolated;
4692
    int rr_cur_index;
4693
    bool rr_idx_inc;
4694
};
4695
4696
static size_t
4697
sched_numa_list_count(struct sched_numa_list *numa_list)
4698
0
{
4699
0
    return hmap_count(&numa_list->numas);
4700
0
}
4701
4702
static struct sched_numa *
4703
sched_numa_list_next(struct sched_numa_list *numa_list,
4704
                     const struct sched_numa *numa)
4705
0
{
4706
0
    struct hmap_node *node = NULL;
4707
4708
0
    if (numa) {
4709
0
        node = hmap_next(&numa_list->numas, &numa->node);
4710
0
    }
4711
0
    if (!node) {
4712
0
        node = hmap_first(&numa_list->numas);
4713
0
    }
4714
4715
0
    return (node) ? CONTAINER_OF(node, struct sched_numa, node) : NULL;
4716
0
}
4717
4718
static struct sched_numa *
4719
sched_numa_list_lookup(struct sched_numa_list *numa_list, int numa_id)
4720
0
{
4721
0
    struct sched_numa *numa;
4722
4723
0
    HMAP_FOR_EACH_WITH_HASH (numa, node, hash_int(numa_id, 0),
4724
0
                             &numa_list->numas) {
4725
0
        if (numa->numa_id == numa_id) {
4726
0
            return numa;
4727
0
        }
4728
0
    }
4729
0
    return NULL;
4730
0
}
4731
4732
static int
4733
compare_sched_pmd_list(const void *a_, const void *b_)
4734
0
{
4735
0
    struct sched_pmd *a, *b;
4736
4737
0
    a = (struct sched_pmd *) a_;
4738
0
    b = (struct sched_pmd *) b_;
4739
4740
0
    return compare_poll_thread_list(&a->pmd, &b->pmd);
4741
0
}
4742
4743
static void
4744
sort_numa_list_pmds(struct sched_numa_list *numa_list)
4745
0
{
4746
0
    struct sched_numa *numa;
4747
4748
0
    HMAP_FOR_EACH (numa, node, &numa_list->numas) {
4749
0
        if (numa->n_pmds > 1) {
4750
0
            qsort(numa->pmds, numa->n_pmds, sizeof *numa->pmds,
4751
0
                  compare_sched_pmd_list);
4752
0
        }
4753
0
    }
4754
0
}
4755
4756
/* Populate numas and pmds on those numas. */
4757
static void
4758
sched_numa_list_populate(struct sched_numa_list *numa_list,
4759
                         struct dp_netdev *dp)
4760
0
{
4761
0
    struct dp_netdev_pmd_thread *pmd;
4762
4763
0
    hmap_init(&numa_list->numas);
4764
4765
    /* For each pmd on this datapath. */
4766
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4767
0
        struct sched_numa *numa;
4768
0
        struct sched_pmd *sched_pmd;
4769
0
        if (pmd->core_id == NON_PMD_CORE_ID) {
4770
0
            continue;
4771
0
        }
4772
4773
        /* Get the numa of the PMD. */
4774
0
        numa = sched_numa_list_lookup(numa_list, pmd->numa_id);
4775
        /* Create a new numa node for it if not already created. */
4776
0
        if (!numa) {
4777
0
            numa = xzalloc(sizeof *numa);
4778
0
            numa->numa_id = pmd->numa_id;
4779
0
            hmap_insert(&numa_list->numas, &numa->node,
4780
0
                        hash_int(pmd->numa_id, 0));
4781
0
        }
4782
4783
        /* Create a sched_pmd on this numa for the pmd. */
4784
0
        numa->n_pmds++;
4785
0
        numa->pmds = xrealloc(numa->pmds, numa->n_pmds * sizeof *numa->pmds);
4786
0
        sched_pmd = &numa->pmds[numa->n_pmds - 1];
4787
0
        memset(sched_pmd, 0, sizeof *sched_pmd);
4788
0
        sched_pmd->numa = numa;
4789
0
        sched_pmd->pmd = pmd;
4790
        /* At least one pmd is present so initialize curr_idx and idx_inc. */
4791
0
        numa->rr_cur_index = 0;
4792
0
        numa->rr_idx_inc = true;
4793
0
    }
4794
0
    sort_numa_list_pmds(numa_list);
4795
0
}
4796
4797
static void
4798
sched_numa_list_free_entries(struct sched_numa_list *numa_list)
4799
0
{
4800
0
    struct sched_numa *numa;
4801
4802
0
    HMAP_FOR_EACH_POP (numa, node, &numa_list->numas) {
4803
0
        for (unsigned i = 0; i < numa->n_pmds; i++) {
4804
0
            struct sched_pmd *sched_pmd;
4805
4806
0
            sched_pmd = &numa->pmds[i];
4807
0
            sched_pmd->n_rxq = 0;
4808
0
            free(sched_pmd->rxqs);
4809
0
        }
4810
0
        numa->n_pmds = 0;
4811
0
        free(numa->pmds);
4812
0
        free(numa);
4813
0
    }
4814
0
    hmap_destroy(&numa_list->numas);
4815
0
}
4816
4817
static struct sched_pmd *
4818
sched_pmd_find_by_pmd(struct sched_numa_list *numa_list,
4819
                      struct dp_netdev_pmd_thread *pmd)
4820
0
{
4821
0
    struct sched_numa *numa;
4822
4823
0
    HMAP_FOR_EACH (numa, node, &numa_list->numas) {
4824
0
        for (unsigned i = 0; i < numa->n_pmds; i++) {
4825
0
            struct sched_pmd *sched_pmd;
4826
4827
0
            sched_pmd = &numa->pmds[i];
4828
0
            if (pmd == sched_pmd->pmd) {
4829
0
                return sched_pmd;
4830
0
            }
4831
0
        }
4832
0
    }
4833
0
    return NULL;
4834
0
}
4835
4836
static void
4837
sched_pmd_add_rxq(struct sched_pmd *sched_pmd, struct dp_netdev_rxq *rxq,
4838
                  uint64_t cycles)
4839
0
{
4840
    /* As sched_pmd is allocated outside this fn. better to not assume
4841
     * rxqs is initialized to NULL. */
4842
0
    if (sched_pmd->n_rxq == 0) {
4843
0
        sched_pmd->rxqs = xmalloc(sizeof *sched_pmd->rxqs);
4844
0
    } else {
4845
0
        sched_pmd->rxqs = xrealloc(sched_pmd->rxqs, (sched_pmd->n_rxq + 1) *
4846
0
                                                    sizeof *sched_pmd->rxqs);
4847
0
    }
4848
4849
0
    sched_pmd->rxqs[sched_pmd->n_rxq++] = rxq;
4850
0
    sched_pmd->pmd_proc_cycles += cycles;
4851
0
}
4852
4853
static void
4854
sched_numa_list_assignments(struct sched_numa_list *numa_list,
4855
                            struct dp_netdev *dp)
4856
    OVS_REQ_RDLOCK(dp->port_rwlock)
4857
0
{
4858
0
    struct dp_netdev_port *port;
4859
4860
    /* For each port. */
4861
0
    HMAP_FOR_EACH (port, node, &dp->ports) {
4862
0
        if (!netdev_is_pmd(port->netdev)) {
4863
0
            continue;
4864
0
        }
4865
        /* For each rxq on the port. */
4866
0
        for (unsigned qid = 0; qid < port->n_rxq; qid++) {
4867
0
            struct dp_netdev_rxq *rxq = &port->rxqs[qid];
4868
0
            struct sched_pmd *sched_pmd;
4869
0
            uint64_t proc_cycles = 0;
4870
4871
0
            for (int i = 0; i < PMD_INTERVAL_MAX; i++) {
4872
0
                proc_cycles  += dp_netdev_rxq_get_intrvl_cycles(rxq, i);
4873
0
            }
4874
4875
0
            sched_pmd = sched_pmd_find_by_pmd(numa_list, rxq->pmd);
4876
0
            if (sched_pmd) {
4877
0
                if (rxq->core_id != OVS_CORE_UNSPEC && dp->pmd_iso) {
4878
0
                    sched_pmd->isolated = true;
4879
0
                }
4880
0
                sched_pmd_add_rxq(sched_pmd, rxq, proc_cycles);
4881
0
            }
4882
0
        }
4883
0
    }
4884
0
}
4885
4886
static void
4887
sched_numa_list_put_in_place(struct sched_numa_list *numa_list)
4888
0
{
4889
0
    struct sched_numa *numa;
4890
4891
    /* For each numa. */
4892
0
    HMAP_FOR_EACH (numa, node, &numa_list->numas) {
4893
        /* For each pmd. */
4894
0
        for (int i = 0; i < numa->n_pmds; i++) {
4895
0
            struct sched_pmd *sched_pmd;
4896
4897
0
            sched_pmd = &numa->pmds[i];
4898
0
            sched_pmd->pmd->isolated = sched_pmd->isolated;
4899
            /* For each rxq. */
4900
0
            for (unsigned k = 0; k < sched_pmd->n_rxq; k++) {
4901
                /* Store the new pmd from the out of place sched_numa_list
4902
                 * struct to the dp_netdev_rxq struct */
4903
0
                sched_pmd->rxqs[k]->pmd = sched_pmd->pmd;
4904
0
            }
4905
0
        }
4906
0
    }
4907
0
}
4908
4909
/* Returns 'true' if OVS rxq scheduling algorithm assigned any unpinned rxq to
4910
 * a PMD thread core on a non-local numa node. */
4911
static bool
4912
sched_numa_list_cross_numa_polling(struct sched_numa_list *numa_list)
4913
0
{
4914
0
    struct sched_numa *numa;
4915
4916
0
    HMAP_FOR_EACH (numa, node, &numa_list->numas) {
4917
0
        for (int i = 0; i < numa->n_pmds; i++) {
4918
0
            struct sched_pmd *sched_pmd;
4919
4920
0
            sched_pmd = &numa->pmds[i];
4921
0
            if (sched_pmd->isolated) {
4922
                /* All rxqs on this PMD thread core are pinned. */
4923
0
                continue;
4924
0
            }
4925
0
            for (unsigned k = 0; k < sched_pmd->n_rxq; k++) {
4926
0
                struct dp_netdev_rxq *rxq = sched_pmd->rxqs[k];
4927
                /* Check if the rxq is not pinned to a specific PMD thread core
4928
                 * by the user AND the PMD thread core that OVS assigned is
4929
                 * non-local to the rxq port. */
4930
0
                if (rxq->core_id == OVS_CORE_UNSPEC &&
4931
0
                    rxq->pmd->numa_id !=
4932
0
                        netdev_get_numa_id(rxq->port->netdev)) {
4933
0
                    return true;
4934
0
                }
4935
0
            }
4936
0
        }
4937
0
    }
4938
0
    return false;
4939
0
}
4940
4941
static unsigned
4942
sched_numa_noniso_pmd_count(struct sched_numa *numa)
4943
0
{
4944
0
    if (numa->n_pmds > numa->n_isolated) {
4945
0
        return numa->n_pmds - numa->n_isolated;
4946
0
    }
4947
0
    return 0;
4948
0
}
4949
4950
/* Sort Rx Queues by the processing cycles they are consuming. */
4951
static int
4952
compare_rxq_cycles(const void *a, const void *b)
4953
0
{
4954
0
    struct dp_netdev_rxq *qa;
4955
0
    struct dp_netdev_rxq *qb;
4956
0
    uint64_t cycles_qa, cycles_qb;
4957
4958
0
    qa = *(struct dp_netdev_rxq **) a;
4959
0
    qb = *(struct dp_netdev_rxq **) b;
4960
4961
0
    cycles_qa = dp_netdev_rxq_get_cycles(qa, RXQ_CYCLES_PROC_HIST);
4962
0
    cycles_qb = dp_netdev_rxq_get_cycles(qb, RXQ_CYCLES_PROC_HIST);
4963
4964
0
    if (cycles_qa != cycles_qb) {
4965
0
        return (cycles_qa < cycles_qb) ? 1 : -1;
4966
0
    } else {
4967
        /* Cycles are the same so tiebreak on port/queue id.
4968
         * Tiebreaking (as opposed to return 0) ensures consistent
4969
         * sort results across multiple OS's. */
4970
0
        uint32_t port_qa = odp_to_u32(qa->port->port_no);
4971
0
        uint32_t port_qb = odp_to_u32(qb->port->port_no);
4972
0
        if (port_qa != port_qb) {
4973
0
            return port_qa > port_qb ? 1 : -1;
4974
0
        } else {
4975
0
            return netdev_rxq_get_queue_id(qa->rx)
4976
0
                    - netdev_rxq_get_queue_id(qb->rx);
4977
0
        }
4978
0
    }
4979
0
}
4980
4981
static bool
4982
sched_pmd_new_lowest(struct sched_pmd *current_lowest, struct sched_pmd *pmd,
4983
                     bool has_proc)
4984
0
{
4985
0
    uint64_t current_num, pmd_num;
4986
4987
0
    if (current_lowest == NULL) {
4988
0
        return true;
4989
0
    }
4990
4991
0
    if (has_proc) {
4992
0
        current_num = current_lowest->pmd_proc_cycles;
4993
0
        pmd_num = pmd->pmd_proc_cycles;
4994
0
    } else {
4995
0
        current_num = current_lowest->n_rxq;
4996
0
        pmd_num = pmd->n_rxq;
4997
0
    }
4998
4999
0
    if (pmd_num < current_num) {
5000
0
        return true;
5001
0
    }
5002
0
    return false;
5003
0
}
5004
5005
static struct sched_pmd *
5006
sched_pmd_get_lowest(struct sched_numa *numa, bool has_cyc)
5007
0
{
5008
0
    struct sched_pmd *lowest_sched_pmd = NULL;
5009
5010
0
    for (unsigned i = 0; i < numa->n_pmds; i++) {
5011
0
        struct sched_pmd *sched_pmd;
5012
5013
0
        sched_pmd = &numa->pmds[i];
5014
0
        if (sched_pmd->isolated) {
5015
0
            continue;
5016
0
        }
5017
0
        if (sched_pmd_new_lowest(lowest_sched_pmd, sched_pmd, has_cyc)) {
5018
0
            lowest_sched_pmd = sched_pmd;
5019
0
        }
5020
0
    }
5021
0
    return lowest_sched_pmd;
5022
0
}
5023
5024
/*
5025
 * Returns the next pmd from the numa node.
5026
 *
5027
 * If 'updown' is 'true' it will alternate between selecting the next pmd in
5028
 * either an up or down walk, switching between up/down when the first or last
5029
 * core is reached. e.g. 1,2,3,3,2,1,1,2...
5030
 *
5031
 * If 'updown' is 'false' it will select the next pmd wrapping around when
5032
 * last core reached. e.g. 1,2,3,1,2,3,1,2...
5033
 */
5034
static struct sched_pmd *
5035
sched_pmd_next_rr(struct sched_numa *numa, bool updown)
5036
0
{
5037
0
    int numa_idx = numa->rr_cur_index;
5038
5039
0
    if (numa->rr_idx_inc == true) {
5040
        /* Incrementing through list of pmds. */
5041
0
        if (numa->rr_cur_index == numa->n_pmds - 1) {
5042
            /* Reached the last pmd. */
5043
0
            if (updown) {
5044
0
                numa->rr_idx_inc = false;
5045
0
            } else {
5046
0
                numa->rr_cur_index = 0;
5047
0
            }
5048
0
        } else {
5049
0
            numa->rr_cur_index++;
5050
0
        }
5051
0
    } else {
5052
        /* Decrementing through list of pmds. */
5053
0
        if (numa->rr_cur_index == 0) {
5054
            /* Reached the first pmd. */
5055
0
            numa->rr_idx_inc = true;
5056
0
        } else {
5057
0
            numa->rr_cur_index--;
5058
0
        }
5059
0
    }
5060
0
    return &numa->pmds[numa_idx];
5061
0
}
5062
5063
static struct sched_pmd *
5064
sched_pmd_next_noniso_rr(struct sched_numa *numa, bool updown)
5065
0
{
5066
0
    struct sched_pmd *sched_pmd = NULL;
5067
5068
    /* sched_pmd_next_rr() may return duplicate PMDs before all PMDs have been
5069
     * returned depending on updown. Call it more than n_pmds to ensure all
5070
     * PMDs can be searched for the next non-isolated PMD. */
5071
0
    for (unsigned i = 0; i < numa->n_pmds * 2; i++) {
5072
0
        sched_pmd = sched_pmd_next_rr(numa, updown);
5073
0
        if (!sched_pmd->isolated) {
5074
0
            break;
5075
0
        }
5076
0
        sched_pmd = NULL;
5077
0
    }
5078
0
    return sched_pmd;
5079
0
}
5080
5081
static struct sched_pmd *
5082
sched_pmd_next(struct sched_numa *numa, enum sched_assignment_type algo,
5083
               bool has_proc)
5084
0
{
5085
0
    if (algo == SCHED_GROUP) {
5086
0
        return sched_pmd_get_lowest(numa, has_proc);
5087
0
    }
5088
5089
    /* By default RR the PMDs. */
5090
0
    return sched_pmd_next_noniso_rr(numa, algo == SCHED_CYCLES ? true : false);
5091
0
}
5092
5093
static const char *
5094
get_assignment_type_string(enum sched_assignment_type algo)
5095
0
{
5096
0
    switch (algo) {
5097
0
    case SCHED_ROUNDROBIN: return "roundrobin";
5098
0
    case SCHED_CYCLES: return "cycles";
5099
0
    case SCHED_GROUP: return "group";
5100
0
    default: return "Unknown";
5101
0
    }
5102
0
}
5103
5104
0
#define MAX_RXQ_CYC_TEXT 40
5105
0
#define MAX_RXQ_CYC_STRLEN (INT_STRLEN(uint64_t) + MAX_RXQ_CYC_TEXT)
5106
5107
static char *
5108
get_rxq_cyc_log(char *a, enum sched_assignment_type algo, uint64_t cycles)
5109
0
{
5110
0
    int ret = 0;
5111
5112
0
    if (algo != SCHED_ROUNDROBIN) {
5113
0
        ret = snprintf(a, MAX_RXQ_CYC_STRLEN,
5114
0
                       " (measured processing cycles %"PRIu64")", cycles);
5115
0
    }
5116
5117
0
    if (algo == SCHED_ROUNDROBIN || ret <= 0) {
5118
0
        a[0] = '\0';
5119
0
    }
5120
0
    return a;
5121
0
}
5122
5123
static void
5124
sched_numa_list_schedule(struct sched_numa_list *numa_list,
5125
                         struct dp_netdev *dp,
5126
                         enum sched_assignment_type algo,
5127
                         enum vlog_level level)
5128
    OVS_REQ_RDLOCK(dp->port_rwlock)
5129
0
{
5130
0
    struct dp_netdev_port *port;
5131
0
    struct dp_netdev_rxq **rxqs = NULL;
5132
0
    struct sched_numa *last_cross_numa;
5133
0
    unsigned n_rxqs = 0;
5134
0
    bool start_logged = false;
5135
0
    size_t n_numa;
5136
5137
    /* For each port. */
5138
0
    HMAP_FOR_EACH (port, node, &dp->ports) {
5139
0
        if (!netdev_is_pmd(port->netdev)) {
5140
0
            continue;
5141
0
        }
5142
5143
        /* For each rxq on the port. */
5144
0
        for (int qid = 0; qid < port->n_rxq; qid++) {
5145
0
            struct dp_netdev_rxq *rxq = &port->rxqs[qid];
5146
5147
0
            if (algo != SCHED_ROUNDROBIN) {
5148
0
                uint64_t cycle_hist = 0;
5149
5150
                /* Sum the queue intervals and store the cycle history. */
5151
0
                for (unsigned i = 0; i < PMD_INTERVAL_MAX; i++) {
5152
0
                    cycle_hist += dp_netdev_rxq_get_intrvl_cycles(rxq, i);
5153
0
                }
5154
0
                dp_netdev_rxq_set_cycles(rxq, RXQ_CYCLES_PROC_HIST,
5155
0
                                         cycle_hist);
5156
0
            }
5157
5158
            /* Check if this rxq is pinned. */
5159
0
            if (rxq->core_id != OVS_CORE_UNSPEC) {
5160
0
                struct sched_pmd *sched_pmd;
5161
0
                struct dp_netdev_pmd_thread *pmd;
5162
0
                struct sched_numa *numa;
5163
0
                bool iso = dp->pmd_iso;
5164
0
                uint64_t proc_cycles;
5165
0
                char rxq_cyc_log[MAX_RXQ_CYC_STRLEN];
5166
5167
                /* This rxq should be pinned, pin it now. */
5168
0
                pmd = dp_netdev_get_pmd(dp, rxq->core_id);
5169
0
                sched_pmd = sched_pmd_find_by_pmd(numa_list, pmd);
5170
0
                dp_netdev_pmd_unref(pmd);
5171
0
                if (!sched_pmd) {
5172
                    /* Cannot find the PMD.  Cannot pin this rxq. */
5173
0
                    VLOG(level == VLL_DBG ? VLL_DBG : VLL_WARN,
5174
0
                            "Core %2u cannot be pinned with "
5175
0
                            "port \'%s\' rx queue %d. Use pmd-cpu-mask to "
5176
0
                            "enable a pmd on core %u. An alternative core "
5177
0
                            "will be assigned.",
5178
0
                            rxq->core_id,
5179
0
                            netdev_rxq_get_name(rxq->rx),
5180
0
                            netdev_rxq_get_queue_id(rxq->rx),
5181
0
                            rxq->core_id);
5182
0
                    rxqs = xrealloc(rxqs, (n_rxqs + 1) * sizeof *rxqs);
5183
0
                    rxqs[n_rxqs++] = rxq;
5184
0
                    continue;
5185
0
                }
5186
0
                if (iso) {
5187
                    /* Mark PMD as isolated if not done already. */
5188
0
                    if (sched_pmd->isolated == false) {
5189
0
                        sched_pmd->isolated = true;
5190
0
                        numa = sched_pmd->numa;
5191
0
                        numa->n_isolated++;
5192
0
                    }
5193
0
                }
5194
0
                proc_cycles = dp_netdev_rxq_get_cycles(rxq,
5195
0
                                                       RXQ_CYCLES_PROC_HIST);
5196
0
                VLOG(level, "Core %2u on numa node %d is pinned with "
5197
0
                            "port \'%s\' rx queue %d%s",
5198
0
                            sched_pmd->pmd->core_id, sched_pmd->pmd->numa_id,
5199
0
                            netdev_rxq_get_name(rxq->rx),
5200
0
                            netdev_rxq_get_queue_id(rxq->rx),
5201
0
                            get_rxq_cyc_log(rxq_cyc_log, algo, proc_cycles));
5202
0
                sched_pmd_add_rxq(sched_pmd, rxq, proc_cycles);
5203
0
            } else {
5204
0
                rxqs = xrealloc(rxqs, (n_rxqs + 1) * sizeof *rxqs);
5205
0
                rxqs[n_rxqs++] = rxq;
5206
0
            }
5207
0
        }
5208
0
    }
5209
5210
0
    if (n_rxqs > 1 && algo != SCHED_ROUNDROBIN) {
5211
        /* Sort the queues in order of the processing cycles
5212
         * they consumed during their last pmd interval. */
5213
0
        qsort(rxqs, n_rxqs, sizeof *rxqs, compare_rxq_cycles);
5214
0
    }
5215
5216
0
    last_cross_numa = NULL;
5217
0
    n_numa = sched_numa_list_count(numa_list);
5218
0
    for (unsigned i = 0; i < n_rxqs; i++) {
5219
0
        struct dp_netdev_rxq *rxq = rxqs[i];
5220
0
        struct sched_pmd *sched_pmd = NULL;
5221
0
        struct sched_numa *numa;
5222
0
        int port_numa_id;
5223
0
        uint64_t proc_cycles;
5224
0
        char rxq_cyc_log[MAX_RXQ_CYC_STRLEN];
5225
5226
0
        if (start_logged == false && level != VLL_DBG) {
5227
0
            VLOG(level, "Performing pmd to rx queue assignment using %s "
5228
0
                        "algorithm.", get_assignment_type_string(algo));
5229
0
            start_logged = true;
5230
0
        }
5231
5232
        /* Store the cycles for this rxq as we will log these later. */
5233
0
        proc_cycles = dp_netdev_rxq_get_cycles(rxq, RXQ_CYCLES_PROC_HIST);
5234
5235
0
        port_numa_id = netdev_get_numa_id(rxq->port->netdev);
5236
5237
        /* Select numa. */
5238
0
        numa = sched_numa_list_lookup(numa_list, port_numa_id);
5239
5240
        /* Check if numa has no PMDs or no non-isolated PMDs. */
5241
0
        if (!numa || !sched_numa_noniso_pmd_count(numa)) {
5242
            /* Unable to use this numa to find a PMD. */
5243
0
            numa = NULL;
5244
            /* Find any numa with available PMDs. */
5245
0
            for (int j = 0; j < n_numa; j++) {
5246
0
                numa = sched_numa_list_next(numa_list, last_cross_numa);
5247
0
                last_cross_numa = numa;
5248
0
                if (sched_numa_noniso_pmd_count(numa)) {
5249
0
                    break;
5250
0
                }
5251
0
                numa = NULL;
5252
0
            }
5253
0
        }
5254
5255
0
        if (numa) {
5256
            /* Select the PMD that should be used for this rxq. */
5257
0
            sched_pmd = sched_pmd_next(numa, algo,
5258
0
                                       proc_cycles ? true : false);
5259
0
        }
5260
5261
        /* Check that a pmd has been selected. */
5262
0
        if (sched_pmd) {
5263
0
            int pmd_numa_id;
5264
5265
0
            pmd_numa_id = sched_pmd->numa->numa_id;
5266
            /* Check if selected pmd numa matches port numa. */
5267
0
            if (pmd_numa_id != port_numa_id) {
5268
0
                VLOG(level, "There's no available (non-isolated) pmd thread "
5269
0
                            "on numa node %d. Port \'%s\' rx queue %d will "
5270
0
                            "be assigned to a pmd on numa node %d. "
5271
0
                            "This may lead to reduced performance.",
5272
0
                            port_numa_id, netdev_rxq_get_name(rxq->rx),
5273
0
                            netdev_rxq_get_queue_id(rxq->rx), pmd_numa_id);
5274
0
            }
5275
0
            VLOG(level, "Core %2u on numa node %d assigned port \'%s\' "
5276
0
                        "rx queue %d%s.",
5277
0
                        sched_pmd->pmd->core_id, sched_pmd->pmd->numa_id,
5278
0
                        netdev_rxq_get_name(rxq->rx),
5279
0
                        netdev_rxq_get_queue_id(rxq->rx),
5280
0
                        get_rxq_cyc_log(rxq_cyc_log, algo, proc_cycles));
5281
0
            sched_pmd_add_rxq(sched_pmd, rxq, proc_cycles);
5282
0
        } else  {
5283
0
            VLOG(level == VLL_DBG ? level : VLL_WARN,
5284
0
                 "No non-isolated pmd on any numa available for "
5285
0
                 "port \'%s\' rx queue %d%s. "
5286
0
                 "This rx queue will not be polled.",
5287
0
                 netdev_rxq_get_name(rxq->rx),
5288
0
                 netdev_rxq_get_queue_id(rxq->rx),
5289
0
                 get_rxq_cyc_log(rxq_cyc_log, algo, proc_cycles));
5290
0
        }
5291
0
    }
5292
0
    free(rxqs);
5293
0
}
5294
5295
static void
5296
rxq_scheduling(struct dp_netdev *dp)
5297
    OVS_REQ_RDLOCK(dp->port_rwlock)
5298
0
{
5299
0
    struct sched_numa_list numa_list;
5300
0
    enum sched_assignment_type algo = dp->pmd_rxq_assign_type;
5301
5302
0
    sched_numa_list_populate(&numa_list, dp);
5303
0
    sched_numa_list_schedule(&numa_list, dp, algo, VLL_INFO);
5304
0
    sched_numa_list_put_in_place(&numa_list);
5305
5306
0
    sched_numa_list_free_entries(&numa_list);
5307
0
}
5308
5309
static uint64_t variance(uint64_t a[], int n);
5310
5311
static uint64_t
5312
sched_numa_variance(struct sched_numa *numa)
5313
0
{
5314
0
    uint64_t *percent_busy = NULL;
5315
0
    int n_proc = 0;
5316
0
    uint64_t var;
5317
5318
0
    percent_busy = xmalloc(numa->n_pmds * sizeof *percent_busy);
5319
5320
0
    for (unsigned i = 0; i < numa->n_pmds; i++) {
5321
0
        struct sched_pmd *sched_pmd;
5322
0
        uint64_t total_cycles = 0;
5323
5324
0
        sched_pmd = &numa->pmds[i];
5325
        /* Exclude isolated PMDs from variance calculations. */
5326
0
        if (sched_pmd->isolated == true) {
5327
0
            continue;
5328
0
        }
5329
        /* Get the total pmd cycles for an interval. */
5330
0
        atomic_read_relaxed(&sched_pmd->pmd->intrvl_cycles, &total_cycles);
5331
5332
0
        if (total_cycles) {
5333
            /* Estimate the cycles to cover all intervals. */
5334
0
            total_cycles *= PMD_INTERVAL_MAX;
5335
0
            percent_busy[n_proc++] = (sched_pmd->pmd_proc_cycles * 100)
5336
0
                                            / total_cycles;
5337
0
        } else {
5338
0
            percent_busy[n_proc++] = 0;
5339
0
        }
5340
0
    }
5341
0
    var = variance(percent_busy, n_proc);
5342
0
    free(percent_busy);
5343
0
    return var;
5344
0
}
5345
5346
/*
5347
 * This function checks that some basic conditions needed for a rebalance to be
5348
 * effective are met. Such as Rxq scheduling assignment type, more than one
5349
 * PMD, more than 2 Rxqs on a PMD. If there was no reconfiguration change
5350
 * since the last check, it reuses the last result.
5351
 *
5352
 * It is not intended to be an inclusive check of every condition that may make
5353
 * a rebalance ineffective. It is done as a quick check so a full
5354
 * pmd_rebalance_dry_run() can be avoided when it is not needed.
5355
 */
5356
static bool
5357
pmd_rebalance_dry_run_needed(struct dp_netdev *dp)
5358
    OVS_REQ_RDLOCK(dp->port_rwlock)
5359
0
{
5360
0
    struct dp_netdev_pmd_thread *pmd;
5361
0
    struct pmd_auto_lb *pmd_alb = &dp->pmd_alb;
5362
0
    unsigned int cnt = 0;
5363
0
    bool multi_rxq = false;
5364
5365
    /* Check if there was no reconfiguration since last check. */
5366
0
    if (!pmd_alb->recheck_config) {
5367
0
        if (!pmd_alb->do_dry_run) {
5368
0
            VLOG_DBG("PMD auto load balance nothing to do, "
5369
0
                     "no configuration changes since last check.");
5370
0
            return false;
5371
0
        }
5372
0
        return true;
5373
0
    }
5374
0
    pmd_alb->recheck_config = false;
5375
5376
    /* Check for incompatible assignment type. */
5377
0
    if (dp->pmd_rxq_assign_type == SCHED_ROUNDROBIN) {
5378
0
        VLOG_DBG("PMD auto load balance nothing to do, "
5379
0
                 "pmd-rxq-assign=roundrobin assignment type configured.");
5380
0
        return pmd_alb->do_dry_run = false;
5381
0
    }
5382
5383
    /* Check that there is at least 2 non-isolated PMDs and
5384
     * one of them is polling more than one rxq. */
5385
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5386
0
        if (pmd->core_id == NON_PMD_CORE_ID || pmd->isolated) {
5387
0
            continue;
5388
0
        }
5389
5390
0
        if (hmap_count(&pmd->poll_list) > 1) {
5391
0
            multi_rxq = true;
5392
0
        }
5393
0
        if (cnt && multi_rxq) {
5394
0
            return pmd_alb->do_dry_run = true;
5395
0
        }
5396
0
        cnt++;
5397
0
    }
5398
5399
0
    VLOG_DBG("PMD auto load balance nothing to do, "
5400
0
             "not enough non-isolated PMDs or RxQs.");
5401
0
    return pmd_alb->do_dry_run = false;
5402
0
}
5403
5404
static bool
5405
pmd_rebalance_dry_run(struct dp_netdev *dp)
5406
    OVS_REQ_RDLOCK(dp->port_rwlock)
5407
0
{
5408
0
    struct sched_numa_list numa_list_cur;
5409
0
    struct sched_numa_list numa_list_est;
5410
0
    bool thresh_met = false;
5411
5412
0
    VLOG_DBG("PMD auto load balance performing dry run.");
5413
5414
    /* Populate current assignments. */
5415
0
    sched_numa_list_populate(&numa_list_cur, dp);
5416
0
    sched_numa_list_assignments(&numa_list_cur, dp);
5417
5418
    /* Populate estimated assignments. */
5419
0
    sched_numa_list_populate(&numa_list_est, dp);
5420
0
    sched_numa_list_schedule(&numa_list_est, dp,
5421
0
                             dp->pmd_rxq_assign_type, VLL_DBG);
5422
5423
    /* Check if cross-numa polling, there is only one numa with PMDs. */
5424
0
    if (!sched_numa_list_cross_numa_polling(&numa_list_est) ||
5425
0
            sched_numa_list_count(&numa_list_est) == 1) {
5426
0
        struct sched_numa *numa_cur;
5427
5428
        /* Calculate variances. */
5429
0
        HMAP_FOR_EACH (numa_cur, node, &numa_list_cur.numas) {
5430
0
            uint64_t current_var, estimate_var;
5431
0
            struct sched_numa *numa_est;
5432
0
            uint64_t improvement = 0;
5433
5434
0
            numa_est = sched_numa_list_lookup(&numa_list_est,
5435
0
                                              numa_cur->numa_id);
5436
0
            if (!numa_est) {
5437
0
                continue;
5438
0
            }
5439
0
            current_var = sched_numa_variance(numa_cur);
5440
0
            estimate_var = sched_numa_variance(numa_est);
5441
0
            if (estimate_var < current_var) {
5442
0
                improvement = ((current_var - estimate_var) * 100)
5443
0
                              / current_var;
5444
0
            }
5445
0
            VLOG_DBG("Numa node %d. Current variance %"PRIu64" Estimated "
5446
0
                     "variance %"PRIu64". Variance improvement %"PRIu64"%%.",
5447
0
                     numa_cur->numa_id, current_var,
5448
0
                     estimate_var, improvement);
5449
0
            if (improvement >= dp->pmd_alb.rebalance_improve_thresh) {
5450
0
                thresh_met = true;
5451
0
            }
5452
0
        }
5453
0
        VLOG_DBG("PMD load variance improvement threshold %u%% is %s.",
5454
0
                 dp->pmd_alb.rebalance_improve_thresh,
5455
0
                 thresh_met ? "met" : "not met");
5456
0
    } else {
5457
0
        VLOG_DBG("PMD auto load balance detected cross-numa polling with "
5458
0
                 "multiple numa nodes. Unable to accurately estimate.");
5459
0
    }
5460
5461
0
    sched_numa_list_free_entries(&numa_list_cur);
5462
0
    sched_numa_list_free_entries(&numa_list_est);
5463
5464
0
    return thresh_met;
5465
0
}
5466
5467
static void
5468
reload_affected_pmds(struct dp_netdev *dp)
5469
0
{
5470
0
    struct dp_netdev_pmd_thread *pmd;
5471
5472
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5473
0
        if (pmd->need_reload) {
5474
0
            dp_netdev_reload_pmd__(pmd);
5475
0
        }
5476
0
    }
5477
5478
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5479
0
        if (pmd->need_reload) {
5480
0
            if (pmd->core_id != NON_PMD_CORE_ID) {
5481
0
                bool reload;
5482
5483
0
                do {
5484
0
                    atomic_read_explicit(&pmd->reload, &reload,
5485
0
                                         memory_order_acquire);
5486
0
                } while (reload);
5487
0
            }
5488
0
            pmd->need_reload = false;
5489
0
        }
5490
0
    }
5491
0
}
5492
5493
static void
5494
reconfigure_pmd_threads(struct dp_netdev *dp)
5495
    OVS_REQ_RDLOCK(dp->port_rwlock)
5496
0
{
5497
0
    struct dp_netdev_pmd_thread *pmd;
5498
0
    struct ovs_numa_dump *pmd_cores;
5499
0
    struct ovs_numa_info_core *core;
5500
0
    struct hmapx to_delete = HMAPX_INITIALIZER(&to_delete);
5501
0
    struct hmapx_node *node;
5502
0
    bool changed = false;
5503
0
    bool need_to_adjust_static_tx_qids = false;
5504
5505
    /* The pmd threads should be started only if there's a pmd port in the
5506
     * datapath.  If the user didn't provide any "pmd-cpu-mask", we start
5507
     * NR_PMD_THREADS per numa node. */
5508
0
    if (!has_pmd_port(dp)) {
5509
0
        pmd_cores = ovs_numa_dump_n_cores_per_numa(0);
5510
0
    } else if (dp->pmd_cmask && dp->pmd_cmask[0]) {
5511
0
        pmd_cores = ovs_numa_dump_cores_with_cmask(dp->pmd_cmask);
5512
0
    } else {
5513
0
        pmd_cores = ovs_numa_dump_n_cores_per_numa(NR_PMD_THREADS);
5514
0
    }
5515
5516
    /* We need to adjust 'static_tx_qid's only if we're reducing number of
5517
     * PMD threads. Otherwise, new threads will allocate all the freed ids. */
5518
0
    if (ovs_numa_dump_count(pmd_cores) < cmap_count(&dp->poll_threads) - 1) {
5519
        /* Adjustment is required to keep 'static_tx_qid's sequential and
5520
         * avoid possible issues, for example, imbalanced tx queue usage
5521
         * and unnecessary locking caused by remapping on netdev level. */
5522
0
        need_to_adjust_static_tx_qids = true;
5523
0
    }
5524
5525
    /* Check for unwanted pmd threads */
5526
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5527
0
        if (pmd->core_id == NON_PMD_CORE_ID) {
5528
0
            continue;
5529
0
        }
5530
0
        if (!ovs_numa_dump_contains_core(pmd_cores, pmd->numa_id,
5531
0
                                                    pmd->core_id)) {
5532
0
            hmapx_add(&to_delete, pmd);
5533
0
        } else if (need_to_adjust_static_tx_qids) {
5534
0
            atomic_store_relaxed(&pmd->reload_tx_qid, true);
5535
0
            pmd->need_reload = true;
5536
0
        }
5537
0
    }
5538
5539
0
    HMAPX_FOR_EACH (node, &to_delete) {
5540
0
        pmd = (struct dp_netdev_pmd_thread *) node->data;
5541
0
        VLOG_INFO("PMD thread on numa_id: %d, core id: %2d destroyed.",
5542
0
                  pmd->numa_id, pmd->core_id);
5543
0
        dp_netdev_del_pmd(dp, pmd);
5544
0
    }
5545
0
    changed = !hmapx_is_empty(&to_delete);
5546
0
    hmapx_destroy(&to_delete);
5547
5548
0
    if (need_to_adjust_static_tx_qids) {
5549
        /* 'static_tx_qid's are not sequential now.
5550
         * Reload remaining threads to fix this. */
5551
0
        reload_affected_pmds(dp);
5552
0
    }
5553
5554
    /* Check for required new pmd threads */
5555
0
    FOR_EACH_CORE_ON_DUMP(core, pmd_cores) {
5556
0
        pmd = dp_netdev_get_pmd(dp, core->core_id);
5557
0
        if (!pmd) {
5558
0
            struct ds name = DS_EMPTY_INITIALIZER;
5559
5560
0
            pmd = xzalloc(sizeof *pmd);
5561
0
            dp_netdev_configure_pmd(pmd, dp, core->core_id, core->numa_id);
5562
5563
0
            ds_put_format(&name, "pmd-c%02d/id:", core->core_id);
5564
0
            pmd->thread = ovs_thread_create(ds_cstr(&name),
5565
0
                                            pmd_thread_main, pmd);
5566
0
            ds_destroy(&name);
5567
5568
0
            VLOG_INFO("PMD thread on numa_id: %d, core id: %2d created.",
5569
0
                      pmd->numa_id, pmd->core_id);
5570
0
            changed = true;
5571
0
        } else {
5572
0
            dp_netdev_pmd_unref(pmd);
5573
0
        }
5574
0
    }
5575
5576
0
    if (changed) {
5577
0
        struct ovs_numa_info_numa *numa;
5578
5579
        /* Log the number of pmd threads per numa node. */
5580
0
        FOR_EACH_NUMA_ON_DUMP (numa, pmd_cores) {
5581
0
            VLOG_INFO("There are %"PRIuSIZE" pmd threads on numa node %d",
5582
0
                      numa->n_cores, numa->numa_id);
5583
0
        }
5584
0
    }
5585
5586
0
    ovs_numa_dump_destroy(pmd_cores);
5587
0
}
5588
5589
static void
5590
pmd_remove_stale_ports(struct dp_netdev *dp,
5591
                       struct dp_netdev_pmd_thread *pmd)
5592
    OVS_EXCLUDED(pmd->port_mutex)
5593
    OVS_REQ_RDLOCK(dp->port_rwlock)
5594
0
{
5595
0
    struct rxq_poll *poll;
5596
0
    struct tx_port *tx;
5597
5598
0
    ovs_mutex_lock(&pmd->port_mutex);
5599
0
    HMAP_FOR_EACH_SAFE (poll, node, &pmd->poll_list) {
5600
0
        struct dp_netdev_port *port = poll->rxq->port;
5601
5602
0
        if (port->need_reconfigure
5603
0
            || !hmap_contains(&dp->ports, &port->node)) {
5604
0
            dp_netdev_del_rxq_from_pmd(pmd, poll);
5605
0
        }
5606
0
    }
5607
0
    HMAP_FOR_EACH_SAFE (tx, node, &pmd->tx_ports) {
5608
0
        struct dp_netdev_port *port = tx->port;
5609
5610
0
        if (port->need_reconfigure
5611
0
            || !hmap_contains(&dp->ports, &port->node)) {
5612
0
            dp_netdev_del_port_tx_from_pmd(pmd, tx);
5613
0
        }
5614
0
    }
5615
0
    ovs_mutex_unlock(&pmd->port_mutex);
5616
0
}
5617
5618
/* Must be called each time a port is added/removed or the cmask changes.
5619
 * This creates and destroys pmd threads, reconfigures ports, opens their
5620
 * rxqs and assigns all rxqs/txqs to pmd threads. */
5621
static void
5622
reconfigure_datapath(struct dp_netdev *dp)
5623
    OVS_REQ_RDLOCK(dp->port_rwlock)
5624
0
{
5625
0
    struct hmapx busy_threads = HMAPX_INITIALIZER(&busy_threads);
5626
0
    struct dp_netdev_pmd_thread *pmd;
5627
0
    struct dp_netdev_port *port;
5628
0
    int wanted_txqs;
5629
5630
0
    dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq);
5631
5632
    /* Step 1: Adjust the pmd threads based on the datapath ports, the cores
5633
     * on the system and the user configuration. */
5634
0
    reconfigure_pmd_threads(dp);
5635
5636
0
    wanted_txqs = cmap_count(&dp->poll_threads);
5637
5638
    /* The number of pmd threads might have changed, or a port can be new:
5639
     * adjust the txqs. */
5640
0
    HMAP_FOR_EACH (port, node, &dp->ports) {
5641
0
        netdev_set_tx_multiq(port->netdev, wanted_txqs);
5642
0
    }
5643
5644
    /* Step 2: Remove from the pmd threads ports that have been removed or
5645
     * need reconfiguration. */
5646
5647
    /* Check for all the ports that need reconfiguration.  We cache this in
5648
     * 'port->need_reconfigure', because netdev_is_reconf_required() can
5649
     * change at any time.
5650
     * Also mark for reconfiguration all ports which will likely change their
5651
     * 'txq_mode' parameter.  It's required to stop using them before
5652
     * changing this setting and it's simpler to mark ports here and allow
5653
     * 'pmd_remove_stale_ports' to remove them from threads.  There will be
5654
     * no actual reconfiguration in 'port_reconfigure' because it's
5655
     * unnecessary.  */
5656
0
    HMAP_FOR_EACH (port, node, &dp->ports) {
5657
0
        if (netdev_is_reconf_required(port->netdev)
5658
0
            || ((port->txq_mode == TXQ_MODE_XPS)
5659
0
                != (netdev_n_txq(port->netdev) < wanted_txqs))
5660
0
            || ((port->txq_mode == TXQ_MODE_XPS_HASH)
5661
0
                != (port->txq_requested_mode == TXQ_REQ_MODE_HASH
5662
0
                    && netdev_n_txq(port->netdev) > 1))) {
5663
0
            port->need_reconfigure = true;
5664
0
        }
5665
0
    }
5666
5667
    /* Remove from the pmd threads all the ports that have been deleted or
5668
     * need reconfiguration. */
5669
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5670
0
        pmd_remove_stale_ports(dp, pmd);
5671
0
    }
5672
5673
    /* Reload affected pmd threads.  We must wait for the pmd threads before
5674
     * reconfiguring the ports, because a port cannot be reconfigured while
5675
     * it's being used. */
5676
0
    reload_affected_pmds(dp);
5677
5678
    /* Step 3: Reconfigure ports. */
5679
5680
    /* We only reconfigure the ports that we determined above, because they're
5681
     * not being used by any pmd thread at the moment.  If a port fails to
5682
     * reconfigure we remove it from the datapath. */
5683
0
    HMAP_FOR_EACH_SAFE (port, node, &dp->ports) {
5684
0
        int err;
5685
5686
0
        if (!port->need_reconfigure) {
5687
0
            continue;
5688
0
        }
5689
5690
0
        err = port_reconfigure(port);
5691
0
        if (err) {
5692
0
            hmap_remove(&dp->ports, &port->node);
5693
0
            seq_change(dp->port_seq);
5694
0
            port_destroy(port);
5695
0
        } else {
5696
            /* With a single queue, there is no point in using hash mode. */
5697
0
            if (port->txq_requested_mode == TXQ_REQ_MODE_HASH &&
5698
0
                netdev_n_txq(port->netdev) > 1) {
5699
0
                port->txq_mode = TXQ_MODE_XPS_HASH;
5700
0
            } else if (netdev_n_txq(port->netdev) < wanted_txqs) {
5701
0
                port->txq_mode = TXQ_MODE_XPS;
5702
0
            } else {
5703
0
                port->txq_mode = TXQ_MODE_STATIC;
5704
0
            }
5705
0
        }
5706
0
    }
5707
5708
    /* Step 4: Compute new rxq scheduling.  We don't touch the pmd threads
5709
     * for now, we just update the 'pmd' pointer in each rxq to point to the
5710
     * wanted thread according to the scheduling policy. */
5711
5712
    /* Reset all the pmd threads to non isolated. */
5713
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5714
0
        pmd->isolated = false;
5715
0
    }
5716
5717
    /* Reset all the queues to unassigned */
5718
0
    HMAP_FOR_EACH (port, node, &dp->ports) {
5719
0
        for (int i = 0; i < port->n_rxq; i++) {
5720
0
            port->rxqs[i].pmd = NULL;
5721
0
        }
5722
0
    }
5723
0
    rxq_scheduling(dp);
5724
5725
    /* Step 5: Remove queues not compliant with new scheduling. */
5726
5727
    /* Count all the threads that will have at least one queue to poll. */
5728
0
    HMAP_FOR_EACH (port, node, &dp->ports) {
5729
0
        for (int qid = 0; qid < port->n_rxq; qid++) {
5730
0
            struct dp_netdev_rxq *q = &port->rxqs[qid];
5731
5732
0
            if (q->pmd) {
5733
0
                hmapx_add(&busy_threads, q->pmd);
5734
0
            }
5735
0
        }
5736
0
    }
5737
5738
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5739
0
        struct rxq_poll *poll;
5740
5741
0
        ovs_mutex_lock(&pmd->port_mutex);
5742
0
        HMAP_FOR_EACH_SAFE (poll, node, &pmd->poll_list) {
5743
0
            if (poll->rxq->pmd != pmd) {
5744
0
                dp_netdev_del_rxq_from_pmd(pmd, poll);
5745
5746
                /* This pmd might sleep after this step if it has no rxq
5747
                 * remaining. Tell it to busy wait for new assignment if it
5748
                 * has at least one scheduled queue. */
5749
0
                if (hmap_count(&pmd->poll_list) == 0 &&
5750
0
                    hmapx_contains(&busy_threads, pmd)) {
5751
0
                    atomic_store_relaxed(&pmd->wait_for_reload, true);
5752
0
                }
5753
0
            }
5754
0
        }
5755
0
        ovs_mutex_unlock(&pmd->port_mutex);
5756
0
    }
5757
5758
0
    hmapx_destroy(&busy_threads);
5759
5760
    /* Reload affected pmd threads.  We must wait for the pmd threads to remove
5761
     * the old queues before readding them, otherwise a queue can be polled by
5762
     * two threads at the same time. */
5763
0
    reload_affected_pmds(dp);
5764
5765
    /* Step 6: Add queues from scheduling, if they're not there already. */
5766
0
    HMAP_FOR_EACH (port, node, &dp->ports) {
5767
0
        if (!netdev_is_pmd(port->netdev)) {
5768
0
            continue;
5769
0
        }
5770
5771
0
        for (int qid = 0; qid < port->n_rxq; qid++) {
5772
0
            struct dp_netdev_rxq *q = &port->rxqs[qid];
5773
5774
0
            if (q->pmd) {
5775
0
                ovs_mutex_lock(&q->pmd->port_mutex);
5776
0
                dp_netdev_add_rxq_to_pmd(q->pmd, q);
5777
0
                ovs_mutex_unlock(&q->pmd->port_mutex);
5778
0
            }
5779
0
        }
5780
0
    }
5781
5782
    /* Add every port and bond to the tx port and bond caches of
5783
     * every pmd thread, if it's not there already and if this pmd
5784
     * has at least one rxq to poll.
5785
     */
5786
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5787
0
        ovs_mutex_lock(&pmd->port_mutex);
5788
0
        if (hmap_count(&pmd->poll_list) || pmd->core_id == NON_PMD_CORE_ID) {
5789
0
            struct tx_bond *bond;
5790
5791
0
            HMAP_FOR_EACH (port, node, &dp->ports) {
5792
0
                dp_netdev_add_port_tx_to_pmd(pmd, port);
5793
0
            }
5794
5795
0
            CMAP_FOR_EACH (bond, node, &dp->tx_bonds) {
5796
0
                dp_netdev_add_bond_tx_to_pmd(pmd, bond, false);
5797
0
            }
5798
0
        }
5799
0
        ovs_mutex_unlock(&pmd->port_mutex);
5800
0
    }
5801
5802
    /* Reload affected pmd threads. */
5803
0
    reload_affected_pmds(dp);
5804
5805
    /* PMD ALB will need to recheck if dry run needed. */
5806
0
    dp->pmd_alb.recheck_config = true;
5807
0
}
5808
5809
/* Returns true if one of the netdevs in 'dp' requires a reconfiguration */
5810
static bool
5811
ports_require_restart(const struct dp_netdev *dp)
5812
    OVS_REQ_RDLOCK(dp->port_rwlock)
5813
0
{
5814
0
    struct dp_netdev_port *port;
5815
5816
0
    HMAP_FOR_EACH (port, node, &dp->ports) {
5817
0
        if (netdev_is_reconf_required(port->netdev)) {
5818
0
            return true;
5819
0
        }
5820
0
    }
5821
5822
0
    return false;
5823
0
}
5824
5825
/* Calculates variance in the values stored in array 'a'. 'n' is the number
5826
 * of elements in array to be considered for calculating vairance.
5827
 * Usage example: data array 'a' contains the processing load of each pmd and
5828
 * 'n' is the number of PMDs. It returns the variance in processing load of
5829
 * PMDs*/
5830
static uint64_t
5831
variance(uint64_t a[], int n)
5832
0
{
5833
    /* Compute mean (average of elements). */
5834
0
    uint64_t sum = 0;
5835
0
    uint64_t mean = 0;
5836
0
    uint64_t sqDiff = 0;
5837
5838
0
    if (!n) {
5839
0
        return 0;
5840
0
    }
5841
5842
0
    for (int i = 0; i < n; i++) {
5843
0
        sum += a[i];
5844
0
    }
5845
5846
0
    if (sum) {
5847
0
        mean = sum / n;
5848
5849
        /* Compute sum squared differences with mean. */
5850
0
        for (int i = 0; i < n; i++) {
5851
0
            sqDiff += (a[i] - mean)*(a[i] - mean);
5852
0
        }
5853
0
    }
5854
0
    return (sqDiff ? (sqDiff / n) : 0);
5855
0
}
5856
5857
/* Return true if needs to revalidate datapath flows. */
5858
static bool
5859
dpif_netdev_run(struct dpif *dpif)
5860
0
{
5861
0
    struct dp_netdev_port *port;
5862
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
5863
0
    struct dp_netdev_pmd_thread *non_pmd;
5864
0
    uint64_t new_tnl_seq;
5865
0
    bool need_to_flush = true;
5866
0
    bool pmd_rebalance = false;
5867
0
    long long int now = time_msec();
5868
0
    struct dp_netdev_pmd_thread *pmd;
5869
5870
0
    ovs_rwlock_rdlock(&dp->port_rwlock);
5871
0
    non_pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
5872
0
    if (non_pmd) {
5873
0
        ovs_mutex_lock(&dp->non_pmd_mutex);
5874
5875
0
        atomic_read_relaxed(&dp->smc_enable_db, &non_pmd->ctx.smc_enable_db);
5876
5877
0
        HMAP_FOR_EACH (port, node, &dp->ports) {
5878
0
            if (!netdev_is_pmd(port->netdev)) {
5879
0
                int i;
5880
5881
0
                if (port->emc_enabled) {
5882
0
                    atomic_read_relaxed(&dp->emc_insert_min,
5883
0
                                        &non_pmd->ctx.emc_insert_min);
5884
0
                } else {
5885
0
                    non_pmd->ctx.emc_insert_min = 0;
5886
0
                }
5887
5888
0
                for (i = 0; i < port->n_rxq; i++) {
5889
5890
0
                    if (!netdev_rxq_enabled(port->rxqs[i].rx)) {
5891
0
                        continue;
5892
0
                    }
5893
5894
0
                    if (dp_netdev_process_rxq_port(non_pmd,
5895
0
                                                   &port->rxqs[i],
5896
0
                                                   port->port_no)) {
5897
0
                        need_to_flush = false;
5898
0
                    }
5899
0
                }
5900
0
            }
5901
0
        }
5902
0
        if (need_to_flush) {
5903
            /* We didn't receive anything in the process loop.
5904
             * Check if we need to send something.
5905
             * There was no time updates on current iteration. */
5906
0
            pmd_thread_ctx_time_update(non_pmd);
5907
0
            dp_netdev_pmd_flush_output_packets(non_pmd, false);
5908
0
        }
5909
5910
0
        dpif_netdev_xps_revalidate_pmd(non_pmd, false);
5911
0
        ovs_mutex_unlock(&dp->non_pmd_mutex);
5912
5913
0
        dp_netdev_pmd_unref(non_pmd);
5914
0
    }
5915
5916
0
    struct pmd_auto_lb *pmd_alb = &dp->pmd_alb;
5917
0
    if (pmd_alb->is_enabled) {
5918
0
        if (!pmd_alb->rebalance_poll_timer) {
5919
0
            pmd_alb->rebalance_poll_timer = now;
5920
0
        } else if ((pmd_alb->rebalance_poll_timer +
5921
0
                   pmd_alb->rebalance_intvl) < now) {
5922
0
            pmd_alb->rebalance_poll_timer = now;
5923
0
            CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5924
0
                if (atomic_count_get(&pmd->pmd_overloaded) >=
5925
0
                                    PMD_INTERVAL_MAX) {
5926
0
                    pmd_rebalance = true;
5927
0
                    break;
5928
0
                }
5929
0
            }
5930
5931
0
            if (pmd_rebalance &&
5932
0
                !dp_netdev_is_reconf_required(dp) &&
5933
0
                !ports_require_restart(dp) &&
5934
0
                pmd_rebalance_dry_run_needed(dp) &&
5935
0
                pmd_rebalance_dry_run(dp)) {
5936
0
                VLOG_INFO("PMD auto load balance dry run. "
5937
0
                          "Requesting datapath reconfigure.");
5938
0
                dp_netdev_request_reconfigure(dp);
5939
0
            }
5940
0
        }
5941
0
    }
5942
5943
0
    if (dp_netdev_is_reconf_required(dp) || ports_require_restart(dp)) {
5944
0
        reconfigure_datapath(dp);
5945
0
    }
5946
0
    ovs_rwlock_unlock(&dp->port_rwlock);
5947
5948
0
    tnl_neigh_cache_run();
5949
0
    tnl_port_map_run();
5950
0
    new_tnl_seq = seq_read(tnl_conf_seq);
5951
5952
0
    if (dp->last_tnl_conf_seq != new_tnl_seq) {
5953
0
        dp->last_tnl_conf_seq = new_tnl_seq;
5954
0
        return true;
5955
0
    }
5956
0
    return false;
5957
0
}
5958
5959
static void
5960
dpif_netdev_wait(struct dpif *dpif)
5961
0
{
5962
0
    struct dp_netdev_port *port;
5963
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
5964
5965
0
    ovs_mutex_lock(&dp_netdev_mutex);
5966
0
    ovs_rwlock_rdlock(&dp->port_rwlock);
5967
0
    HMAP_FOR_EACH (port, node, &dp->ports) {
5968
0
        netdev_wait_reconf_required(port->netdev);
5969
0
        if (!netdev_is_pmd(port->netdev)) {
5970
0
            int i;
5971
5972
0
            for (i = 0; i < port->n_rxq; i++) {
5973
0
                netdev_rxq_wait(port->rxqs[i].rx);
5974
0
            }
5975
0
        }
5976
0
    }
5977
0
    ovs_rwlock_unlock(&dp->port_rwlock);
5978
0
    ovs_mutex_unlock(&dp_netdev_mutex);
5979
0
    seq_wait(tnl_conf_seq, dp->last_tnl_conf_seq);
5980
0
}
5981
5982
static void
5983
pmd_free_cached_ports(struct dp_netdev_pmd_thread *pmd)
5984
0
{
5985
0
    struct tx_port *tx_port_cached;
5986
5987
    /* Flush all the queued packets. */
5988
0
    dp_netdev_pmd_flush_output_packets(pmd, true);
5989
    /* Free all used tx queue ids. */
5990
0
    dpif_netdev_xps_revalidate_pmd(pmd, true);
5991
5992
0
    HMAP_FOR_EACH_POP (tx_port_cached, node, &pmd->tnl_port_cache) {
5993
0
        free(tx_port_cached->txq_pkts);
5994
0
        free(tx_port_cached);
5995
0
    }
5996
0
    HMAP_FOR_EACH_POP (tx_port_cached, node, &pmd->send_port_cache) {
5997
0
        free(tx_port_cached->txq_pkts);
5998
0
        free(tx_port_cached);
5999
0
    }
6000
0
}
6001
6002
/* Copies ports from 'pmd->tx_ports' (shared with the main thread) to
6003
 * thread-local copies. Copy to 'pmd->tnl_port_cache' if it is a tunnel
6004
 * device, otherwise to 'pmd->send_port_cache' if the port has at least
6005
 * one txq. */
6006
static void
6007
pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
6008
    OVS_REQUIRES(pmd->port_mutex)
6009
0
{
6010
0
    struct tx_port *tx_port, *tx_port_cached;
6011
6012
0
    pmd_free_cached_ports(pmd);
6013
0
    hmap_shrink(&pmd->send_port_cache);
6014
0
    hmap_shrink(&pmd->tnl_port_cache);
6015
6016
0
    HMAP_FOR_EACH (tx_port, node, &pmd->tx_ports) {
6017
0
        int n_txq = netdev_n_txq(tx_port->port->netdev);
6018
0
        struct dp_packet_batch *txq_pkts_cached;
6019
6020
0
        if (netdev_has_tunnel_push_pop(tx_port->port->netdev)) {
6021
0
            tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached);
6022
0
            if (tx_port->txq_pkts) {
6023
0
                txq_pkts_cached = xmemdup(tx_port->txq_pkts,
6024
0
                                          n_txq * sizeof *tx_port->txq_pkts);
6025
0
                tx_port_cached->txq_pkts = txq_pkts_cached;
6026
0
            }
6027
0
            hmap_insert(&pmd->tnl_port_cache, &tx_port_cached->node,
6028
0
                        hash_port_no(tx_port_cached->port->port_no));
6029
0
        }
6030
6031
0
        if (n_txq) {
6032
0
            tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached);
6033
0
            if (tx_port->txq_pkts) {
6034
0
                txq_pkts_cached = xmemdup(tx_port->txq_pkts,
6035
0
                                          n_txq * sizeof *tx_port->txq_pkts);
6036
0
                tx_port_cached->txq_pkts = txq_pkts_cached;
6037
0
            }
6038
0
            hmap_insert(&pmd->send_port_cache, &tx_port_cached->node,
6039
0
                        hash_port_no(tx_port_cached->port->port_no));
6040
0
        }
6041
0
    }
6042
0
}
6043
6044
static void
6045
pmd_alloc_static_tx_qid(struct dp_netdev_pmd_thread *pmd)
6046
0
{
6047
0
    ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex);
6048
0
    if (!id_pool_alloc_id(pmd->dp->tx_qid_pool, &pmd->static_tx_qid)) {
6049
0
        VLOG_ABORT("static_tx_qid allocation failed for PMD on core %2d"
6050
0
                   ", numa_id %d.", pmd->core_id, pmd->numa_id);
6051
0
    }
6052
0
    ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex);
6053
6054
0
    VLOG_DBG("static_tx_qid = %d allocated for PMD thread on core %2d"
6055
0
             ", numa_id %d.", pmd->static_tx_qid, pmd->core_id, pmd->numa_id);
6056
0
}
6057
6058
static void
6059
pmd_free_static_tx_qid(struct dp_netdev_pmd_thread *pmd)
6060
0
{
6061
0
    ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex);
6062
0
    id_pool_free_id(pmd->dp->tx_qid_pool, pmd->static_tx_qid);
6063
0
    ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex);
6064
0
}
6065
6066
static int
6067
pmd_load_queues_and_ports(struct dp_netdev_pmd_thread *pmd,
6068
                          struct polled_queue **ppoll_list)
6069
0
{
6070
0
    struct polled_queue *poll_list = *ppoll_list;
6071
0
    struct rxq_poll *poll;
6072
0
    int i;
6073
6074
0
    ovs_mutex_lock(&pmd->port_mutex);
6075
0
    poll_list = xrealloc(poll_list, hmap_count(&pmd->poll_list)
6076
0
                                    * sizeof *poll_list);
6077
6078
0
    i = 0;
6079
0
    HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
6080
0
        poll_list[i].rxq = poll->rxq;
6081
0
        poll_list[i].port_no = poll->rxq->port->port_no;
6082
0
        poll_list[i].emc_enabled = poll->rxq->port->emc_enabled;
6083
0
        poll_list[i].rxq_enabled = netdev_rxq_enabled(poll->rxq->rx);
6084
0
        poll_list[i].change_seq =
6085
0
                     netdev_get_change_seq(poll->rxq->port->netdev);
6086
0
        i++;
6087
0
    }
6088
6089
0
    pmd_load_cached_ports(pmd);
6090
6091
0
    ovs_mutex_unlock(&pmd->port_mutex);
6092
6093
0
    *ppoll_list = poll_list;
6094
0
    return i;
6095
0
}
6096
6097
static void *
6098
pmd_thread_main(void *f_)
6099
0
{
6100
0
    struct dp_netdev_pmd_thread *pmd = f_;
6101
0
    struct pmd_perf_stats *s = &pmd->perf_stats;
6102
0
    unsigned int lc = 0;
6103
0
    struct polled_queue *poll_list;
6104
0
    bool wait_for_reload = false;
6105
0
    bool dpdk_attached;
6106
0
    bool reload_tx_qid;
6107
0
    bool exiting;
6108
0
    bool reload;
6109
0
    int poll_cnt;
6110
0
    int i;
6111
0
    int process_packets = 0;
6112
0
    uint64_t sleep_time = 0;
6113
6114
0
    poll_list = NULL;
6115
6116
    /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */
6117
0
    ovsthread_setspecific(pmd->dp->per_pmd_key, pmd);
6118
0
    ovs_numa_thread_setaffinity_core(pmd->core_id);
6119
0
    dpdk_attached = dpdk_attach_thread(pmd->core_id);
6120
0
    poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
6121
0
    dfc_cache_init(&pmd->flow_cache);
6122
0
    pmd_alloc_static_tx_qid(pmd);
6123
0
    set_timer_resolution(PMD_TIMER_RES_NS);
6124
6125
0
reload:
6126
0
    atomic_count_init(&pmd->pmd_overloaded, 0);
6127
6128
0
    pmd->intrvl_tsc_prev = 0;
6129
0
    atomic_store_relaxed(&pmd->intrvl_cycles, 0);
6130
6131
0
    if (!dpdk_attached) {
6132
0
        dpdk_attached = dpdk_attach_thread(pmd->core_id);
6133
0
    }
6134
6135
    /* List port/core affinity */
6136
0
    for (i = 0; i < poll_cnt; i++) {
6137
0
       VLOG_DBG("Core %d processing port \'%s\' with queue-id %d\n",
6138
0
                pmd->core_id, netdev_rxq_get_name(poll_list[i].rxq->rx),
6139
0
                netdev_rxq_get_queue_id(poll_list[i].rxq->rx));
6140
       /* Reset the rxq current cycles counter. */
6141
0
       dp_netdev_rxq_set_cycles(poll_list[i].rxq, RXQ_CYCLES_PROC_CURR, 0);
6142
0
       for (int j = 0; j < PMD_INTERVAL_MAX; j++) {
6143
0
           dp_netdev_rxq_set_intrvl_cycles(poll_list[i].rxq, 0);
6144
0
       }
6145
0
    }
6146
6147
0
    if (!poll_cnt) {
6148
0
        if (wait_for_reload) {
6149
            /* Don't sleep, control thread will ask for a reload shortly. */
6150
0
            do {
6151
0
                atomic_read_explicit(&pmd->reload, &reload,
6152
0
                                     memory_order_acquire);
6153
0
            } while (!reload);
6154
0
        } else {
6155
0
            while (seq_read(pmd->reload_seq) == pmd->last_reload_seq) {
6156
0
                seq_wait(pmd->reload_seq, pmd->last_reload_seq);
6157
0
                poll_block();
6158
0
            }
6159
0
        }
6160
0
    }
6161
6162
0
    for (i = 0; i < PMD_INTERVAL_MAX; i++) {
6163
0
        atomic_store_relaxed(&pmd->busy_cycles_intrvl[i], 0);
6164
0
    }
6165
0
    atomic_count_set(&pmd->intrvl_idx, 0);
6166
0
    cycles_counter_update(s);
6167
6168
0
    pmd->next_rcu_quiesce = pmd->ctx.now + PMD_RCU_QUIESCE_INTERVAL;
6169
6170
    /* Protect pmd stats from external clearing while polling. */
6171
0
    ovs_mutex_lock(&pmd->perf_stats.stats_mutex);
6172
0
    for (;;) {
6173
0
        uint64_t rx_packets = 0, tx_packets = 0;
6174
0
        uint64_t time_slept = 0;
6175
0
        uint64_t max_sleep;
6176
6177
0
        pmd_perf_start_iteration(s);
6178
6179
0
        atomic_read_relaxed(&pmd->dp->smc_enable_db, &pmd->ctx.smc_enable_db);
6180
0
        atomic_read_relaxed(&pmd->max_sleep, &max_sleep);
6181
6182
0
        for (i = 0; i < poll_cnt; i++) {
6183
6184
0
            if (!poll_list[i].rxq_enabled) {
6185
0
                continue;
6186
0
            }
6187
6188
0
            if (poll_list[i].emc_enabled) {
6189
0
                atomic_read_relaxed(&pmd->dp->emc_insert_min,
6190
0
                                    &pmd->ctx.emc_insert_min);
6191
0
            } else {
6192
0
                pmd->ctx.emc_insert_min = 0;
6193
0
            }
6194
6195
0
            process_packets =
6196
0
                dp_netdev_process_rxq_port(pmd, poll_list[i].rxq,
6197
0
                                           poll_list[i].port_no);
6198
0
            rx_packets += process_packets;
6199
0
            if (process_packets >= PMD_SLEEP_THRESH) {
6200
0
                sleep_time = 0;
6201
0
            }
6202
0
        }
6203
6204
0
        if (!rx_packets) {
6205
            /* We didn't receive anything in the process loop.
6206
             * Check if we need to send something.
6207
             * There was no time updates on current iteration. */
6208
0
            pmd_thread_ctx_time_update(pmd);
6209
0
            tx_packets = dp_netdev_pmd_flush_output_packets(pmd,
6210
0
                                                   max_sleep && sleep_time
6211
0
                                                   ? true : false);
6212
0
        }
6213
6214
0
        if (max_sleep) {
6215
            /* Check if a sleep should happen on this iteration. */
6216
0
            if (sleep_time) {
6217
0
                struct cycle_timer sleep_timer;
6218
6219
0
                cycle_timer_start(&pmd->perf_stats, &sleep_timer);
6220
0
                xnanosleep_no_quiesce(sleep_time * 1000);
6221
0
                time_slept = cycle_timer_stop(&pmd->perf_stats, &sleep_timer);
6222
0
                pmd_thread_ctx_time_update(pmd);
6223
0
            }
6224
0
            if (sleep_time < max_sleep) {
6225
                /* Increase sleep time for next iteration. */
6226
0
                sleep_time += PMD_SLEEP_INC_US;
6227
0
            } else {
6228
0
                sleep_time = max_sleep;
6229
0
            }
6230
0
        } else {
6231
            /* Reset sleep time as max sleep policy may have been changed. */
6232
0
            sleep_time = 0;
6233
0
        }
6234
6235
        /* Do RCU synchronization at fixed interval.  This ensures that
6236
         * synchronization would not be delayed long even at high load of
6237
         * packet processing. */
6238
0
        if (pmd->ctx.now > pmd->next_rcu_quiesce) {
6239
0
            if (!ovsrcu_try_quiesce()) {
6240
0
                pmd->next_rcu_quiesce =
6241
0
                    pmd->ctx.now + PMD_RCU_QUIESCE_INTERVAL;
6242
0
            }
6243
0
        }
6244
6245
0
        if (lc++ > 1024) {
6246
0
            lc = 0;
6247
6248
0
            coverage_try_clear();
6249
0
            dp_netdev_pmd_try_optimize(pmd, poll_list, poll_cnt);
6250
0
            if (!ovsrcu_try_quiesce()) {
6251
0
                emc_cache_slow_sweep(&((pmd->flow_cache).emc_cache));
6252
0
                pmd->next_rcu_quiesce =
6253
0
                    pmd->ctx.now + PMD_RCU_QUIESCE_INTERVAL;
6254
0
            }
6255
6256
0
            for (i = 0; i < poll_cnt; i++) {
6257
0
                uint64_t current_seq =
6258
0
                         netdev_get_change_seq(poll_list[i].rxq->port->netdev);
6259
0
                if (poll_list[i].change_seq != current_seq) {
6260
0
                    poll_list[i].change_seq = current_seq;
6261
0
                    poll_list[i].rxq_enabled =
6262
0
                                 netdev_rxq_enabled(poll_list[i].rxq->rx);
6263
0
                }
6264
0
            }
6265
0
        }
6266
6267
0
        atomic_read_explicit(&pmd->reload, &reload, memory_order_acquire);
6268
0
        if (OVS_UNLIKELY(reload)) {
6269
0
            break;
6270
0
        }
6271
6272
0
        pmd_perf_end_iteration(s, rx_packets, tx_packets, time_slept,
6273
0
                               pmd_perf_metrics_enabled(pmd));
6274
0
    }
6275
0
    ovs_mutex_unlock(&pmd->perf_stats.stats_mutex);
6276
6277
0
    poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
6278
0
    atomic_read_relaxed(&pmd->wait_for_reload, &wait_for_reload);
6279
0
    atomic_read_relaxed(&pmd->reload_tx_qid, &reload_tx_qid);
6280
0
    atomic_read_relaxed(&pmd->exit, &exiting);
6281
    /* Signal here to make sure the pmd finishes
6282
     * reloading the updated configuration. */
6283
0
    dp_netdev_pmd_reload_done(pmd);
6284
6285
0
    if (reload_tx_qid) {
6286
0
        pmd_free_static_tx_qid(pmd);
6287
0
        pmd_alloc_static_tx_qid(pmd);
6288
0
    }
6289
6290
0
    if (!exiting) {
6291
0
        goto reload;
6292
0
    }
6293
6294
0
    pmd_free_static_tx_qid(pmd);
6295
0
    dfc_cache_uninit(&pmd->flow_cache);
6296
0
    free(poll_list);
6297
0
    pmd_free_cached_ports(pmd);
6298
0
    if (dpdk_attached) {
6299
0
        dpdk_detach_thread();
6300
0
    }
6301
0
    return NULL;
6302
0
}
6303
6304
static void
6305
dp_netdev_disable_upcall(struct dp_netdev *dp)
6306
    OVS_ACQUIRES(dp->upcall_rwlock)
6307
0
{
6308
0
    fat_rwlock_wrlock(&dp->upcall_rwlock);
6309
0
}
6310
6311

6312
/* Meters */
6313
static void
6314
dpif_netdev_meter_get_features(const struct dpif * dpif OVS_UNUSED,
6315
                               struct ofputil_meter_features *features)
6316
0
{
6317
0
    features->max_meters = MAX_METERS;
6318
0
    features->band_types = DP_SUPPORTED_METER_BAND_TYPES;
6319
0
    features->capabilities = DP_SUPPORTED_METER_FLAGS_MASK;
6320
0
    features->max_bands = MAX_BANDS;
6321
0
    features->max_color = 0;
6322
0
}
6323
6324
/* Tries to atomically add 'n' to 'value' in terms of saturation arithmetic,
6325
 * i.e., if the result will be larger than 'max_value', will store 'max_value'
6326
 * instead. */
6327
static void
6328
atomic_sat_add(atomic_uint64_t *value, uint64_t n, uint64_t max_value)
6329
0
{
6330
0
    uint64_t current, new_value;
6331
6332
0
    atomic_read_relaxed(value, &current);
6333
0
    do {
6334
0
        new_value = current + n;
6335
0
        new_value = MIN(new_value, max_value);
6336
0
    } while (!atomic_compare_exchange_weak_relaxed(value, &current,
6337
0
                                                   new_value));
6338
0
}
6339
6340
/* Tries to atomically subtract 'n' from 'value'.  Does not perform the
6341
 * operation and returns 'false' if the result will be less than 'min_value'.
6342
 * Otherwise, stores the result and returns 'true'. */
6343
static bool
6344
atomic_bound_sub(atomic_uint64_t *value, uint64_t n, uint64_t min_value)
6345
0
{
6346
0
    uint64_t current;
6347
6348
0
    atomic_read_relaxed(value, &current);
6349
0
    do {
6350
0
        if (current < min_value + n) {
6351
0
            return false;
6352
0
        }
6353
0
    } while (!atomic_compare_exchange_weak_relaxed(value, &current,
6354
0
                                                   current - n));
6355
0
    return true;
6356
0
}
6357
6358
/* Applies the meter identified by 'meter_id' to 'packets_'.  Packets
6359
 * that exceed a band are dropped in-place. */
6360
static void
6361
dp_netdev_run_meter(struct dp_netdev *dp, struct dp_packet_batch *packets_,
6362
                    uint32_t meter_id, long long int now_ms)
6363
0
{
6364
0
    const size_t cnt = dp_packet_batch_size(packets_);
6365
0
    uint32_t exceeded_rate[NETDEV_MAX_BURST];
6366
0
    uint32_t exceeded_band[NETDEV_MAX_BURST];
6367
0
    uint64_t bytes, volume, meter_used, old;
6368
0
    uint64_t band_packets[MAX_BANDS];
6369
0
    uint64_t band_bytes[MAX_BANDS];
6370
0
    struct dp_meter_band *band;
6371
0
    struct dp_packet *packet;
6372
0
    struct dp_meter *meter;
6373
0
    bool exceeded = false;
6374
6375
0
    if (meter_id >= MAX_METERS) {
6376
0
        return;
6377
0
    }
6378
6379
0
    meter = dp_meter_lookup(&dp->meters, meter_id);
6380
0
    if (!meter) {
6381
0
        return;
6382
0
    }
6383
6384
    /* Initialize as negative values. */
6385
0
    memset(exceeded_band, 0xff, cnt * sizeof *exceeded_band);
6386
    /* Initialize as zeroes. */
6387
0
    memset(exceeded_rate, 0, cnt * sizeof *exceeded_rate);
6388
6389
0
    atomic_read_relaxed(&meter->used, &meter_used);
6390
0
    do {
6391
0
        if (meter_used >= now_ms) {
6392
            /* The '>' condition means that we have several threads hitting the
6393
             * same meter, and the other one already advanced the time. */
6394
0
            meter_used = now_ms;
6395
0
            break;
6396
0
        }
6397
0
    } while (!atomic_compare_exchange_weak_relaxed(&meter->used,
6398
0
                                                   &meter_used, now_ms));
6399
6400
    /* Refill all buckets right away, since other threads may use them. */
6401
0
    if (meter_used < now_ms) {
6402
        /* All packets will hit the meter at the same time. */
6403
0
        uint64_t delta_t = now_ms - meter_used;
6404
6405
        /* Make sure delta_t will not be too large, so that bucket will not
6406
         * wrap around below. */
6407
0
        delta_t = MIN(delta_t, meter->max_delta_t);
6408
6409
0
        for (int m = 0; m < meter->n_bands; m++) {
6410
0
            band = &meter->bands[m];
6411
            /* Update band's bucket.  We can't just use atomic add here,
6412
             * because we should never add above the max capacity. */
6413
0
            atomic_sat_add(&band->bucket, delta_t * band->rate,
6414
0
                           band->burst_size * 1000ULL);
6415
0
        }
6416
0
    }
6417
6418
    /* Update meter stats. */
6419
0
    atomic_add_relaxed(&meter->packet_count, cnt, &old);
6420
0
    bytes = 0;
6421
0
    DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
6422
0
        bytes += dp_packet_size(packet);
6423
0
    }
6424
0
    atomic_add_relaxed(&meter->byte_count, bytes, &old);
6425
6426
    /* Meters can operate in terms of packets per second or kilobits per
6427
     * second. */
6428
0
    if (meter->flags & OFPMF13_PKTPS) {
6429
        /* Rate in packets/second, bucket 1/1000 packets.
6430
         * msec * packets/sec = 1/1000 packets. */
6431
0
        volume = cnt * 1000; /* Take 'cnt' packets from the bucket. */
6432
0
    } else {
6433
        /* Rate in kbps, bucket in bits.
6434
         * msec * kbps = bits */
6435
0
        volume = bytes * 8;
6436
0
    }
6437
6438
    /* Find the band hit with the highest rate for each packet (if any). */
6439
0
    for (int m = 0; m < meter->n_bands; m++) {
6440
0
        band = &meter->bands[m];
6441
6442
        /* Drain the bucket for all the packets, if possible. */
6443
0
        if (atomic_bound_sub(&band->bucket, volume, 0)) {
6444
0
            continue;
6445
0
        }
6446
6447
        /* Band limit hit, must process packet-by-packet. */
6448
0
        DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
6449
0
            uint64_t packet_volume = (meter->flags & OFPMF13_PKTPS)
6450
0
                                     ? 1000 : (dp_packet_size(packet) * 8);
6451
6452
0
            if (!atomic_bound_sub(&band->bucket, packet_volume, 0)) {
6453
                /* Update the exceeding band for the exceeding packet.
6454
                 * Only one band will be fired by a packet, and that can
6455
                 * be different for each packet. */
6456
0
                if (band->rate > exceeded_rate[i]) {
6457
0
                    exceeded_rate[i] = band->rate;
6458
0
                    exceeded_band[i] = m;
6459
0
                    exceeded = true;
6460
0
                }
6461
0
            }
6462
0
        }
6463
0
    }
6464
6465
    /* No need to iterate over packets if there are no drops. */
6466
0
    if (!exceeded) {
6467
0
        return;
6468
0
    }
6469
6470
    /* Fire the highest rate band exceeded by each packet, and drop
6471
     * packets if needed. */
6472
6473
0
    memset(band_packets, 0, sizeof band_packets);
6474
0
    memset(band_bytes,   0, sizeof band_bytes);
6475
6476
0
    size_t j;
6477
0
    DP_PACKET_BATCH_REFILL_FOR_EACH (j, cnt, packet, packets_) {
6478
0
        uint32_t m = exceeded_band[j];
6479
6480
0
        if (m != UINT32_MAX) {
6481
            /* Meter drop packet. */
6482
0
            band_packets[m]++;
6483
0
            band_bytes[m] += dp_packet_size(packet);
6484
0
            dp_packet_delete(packet);
6485
0
        } else {
6486
            /* Meter accepts packet. */
6487
0
            dp_packet_batch_refill(packets_, packet, j);
6488
0
        }
6489
0
    }
6490
6491
0
    for (int m = 0; m < meter->n_bands; m++) {
6492
0
        if (!band_packets[m]) {
6493
0
            continue;
6494
0
        }
6495
0
        band = &meter->bands[m];
6496
0
        atomic_add_relaxed(&band->packet_count, band_packets[m], &old);
6497
0
        atomic_add_relaxed(&band->byte_count,   band_bytes[m],   &old);
6498
0
        COVERAGE_ADD(datapath_drop_meter, band_packets[m]);
6499
0
    }
6500
0
}
6501
6502
/* Meter set/get/del processing is still single-threaded. */
6503
static int
6504
dpif_netdev_meter_set(struct dpif *dpif, ofproto_meter_id meter_id,
6505
                      struct ofputil_meter_config *config)
6506
0
{
6507
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
6508
0
    uint32_t mid = meter_id.uint32;
6509
0
    struct dp_meter *meter;
6510
0
    int i;
6511
6512
0
    if (mid >= MAX_METERS) {
6513
0
        return EFBIG; /* Meter_id out of range. */
6514
0
    }
6515
6516
0
    if (config->flags & ~DP_SUPPORTED_METER_FLAGS_MASK) {
6517
0
        return EBADF; /* Unsupported flags set */
6518
0
    }
6519
6520
0
    if (config->n_bands > MAX_BANDS) {
6521
0
        return EINVAL;
6522
0
    }
6523
6524
0
    for (i = 0; i < config->n_bands; ++i) {
6525
0
        switch (config->bands[i].type) {
6526
0
        case OFPMBT13_DROP:
6527
0
            break;
6528
0
        default:
6529
0
            return ENODEV; /* Unsupported band type */
6530
0
        }
6531
0
    }
6532
6533
    /* Allocate meter */
6534
0
    meter = xzalloc(sizeof *meter
6535
0
                    + config->n_bands * sizeof(struct dp_meter_band));
6536
6537
0
    meter->flags = config->flags;
6538
0
    meter->n_bands = config->n_bands;
6539
0
    meter->max_delta_t = 0;
6540
0
    meter->id = mid;
6541
0
    atomic_init(&meter->used, time_msec());
6542
6543
    /* set up bands */
6544
0
    for (i = 0; i < config->n_bands; ++i) {
6545
0
        uint32_t band_max_delta_t;
6546
0
        uint64_t bucket_size;
6547
6548
        /* Set burst size to a workable value if none specified. */
6549
0
        if (config->bands[i].burst_size == 0) {
6550
0
            config->bands[i].burst_size = config->bands[i].rate;
6551
0
        }
6552
6553
0
        meter->bands[i].rate = config->bands[i].rate;
6554
0
        meter->bands[i].burst_size = config->bands[i].burst_size;
6555
        /* Start with a full bucket. */
6556
0
        bucket_size = meter->bands[i].burst_size * 1000ULL;
6557
0
        atomic_init(&meter->bands[i].bucket, bucket_size);
6558
6559
        /* Figure out max delta_t that is enough to fill any bucket. */
6560
0
        band_max_delta_t = bucket_size / meter->bands[i].rate;
6561
0
        if (band_max_delta_t > meter->max_delta_t) {
6562
0
            meter->max_delta_t = band_max_delta_t;
6563
0
        }
6564
0
    }
6565
6566
0
    ovs_mutex_lock(&dp->meters_lock);
6567
6568
0
    dp_meter_detach_free(&dp->meters, mid); /* Free existing meter, if any. */
6569
0
    dp_meter_attach(&dp->meters, meter);
6570
6571
0
    ovs_mutex_unlock(&dp->meters_lock);
6572
6573
0
    return 0;
6574
0
}
6575
6576
static int
6577
dpif_netdev_meter_get(const struct dpif *dpif,
6578
                      ofproto_meter_id meter_id_,
6579
                      struct ofputil_meter_stats *stats, uint16_t n_bands)
6580
0
{
6581
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
6582
0
    uint32_t meter_id = meter_id_.uint32;
6583
0
    struct dp_meter *meter;
6584
6585
0
    if (meter_id >= MAX_METERS) {
6586
0
        return EFBIG;
6587
0
    }
6588
6589
0
    meter = dp_meter_lookup(&dp->meters, meter_id);
6590
0
    if (!meter) {
6591
0
        return ENOENT;
6592
0
    }
6593
6594
0
    if (stats) {
6595
0
        int i = 0;
6596
6597
0
        atomic_read_relaxed(&meter->packet_count, &stats->packet_in_count);
6598
0
        atomic_read_relaxed(&meter->byte_count, &stats->byte_in_count);
6599
6600
0
        for (i = 0; i < n_bands && i < meter->n_bands; ++i) {
6601
0
            atomic_read_relaxed(&meter->bands[i].packet_count,
6602
0
                                &stats->bands[i].packet_count);
6603
0
            atomic_read_relaxed(&meter->bands[i].byte_count,
6604
0
                                &stats->bands[i].byte_count);
6605
0
        }
6606
0
        stats->n_bands = i;
6607
0
    }
6608
6609
0
    return 0;
6610
0
}
6611
6612
static int
6613
dpif_netdev_meter_del(struct dpif *dpif,
6614
                      ofproto_meter_id meter_id_,
6615
                      struct ofputil_meter_stats *stats, uint16_t n_bands)
6616
0
{
6617
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
6618
0
    int error;
6619
6620
0
    error = dpif_netdev_meter_get(dpif, meter_id_, stats, n_bands);
6621
0
    if (!error) {
6622
0
        uint32_t meter_id = meter_id_.uint32;
6623
6624
0
        ovs_mutex_lock(&dp->meters_lock);
6625
0
        dp_meter_detach_free(&dp->meters, meter_id);
6626
0
        ovs_mutex_unlock(&dp->meters_lock);
6627
0
    }
6628
0
    return error;
6629
0
}
6630
6631

6632
static void
6633
dpif_netdev_disable_upcall(struct dpif *dpif)
6634
    OVS_NO_THREAD_SAFETY_ANALYSIS
6635
0
{
6636
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
6637
0
    dp_netdev_disable_upcall(dp);
6638
0
}
6639
6640
static void
6641
dp_netdev_enable_upcall(struct dp_netdev *dp)
6642
    OVS_RELEASES(dp->upcall_rwlock)
6643
0
{
6644
0
    fat_rwlock_unlock(&dp->upcall_rwlock);
6645
0
}
6646
6647
static void
6648
dpif_netdev_enable_upcall(struct dpif *dpif)
6649
    OVS_NO_THREAD_SAFETY_ANALYSIS
6650
0
{
6651
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
6652
0
    dp_netdev_enable_upcall(dp);
6653
0
}
6654
6655
static void
6656
dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd)
6657
0
{
6658
0
    atomic_store_relaxed(&pmd->wait_for_reload, false);
6659
0
    atomic_store_relaxed(&pmd->reload_tx_qid, false);
6660
0
    pmd->last_reload_seq = seq_read(pmd->reload_seq);
6661
0
    atomic_store_explicit(&pmd->reload, false, memory_order_release);
6662
0
}
6663
6664
/* Finds and refs the dp_netdev_pmd_thread on core 'core_id'.  Returns
6665
 * the pointer if succeeds, otherwise, NULL (it can return NULL even if
6666
 * 'core_id' is NON_PMD_CORE_ID).
6667
 *
6668
 * Caller must unrefs the returned reference.  */
6669
static struct dp_netdev_pmd_thread *
6670
dp_netdev_get_pmd(struct dp_netdev *dp, unsigned core_id)
6671
0
{
6672
0
    struct dp_netdev_pmd_thread *pmd;
6673
6674
0
    CMAP_FOR_EACH_WITH_HASH (pmd, node, hash_int(core_id, 0),
6675
0
                             &dp->poll_threads) {
6676
0
        if (pmd->core_id == core_id) {
6677
0
            return dp_netdev_pmd_try_ref(pmd) ? pmd : NULL;
6678
0
        }
6679
0
    }
6680
6681
0
    return NULL;
6682
0
}
6683
6684
/* Sets the 'struct dp_netdev_pmd_thread' for non-pmd threads. */
6685
static void
6686
dp_netdev_set_nonpmd(struct dp_netdev *dp)
6687
    OVS_REQ_WRLOCK(dp->port_rwlock)
6688
0
{
6689
0
    struct dp_netdev_pmd_thread *non_pmd;
6690
6691
0
    non_pmd = xzalloc(sizeof *non_pmd);
6692
0
    dp_netdev_configure_pmd(non_pmd, dp, NON_PMD_CORE_ID, OVS_NUMA_UNSPEC);
6693
0
}
6694
6695
/* Caller must have valid pointer to 'pmd'. */
6696
static bool
6697
dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd)
6698
0
{
6699
0
    return ovs_refcount_try_ref_rcu(&pmd->ref_cnt);
6700
0
}
6701
6702
static void
6703
dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd)
6704
0
{
6705
0
    if (pmd && ovs_refcount_unref(&pmd->ref_cnt) == 1) {
6706
0
        ovsrcu_postpone(dp_netdev_destroy_pmd, pmd);
6707
0
    }
6708
0
}
6709
6710
/* Given cmap position 'pos', tries to ref the next node.  If try_ref()
6711
 * fails, keeps checking for next node until reaching the end of cmap.
6712
 *
6713
 * Caller must unrefs the returned reference. */
6714
static struct dp_netdev_pmd_thread *
6715
dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos)
6716
0
{
6717
0
    struct dp_netdev_pmd_thread *next;
6718
6719
0
    do {
6720
0
        struct cmap_node *node;
6721
6722
0
        node = cmap_next_position(&dp->poll_threads, pos);
6723
0
        next = node ? CONTAINER_OF(node, struct dp_netdev_pmd_thread, node)
6724
0
            : NULL;
6725
0
    } while (next && !dp_netdev_pmd_try_ref(next));
6726
6727
0
    return next;
6728
0
}
6729
6730
/* Configures the 'pmd' based on the input argument. */
6731
static void
6732
dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp,
6733
                        unsigned core_id, int numa_id)
6734
    OVS_NO_THREAD_SAFETY_ANALYSIS
6735
0
{
6736
0
    pmd->dp = dp;
6737
0
    pmd->core_id = core_id;
6738
0
    pmd->numa_id = numa_id;
6739
0
    pmd->need_reload = false;
6740
0
    pmd->n_output_batches = 0;
6741
6742
0
    ovs_refcount_init(&pmd->ref_cnt);
6743
0
    atomic_init(&pmd->exit, false);
6744
0
    pmd->reload_seq = seq_create();
6745
0
    pmd->last_reload_seq = seq_read(pmd->reload_seq);
6746
0
    atomic_init(&pmd->reload, false);
6747
0
    ovs_mutex_init(&pmd->flow_mutex);
6748
0
    ovs_mutex_init(&pmd->port_mutex);
6749
0
    ovs_mutex_init(&pmd->bond_mutex);
6750
0
    cmap_init(&pmd->flow_table);
6751
0
    cmap_init(&pmd->classifiers);
6752
0
    cmap_init(&pmd->simple_match_table);
6753
0
    ccmap_init(&pmd->n_flows);
6754
0
    ccmap_init(&pmd->n_simple_flows);
6755
0
    pmd->ctx.last_rxq = NULL;
6756
0
    pmd_thread_ctx_time_update(pmd);
6757
0
    pmd->next_optimization = pmd->ctx.now + DPCLS_OPTIMIZATION_INTERVAL;
6758
0
    pmd->next_rcu_quiesce = pmd->ctx.now + PMD_RCU_QUIESCE_INTERVAL;
6759
0
    pmd->next_cycle_store = pmd->ctx.now + PMD_INTERVAL_LEN;
6760
0
    pmd->busy_cycles_intrvl = xzalloc(PMD_INTERVAL_MAX *
6761
0
                                      sizeof *pmd->busy_cycles_intrvl);
6762
0
    hmap_init(&pmd->poll_list);
6763
0
    hmap_init(&pmd->tx_ports);
6764
0
    hmap_init(&pmd->tnl_port_cache);
6765
0
    hmap_init(&pmd->send_port_cache);
6766
0
    cmap_init(&pmd->tx_bonds);
6767
6768
0
    pmd_init_max_sleep(dp, pmd);
6769
6770
    /* init the 'flow_cache' since there is no
6771
     * actual thread created for NON_PMD_CORE_ID. */
6772
0
    if (core_id == NON_PMD_CORE_ID) {
6773
0
        dfc_cache_init(&pmd->flow_cache);
6774
0
        pmd_alloc_static_tx_qid(pmd);
6775
0
    }
6776
0
    pmd_perf_stats_init(&pmd->perf_stats);
6777
0
    cmap_insert(&dp->poll_threads, CONST_CAST(struct cmap_node *, &pmd->node),
6778
0
                hash_int(core_id, 0));
6779
0
}
6780
6781
static void
6782
dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd)
6783
    OVS_NO_THREAD_SAFETY_ANALYSIS
6784
0
{
6785
0
    struct dpcls *cls;
6786
6787
0
    dp_netdev_pmd_flow_flush(pmd);
6788
0
    hmap_destroy(&pmd->send_port_cache);
6789
0
    hmap_destroy(&pmd->tnl_port_cache);
6790
0
    hmap_destroy(&pmd->tx_ports);
6791
0
    cmap_destroy(&pmd->tx_bonds);
6792
0
    hmap_destroy(&pmd->poll_list);
6793
0
    free(pmd->busy_cycles_intrvl);
6794
    /* All flows (including their dpcls_rules) have been deleted already */
6795
0
    CMAP_FOR_EACH (cls, node, &pmd->classifiers) {
6796
0
        dpcls_destroy(cls);
6797
0
        ovsrcu_postpone(free, cls);
6798
0
    }
6799
0
    cmap_destroy(&pmd->classifiers);
6800
0
    cmap_destroy(&pmd->flow_table);
6801
0
    cmap_destroy(&pmd->simple_match_table);
6802
0
    ccmap_destroy(&pmd->n_flows);
6803
0
    ccmap_destroy(&pmd->n_simple_flows);
6804
0
    ovs_mutex_destroy(&pmd->flow_mutex);
6805
0
    seq_destroy(pmd->reload_seq);
6806
0
    ovs_mutex_destroy(&pmd->port_mutex);
6807
0
    ovs_mutex_destroy(&pmd->bond_mutex);
6808
0
    free(pmd);
6809
0
}
6810
6811
/* Stops the pmd thread, removes it from the 'dp->poll_threads',
6812
 * and unrefs the struct. */
6813
static void
6814
dp_netdev_del_pmd(struct dp_netdev *dp, struct dp_netdev_pmd_thread *pmd)
6815
0
{
6816
    /* NON_PMD_CORE_ID doesn't have a thread, so we don't have to synchronize,
6817
     * but extra cleanup is necessary */
6818
0
    if (pmd->core_id == NON_PMD_CORE_ID) {
6819
0
        ovs_mutex_lock(&dp->non_pmd_mutex);
6820
0
        dfc_cache_uninit(&pmd->flow_cache);
6821
0
        pmd_free_cached_ports(pmd);
6822
0
        pmd_free_static_tx_qid(pmd);
6823
0
        ovs_mutex_unlock(&dp->non_pmd_mutex);
6824
0
    } else {
6825
0
        atomic_store_relaxed(&pmd->exit, true);
6826
0
        dp_netdev_reload_pmd__(pmd);
6827
0
        xpthread_join(pmd->thread, NULL);
6828
0
    }
6829
6830
0
    dp_netdev_pmd_clear_ports(pmd);
6831
6832
    /* Purges the 'pmd''s flows after stopping the thread, but before
6833
     * destroying the flows, so that the flow stats can be collected. */
6834
0
    if (dp->dp_purge_cb) {
6835
0
        dp->dp_purge_cb(dp->dp_purge_aux, pmd->core_id);
6836
0
    }
6837
0
    cmap_remove(&pmd->dp->poll_threads, &pmd->node, hash_int(pmd->core_id, 0));
6838
0
    dp_netdev_pmd_unref(pmd);
6839
0
}
6840
6841
/* Destroys all pmd threads. If 'non_pmd' is true it also destroys the non pmd
6842
 * thread. */
6843
static void
6844
dp_netdev_destroy_all_pmds(struct dp_netdev *dp, bool non_pmd)
6845
0
{
6846
0
    struct dp_netdev_pmd_thread *pmd;
6847
0
    struct dp_netdev_pmd_thread **pmd_list;
6848
0
    size_t k = 0, n_pmds;
6849
6850
0
    n_pmds = cmap_count(&dp->poll_threads);
6851
0
    pmd_list = xcalloc(n_pmds, sizeof *pmd_list);
6852
6853
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
6854
0
        if (!non_pmd && pmd->core_id == NON_PMD_CORE_ID) {
6855
0
            continue;
6856
0
        }
6857
        /* We cannot call dp_netdev_del_pmd(), since it alters
6858
         * 'dp->poll_threads' (while we're iterating it) and it
6859
         * might quiesce. */
6860
0
        ovs_assert(k < n_pmds);
6861
0
        pmd_list[k++] = pmd;
6862
0
    }
6863
6864
0
    for (size_t i = 0; i < k; i++) {
6865
0
        dp_netdev_del_pmd(dp, pmd_list[i]);
6866
0
    }
6867
0
    free(pmd_list);
6868
0
}
6869
6870
/* Deletes all rx queues from pmd->poll_list and all the ports from
6871
 * pmd->tx_ports. */
6872
static void
6873
dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd)
6874
0
{
6875
0
    struct rxq_poll *poll;
6876
0
    struct tx_port *port;
6877
0
    struct tx_bond *tx;
6878
6879
0
    ovs_mutex_lock(&pmd->port_mutex);
6880
0
    HMAP_FOR_EACH_POP (poll, node, &pmd->poll_list) {
6881
0
        free(poll);
6882
0
    }
6883
0
    HMAP_FOR_EACH_POP (port, node, &pmd->tx_ports) {
6884
0
        free(port->txq_pkts);
6885
0
        free(port);
6886
0
    }
6887
0
    ovs_mutex_unlock(&pmd->port_mutex);
6888
6889
0
    ovs_mutex_lock(&pmd->bond_mutex);
6890
0
    CMAP_FOR_EACH (tx, node, &pmd->tx_bonds) {
6891
0
        cmap_remove(&pmd->tx_bonds, &tx->node, hash_bond_id(tx->bond_id));
6892
0
        ovsrcu_postpone(free, tx);
6893
0
    }
6894
0
    ovs_mutex_unlock(&pmd->bond_mutex);
6895
0
}
6896
6897
/* Adds rx queue to poll_list of PMD thread, if it's not there already. */
6898
static void
6899
dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd,
6900
                         struct dp_netdev_rxq *rxq)
6901
    OVS_REQUIRES(pmd->port_mutex)
6902
0
{
6903
0
    int qid = netdev_rxq_get_queue_id(rxq->rx);
6904
0
    uint32_t hash = hash_2words(odp_to_u32(rxq->port->port_no), qid);
6905
0
    struct rxq_poll *poll;
6906
6907
0
    HMAP_FOR_EACH_WITH_HASH (poll, node, hash, &pmd->poll_list) {
6908
0
        if (poll->rxq == rxq) {
6909
            /* 'rxq' is already polled by this thread. Do nothing. */
6910
0
            return;
6911
0
        }
6912
0
    }
6913
6914
0
    poll = xmalloc(sizeof *poll);
6915
0
    poll->rxq = rxq;
6916
0
    hmap_insert(&pmd->poll_list, &poll->node, hash);
6917
6918
0
    pmd->need_reload = true;
6919
0
}
6920
6921
/* Delete 'poll' from poll_list of PMD thread. */
6922
static void
6923
dp_netdev_del_rxq_from_pmd(struct dp_netdev_pmd_thread *pmd,
6924
                           struct rxq_poll *poll)
6925
    OVS_REQUIRES(pmd->port_mutex)
6926
0
{
6927
0
    hmap_remove(&pmd->poll_list, &poll->node);
6928
0
    free(poll);
6929
6930
0
    pmd->need_reload = true;
6931
0
}
6932
6933
/* Add 'port' to the tx port cache of 'pmd', which must be reloaded for the
6934
 * changes to take effect. */
6935
static void
6936
dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
6937
                             struct dp_netdev_port *port)
6938
    OVS_REQUIRES(pmd->port_mutex)
6939
0
{
6940
0
    struct tx_port *tx;
6941
6942
0
    tx = tx_port_lookup(&pmd->tx_ports, port->port_no);
6943
0
    if (tx) {
6944
        /* 'port' is already on this thread tx cache. Do nothing. */
6945
0
        return;
6946
0
    }
6947
6948
0
    tx = xzalloc(sizeof *tx);
6949
6950
0
    tx->port = port;
6951
0
    tx->qid = -1;
6952
0
    tx->flush_time = 0LL;
6953
0
    dp_packet_batch_init(&tx->output_pkts);
6954
6955
0
    if (tx->port->txq_mode == TXQ_MODE_XPS_HASH) {
6956
0
        int i, n_txq = netdev_n_txq(tx->port->netdev);
6957
6958
0
        tx->txq_pkts = xzalloc(n_txq * sizeof *tx->txq_pkts);
6959
0
        for (i = 0; i < n_txq; i++) {
6960
0
            dp_packet_batch_init(&tx->txq_pkts[i]);
6961
0
        }
6962
0
    }
6963
6964
0
    hmap_insert(&pmd->tx_ports, &tx->node, hash_port_no(tx->port->port_no));
6965
0
    pmd->need_reload = true;
6966
0
}
6967
6968
/* Del 'tx' from the tx port cache of 'pmd', which must be reloaded for the
6969
 * changes to take effect. */
6970
static void
6971
dp_netdev_del_port_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
6972
                               struct tx_port *tx)
6973
    OVS_REQUIRES(pmd->port_mutex)
6974
0
{
6975
0
    hmap_remove(&pmd->tx_ports, &tx->node);
6976
0
    free(tx->txq_pkts);
6977
0
    free(tx);
6978
0
    pmd->need_reload = true;
6979
0
}
6980
6981
/* Add bond to the tx bond cmap of 'pmd'. */
6982
static void
6983
dp_netdev_add_bond_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
6984
                             struct tx_bond *bond, bool update)
6985
    OVS_EXCLUDED(pmd->bond_mutex)
6986
0
{
6987
0
    struct tx_bond *tx;
6988
6989
0
    ovs_mutex_lock(&pmd->bond_mutex);
6990
0
    tx = tx_bond_lookup(&pmd->tx_bonds, bond->bond_id);
6991
6992
0
    if (tx && !update) {
6993
        /* It's not an update and the entry already exists.  Do nothing. */
6994
0
        goto unlock;
6995
0
    }
6996
6997
0
    if (tx) {
6998
0
        struct tx_bond *new_tx = xmemdup(bond, sizeof *bond);
6999
7000
        /* Copy the stats for each bucket. */
7001
0
        for (int i = 0; i < BOND_BUCKETS; i++) {
7002
0
            uint64_t n_packets, n_bytes;
7003
7004
0
            atomic_read_relaxed(&tx->member_buckets[i].n_packets, &n_packets);
7005
0
            atomic_read_relaxed(&tx->member_buckets[i].n_bytes, &n_bytes);
7006
0
            atomic_init(&new_tx->member_buckets[i].n_packets, n_packets);
7007
0
            atomic_init(&new_tx->member_buckets[i].n_bytes, n_bytes);
7008
0
        }
7009
0
        cmap_replace(&pmd->tx_bonds, &tx->node, &new_tx->node,
7010
0
                     hash_bond_id(bond->bond_id));
7011
0
        ovsrcu_postpone(free, tx);
7012
0
    } else {
7013
0
        tx = xmemdup(bond, sizeof *bond);
7014
0
        cmap_insert(&pmd->tx_bonds, &tx->node, hash_bond_id(bond->bond_id));
7015
0
    }
7016
0
unlock:
7017
0
    ovs_mutex_unlock(&pmd->bond_mutex);
7018
0
}
7019
7020
/* Delete bond from the tx bond cmap of 'pmd'. */
7021
static void
7022
dp_netdev_del_bond_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
7023
                               uint32_t bond_id)
7024
    OVS_EXCLUDED(pmd->bond_mutex)
7025
0
{
7026
0
    struct tx_bond *tx;
7027
7028
0
    ovs_mutex_lock(&pmd->bond_mutex);
7029
0
    tx = tx_bond_lookup(&pmd->tx_bonds, bond_id);
7030
0
    if (tx) {
7031
0
        cmap_remove(&pmd->tx_bonds, &tx->node, hash_bond_id(tx->bond_id));
7032
0
        ovsrcu_postpone(free, tx);
7033
0
    }
7034
0
    ovs_mutex_unlock(&pmd->bond_mutex);
7035
0
}
7036

7037
static char *
7038
dpif_netdev_get_datapath_version(void)
7039
0
{
7040
0
     return xstrdup("<built-in>");
7041
0
}
7042
7043
static void
7044
dp_netdev_flow_used(struct dp_netdev_flow *netdev_flow, int cnt, int size,
7045
                    uint16_t tcp_flags, long long now)
7046
0
{
7047
0
    uint16_t flags;
7048
7049
0
    atomic_store_relaxed(&netdev_flow->stats.used, now);
7050
0
    non_atomic_ullong_add(&netdev_flow->stats.packet_count, cnt);
7051
0
    non_atomic_ullong_add(&netdev_flow->stats.byte_count, size);
7052
0
    atomic_read_relaxed(&netdev_flow->stats.tcp_flags, &flags);
7053
0
    flags |= tcp_flags;
7054
0
    atomic_store_relaxed(&netdev_flow->stats.tcp_flags, flags);
7055
0
}
7056
7057
static int
7058
dp_netdev_upcall(struct dp_netdev_pmd_thread *pmd, struct dp_packet *packet_,
7059
                 struct flow *flow, struct flow_wildcards *wc, ovs_u128 *ufid,
7060
                 enum dpif_upcall_type type, const struct nlattr *userdata,
7061
                 struct ofpbuf *actions, struct ofpbuf *put_actions)
7062
0
{
7063
0
    struct dp_netdev *dp = pmd->dp;
7064
7065
0
    if (OVS_UNLIKELY(!dp->upcall_cb)) {
7066
0
        return ENODEV;
7067
0
    }
7068
7069
0
    if (OVS_UNLIKELY(!VLOG_DROP_DBG(&upcall_rl))) {
7070
0
        struct ds ds = DS_EMPTY_INITIALIZER;
7071
0
        char *packet_str;
7072
0
        struct ofpbuf key;
7073
0
        struct odp_flow_key_parms odp_parms = {
7074
0
            .flow = flow,
7075
0
            .mask = wc ? &wc->masks : NULL,
7076
0
            .support = dp_netdev_support,
7077
0
        };
7078
7079
0
        ofpbuf_init(&key, 0);
7080
0
        odp_flow_key_from_flow(&odp_parms, &key);
7081
0
        packet_str = ofp_dp_packet_to_string(packet_);
7082
7083
0
        odp_flow_key_format(key.data, key.size, &ds);
7084
7085
0
        VLOG_DBG("%s: %s upcall:\n%s\n%s", dp->name,
7086
0
                 dpif_upcall_type_to_string(type), ds_cstr(&ds), packet_str);
7087
7088
0
        ofpbuf_uninit(&key);
7089
0
        free(packet_str);
7090
7091
0
        ds_destroy(&ds);
7092
0
    }
7093
7094
0
    if (type != DPIF_UC_MISS) {
7095
0
        dp_packet_ol_send_prepare(packet_, 0);
7096
0
    }
7097
7098
0
    return dp->upcall_cb(packet_, flow, ufid, pmd->core_id, type, userdata,
7099
0
                         actions, wc, put_actions, dp->upcall_aux);
7100
0
}
7101
7102
static inline uint32_t
7103
dpif_netdev_packet_get_rss_hash(struct dp_packet *packet,
7104
                                const struct miniflow *mf)
7105
0
{
7106
0
    uint32_t hash, recirc_depth;
7107
7108
0
    if (OVS_LIKELY(dp_packet_rss_valid(packet))) {
7109
0
        hash = dp_packet_get_rss_hash(packet);
7110
0
    } else {
7111
0
        hash = miniflow_hash_5tuple(mf, 0);
7112
0
        dp_packet_set_rss_hash(packet, hash);
7113
0
    }
7114
7115
    /* The RSS hash must account for the recirculation depth to avoid
7116
     * collisions in the exact match cache */
7117
0
    recirc_depth = *recirc_depth_get_unsafe();
7118
0
    if (OVS_UNLIKELY(recirc_depth)) {
7119
0
        hash = hash_finish(hash, recirc_depth);
7120
0
    }
7121
0
    return hash;
7122
0
}
7123
7124
struct packet_batch_per_flow {
7125
    unsigned int byte_count;
7126
    uint16_t tcp_flags;
7127
    struct dp_netdev_flow *flow;
7128
7129
    struct dp_packet_batch array;
7130
};
7131
7132
static inline void
7133
packet_batch_per_flow_update(struct packet_batch_per_flow *batch,
7134
                             struct dp_packet *packet,
7135
                             uint16_t tcp_flags)
7136
0
{
7137
0
    batch->byte_count += dp_packet_size(packet);
7138
0
    batch->tcp_flags |= tcp_flags;
7139
0
    dp_packet_batch_add(&batch->array, packet);
7140
0
}
7141
7142
static inline void
7143
packet_batch_per_flow_init(struct packet_batch_per_flow *batch,
7144
                           struct dp_netdev_flow *flow)
7145
0
{
7146
0
    flow->batch = batch;
7147
7148
0
    batch->flow = flow;
7149
0
    dp_packet_batch_init(&batch->array);
7150
0
    batch->byte_count = 0;
7151
0
    batch->tcp_flags = 0;
7152
0
}
7153
7154
static inline void
7155
packet_batch_per_flow_execute(struct packet_batch_per_flow *batch,
7156
                              struct dp_netdev_pmd_thread *pmd)
7157
0
{
7158
0
    struct dp_netdev_actions *actions;
7159
0
    struct dp_netdev_flow *flow = batch->flow;
7160
7161
0
    dp_netdev_flow_used(flow, dp_packet_batch_size(&batch->array),
7162
0
                        batch->byte_count,
7163
0
                        batch->tcp_flags, pmd->ctx.now / 1000);
7164
7165
0
    actions = dp_netdev_flow_get_actions(flow);
7166
7167
0
    dp_netdev_execute_actions(pmd, &batch->array, true, &flow->flow,
7168
0
                              actions->actions, actions->size);
7169
0
}
7170
7171
static inline void
7172
dp_netdev_queue_batches(struct dp_packet *pkt,
7173
                        struct dp_netdev_flow *flow, uint16_t tcp_flags,
7174
                        struct packet_batch_per_flow *batches,
7175
                        size_t *n_batches)
7176
0
{
7177
0
    struct packet_batch_per_flow *batch = flow->batch;
7178
7179
0
    if (OVS_UNLIKELY(!batch)) {
7180
0
        batch = &batches[(*n_batches)++];
7181
0
        packet_batch_per_flow_init(batch, flow);
7182
0
    }
7183
7184
0
    packet_batch_per_flow_update(batch, pkt, tcp_flags);
7185
0
}
7186
7187
static inline void
7188
packet_enqueue_to_flow_map(struct dp_packet *packet,
7189
                           struct dp_netdev_flow *flow,
7190
                           uint16_t tcp_flags,
7191
                           struct dp_packet_flow_map *flow_map,
7192
                           size_t index)
7193
0
{
7194
0
    struct dp_packet_flow_map *map = &flow_map[index];
7195
0
    map->flow = flow;
7196
0
    map->packet = packet;
7197
0
    map->tcp_flags = tcp_flags;
7198
0
}
7199
7200
/* SMC lookup function for a batch of packets.
7201
 * By doing batching SMC lookup, we can use prefetch
7202
 * to hide memory access latency.
7203
 */
7204
static inline void
7205
smc_lookup_batch(struct dp_netdev_pmd_thread *pmd,
7206
            struct netdev_flow_key *keys,
7207
            struct netdev_flow_key **missed_keys,
7208
            struct dp_packet_batch *packets_,
7209
            const int cnt,
7210
            struct dp_packet_flow_map *flow_map,
7211
            uint8_t *index_map)
7212
0
{
7213
0
    int i;
7214
0
    struct dp_packet *packet;
7215
0
    size_t n_smc_hit = 0, n_missed = 0;
7216
0
    struct dfc_cache *cache = &pmd->flow_cache;
7217
0
    struct smc_cache *smc_cache = &cache->smc_cache;
7218
0
    const struct cmap_node *flow_node;
7219
0
    int recv_idx;
7220
0
    uint16_t tcp_flags;
7221
7222
    /* Prefetch buckets for all packets */
7223
0
    for (i = 0; i < cnt; i++) {
7224
0
        OVS_PREFETCH(&smc_cache->buckets[keys[i].hash & SMC_MASK]);
7225
0
    }
7226
7227
0
    DP_PACKET_BATCH_REFILL_FOR_EACH (i, cnt, packet, packets_) {
7228
0
        struct dp_netdev_flow *flow = NULL;
7229
0
        flow_node = smc_entry_get(pmd, keys[i].hash);
7230
0
        bool hit = false;
7231
        /* Get the original order of this packet in received batch. */
7232
0
        recv_idx = index_map[i];
7233
7234
0
        if (OVS_LIKELY(flow_node != NULL)) {
7235
0
            CMAP_NODE_FOR_EACH (flow, node, flow_node) {
7236
                /* Since we dont have per-port megaflow to check the port
7237
                 * number, we need to  verify that the input ports match. */
7238
0
                if (OVS_LIKELY(dpcls_rule_matches_key(&flow->cr, &keys[i]) &&
7239
0
                flow->flow.in_port.odp_port == packet->md.in_port.odp_port)) {
7240
0
                    tcp_flags = miniflow_get_tcp_flags(&keys[i].mf);
7241
7242
                    /* SMC hit and emc miss, we insert into EMC */
7243
0
                    keys[i].len =
7244
0
                        netdev_flow_key_size(miniflow_n_values(&keys[i].mf));
7245
0
                    emc_probabilistic_insert(pmd, &keys[i], flow);
7246
                    /* Add these packets into the flow map in the same order
7247
                     * as received.
7248
                     */
7249
0
                    packet_enqueue_to_flow_map(packet, flow, tcp_flags,
7250
0
                                               flow_map, recv_idx);
7251
0
                    n_smc_hit++;
7252
0
                    hit = true;
7253
0
                    break;
7254
0
                }
7255
0
            }
7256
0
            if (hit) {
7257
0
                continue;
7258
0
            }
7259
0
        }
7260
7261
        /* SMC missed. Group missed packets together at
7262
         * the beginning of the 'packets' array. */
7263
0
        dp_packet_batch_refill(packets_, packet, i);
7264
7265
        /* Preserve the order of packet for flow batching. */
7266
0
        index_map[n_missed] = recv_idx;
7267
7268
        /* Put missed keys to the pointer arrays return to the caller */
7269
0
        missed_keys[n_missed++] = &keys[i];
7270
0
    }
7271
7272
0
    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SMC_HIT, n_smc_hit);
7273
0
}
7274
7275
struct dp_netdev_flow *
7276
smc_lookup_single(struct dp_netdev_pmd_thread *pmd,
7277
                  struct dp_packet *packet,
7278
                  struct netdev_flow_key *key)
7279
0
{
7280
0
    const struct cmap_node *flow_node = smc_entry_get(pmd, key->hash);
7281
7282
0
    if (OVS_LIKELY(flow_node != NULL)) {
7283
0
        struct dp_netdev_flow *flow = NULL;
7284
7285
0
        CMAP_NODE_FOR_EACH (flow, node, flow_node) {
7286
            /* Since we dont have per-port megaflow to check the port
7287
             * number, we need to verify that the input ports match. */
7288
0
            if (OVS_LIKELY(dpcls_rule_matches_key(&flow->cr, key) &&
7289
0
                flow->flow.in_port.odp_port == packet->md.in_port.odp_port)) {
7290
7291
0
                return (void *) flow;
7292
0
            }
7293
0
        }
7294
0
    }
7295
7296
0
    return NULL;
7297
0
}
7298
7299
static inline int
7300
dp_netdev_hw_flow(const struct dp_netdev_pmd_thread *pmd,
7301
                  struct dp_packet *packet,
7302
                  struct dp_netdev_flow **flow)
7303
0
{
7304
0
    struct dp_netdev_rxq *rxq = pmd->ctx.last_rxq;
7305
0
    bool post_process_api_supported;
7306
0
    void *flow_reference = NULL;
7307
0
    int err;
7308
7309
0
    atomic_read_relaxed(&rxq->port->netdev->hw_info.post_process_api_supported,
7310
0
                        &post_process_api_supported);
7311
7312
0
    if (!post_process_api_supported) {
7313
0
        *flow = NULL;
7314
0
        return 0;
7315
0
    }
7316
7317
0
    err = dpif_offload_netdev_hw_post_process(rxq->port->netdev, pmd->core_id,
7318
0
                                              packet, &flow_reference);
7319
0
    if (err && err != EOPNOTSUPP) {
7320
0
        if (err != ECANCELED) {
7321
0
            COVERAGE_INC(datapath_drop_hw_post_process);
7322
0
        } else {
7323
0
            COVERAGE_INC(datapath_drop_hw_post_process_consumed);
7324
0
        }
7325
0
        return -1;
7326
0
    }
7327
7328
0
    *flow = flow_reference;
7329
0
    return 0;
7330
0
}
7331
7332
/* Enqueues already classified packet into per-flow batches or the flow map,
7333
 * depending on the fact if batching enabled. */
7334
static inline void
7335
dfc_processing_enqueue_classified_packet(struct dp_packet *packet,
7336
                                         struct dp_netdev_flow *flow,
7337
                                         uint16_t tcp_flags,
7338
                                         bool batch_enable,
7339
                                         struct packet_batch_per_flow *batches,
7340
                                         size_t *n_batches,
7341
                                         struct dp_packet_flow_map *flow_map,
7342
                                         size_t *map_cnt)
7343
7344
0
{
7345
0
    if (OVS_LIKELY(batch_enable)) {
7346
0
        dp_netdev_queue_batches(packet, flow, tcp_flags, batches,
7347
0
                                n_batches);
7348
0
    } else {
7349
        /* Flow batching should be performed only after fast-path
7350
         * processing is also completed for packets with emc miss
7351
         * or else it will result in reordering of packets with
7352
         * same datapath flows. */
7353
0
        packet_enqueue_to_flow_map(packet, flow, tcp_flags,
7354
0
                                   flow_map, (*map_cnt)++);
7355
0
    }
7356
7357
0
}
7358
7359
/* Try to process all ('cnt') the 'packets' using only the datapath flow cache
7360
 * 'pmd->flow_cache'. If a flow is not found for a packet 'packets[i]', the
7361
 * miniflow is copied into 'keys' and the packet pointer is moved at the
7362
 * beginning of the 'packets' array. The pointers of missed keys are put in the
7363
 * missed_keys pointer array for future processing.
7364
 *
7365
 * The function returns the number of packets that needs to be processed in the
7366
 * 'packets' array (they have been moved to the beginning of the vector).
7367
 *
7368
 * For performance reasons a caller may choose not to initialize the metadata
7369
 * in 'packets_'.  If 'md_is_valid' is false, the metadata in 'packets'
7370
 * is not valid and must be initialized by this function using 'port_no'.
7371
 * If 'md_is_valid' is true, the metadata is already valid and 'port_no'
7372
 * will be ignored.
7373
 */
7374
static inline size_t
7375
dfc_processing(struct dp_netdev_pmd_thread *pmd,
7376
               struct dp_packet_batch *packets_,
7377
               struct netdev_flow_key *keys,
7378
               struct netdev_flow_key **missed_keys,
7379
               struct packet_batch_per_flow batches[], size_t *n_batches,
7380
               struct dp_packet_flow_map *flow_map,
7381
               size_t *n_flows, uint8_t *index_map,
7382
               bool md_is_valid, odp_port_t port_no)
7383
0
{
7384
0
    size_t n_missed = 0, n_emc_hit = 0, n_phwol_hit = 0, n_simple_hit = 0;
7385
0
    const bool offload_enabled = dpif_offload_enabled();
7386
0
    const uint32_t recirc_depth = *recirc_depth_get();
7387
0
    const size_t cnt = dp_packet_batch_size(packets_);
7388
0
    struct dfc_cache *cache = &pmd->flow_cache;
7389
0
    struct netdev_flow_key *key = &keys[0];
7390
0
    struct dp_packet *packet;
7391
0
    size_t map_cnt = 0;
7392
0
    bool batch_enable = true;
7393
7394
0
    const bool simple_match_enabled =
7395
0
        !md_is_valid && dp_netdev_simple_match_enabled(pmd, port_no);
7396
    /* 'simple_match_table' is a full flow table.  If the flow is not there,
7397
     * upcall is required, and there is no chance to find a match in caches. */
7398
0
    const bool smc_enable_db = !simple_match_enabled && pmd->ctx.smc_enable_db;
7399
0
    const uint32_t cur_min = simple_match_enabled
7400
0
                             ? 0 : pmd->ctx.emc_insert_min;
7401
7402
0
    pmd_perf_update_counter(&pmd->perf_stats,
7403
0
                            md_is_valid ? PMD_STAT_RECIRC : PMD_STAT_RECV,
7404
0
                            cnt);
7405
0
    int i;
7406
0
    DP_PACKET_BATCH_REFILL_FOR_EACH (i, cnt, packet, packets_) {
7407
0
        struct dp_netdev_flow *flow = NULL;
7408
0
        uint16_t tcp_flags;
7409
7410
0
        if (OVS_UNLIKELY(dp_packet_size(packet) < ETH_HEADER_LEN)) {
7411
0
            dp_packet_delete(packet);
7412
0
            COVERAGE_INC(datapath_drop_rx_invalid_packet);
7413
0
            continue;
7414
0
        }
7415
7416
0
        if (i != cnt - 1) {
7417
0
            struct dp_packet **packets = packets_->packets;
7418
            /* Prefetch next packet data and metadata. */
7419
0
            OVS_PREFETCH(dp_packet_data(packets[i+1]));
7420
0
            pkt_metadata_prefetch_init(&packets[i+1]->md);
7421
0
        }
7422
7423
0
        if (!md_is_valid) {
7424
0
            pkt_metadata_init(&packet->md, port_no);
7425
0
        }
7426
7427
0
        if (offload_enabled && recirc_depth == 0) {
7428
0
            if (OVS_UNLIKELY(dp_netdev_hw_flow(pmd, packet, &flow))) {
7429
                /* Packet restoration failed and it was dropped, do not
7430
                 * continue processing.
7431
                 */
7432
0
                continue;
7433
0
            }
7434
0
            if (OVS_LIKELY(flow)) {
7435
0
                tcp_flags = parse_tcp_flags(packet, NULL, NULL, NULL);
7436
0
                n_phwol_hit++;
7437
0
                dfc_processing_enqueue_classified_packet(
7438
0
                        packet, flow, tcp_flags, batch_enable,
7439
0
                        batches, n_batches, flow_map, &map_cnt);
7440
0
                continue;
7441
0
            }
7442
0
        }
7443
7444
0
        if (!flow && simple_match_enabled) {
7445
0
            ovs_be16 dl_type = 0, vlan_tci = 0;
7446
0
            uint8_t nw_frag = 0;
7447
7448
0
            tcp_flags = parse_tcp_flags(packet, &dl_type, &nw_frag, &vlan_tci);
7449
0
            flow = dp_netdev_simple_match_lookup(pmd, port_no, dl_type,
7450
0
                                                 nw_frag, vlan_tci);
7451
0
            if (OVS_LIKELY(flow)) {
7452
0
                n_simple_hit++;
7453
0
                dfc_processing_enqueue_classified_packet(
7454
0
                        packet, flow, tcp_flags, batch_enable,
7455
0
                        batches, n_batches, flow_map, &map_cnt);
7456
0
                continue;
7457
0
            }
7458
0
        }
7459
7460
0
        miniflow_extract(packet, &key->mf);
7461
0
        key->len = 0; /* Not computed yet. */
7462
0
        key->hash =
7463
0
                (md_is_valid == false)
7464
0
                ? dpif_netdev_packet_get_rss_hash_orig_pkt(packet, &key->mf)
7465
0
                : dpif_netdev_packet_get_rss_hash(packet, &key->mf);
7466
7467
        /* If EMC is disabled skip emc_lookup */
7468
0
        flow = (cur_min != 0) ? emc_lookup(&cache->emc_cache, key) : NULL;
7469
0
        if (OVS_LIKELY(flow)) {
7470
0
            tcp_flags = miniflow_get_tcp_flags(&key->mf);
7471
0
            n_emc_hit++;
7472
0
            dfc_processing_enqueue_classified_packet(
7473
0
                    packet, flow, tcp_flags, batch_enable,
7474
0
                    batches, n_batches, flow_map, &map_cnt);
7475
0
        } else {
7476
            /* Exact match cache missed. Group missed packets together at
7477
             * the beginning of the 'packets' array. */
7478
0
            dp_packet_batch_refill(packets_, packet, i);
7479
7480
            /* Preserve the order of packet for flow batching. */
7481
0
            index_map[n_missed] = map_cnt;
7482
0
            flow_map[map_cnt++].flow = NULL;
7483
7484
            /* 'key[n_missed]' contains the key of the current packet and it
7485
             * will be passed to SMC lookup. The next key should be extracted
7486
             * to 'keys[n_missed + 1]'.
7487
             * We also maintain a pointer array to keys missed both SMC and EMC
7488
             * which will be returned to the caller for future processing. */
7489
0
            missed_keys[n_missed] = key;
7490
0
            key = &keys[++n_missed];
7491
7492
            /* Skip batching for subsequent packets to avoid reordering. */
7493
0
            batch_enable = false;
7494
0
        }
7495
0
    }
7496
    /* Count of packets which are not flow batched. */
7497
0
    *n_flows = map_cnt;
7498
7499
0
    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_PHWOL_HIT, n_phwol_hit);
7500
0
    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SIMPLE_HIT,
7501
0
                            n_simple_hit);
7502
0
    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_EXACT_HIT, n_emc_hit);
7503
7504
0
    if (!smc_enable_db) {
7505
0
        return dp_packet_batch_size(packets_);
7506
0
    }
7507
7508
    /* Packets miss EMC will do a batch lookup in SMC if enabled */
7509
0
    smc_lookup_batch(pmd, keys, missed_keys, packets_,
7510
0
                     n_missed, flow_map, index_map);
7511
7512
0
    return dp_packet_batch_size(packets_);
7513
0
}
7514
7515
static inline int
7516
handle_packet_upcall(struct dp_netdev_pmd_thread *pmd,
7517
                     struct dp_packet *packet,
7518
                     const struct netdev_flow_key *key,
7519
                     struct ofpbuf *actions, struct ofpbuf *put_actions)
7520
0
{
7521
0
    struct ofpbuf *add_actions;
7522
0
    struct dp_packet_batch b;
7523
0
    struct match match;
7524
0
    ovs_u128 ufid;
7525
0
    int error;
7526
0
    uint64_t cycles = cycles_counter_update(&pmd->perf_stats);
7527
0
    odp_port_t orig_in_port = packet->md.orig_in_port;
7528
7529
0
    match.tun_md.valid = false;
7530
0
    miniflow_expand(&key->mf, &match.flow);
7531
0
    memset(&match.wc, 0, sizeof match.wc);
7532
7533
0
    ofpbuf_clear(actions);
7534
0
    ofpbuf_clear(put_actions);
7535
7536
0
    odp_flow_key_hash(&match.flow, sizeof match.flow, &ufid);
7537
0
    error = dp_netdev_upcall(pmd, packet, &match.flow, &match.wc,
7538
0
                             &ufid, DPIF_UC_MISS, NULL, actions,
7539
0
                             put_actions);
7540
0
    if (OVS_UNLIKELY(error && error != ENOSPC)) {
7541
0
        dp_packet_delete(packet);
7542
0
        COVERAGE_INC(datapath_drop_upcall_error);
7543
0
        return error;
7544
0
    }
7545
7546
    /* The Netlink encoding of datapath flow keys cannot express
7547
     * wildcarding the presence of a VLAN tag. Instead, a missing VLAN
7548
     * tag is interpreted as exact match on the fact that there is no
7549
     * VLAN.  Unless we refactor a lot of code that translates between
7550
     * Netlink and struct flow representations, we have to do the same
7551
     * here.  This must be in sync with 'match' in dpif_netdev_flow_put(). */
7552
0
    if (!match.wc.masks.vlans[0].tci) {
7553
0
        match.wc.masks.vlans[0].tci = htons(VLAN_VID_MASK | VLAN_CFI);
7554
0
    }
7555
7556
    /* We can't allow the packet batching in the next loop to execute
7557
     * the actions.  Otherwise, if there are any slow path actions,
7558
     * we'll send the packet up twice. */
7559
0
    dp_packet_batch_init_packet(&b, packet);
7560
0
    dp_netdev_execute_actions(pmd, &b, true, &match.flow,
7561
0
                              actions->data, actions->size);
7562
7563
0
    add_actions = put_actions->size ? put_actions : actions;
7564
0
    if (OVS_LIKELY(error != ENOSPC)) {
7565
0
        struct dp_netdev_flow *netdev_flow;
7566
7567
        /* XXX: There's a race window where a flow covering this packet
7568
         * could have already been installed since we last did the flow
7569
         * lookup before upcall.  This could be solved by moving the
7570
         * mutex lock outside the loop, but that's an awful long time
7571
         * to be locking revalidators out of making flow modifications. */
7572
0
        ovs_mutex_lock(&pmd->flow_mutex);
7573
0
        netdev_flow = dp_netdev_pmd_lookup_flow(pmd, key, NULL);
7574
0
        if (OVS_LIKELY(!netdev_flow)) {
7575
0
            netdev_flow = dp_netdev_flow_add(pmd, &match, &ufid,
7576
0
                                             add_actions->data,
7577
0
                                             add_actions->size, orig_in_port);
7578
0
        }
7579
0
        ovs_mutex_unlock(&pmd->flow_mutex);
7580
0
        uint32_t hash = dp_netdev_flow_hash(&netdev_flow->ufid);
7581
0
        smc_insert(pmd, key, hash);
7582
0
        emc_probabilistic_insert(pmd, key, netdev_flow);
7583
0
    }
7584
0
    if (pmd_perf_metrics_enabled(pmd)) {
7585
        /* Update upcall stats. */
7586
0
        cycles = cycles_counter_update(&pmd->perf_stats) - cycles;
7587
0
        struct pmd_perf_stats *s = &pmd->perf_stats;
7588
0
        s->current.upcalls++;
7589
0
        s->current.upcall_cycles += cycles;
7590
0
        histogram_add_sample(&s->cycles_per_upcall, cycles);
7591
0
    }
7592
0
    return error;
7593
0
}
7594
7595
static inline void
7596
fast_path_processing(struct dp_netdev_pmd_thread *pmd,
7597
                     struct dp_packet_batch *packets_,
7598
                     struct netdev_flow_key **keys,
7599
                     struct dp_packet_flow_map *flow_map,
7600
                     uint8_t *index_map,
7601
                     odp_port_t in_port)
7602
0
{
7603
0
    const size_t cnt = dp_packet_batch_size(packets_);
7604
0
#ifndef __CHECKER__
7605
0
    const size_t PKT_ARRAY_SIZE = cnt;
7606
#else
7607
    /* Sparse doesn't like variable length array. */
7608
    enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
7609
#endif
7610
0
    struct dp_packet *packet;
7611
0
    struct dpcls *cls;
7612
0
    struct dpcls_rule *rules[PKT_ARRAY_SIZE];
7613
0
    struct dp_netdev *dp = pmd->dp;
7614
0
    int upcall_ok_cnt = 0, upcall_fail_cnt = 0;
7615
0
    int lookup_cnt = 0, add_lookup_cnt;
7616
0
    bool any_miss;
7617
7618
0
    for (size_t i = 0; i < cnt; i++) {
7619
        /* Key length is needed in all the cases, hash computed on demand. */
7620
0
        keys[i]->len = netdev_flow_key_size(miniflow_n_values(&keys[i]->mf));
7621
0
    }
7622
    /* Get the classifier for the in_port */
7623
0
    cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
7624
0
    if (OVS_LIKELY(cls)) {
7625
0
        any_miss = !dpcls_lookup(cls, (const struct netdev_flow_key **)keys,
7626
0
                                rules, cnt, &lookup_cnt);
7627
0
    } else {
7628
0
        any_miss = true;
7629
0
        memset(rules, 0, sizeof(rules));
7630
0
    }
7631
0
    if (OVS_UNLIKELY(any_miss) && !fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {
7632
0
        uint64_t actions_stub[512 / 8], slow_stub[512 / 8];
7633
0
        struct ofpbuf actions, put_actions;
7634
7635
0
        ofpbuf_use_stub(&actions, actions_stub, sizeof actions_stub);
7636
0
        ofpbuf_use_stub(&put_actions, slow_stub, sizeof slow_stub);
7637
7638
0
        DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
7639
0
            struct dp_netdev_flow *netdev_flow;
7640
7641
0
            if (OVS_LIKELY(rules[i])) {
7642
0
                continue;
7643
0
            }
7644
7645
            /* It's possible that an earlier slow path execution installed
7646
             * a rule covering this flow.  In this case, it's a lot cheaper
7647
             * to catch it here than execute a miss. */
7648
0
            netdev_flow = dp_netdev_pmd_lookup_flow(pmd, keys[i],
7649
0
                                                    &add_lookup_cnt);
7650
0
            if (netdev_flow) {
7651
0
                lookup_cnt += add_lookup_cnt;
7652
0
                rules[i] = &netdev_flow->cr;
7653
0
                continue;
7654
0
            }
7655
7656
0
            int error = handle_packet_upcall(pmd, packet, keys[i],
7657
0
                                             &actions, &put_actions);
7658
7659
0
            if (OVS_UNLIKELY(error)) {
7660
0
                upcall_fail_cnt++;
7661
0
            } else {
7662
0
                upcall_ok_cnt++;
7663
0
            }
7664
0
        }
7665
7666
0
        ofpbuf_uninit(&actions);
7667
0
        ofpbuf_uninit(&put_actions);
7668
0
        fat_rwlock_unlock(&dp->upcall_rwlock);
7669
0
    } else if (OVS_UNLIKELY(any_miss)) {
7670
0
        DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
7671
0
            if (OVS_UNLIKELY(!rules[i])) {
7672
0
                dp_packet_delete(packet);
7673
0
                COVERAGE_INC(datapath_drop_lock_error);
7674
0
                upcall_fail_cnt++;
7675
0
            }
7676
0
        }
7677
0
    }
7678
7679
0
    DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
7680
0
        struct dp_netdev_flow *flow;
7681
        /* Get the original order of this packet in received batch. */
7682
0
        int recv_idx = index_map[i];
7683
0
        uint16_t tcp_flags;
7684
7685
0
        if (OVS_UNLIKELY(!rules[i])) {
7686
0
            continue;
7687
0
        }
7688
7689
0
        flow = dp_netdev_flow_cast(rules[i]);
7690
0
        uint32_t hash =  dp_netdev_flow_hash(&flow->ufid);
7691
0
        smc_insert(pmd, keys[i], hash);
7692
7693
0
        emc_probabilistic_insert(pmd, keys[i], flow);
7694
        /* Add these packets into the flow map in the same order
7695
         * as received.
7696
         */
7697
0
        tcp_flags = miniflow_get_tcp_flags(&keys[i]->mf);
7698
0
        packet_enqueue_to_flow_map(packet, flow, tcp_flags,
7699
0
                                   flow_map, recv_idx);
7700
0
    }
7701
7702
0
    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_HIT,
7703
0
                            cnt - upcall_ok_cnt - upcall_fail_cnt);
7704
0
    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_LOOKUP,
7705
0
                            lookup_cnt);
7706
0
    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MISS,
7707
0
                            upcall_ok_cnt);
7708
0
    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_LOST,
7709
0
                            upcall_fail_cnt);
7710
0
}
7711
7712
/* Packets enter the datapath from a port (or from recirculation) here.
7713
 *
7714
 * When 'md_is_valid' is true the metadata in 'packets' are already valid.
7715
 * When false the metadata in 'packets' need to be initialized. */
7716
static void
7717
dp_netdev_input__(struct dp_netdev_pmd_thread *pmd,
7718
                  struct dp_packet_batch *packets,
7719
                  bool md_is_valid, odp_port_t port_no)
7720
0
{
7721
0
#ifndef __CHECKER__
7722
0
    const size_t PKT_ARRAY_SIZE = dp_packet_batch_size(packets);
7723
#else
7724
    /* Sparse doesn't like variable length array. */
7725
    enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
7726
#endif
7727
0
    OVS_ALIGNED_VAR(CACHE_LINE_SIZE)
7728
0
        struct netdev_flow_key keys[PKT_ARRAY_SIZE];
7729
0
    struct netdev_flow_key *missed_keys[PKT_ARRAY_SIZE];
7730
0
    struct packet_batch_per_flow batches[PKT_ARRAY_SIZE];
7731
0
    size_t n_batches;
7732
0
    struct dp_packet_flow_map flow_map[PKT_ARRAY_SIZE];
7733
0
    uint8_t index_map[PKT_ARRAY_SIZE];
7734
0
    size_t n_flows, i;
7735
7736
0
    odp_port_t in_port;
7737
7738
0
    n_batches = 0;
7739
0
    dfc_processing(pmd, packets, keys, missed_keys, batches, &n_batches,
7740
0
                   flow_map, &n_flows, index_map, md_is_valid, port_no);
7741
7742
0
    if (!dp_packet_batch_is_empty(packets)) {
7743
        /* Get ingress port from first packet's metadata. */
7744
0
        in_port = packets->packets[0]->md.in_port.odp_port;
7745
0
        fast_path_processing(pmd, packets, missed_keys,
7746
0
                             flow_map, index_map, in_port);
7747
0
    }
7748
7749
    /* Batch rest of packets which are in flow map. */
7750
0
    for (i = 0; i < n_flows; i++) {
7751
0
        struct dp_packet_flow_map *map = &flow_map[i];
7752
7753
0
        if (OVS_UNLIKELY(!map->flow)) {
7754
0
            continue;
7755
0
        }
7756
0
        dp_netdev_queue_batches(map->packet, map->flow, map->tcp_flags,
7757
0
                                batches, &n_batches);
7758
0
     }
7759
7760
    /* All the flow batches need to be reset before any call to
7761
     * packet_batch_per_flow_execute() as it could potentially trigger
7762
     * recirculation. When a packet matching flow 'j' happens to be
7763
     * recirculated, the nested call to dp_netdev_input__() could potentially
7764
     * classify the packet as matching another flow - say 'k'. It could happen
7765
     * that in the previous call to dp_netdev_input__() that same flow 'k' had
7766
     * already its own batches[k] still waiting to be served.  So if its
7767
     * 'batch' member is not reset, the recirculated packet would be wrongly
7768
     * appended to batches[k] of the 1st call to dp_netdev_input__(). */
7769
0
    for (i = 0; i < n_batches; i++) {
7770
0
        batches[i].flow->batch = NULL;
7771
0
    }
7772
7773
0
    for (i = 0; i < n_batches; i++) {
7774
0
        packet_batch_per_flow_execute(&batches[i], pmd);
7775
0
    }
7776
0
}
7777
7778
static void
7779
dp_netdev_input(struct dp_netdev_pmd_thread *pmd,
7780
                struct dp_packet_batch *packets,
7781
                odp_port_t port_no)
7782
0
{
7783
0
    dp_netdev_input__(pmd, packets, false, port_no);
7784
0
}
7785
7786
static void
7787
dp_netdev_recirculate(struct dp_netdev_pmd_thread *pmd,
7788
                      struct dp_packet_batch *packets)
7789
0
{
7790
0
    dp_netdev_input__(pmd, packets, true, 0);
7791
0
}
7792
7793
struct dp_netdev_execute_aux {
7794
    struct dp_netdev_pmd_thread *pmd;
7795
    const struct flow *flow;
7796
};
7797
7798
static void
7799
dpif_netdev_register_dp_purge_cb(struct dpif *dpif, dp_purge_callback *cb,
7800
                                 void *aux)
7801
0
{
7802
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
7803
0
    dp->dp_purge_aux = aux;
7804
0
    dp->dp_purge_cb = cb;
7805
0
}
7806
7807
static void
7808
dpif_netdev_register_upcall_cb(struct dpif *dpif, upcall_callback *cb,
7809
                               void *aux)
7810
0
{
7811
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
7812
0
    dp->upcall_aux = aux;
7813
0
    dp->upcall_cb = cb;
7814
0
}
7815
7816
static void
7817
dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
7818
                               bool purge)
7819
0
{
7820
0
    struct tx_port *tx;
7821
0
    struct dp_netdev_port *port;
7822
0
    long long interval;
7823
7824
0
    HMAP_FOR_EACH (tx, node, &pmd->send_port_cache) {
7825
0
        if (tx->port->txq_mode != TXQ_MODE_XPS) {
7826
0
            continue;
7827
0
        }
7828
0
        interval = pmd->ctx.now - tx->last_used;
7829
0
        if (tx->qid >= 0 && (purge || interval >= XPS_TIMEOUT)) {
7830
0
            port = tx->port;
7831
0
            ovs_mutex_lock(&port->txq_used_mutex);
7832
0
            port->txq_used[tx->qid]--;
7833
0
            ovs_mutex_unlock(&port->txq_used_mutex);
7834
0
            tx->qid = -1;
7835
0
        }
7836
0
    }
7837
0
}
7838
7839
static int
7840
dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd,
7841
                           struct tx_port *tx)
7842
0
{
7843
0
    struct dp_netdev_port *port;
7844
0
    long long interval;
7845
0
    int i, min_cnt, min_qid;
7846
7847
0
    interval = pmd->ctx.now - tx->last_used;
7848
0
    tx->last_used = pmd->ctx.now;
7849
7850
0
    if (OVS_LIKELY(tx->qid >= 0 && interval < XPS_TIMEOUT)) {
7851
0
        return tx->qid;
7852
0
    }
7853
7854
0
    port = tx->port;
7855
7856
0
    ovs_mutex_lock(&port->txq_used_mutex);
7857
0
    if (tx->qid >= 0) {
7858
0
        port->txq_used[tx->qid]--;
7859
0
        tx->qid = -1;
7860
0
    }
7861
7862
0
    min_cnt = -1;
7863
0
    min_qid = 0;
7864
0
    for (i = 0; i < netdev_n_txq(port->netdev); i++) {
7865
0
        if (port->txq_used[i] < min_cnt || min_cnt == -1) {
7866
0
            min_cnt = port->txq_used[i];
7867
0
            min_qid = i;
7868
0
        }
7869
0
    }
7870
7871
0
    port->txq_used[min_qid]++;
7872
0
    tx->qid = min_qid;
7873
7874
0
    ovs_mutex_unlock(&port->txq_used_mutex);
7875
7876
0
    dpif_netdev_xps_revalidate_pmd(pmd, false);
7877
7878
0
    VLOG_DBG("Core %d: New TX queue ID %d for port \'%s\'.",
7879
0
             pmd->core_id, tx->qid, netdev_get_name(tx->port->netdev));
7880
0
    return min_qid;
7881
0
}
7882
7883
static struct tx_port *
7884
pmd_tnl_port_cache_lookup(const struct dp_netdev_pmd_thread *pmd,
7885
                          odp_port_t port_no)
7886
0
{
7887
0
    return tx_port_lookup(&pmd->tnl_port_cache, port_no);
7888
0
}
7889
7890
static struct tx_port *
7891
pmd_send_port_cache_lookup(const struct dp_netdev_pmd_thread *pmd,
7892
                           odp_port_t port_no)
7893
0
{
7894
0
    return tx_port_lookup(&pmd->send_port_cache, port_no);
7895
0
}
7896
7897
static int
7898
push_tnl_action(const struct dp_netdev_pmd_thread *pmd,
7899
                const struct nlattr *attr,
7900
                struct dp_packet_batch *batch)
7901
0
{
7902
0
    struct tx_port *tun_port;
7903
0
    const struct ovs_action_push_tnl *data;
7904
0
    int err;
7905
7906
0
    data = nl_attr_get(attr);
7907
7908
0
    tun_port = pmd_tnl_port_cache_lookup(pmd, data->tnl_port);
7909
0
    if (!tun_port) {
7910
0
        err = -EINVAL;
7911
0
        goto error;
7912
0
    }
7913
0
    err = netdev_push_header(tun_port->port->netdev, batch, data);
7914
0
    if (!err) {
7915
0
        return 0;
7916
0
    }
7917
0
error:
7918
0
    dp_packet_delete_batch(batch, true);
7919
0
    return err;
7920
0
}
7921
7922
static void
7923
dp_execute_userspace_action(struct dp_netdev_pmd_thread *pmd,
7924
                            struct dp_packet *packet, bool should_steal,
7925
                            struct flow *flow, ovs_u128 *ufid,
7926
                            struct ofpbuf *actions,
7927
                            const struct nlattr *userdata)
7928
0
{
7929
0
    struct dp_packet_batch b;
7930
0
    int error;
7931
7932
0
    ofpbuf_clear(actions);
7933
7934
0
    error = dp_netdev_upcall(pmd, packet, flow, NULL, ufid,
7935
0
                             DPIF_UC_ACTION, userdata, actions,
7936
0
                             NULL);
7937
0
    if (!error || error == ENOSPC) {
7938
0
        dp_packet_batch_init_packet(&b, packet);
7939
0
        dp_netdev_execute_actions(pmd, &b, should_steal, flow,
7940
0
                                  actions->data, actions->size);
7941
0
    } else if (should_steal) {
7942
0
        dp_packet_delete(packet);
7943
0
        COVERAGE_INC(datapath_drop_userspace_action_error);
7944
0
    }
7945
0
}
7946
7947
static bool
7948
dp_execute_output_action(struct dp_netdev_pmd_thread *pmd,
7949
                         struct dp_packet_batch *packets_,
7950
                         bool should_steal, odp_port_t port_no)
7951
0
{
7952
0
    struct tx_port *p = pmd_send_port_cache_lookup(pmd, port_no);
7953
0
    struct dp_packet_batch out;
7954
7955
0
    if (!OVS_LIKELY(p)) {
7956
0
        COVERAGE_ADD(datapath_drop_invalid_port,
7957
0
                     dp_packet_batch_size(packets_));
7958
0
        dp_packet_delete_batch(packets_, should_steal);
7959
0
        return false;
7960
0
    }
7961
0
    if (!should_steal) {
7962
0
        dp_packet_batch_clone(&out, packets_);
7963
0
        dp_packet_batch_reset_cutlen(packets_);
7964
0
        packets_ = &out;
7965
0
    }
7966
0
    dp_packet_batch_apply_cutlen(packets_);
7967
0
    if (dp_packet_batch_size(&p->output_pkts)
7968
0
        + dp_packet_batch_size(packets_) > NETDEV_MAX_BURST) {
7969
        /* Flush here to avoid overflow. */
7970
0
        dp_netdev_pmd_flush_output_on_port(pmd, p);
7971
0
    }
7972
0
    if (dp_packet_batch_is_empty(&p->output_pkts)) {
7973
0
        pmd->n_output_batches++;
7974
0
    }
7975
7976
0
    struct dp_packet *packet;
7977
0
    DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
7978
0
        p->output_pkts_rxqs[dp_packet_batch_size(&p->output_pkts)] =
7979
0
            pmd->ctx.last_rxq;
7980
0
        dp_packet_batch_add(&p->output_pkts, packet);
7981
0
    }
7982
0
    return true;
7983
0
}
7984
7985
static void
7986
dp_execute_lb_output_action(struct dp_netdev_pmd_thread *pmd,
7987
                            struct dp_packet_batch *packets_,
7988
                            bool should_steal, uint32_t bond)
7989
0
{
7990
0
    struct tx_bond *p_bond = tx_bond_lookup(&pmd->tx_bonds, bond);
7991
0
    struct dp_packet_batch out;
7992
0
    struct dp_packet *packet;
7993
7994
0
    if (!p_bond) {
7995
0
        COVERAGE_ADD(datapath_drop_invalid_bond,
7996
0
                     dp_packet_batch_size(packets_));
7997
0
        dp_packet_delete_batch(packets_, should_steal);
7998
0
        return;
7999
0
    }
8000
0
    if (!should_steal) {
8001
0
        dp_packet_batch_clone(&out, packets_);
8002
0
        dp_packet_batch_reset_cutlen(packets_);
8003
0
        packets_ = &out;
8004
0
    }
8005
0
    dp_packet_batch_apply_cutlen(packets_);
8006
8007
0
    DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
8008
        /*
8009
         * Lookup the bond-hash table using hash to get the member.
8010
         */
8011
0
        uint32_t hash = dp_packet_get_rss_hash(packet);
8012
0
        struct member_entry *s_entry
8013
0
            = &p_bond->member_buckets[hash & BOND_MASK];
8014
0
        odp_port_t bond_member = s_entry->member_id;
8015
0
        uint32_t size = dp_packet_size(packet);
8016
0
        struct dp_packet_batch output_pkt;
8017
8018
0
        dp_packet_batch_init_packet(&output_pkt, packet);
8019
0
        if (OVS_LIKELY(dp_execute_output_action(pmd, &output_pkt, true,
8020
0
                                                bond_member))) {
8021
            /* Update member stats. */
8022
0
            non_atomic_ullong_add(&s_entry->n_packets, 1);
8023
0
            non_atomic_ullong_add(&s_entry->n_bytes, size);
8024
0
        }
8025
0
    }
8026
0
}
8027
8028
static void
8029
dp_execute_cb(void *aux_, struct dp_packet_batch *packets_,
8030
              const struct nlattr *a, bool should_steal)
8031
    OVS_NO_THREAD_SAFETY_ANALYSIS
8032
0
{
8033
0
    struct dp_netdev_execute_aux *aux = aux_;
8034
0
    uint32_t *depth = recirc_depth_get();
8035
0
    struct dp_netdev_pmd_thread *pmd = aux->pmd;
8036
0
    struct dp_netdev *dp = pmd->dp;
8037
0
    int type = nl_attr_type(a);
8038
0
    struct tx_port *p;
8039
0
    uint32_t packet_count, packets_dropped;
8040
8041
0
    switch ((enum ovs_action_attr)type) {
8042
0
    case OVS_ACTION_ATTR_OUTPUT:
8043
0
        dp_execute_output_action(pmd, packets_, should_steal,
8044
0
                                 nl_attr_get_odp_port(a));
8045
0
        return;
8046
8047
0
    case OVS_ACTION_ATTR_LB_OUTPUT:
8048
0
        dp_execute_lb_output_action(pmd, packets_, should_steal,
8049
0
                                    nl_attr_get_u32(a));
8050
0
        return;
8051
8052
0
    case OVS_ACTION_ATTR_TUNNEL_PUSH:
8053
0
        if (should_steal) {
8054
            /* We're requested to push tunnel header, but also we need to take
8055
             * the ownership of these packets. Thus, we can avoid performing
8056
             * the action, because the caller will not use the result anyway.
8057
             * Just break to free the batch. */
8058
0
            break;
8059
0
        }
8060
0
        dp_packet_batch_apply_cutlen(packets_);
8061
0
        packet_count = dp_packet_batch_size(packets_);
8062
0
        if (push_tnl_action(pmd, a, packets_)) {
8063
0
            COVERAGE_ADD(datapath_drop_tunnel_push_error,
8064
0
                         packet_count);
8065
0
        }
8066
0
        return;
8067
8068
0
    case OVS_ACTION_ATTR_TUNNEL_POP:
8069
0
        if (*depth < MAX_RECIRC_DEPTH) {
8070
0
            struct dp_packet_batch *orig_packets_ = packets_;
8071
0
            odp_port_t portno = nl_attr_get_odp_port(a);
8072
8073
0
            p = pmd_tnl_port_cache_lookup(pmd, portno);
8074
0
            if (p) {
8075
0
                struct dp_packet_batch tnl_pkt;
8076
8077
0
                if (!should_steal) {
8078
0
                    dp_packet_batch_clone(&tnl_pkt, packets_);
8079
0
                    packets_ = &tnl_pkt;
8080
0
                    dp_packet_batch_reset_cutlen(orig_packets_);
8081
0
                }
8082
8083
0
                dp_packet_batch_apply_cutlen(packets_);
8084
8085
0
                packet_count = dp_packet_batch_size(packets_);
8086
0
                netdev_pop_header(p->port->netdev, packets_);
8087
0
                packets_dropped =
8088
0
                   packet_count - dp_packet_batch_size(packets_);
8089
0
                if (packets_dropped) {
8090
0
                    COVERAGE_ADD(datapath_drop_tunnel_pop_error,
8091
0
                                 packets_dropped);
8092
0
                }
8093
0
                if (dp_packet_batch_is_empty(packets_)) {
8094
0
                    return;
8095
0
                }
8096
8097
0
                struct dp_packet *packet;
8098
0
                DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
8099
0
                    packet->md.in_port.odp_port = portno;
8100
0
                }
8101
8102
0
                (*depth)++;
8103
0
                dp_netdev_recirculate(pmd, packets_);
8104
0
                (*depth)--;
8105
0
                return;
8106
0
            }
8107
0
            COVERAGE_ADD(datapath_drop_invalid_tnl_port,
8108
0
                         dp_packet_batch_size(packets_));
8109
0
        } else {
8110
0
            COVERAGE_ADD(datapath_drop_recirc_error,
8111
0
                         dp_packet_batch_size(packets_));
8112
0
        }
8113
0
        break;
8114
8115
0
    case OVS_ACTION_ATTR_USERSPACE:
8116
0
        if (!fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {
8117
0
            struct dp_packet_batch *orig_packets_ = packets_;
8118
0
            const struct nlattr *userdata;
8119
0
            struct dp_packet_batch usr_pkt;
8120
0
            struct ofpbuf actions;
8121
0
            struct flow flow;
8122
0
            ovs_u128 ufid;
8123
0
            bool clone = false;
8124
8125
0
            userdata = nl_attr_find_nested(a, OVS_USERSPACE_ATTR_USERDATA);
8126
0
            ofpbuf_init(&actions, 0);
8127
8128
0
            if (packets_->trunc) {
8129
0
                if (!should_steal) {
8130
0
                    dp_packet_batch_clone(&usr_pkt, packets_);
8131
0
                    packets_ = &usr_pkt;
8132
0
                    clone = true;
8133
0
                    dp_packet_batch_reset_cutlen(orig_packets_);
8134
0
                }
8135
8136
0
                dp_packet_batch_apply_cutlen(packets_);
8137
0
            }
8138
8139
0
            struct dp_packet *packet;
8140
0
            DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
8141
0
                flow_extract(packet, &flow);
8142
0
                odp_flow_key_hash(&flow, sizeof flow, &ufid);
8143
0
                dp_execute_userspace_action(pmd, packet, should_steal, &flow,
8144
0
                                            &ufid, &actions, userdata);
8145
0
            }
8146
8147
0
            if (clone) {
8148
0
                dp_packet_delete_batch(packets_, true);
8149
0
            }
8150
8151
0
            ofpbuf_uninit(&actions);
8152
0
            fat_rwlock_unlock(&dp->upcall_rwlock);
8153
8154
0
            return;
8155
0
        }
8156
0
        COVERAGE_ADD(datapath_drop_lock_error,
8157
0
                     dp_packet_batch_size(packets_));
8158
0
        break;
8159
8160
0
    case OVS_ACTION_ATTR_RECIRC:
8161
0
        if (*depth < MAX_RECIRC_DEPTH) {
8162
0
            struct dp_packet_batch recirc_pkts;
8163
8164
0
            if (!should_steal) {
8165
0
               dp_packet_batch_clone(&recirc_pkts, packets_);
8166
0
               packets_ = &recirc_pkts;
8167
0
            }
8168
8169
0
            struct dp_packet *packet;
8170
0
            DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
8171
0
                packet->md.recirc_id = nl_attr_get_u32(a);
8172
0
            }
8173
8174
0
            (*depth)++;
8175
0
            dp_netdev_recirculate(pmd, packets_);
8176
0
            (*depth)--;
8177
8178
0
            return;
8179
0
        }
8180
8181
0
        COVERAGE_ADD(datapath_drop_recirc_error,
8182
0
                     dp_packet_batch_size(packets_));
8183
0
        VLOG_WARN("Packet dropped. Max recirculation depth exceeded.");
8184
0
        break;
8185
8186
0
    case OVS_ACTION_ATTR_CT: {
8187
0
        const struct nlattr *b;
8188
0
        bool force = false;
8189
0
        bool commit = false;
8190
0
        unsigned int left;
8191
0
        uint16_t zone = 0;
8192
0
        uint32_t tp_id = 0;
8193
0
        const char *helper = NULL;
8194
0
        const uint32_t *setmark = NULL;
8195
0
        const struct ovs_key_ct_labels *setlabel = NULL;
8196
0
        struct nat_action_info_t nat_action_info;
8197
0
        struct nat_action_info_t *nat_action_info_ref = NULL;
8198
0
        bool nat_config = false;
8199
8200
0
        NL_ATTR_FOR_EACH_UNSAFE (b, left, nl_attr_get(a),
8201
0
                                 nl_attr_get_size(a)) {
8202
0
            enum ovs_ct_attr sub_type = nl_attr_type(b);
8203
8204
0
            switch(sub_type) {
8205
0
            case OVS_CT_ATTR_FORCE_COMMIT:
8206
0
                force = true;
8207
                /* fall through. */
8208
0
            case OVS_CT_ATTR_COMMIT:
8209
0
                commit = true;
8210
0
                break;
8211
0
            case OVS_CT_ATTR_ZONE:
8212
0
                zone = nl_attr_get_u16(b);
8213
0
                break;
8214
0
            case OVS_CT_ATTR_HELPER:
8215
0
                helper = nl_attr_get_string(b);
8216
0
                break;
8217
0
            case OVS_CT_ATTR_MARK:
8218
0
                setmark = nl_attr_get(b);
8219
0
                break;
8220
0
            case OVS_CT_ATTR_LABELS:
8221
0
                setlabel = nl_attr_get(b);
8222
0
                break;
8223
0
            case OVS_CT_ATTR_EVENTMASK:
8224
                /* Silently ignored, as userspace datapath does not generate
8225
                 * netlink events. */
8226
0
                break;
8227
0
            case OVS_CT_ATTR_TIMEOUT:
8228
0
                if (!str_to_uint(nl_attr_get_string(b), 10, &tp_id)) {
8229
0
                    VLOG_WARN("Invalid Timeout Policy ID: %s.",
8230
0
                              nl_attr_get_string(b));
8231
0
                    tp_id = DEFAULT_TP_ID;
8232
0
                }
8233
0
                break;
8234
0
            case OVS_CT_ATTR_NAT: {
8235
0
                const struct nlattr *b_nest;
8236
0
                unsigned int left_nest;
8237
0
                bool ip_min_specified = false;
8238
0
                bool proto_num_min_specified = false;
8239
0
                bool ip_max_specified = false;
8240
0
                bool proto_num_max_specified = false;
8241
0
                memset(&nat_action_info, 0, sizeof nat_action_info);
8242
0
                nat_action_info_ref = &nat_action_info;
8243
8244
0
                NL_NESTED_FOR_EACH_UNSAFE (b_nest, left_nest, b) {
8245
0
                    enum ovs_nat_attr sub_type_nest = nl_attr_type(b_nest);
8246
8247
0
                    switch (sub_type_nest) {
8248
0
                    case OVS_NAT_ATTR_SRC:
8249
0
                    case OVS_NAT_ATTR_DST:
8250
0
                        nat_config = true;
8251
0
                        nat_action_info.nat_action |=
8252
0
                            ((sub_type_nest == OVS_NAT_ATTR_SRC)
8253
0
                                ? NAT_ACTION_SRC : NAT_ACTION_DST);
8254
0
                        break;
8255
0
                    case OVS_NAT_ATTR_IP_MIN:
8256
0
                        memcpy(&nat_action_info.min_addr,
8257
0
                               nl_attr_get(b_nest),
8258
0
                               nl_attr_get_size(b_nest));
8259
0
                        ip_min_specified = true;
8260
0
                        break;
8261
0
                    case OVS_NAT_ATTR_IP_MAX:
8262
0
                        memcpy(&nat_action_info.max_addr,
8263
0
                               nl_attr_get(b_nest),
8264
0
                               nl_attr_get_size(b_nest));
8265
0
                        ip_max_specified = true;
8266
0
                        break;
8267
0
                    case OVS_NAT_ATTR_PROTO_MIN:
8268
0
                        nat_action_info.min_port =
8269
0
                            nl_attr_get_u16(b_nest);
8270
0
                        proto_num_min_specified = true;
8271
0
                        break;
8272
0
                    case OVS_NAT_ATTR_PROTO_MAX:
8273
0
                        nat_action_info.max_port =
8274
0
                            nl_attr_get_u16(b_nest);
8275
0
                        proto_num_max_specified = true;
8276
0
                        break;
8277
0
                    case OVS_NAT_ATTR_PROTO_RANDOM:
8278
0
                        nat_action_info.nat_flags |= NAT_RANGE_RANDOM;
8279
0
                        break;
8280
0
                    case OVS_NAT_ATTR_PERSISTENT:
8281
0
                        nat_action_info.nat_flags |= NAT_PERSISTENT;
8282
0
                        break;
8283
0
                    case OVS_NAT_ATTR_PROTO_HASH:
8284
0
                        break;
8285
0
                    case OVS_NAT_ATTR_UNSPEC:
8286
0
                    case __OVS_NAT_ATTR_MAX:
8287
0
                        OVS_NOT_REACHED();
8288
0
                    }
8289
0
                }
8290
8291
0
                if (ip_min_specified && !ip_max_specified) {
8292
0
                    nat_action_info.max_addr = nat_action_info.min_addr;
8293
0
                }
8294
0
                if (proto_num_min_specified && !proto_num_max_specified) {
8295
0
                    nat_action_info.max_port = nat_action_info.min_port;
8296
0
                }
8297
0
                if (proto_num_min_specified || proto_num_max_specified) {
8298
0
                    if (nat_action_info.nat_action & NAT_ACTION_SRC) {
8299
0
                        nat_action_info.nat_action |= NAT_ACTION_SRC_PORT;
8300
0
                    } else if (nat_action_info.nat_action & NAT_ACTION_DST) {
8301
0
                        nat_action_info.nat_action |= NAT_ACTION_DST_PORT;
8302
0
                    }
8303
0
                }
8304
0
                break;
8305
0
            }
8306
0
            case OVS_CT_ATTR_UNSPEC:
8307
0
            case __OVS_CT_ATTR_MAX:
8308
0
                OVS_NOT_REACHED();
8309
0
            }
8310
0
        }
8311
8312
        /* We won't be able to function properly in this case, hence
8313
         * complain loudly. */
8314
0
        if (nat_config && !commit) {
8315
0
            static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
8316
0
            VLOG_WARN_RL(&rl, "NAT specified without commit.");
8317
0
        }
8318
8319
0
        conntrack_execute(dp->conntrack, packets_, aux->flow->dl_type, force,
8320
0
                          commit, zone, setmark, setlabel, helper,
8321
0
                          nat_action_info_ref, pmd->ctx.now / 1000, tp_id);
8322
0
        break;
8323
0
    }
8324
8325
0
    case OVS_ACTION_ATTR_METER:
8326
0
        dp_netdev_run_meter(pmd->dp, packets_, nl_attr_get_u32(a),
8327
0
                            pmd->ctx.now / 1000);
8328
0
        break;
8329
8330
0
    case OVS_ACTION_ATTR_PUSH_VLAN:
8331
0
    case OVS_ACTION_ATTR_POP_VLAN:
8332
0
    case OVS_ACTION_ATTR_PUSH_MPLS:
8333
0
    case OVS_ACTION_ATTR_POP_MPLS:
8334
0
    case OVS_ACTION_ATTR_SET:
8335
0
    case OVS_ACTION_ATTR_SET_MASKED:
8336
0
    case OVS_ACTION_ATTR_SAMPLE:
8337
0
    case OVS_ACTION_ATTR_HASH:
8338
0
    case OVS_ACTION_ATTR_UNSPEC:
8339
0
    case OVS_ACTION_ATTR_TRUNC:
8340
0
    case OVS_ACTION_ATTR_PUSH_ETH:
8341
0
    case OVS_ACTION_ATTR_POP_ETH:
8342
0
    case OVS_ACTION_ATTR_CLONE:
8343
0
    case OVS_ACTION_ATTR_PUSH_NSH:
8344
0
    case OVS_ACTION_ATTR_POP_NSH:
8345
0
    case OVS_ACTION_ATTR_CT_CLEAR:
8346
0
    case OVS_ACTION_ATTR_CHECK_PKT_LEN:
8347
0
    case OVS_ACTION_ATTR_DROP:
8348
0
    case OVS_ACTION_ATTR_ADD_MPLS:
8349
0
    case OVS_ACTION_ATTR_DEC_TTL:
8350
0
    case OVS_ACTION_ATTR_PSAMPLE:
8351
0
    case __OVS_ACTION_ATTR_MAX:
8352
0
        OVS_NOT_REACHED();
8353
0
    }
8354
8355
0
    dp_packet_delete_batch(packets_, should_steal);
8356
0
}
8357
8358
static void
8359
dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
8360
                          struct dp_packet_batch *packets,
8361
                          bool should_steal, const struct flow *flow,
8362
                          const struct nlattr *actions, size_t actions_len)
8363
0
{
8364
0
    struct dp_netdev_execute_aux aux = { pmd, flow };
8365
8366
0
    odp_execute_actions(&aux, packets, should_steal, actions,
8367
0
                        actions_len, dp_execute_cb);
8368
0
}
8369
8370
struct dp_netdev_ct_dump {
8371
    struct ct_dpif_dump_state up;
8372
    struct conntrack_dump dump;
8373
    struct conntrack *ct;
8374
    struct dp_netdev *dp;
8375
};
8376
8377
static int
8378
dpif_netdev_ct_dump_start(struct dpif *dpif, struct ct_dpif_dump_state **dump_,
8379
                          const uint16_t *pzone, int *ptot_bkts)
8380
0
{
8381
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
8382
0
    struct dp_netdev_ct_dump *dump;
8383
8384
0
    dump = xzalloc(sizeof *dump);
8385
0
    dump->dp = dp;
8386
0
    dump->ct = dp->conntrack;
8387
8388
0
    conntrack_dump_start(dp->conntrack, &dump->dump, pzone, ptot_bkts);
8389
8390
0
    *dump_ = &dump->up;
8391
8392
0
    return 0;
8393
0
}
8394
8395
static int
8396
dpif_netdev_ct_dump_next(struct dpif *dpif OVS_UNUSED,
8397
                         struct ct_dpif_dump_state *dump_,
8398
                         struct ct_dpif_entry *entry)
8399
0
{
8400
0
    struct dp_netdev_ct_dump *dump;
8401
8402
0
    INIT_CONTAINER(dump, dump_, up);
8403
8404
0
    return conntrack_dump_next(&dump->dump, entry);
8405
0
}
8406
8407
static int
8408
dpif_netdev_ct_dump_done(struct dpif *dpif OVS_UNUSED,
8409
                         struct ct_dpif_dump_state *dump_)
8410
0
{
8411
0
    struct dp_netdev_ct_dump *dump;
8412
0
    int err;
8413
8414
0
    INIT_CONTAINER(dump, dump_, up);
8415
8416
0
    err = conntrack_dump_done(&dump->dump);
8417
8418
0
    free(dump);
8419
8420
0
    return err;
8421
0
}
8422
8423
static int
8424
dpif_netdev_ct_exp_dump_start(struct dpif *dpif,
8425
                              struct ct_dpif_dump_state **dump_,
8426
                              const uint16_t *pzone)
8427
0
{
8428
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
8429
0
    struct dp_netdev_ct_dump *dump;
8430
8431
0
    dump = xzalloc(sizeof *dump);
8432
0
    dump->dp = dp;
8433
0
    dump->ct = dp->conntrack;
8434
8435
0
    conntrack_exp_dump_start(dp->conntrack, &dump->dump, pzone);
8436
8437
0
    *dump_ = &dump->up;
8438
8439
0
    return 0;
8440
0
}
8441
8442
static int
8443
dpif_netdev_ct_exp_dump_next(struct dpif *dpif OVS_UNUSED,
8444
                             struct ct_dpif_dump_state *dump_,
8445
                             struct ct_dpif_exp *entry)
8446
0
{
8447
0
    struct dp_netdev_ct_dump *dump;
8448
8449
0
    INIT_CONTAINER(dump, dump_, up);
8450
8451
0
    return conntrack_exp_dump_next(&dump->dump, entry);
8452
0
}
8453
8454
static int
8455
dpif_netdev_ct_exp_dump_done(struct dpif *dpif OVS_UNUSED,
8456
                             struct ct_dpif_dump_state *dump_)
8457
0
{
8458
0
    struct dp_netdev_ct_dump *dump;
8459
0
    int err;
8460
8461
0
    INIT_CONTAINER(dump, dump_, up);
8462
8463
0
    err = conntrack_exp_dump_done(&dump->dump);
8464
8465
0
    free(dump);
8466
8467
0
    return err;
8468
0
}
8469
8470
static int
8471
dpif_netdev_ct_flush(struct dpif *dpif, const uint16_t *zone,
8472
                     const struct ct_dpif_tuple *tuple)
8473
0
{
8474
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
8475
8476
0
    if (tuple) {
8477
0
        return conntrack_flush_tuple(dp->conntrack, tuple, zone ? *zone : 0);
8478
0
    }
8479
0
    return conntrack_flush(dp->conntrack, zone);
8480
0
}
8481
8482
static int
8483
dpif_netdev_ct_set_maxconns(struct dpif *dpif, uint32_t maxconns)
8484
0
{
8485
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
8486
8487
0
    return conntrack_set_maxconns(dp->conntrack, maxconns);
8488
0
}
8489
8490
static int
8491
dpif_netdev_ct_get_maxconns(struct dpif *dpif, uint32_t *maxconns)
8492
0
{
8493
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
8494
8495
0
    return conntrack_get_maxconns(dp->conntrack, maxconns);
8496
0
}
8497
8498
static int
8499
dpif_netdev_ct_get_nconns(struct dpif *dpif, uint32_t *nconns)
8500
0
{
8501
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
8502
8503
0
    return conntrack_get_nconns(dp->conntrack, nconns);
8504
0
}
8505
8506
static int
8507
dpif_netdev_ct_set_tcp_seq_chk(struct dpif *dpif, bool enabled)
8508
0
{
8509
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
8510
8511
0
    return conntrack_set_tcp_seq_chk(dp->conntrack, enabled);
8512
0
}
8513
8514
static int
8515
dpif_netdev_ct_get_tcp_seq_chk(struct dpif *dpif, bool *enabled)
8516
0
{
8517
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
8518
0
    *enabled = conntrack_get_tcp_seq_chk(dp->conntrack);
8519
0
    return 0;
8520
0
}
8521
8522
static int
8523
dpif_netdev_ct_set_sweep_interval(struct dpif *dpif, uint32_t ms)
8524
0
{
8525
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
8526
0
    return conntrack_set_sweep_interval(dp->conntrack, ms);
8527
0
}
8528
8529
static int
8530
dpif_netdev_ct_get_sweep_interval(struct dpif *dpif, uint32_t *ms)
8531
0
{
8532
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
8533
0
    *ms = conntrack_get_sweep_interval(dp->conntrack);
8534
0
    return 0;
8535
0
}
8536
8537
static int
8538
dpif_netdev_ct_set_limits(struct dpif *dpif,
8539
                           const struct ovs_list *zone_limits)
8540
0
{
8541
0
    int err = 0;
8542
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
8543
8544
0
    struct ct_dpif_zone_limit *zone_limit;
8545
0
    LIST_FOR_EACH (zone_limit, node, zone_limits) {
8546
0
        err = zone_limit_update(dp->conntrack, zone_limit->zone,
8547
0
                                zone_limit->limit);
8548
0
        if (err != 0) {
8549
0
            break;
8550
0
        }
8551
0
    }
8552
0
    return err;
8553
0
}
8554
8555
static int
8556
dpif_netdev_ct_get_limits(struct dpif *dpif,
8557
                           const struct ovs_list *zone_limits_request,
8558
                           struct ovs_list *zone_limits_reply)
8559
0
{
8560
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
8561
0
    struct conntrack_zone_info czl;
8562
8563
0
    if (!ovs_list_is_empty(zone_limits_request)) {
8564
0
        struct ct_dpif_zone_limit *zone_limit;
8565
0
        LIST_FOR_EACH (zone_limit, node, zone_limits_request) {
8566
0
            czl = zone_limit_get(dp->conntrack, zone_limit->zone);
8567
0
            if (czl.zone == zone_limit->zone || czl.zone == DEFAULT_ZONE) {
8568
0
                ct_dpif_push_zone_limit(zone_limits_reply, zone_limit->zone,
8569
0
                                        czl.limit,
8570
0
                                        czl.count);
8571
0
            } else {
8572
0
                return EINVAL;
8573
0
            }
8574
0
        }
8575
0
    } else {
8576
0
        czl = zone_limit_get(dp->conntrack, DEFAULT_ZONE);
8577
0
        if (czl.zone == DEFAULT_ZONE) {
8578
0
            ct_dpif_push_zone_limit(zone_limits_reply, DEFAULT_ZONE,
8579
0
                                    czl.limit, 0);
8580
0
        }
8581
8582
0
        for (int z = MIN_ZONE; z <= MAX_ZONE; z++) {
8583
0
            czl = zone_limit_get(dp->conntrack, z);
8584
0
            if (czl.zone == z) {
8585
0
                ct_dpif_push_zone_limit(zone_limits_reply, z, czl.limit,
8586
0
                                        czl.count);
8587
0
            }
8588
0
        }
8589
0
    }
8590
8591
0
    return 0;
8592
0
}
8593
8594
static int
8595
dpif_netdev_ct_del_limits(struct dpif *dpif,
8596
                           const struct ovs_list *zone_limits)
8597
0
{
8598
0
    int err = 0;
8599
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
8600
0
    struct ct_dpif_zone_limit *zone_limit;
8601
0
    LIST_FOR_EACH (zone_limit, node, zone_limits) {
8602
0
        err = zone_limit_delete(dp->conntrack, zone_limit->zone);
8603
0
        if (err != 0) {
8604
0
            break;
8605
0
        }
8606
0
    }
8607
8608
0
    return err;
8609
0
}
8610
8611
static int
8612
dpif_netdev_ct_get_features(struct dpif *dpif OVS_UNUSED,
8613
                            enum ct_features *features)
8614
0
{
8615
0
    if (features != NULL) {
8616
0
        *features = CONNTRACK_F_ZERO_SNAT;
8617
0
    }
8618
0
    return 0;
8619
0
}
8620
8621
static int
8622
dpif_netdev_ct_set_timeout_policy(struct dpif *dpif,
8623
                                  const struct ct_dpif_timeout_policy *dpif_tp)
8624
0
{
8625
0
    struct timeout_policy tp;
8626
0
    struct dp_netdev *dp;
8627
8628
0
    dp = get_dp_netdev(dpif);
8629
0
    memcpy(&tp.policy, dpif_tp, sizeof tp.policy);
8630
0
    return timeout_policy_update(dp->conntrack, &tp);
8631
0
}
8632
8633
static int
8634
dpif_netdev_ct_get_timeout_policy(struct dpif *dpif, uint32_t tp_id,
8635
                                  struct ct_dpif_timeout_policy *dpif_tp)
8636
0
{
8637
0
    struct timeout_policy *tp;
8638
0
    struct dp_netdev *dp;
8639
0
    int err = 0;
8640
8641
0
    dp = get_dp_netdev(dpif);
8642
0
    tp = timeout_policy_get(dp->conntrack, tp_id);
8643
0
    if (!tp) {
8644
0
        return ENOENT;
8645
0
    }
8646
0
    memcpy(dpif_tp, &tp->policy, sizeof tp->policy);
8647
0
    return err;
8648
0
}
8649
8650
static int
8651
dpif_netdev_ct_del_timeout_policy(struct dpif *dpif,
8652
                                  uint32_t tp_id)
8653
0
{
8654
0
    struct dp_netdev *dp;
8655
0
    int err = 0;
8656
8657
0
    dp = get_dp_netdev(dpif);
8658
0
    err = timeout_policy_delete(dp->conntrack, tp_id);
8659
0
    return err;
8660
0
}
8661
8662
static int
8663
dpif_netdev_ct_get_timeout_policy_name(struct dpif *dpif OVS_UNUSED,
8664
                                       uint32_t tp_id,
8665
                                       uint16_t dl_type OVS_UNUSED,
8666
                                       uint8_t nw_proto OVS_UNUSED,
8667
                                       char **tp_name, bool *is_generic)
8668
0
{
8669
0
    struct ds ds = DS_EMPTY_INITIALIZER;
8670
8671
0
    ds_put_format(&ds, "%"PRIu32, tp_id);
8672
0
    *tp_name = ds_steal_cstr(&ds);
8673
0
    *is_generic = true;
8674
0
    return 0;
8675
0
}
8676
8677
static int
8678
dpif_netdev_ipf_set_enabled(struct dpif *dpif, bool v6, bool enable)
8679
0
{
8680
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
8681
0
    return ipf_set_enabled(conntrack_ipf_ctx(dp->conntrack), v6, enable);
8682
0
}
8683
8684
static int
8685
dpif_netdev_ipf_set_min_frag(struct dpif *dpif, bool v6, uint32_t min_frag)
8686
0
{
8687
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
8688
0
    return ipf_set_min_frag(conntrack_ipf_ctx(dp->conntrack), v6, min_frag);
8689
0
}
8690
8691
static int
8692
dpif_netdev_ipf_set_max_nfrags(struct dpif *dpif, uint32_t max_frags)
8693
0
{
8694
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
8695
0
    return ipf_set_max_nfrags(conntrack_ipf_ctx(dp->conntrack), max_frags);
8696
0
}
8697
8698
/* Adjust this function if 'dpif_ipf_status' and 'ipf_status' were to
8699
 * diverge. */
8700
static int
8701
dpif_netdev_ipf_get_status(struct dpif *dpif,
8702
                           struct dpif_ipf_status *dpif_ipf_status)
8703
0
{
8704
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
8705
0
    ipf_get_status(conntrack_ipf_ctx(dp->conntrack),
8706
0
                   (struct ipf_status *) dpif_ipf_status);
8707
0
    return 0;
8708
0
}
8709
8710
static int
8711
dpif_netdev_ipf_dump_start(struct dpif *dpif OVS_UNUSED,
8712
                           struct ipf_dump_ctx **ipf_dump_ctx)
8713
0
{
8714
0
    return ipf_dump_start(ipf_dump_ctx);
8715
0
}
8716
8717
static int
8718
dpif_netdev_ipf_dump_next(struct dpif *dpif, void *ipf_dump_ctx, char **dump)
8719
0
{
8720
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
8721
0
    return ipf_dump_next(conntrack_ipf_ctx(dp->conntrack), ipf_dump_ctx,
8722
0
                         dump);
8723
0
}
8724
8725
static int
8726
dpif_netdev_ipf_dump_done(struct dpif *dpif OVS_UNUSED, void *ipf_dump_ctx)
8727
0
{
8728
0
    return ipf_dump_done(ipf_dump_ctx);
8729
8730
0
}
8731
8732
static int
8733
dpif_netdev_bond_add(struct dpif *dpif, uint32_t bond_id,
8734
                     odp_port_t *member_map)
8735
0
{
8736
0
    struct tx_bond *new_tx = xzalloc(sizeof *new_tx);
8737
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
8738
0
    struct dp_netdev_pmd_thread *pmd;
8739
8740
    /* Prepare new bond mapping. */
8741
0
    new_tx->bond_id = bond_id;
8742
0
    for (int bucket = 0; bucket < BOND_BUCKETS; bucket++) {
8743
0
        new_tx->member_buckets[bucket].member_id = member_map[bucket];
8744
0
    }
8745
8746
0
    ovs_mutex_lock(&dp->bond_mutex);
8747
    /* Check if bond already existed. */
8748
0
    struct tx_bond *old_tx = tx_bond_lookup(&dp->tx_bonds, bond_id);
8749
0
    if (old_tx) {
8750
0
        cmap_replace(&dp->tx_bonds, &old_tx->node, &new_tx->node,
8751
0
                     hash_bond_id(bond_id));
8752
0
        ovsrcu_postpone(free, old_tx);
8753
0
    } else {
8754
0
        cmap_insert(&dp->tx_bonds, &new_tx->node, hash_bond_id(bond_id));
8755
0
    }
8756
0
    ovs_mutex_unlock(&dp->bond_mutex);
8757
8758
    /* Update all PMDs with new bond mapping. */
8759
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
8760
0
        dp_netdev_add_bond_tx_to_pmd(pmd, new_tx, true);
8761
0
    }
8762
0
    return 0;
8763
0
}
8764
8765
static int
8766
dpif_netdev_bond_del(struct dpif *dpif, uint32_t bond_id)
8767
0
{
8768
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
8769
0
    struct dp_netdev_pmd_thread *pmd;
8770
0
    struct tx_bond *tx;
8771
8772
0
    ovs_mutex_lock(&dp->bond_mutex);
8773
    /* Check if bond existed. */
8774
0
    tx = tx_bond_lookup(&dp->tx_bonds, bond_id);
8775
0
    if (tx) {
8776
0
        cmap_remove(&dp->tx_bonds, &tx->node, hash_bond_id(bond_id));
8777
0
        ovsrcu_postpone(free, tx);
8778
0
    } else {
8779
        /* Bond is not present. */
8780
0
        ovs_mutex_unlock(&dp->bond_mutex);
8781
0
        return ENOENT;
8782
0
    }
8783
0
    ovs_mutex_unlock(&dp->bond_mutex);
8784
8785
    /* Remove the bond map in all pmds. */
8786
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
8787
0
        dp_netdev_del_bond_tx_from_pmd(pmd, bond_id);
8788
0
    }
8789
0
    return 0;
8790
0
}
8791
8792
static int
8793
dpif_netdev_bond_stats_get(struct dpif *dpif, uint32_t bond_id,
8794
                           uint64_t *n_bytes)
8795
0
{
8796
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
8797
0
    struct dp_netdev_pmd_thread *pmd;
8798
8799
0
    if (!tx_bond_lookup(&dp->tx_bonds, bond_id)) {
8800
0
        return ENOENT;
8801
0
    }
8802
8803
    /* Search the bond in all PMDs. */
8804
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
8805
0
        struct tx_bond *pmd_bond_entry
8806
0
            = tx_bond_lookup(&pmd->tx_bonds, bond_id);
8807
8808
0
        if (!pmd_bond_entry) {
8809
0
            continue;
8810
0
        }
8811
8812
        /* Read bond stats. */
8813
0
        for (int i = 0; i < BOND_BUCKETS; i++) {
8814
0
            uint64_t pmd_n_bytes;
8815
8816
0
            atomic_read_relaxed(&pmd_bond_entry->member_buckets[i].n_bytes,
8817
0
                                &pmd_n_bytes);
8818
0
            n_bytes[i] += pmd_n_bytes;
8819
0
        }
8820
0
    }
8821
0
    return 0;
8822
0
}
8823
8824
const struct dpif_class dpif_netdev_class = {
8825
    "netdev",
8826
    true,                       /* cleanup_required */
8827
    dpif_netdev_init,
8828
    dpif_netdev_enumerate,
8829
    dpif_netdev_port_open_type,
8830
    dpif_netdev_open,
8831
    dpif_netdev_close,
8832
    dpif_netdev_destroy,
8833
    dpif_netdev_run,
8834
    dpif_netdev_wait,
8835
    dpif_netdev_get_stats,
8836
    NULL,                      /* set_features */
8837
    NULL,                      /* get_features */
8838
    dpif_netdev_port_add,
8839
    dpif_netdev_port_del,
8840
    dpif_netdev_port_set_config,
8841
    dpif_netdev_port_query_by_number,
8842
    dpif_netdev_port_query_by_name,
8843
    NULL,                       /* port_get_pid */
8844
    dpif_netdev_port_dump_start,
8845
    dpif_netdev_port_dump_next,
8846
    dpif_netdev_port_dump_done,
8847
    dpif_netdev_port_poll,
8848
    dpif_netdev_port_poll_wait,
8849
    dpif_netdev_flow_flush,
8850
    dpif_netdev_flow_dump_create,
8851
    dpif_netdev_flow_dump_destroy,
8852
    dpif_netdev_flow_dump_thread_create,
8853
    dpif_netdev_flow_dump_thread_destroy,
8854
    dpif_netdev_flow_dump_next,
8855
    dpif_netdev_operate,
8856
    NULL,                       /* recv_set */
8857
    NULL,                       /* handlers_set */
8858
    dpif_netdev_number_handlers_required,
8859
    dpif_netdev_set_config,
8860
    dpif_netdev_queue_to_priority,
8861
    NULL,                       /* recv */
8862
    NULL,                       /* recv_wait */
8863
    NULL,                       /* recv_purge */
8864
    dpif_netdev_register_dp_purge_cb,
8865
    dpif_netdev_register_upcall_cb,
8866
    dpif_netdev_enable_upcall,
8867
    dpif_netdev_disable_upcall,
8868
    dpif_netdev_get_datapath_version,
8869
    dpif_netdev_ct_dump_start,
8870
    dpif_netdev_ct_dump_next,
8871
    dpif_netdev_ct_dump_done,
8872
    dpif_netdev_ct_exp_dump_start,
8873
    dpif_netdev_ct_exp_dump_next,
8874
    dpif_netdev_ct_exp_dump_done,
8875
    dpif_netdev_ct_flush,
8876
    dpif_netdev_ct_set_maxconns,
8877
    dpif_netdev_ct_get_maxconns,
8878
    dpif_netdev_ct_get_nconns,
8879
    dpif_netdev_ct_set_tcp_seq_chk,
8880
    dpif_netdev_ct_get_tcp_seq_chk,
8881
    dpif_netdev_ct_set_sweep_interval,
8882
    dpif_netdev_ct_get_sweep_interval,
8883
    dpif_netdev_ct_set_limits,
8884
    dpif_netdev_ct_get_limits,
8885
    dpif_netdev_ct_del_limits,
8886
    dpif_netdev_ct_set_timeout_policy,
8887
    dpif_netdev_ct_get_timeout_policy,
8888
    dpif_netdev_ct_del_timeout_policy,
8889
    NULL,                       /* ct_timeout_policy_dump_start */
8890
    NULL,                       /* ct_timeout_policy_dump_next */
8891
    NULL,                       /* ct_timeout_policy_dump_done */
8892
    dpif_netdev_ct_get_timeout_policy_name,
8893
    dpif_netdev_ct_get_features,
8894
    dpif_netdev_ipf_set_enabled,
8895
    dpif_netdev_ipf_set_min_frag,
8896
    dpif_netdev_ipf_set_max_nfrags,
8897
    dpif_netdev_ipf_get_status,
8898
    dpif_netdev_ipf_dump_start,
8899
    dpif_netdev_ipf_dump_next,
8900
    dpif_netdev_ipf_dump_done,
8901
    dpif_netdev_meter_get_features,
8902
    dpif_netdev_meter_set,
8903
    dpif_netdev_meter_get,
8904
    dpif_netdev_meter_del,
8905
    dpif_netdev_bond_add,
8906
    dpif_netdev_bond_del,
8907
    dpif_netdev_bond_stats_get,
8908
    NULL,                       /* cache_get_supported_levels */
8909
    NULL,                       /* cache_get_name */
8910
    NULL,                       /* cache_get_size */
8911
    NULL,                       /* cache_set_size */
8912
};
8913
8914
static void
8915
dpif_dummy_change_port_number(struct unixctl_conn *conn, int argc OVS_UNUSED,
8916
                              const char *argv[], void *aux OVS_UNUSED)
8917
0
{
8918
0
    struct dp_netdev_port *port;
8919
0
    struct dp_netdev *dp;
8920
0
    odp_port_t port_no;
8921
8922
0
    ovs_mutex_lock(&dp_netdev_mutex);
8923
0
    dp = shash_find_data(&dp_netdevs, argv[1]);
8924
0
    if (!dp || !dpif_netdev_class_is_dummy(dp->class)) {
8925
0
        ovs_mutex_unlock(&dp_netdev_mutex);
8926
0
        unixctl_command_reply_error(conn, "unknown datapath or not a dummy");
8927
0
        return;
8928
0
    }
8929
0
    ovs_refcount_ref(&dp->ref_cnt);
8930
0
    ovs_mutex_unlock(&dp_netdev_mutex);
8931
8932
0
    ovs_rwlock_wrlock(&dp->port_rwlock);
8933
0
    if (get_port_by_name(dp, argv[2], &port)) {
8934
0
        unixctl_command_reply_error(conn, "unknown port");
8935
0
        goto exit;
8936
0
    }
8937
8938
0
    port_no = u32_to_odp(atoi(argv[3]));
8939
0
    if (!port_no || port_no == ODPP_NONE) {
8940
0
        unixctl_command_reply_error(conn, "bad port number");
8941
0
        goto exit;
8942
0
    }
8943
0
    if (dp_netdev_lookup_port(dp, port_no)) {
8944
0
        unixctl_command_reply_error(conn, "port number already in use");
8945
0
        goto exit;
8946
0
    }
8947
8948
    /* Remove port. */
8949
0
    hmap_remove(&dp->ports, &port->node);
8950
0
    reconfigure_datapath(dp);
8951
8952
    /* Reinsert with new port number. */
8953
0
    port->port_no = port_no;
8954
0
    hmap_insert(&dp->ports, &port->node, hash_port_no(port_no));
8955
0
    reconfigure_datapath(dp);
8956
8957
0
    seq_change(dp->port_seq);
8958
0
    unixctl_command_reply(conn, NULL);
8959
8960
0
exit:
8961
0
    ovs_rwlock_unlock(&dp->port_rwlock);
8962
0
    dp_netdev_unref(dp);
8963
0
}
8964
8965
static void
8966
dpif_dummy_register__(const char *type)
8967
0
{
8968
0
    struct dpif_class *class;
8969
8970
0
    class = xmalloc(sizeof *class);
8971
0
    *class = dpif_netdev_class;
8972
0
    class->type = xstrdup(type);
8973
0
    dp_register_provider(class);
8974
0
}
8975
8976
static void
8977
dpif_dummy_override(const char *type)
8978
0
{
8979
0
    int error;
8980
8981
    /*
8982
     * Ignore EAFNOSUPPORT to allow --enable-dummy=system with
8983
     * a userland-only build.  It's useful for testsuite.
8984
     */
8985
0
    error = dp_unregister_provider(type);
8986
0
    if (error == 0 || error == EAFNOSUPPORT) {
8987
0
        dpif_dummy_register__(type);
8988
0
    }
8989
0
}
8990
8991
void
8992
dpif_dummy_register(enum dummy_level level)
8993
0
{
8994
0
    if (level == DUMMY_OVERRIDE_ALL) {
8995
0
        struct sset types;
8996
0
        const char *type;
8997
8998
0
        sset_init(&types);
8999
0
        dp_enumerate_types(&types);
9000
0
        SSET_FOR_EACH (type, &types) {
9001
0
            dpif_dummy_override(type);
9002
0
        }
9003
0
        sset_destroy(&types);
9004
0
    } else if (level == DUMMY_OVERRIDE_SYSTEM) {
9005
0
        dpif_dummy_override("system");
9006
0
    }
9007
9008
0
    dpif_dummy_register__("dummy");
9009
9010
0
    unixctl_command_register("dpif-dummy/change-port-number",
9011
0
                             "dp port new-number",
9012
0
                             3, 3, dpif_dummy_change_port_number, NULL);
9013
0
}
9014

9015
/* Datapath Classifier. */
9016
9017
static void
9018
dpcls_subtable_destroy_cb(struct dpcls_subtable *subtable)
9019
0
{
9020
0
    cmap_destroy(&subtable->rules);
9021
0
    ovsrcu_postpone(free, subtable->mf_masks);
9022
0
    ovsrcu_postpone(free, subtable);
9023
0
}
9024
9025
/* Initializes 'cls' as a classifier that initially contains no classification
9026
 * rules. */
9027
static void
9028
dpcls_init(struct dpcls *cls)
9029
0
{
9030
0
    cmap_init(&cls->subtables_map);
9031
0
    pvector_init(&cls->subtables);
9032
0
}
9033
9034
static void
9035
dpcls_destroy_subtable(struct dpcls *cls, struct dpcls_subtable *subtable)
9036
0
{
9037
0
    VLOG_DBG("Destroying subtable %p for in_port %d", subtable, cls->in_port);
9038
0
    pvector_remove(&cls->subtables, subtable);
9039
0
    cmap_remove(&cls->subtables_map, &subtable->cmap_node,
9040
0
                subtable->mask.hash);
9041
0
    ovsrcu_postpone(dpcls_subtable_destroy_cb, subtable);
9042
0
}
9043
9044
/* Destroys 'cls'.  Rules within 'cls', if any, are not freed; this is the
9045
 * caller's responsibility.
9046
 * May only be called after all the readers have been terminated. */
9047
static void
9048
dpcls_destroy(struct dpcls *cls)
9049
0
{
9050
0
    if (cls) {
9051
0
        struct dpcls_subtable *subtable;
9052
9053
0
        CMAP_FOR_EACH (subtable, cmap_node, &cls->subtables_map) {
9054
0
            ovs_assert(cmap_count(&subtable->rules) == 0);
9055
0
            dpcls_destroy_subtable(cls, subtable);
9056
0
        }
9057
0
        cmap_destroy(&cls->subtables_map);
9058
0
        pvector_destroy(&cls->subtables);
9059
0
    }
9060
0
}
9061
9062
static struct dpcls_subtable *
9063
dpcls_create_subtable(struct dpcls *cls, const struct netdev_flow_key *mask)
9064
0
{
9065
0
    struct dpcls_subtable *subtable;
9066
9067
    /* Need to add one. */
9068
0
    subtable = xmalloc(sizeof *subtable
9069
0
                       - sizeof subtable->mask.mf + mask->len);
9070
0
    cmap_init(&subtable->rules);
9071
0
    subtable->hit_cnt = 0;
9072
0
    netdev_flow_key_clone(&subtable->mask, mask);
9073
9074
    /* The count of bits in the mask defines the space required for masks.
9075
     * Then call gen_masks() to create the appropriate masks, avoiding the cost
9076
     * of doing runtime calculations. */
9077
0
    uint32_t unit0 = count_1bits(mask->mf.map.bits[0]);
9078
0
    uint32_t unit1 = count_1bits(mask->mf.map.bits[1]);
9079
0
    subtable->mf_bits_set_unit0 = unit0;
9080
0
    subtable->mf_bits_set_unit1 = unit1;
9081
0
    subtable->mf_masks = xmalloc(sizeof(uint64_t) * (unit0 + unit1));
9082
0
    dpcls_flow_key_gen_masks(mask, subtable->mf_masks, unit0, unit1);
9083
9084
    /* Get the preferred subtable search function for this (u0,u1) subtable.
9085
     * The function is guaranteed to always return a valid implementation, and
9086
     * possibly a specialized implementation. */
9087
0
    subtable->lookup_func = dpcls_subtable_lookup_probe(unit0, unit1);
9088
9089
0
    cmap_insert(&cls->subtables_map, &subtable->cmap_node, mask->hash);
9090
    /* Add the new subtable at the end of the pvector (with no hits yet) */
9091
0
    pvector_insert(&cls->subtables, subtable, 0);
9092
0
    VLOG_DBG("Creating %"PRIuSIZE". subtable %p for in_port %d",
9093
0
             cmap_count(&cls->subtables_map), subtable, cls->in_port);
9094
0
    pvector_publish(&cls->subtables);
9095
9096
0
    return subtable;
9097
0
}
9098
9099
static inline struct dpcls_subtable *
9100
dpcls_find_subtable(struct dpcls *cls, const struct netdev_flow_key *mask)
9101
0
{
9102
0
    struct dpcls_subtable *subtable;
9103
9104
0
    CMAP_FOR_EACH_WITH_HASH (subtable, cmap_node, mask->hash,
9105
0
                             &cls->subtables_map) {
9106
0
        if (netdev_flow_key_equal(&subtable->mask, mask)) {
9107
0
            return subtable;
9108
0
        }
9109
0
    }
9110
0
    return dpcls_create_subtable(cls, mask);
9111
0
}
9112
9113
/* Periodically sort the dpcls subtable vectors according to hit counts */
9114
static void
9115
dpcls_sort_subtable_vector(struct dpcls *cls)
9116
0
{
9117
0
    struct pvector *pvec = &cls->subtables;
9118
0
    struct dpcls_subtable *subtable;
9119
9120
0
    PVECTOR_FOR_EACH (subtable, pvec) {
9121
0
        pvector_change_priority(pvec, subtable, subtable->hit_cnt);
9122
0
        subtable->hit_cnt = 0;
9123
0
    }
9124
0
    pvector_publish(pvec);
9125
0
}
9126
9127
static inline void
9128
dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd,
9129
                           struct polled_queue *poll_list, int poll_cnt)
9130
0
{
9131
0
    struct dpcls *cls;
9132
0
    uint64_t tot_idle = 0, tot_proc = 0, tot_sleep = 0;
9133
0
    unsigned int pmd_load = 0;
9134
9135
0
    if (pmd->ctx.now > pmd->next_cycle_store) {
9136
0
        uint64_t curr_tsc;
9137
0
        uint8_t rebalance_load_trigger;
9138
0
        struct pmd_auto_lb *pmd_alb = &pmd->dp->pmd_alb;
9139
0
        unsigned int idx;
9140
9141
0
        if (pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE] >=
9142
0
                pmd->prev_stats[PMD_CYCLES_ITER_IDLE] &&
9143
0
            pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY] >=
9144
0
                pmd->prev_stats[PMD_CYCLES_ITER_BUSY]) {
9145
0
            tot_idle = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE] -
9146
0
                       pmd->prev_stats[PMD_CYCLES_ITER_IDLE];
9147
0
            tot_proc = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY] -
9148
0
                       pmd->prev_stats[PMD_CYCLES_ITER_BUSY];
9149
0
            tot_sleep = pmd->perf_stats.counters.n[PMD_CYCLES_SLEEP] -
9150
0
                        pmd->prev_stats[PMD_CYCLES_SLEEP];
9151
9152
0
            if (pmd_alb->is_enabled && !pmd->isolated) {
9153
0
                if (tot_proc) {
9154
0
                    pmd_load = ((tot_proc * 100) /
9155
0
                                    (tot_idle + tot_proc + tot_sleep));
9156
0
                }
9157
9158
0
                atomic_read_relaxed(&pmd_alb->rebalance_load_thresh,
9159
0
                                    &rebalance_load_trigger);
9160
0
                if (pmd_load >= rebalance_load_trigger) {
9161
0
                    atomic_count_inc(&pmd->pmd_overloaded);
9162
0
                } else {
9163
0
                    atomic_count_set(&pmd->pmd_overloaded, 0);
9164
0
                }
9165
0
            }
9166
0
        }
9167
9168
0
        pmd->prev_stats[PMD_CYCLES_ITER_IDLE] =
9169
0
                        pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE];
9170
0
        pmd->prev_stats[PMD_CYCLES_ITER_BUSY] =
9171
0
                        pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY];
9172
0
        pmd->prev_stats[PMD_CYCLES_SLEEP] =
9173
0
                        pmd->perf_stats.counters.n[PMD_CYCLES_SLEEP];
9174
9175
        /* Get the cycles that were used to process each queue and store. */
9176
0
        for (unsigned i = 0; i < poll_cnt; i++) {
9177
0
            uint64_t rxq_cyc_curr = dp_netdev_rxq_get_cycles(poll_list[i].rxq,
9178
0
                                                        RXQ_CYCLES_PROC_CURR);
9179
0
            dp_netdev_rxq_set_intrvl_cycles(poll_list[i].rxq, rxq_cyc_curr);
9180
0
            dp_netdev_rxq_set_cycles(poll_list[i].rxq, RXQ_CYCLES_PROC_CURR,
9181
0
                                     0);
9182
0
        }
9183
0
        curr_tsc = cycles_counter_update(&pmd->perf_stats);
9184
0
        if (pmd->intrvl_tsc_prev) {
9185
            /* There is a prev timestamp, store a new intrvl cycle count. */
9186
0
            atomic_store_relaxed(&pmd->intrvl_cycles,
9187
0
                                 curr_tsc - pmd->intrvl_tsc_prev);
9188
0
        }
9189
0
        idx = atomic_count_inc(&pmd->intrvl_idx) % PMD_INTERVAL_MAX;
9190
0
        atomic_store_relaxed(&pmd->busy_cycles_intrvl[idx], tot_proc);
9191
0
        pmd->intrvl_tsc_prev = curr_tsc;
9192
        /* Start new measuring interval */
9193
0
        pmd->next_cycle_store = pmd->ctx.now + PMD_INTERVAL_LEN;
9194
0
    }
9195
9196
0
    if (pmd->ctx.now > pmd->next_optimization) {
9197
        /* Try to obtain the flow lock to block out revalidator threads.
9198
         * If not possible, just try next time. */
9199
0
        if (!ovs_mutex_trylock(&pmd->flow_mutex)) {
9200
            /* Optimize each classifier */
9201
0
            CMAP_FOR_EACH (cls, node, &pmd->classifiers) {
9202
0
                dpcls_sort_subtable_vector(cls);
9203
0
            }
9204
0
            ovs_mutex_unlock(&pmd->flow_mutex);
9205
            /* Start new measuring interval */
9206
0
            pmd->next_optimization = pmd->ctx.now
9207
0
                                     + DPCLS_OPTIMIZATION_INTERVAL;
9208
0
        }
9209
0
    }
9210
0
}
9211
9212
/* Returns the sum of a specified number of newest to
9213
 * oldest interval values. 'cur_idx' is where the next
9214
 * write will be and wrap around needs to be handled.
9215
 */
9216
static uint64_t
9217
get_interval_values(atomic_ullong *source, atomic_count *cur_idx,
9218
0
                    int num_to_read) {
9219
0
    unsigned int i;
9220
0
    uint64_t total = 0;
9221
9222
0
    i = atomic_count_get(cur_idx) % PMD_INTERVAL_MAX;
9223
0
    for (int read = 0; read < num_to_read; read++) {
9224
0
        uint64_t interval_value;
9225
9226
0
        i = i ? i - 1 : PMD_INTERVAL_MAX - 1;
9227
0
        atomic_read_relaxed(&source[i], &interval_value);
9228
0
        total += interval_value;
9229
0
    }
9230
0
    return total;
9231
0
}
9232
9233
/* Insert 'rule' into 'cls'. */
9234
static void
9235
dpcls_insert(struct dpcls *cls, struct dpcls_rule *rule,
9236
             const struct netdev_flow_key *mask)
9237
0
{
9238
0
    struct dpcls_subtable *subtable = dpcls_find_subtable(cls, mask);
9239
9240
    /* Refer to subtable's mask, also for later removal. */
9241
0
    rule->mask = &subtable->mask;
9242
0
    cmap_insert(&subtable->rules, &rule->cmap_node, rule->flow.hash);
9243
0
}
9244
9245
/* Removes 'rule' from 'cls', also destructing the 'rule'. */
9246
static void
9247
dpcls_remove(struct dpcls *cls, struct dpcls_rule *rule)
9248
0
{
9249
0
    struct dpcls_subtable *subtable;
9250
9251
0
    ovs_assert(rule->mask);
9252
9253
    /* Get subtable from reference in rule->mask. */
9254
0
    INIT_CONTAINER(subtable, rule->mask, mask);
9255
0
    if (cmap_remove(&subtable->rules, &rule->cmap_node, rule->flow.hash)
9256
0
        == 0) {
9257
        /* Delete empty subtable. */
9258
0
        dpcls_destroy_subtable(cls, subtable);
9259
0
        pvector_publish(&cls->subtables);
9260
0
    }
9261
0
}
9262
9263
/* Inner loop for mask generation of a unit, see dpcls_flow_key_gen_masks. */
9264
static inline void
9265
dpcls_flow_key_gen_mask_unit(uint64_t iter, const uint64_t count,
9266
                             uint64_t *mf_masks)
9267
0
{
9268
0
    int i;
9269
0
    for (i = 0; i < count; i++) {
9270
0
        uint64_t lowest_bit = (iter & -iter);
9271
0
        iter &= ~lowest_bit;
9272
0
        mf_masks[i] = (lowest_bit - 1);
9273
0
    }
9274
    /* Checks that count has covered all bits in the iter bitmap. */
9275
0
    ovs_assert(iter == 0);
9276
0
}
9277
9278
/* Generate a mask for each block in the miniflow, based on the bits set. This
9279
 * allows easily masking packets with the generated array here, without
9280
 * calculations. This replaces runtime-calculating the masks.
9281
 * @param key The table to generate the mf_masks for
9282
 * @param mf_masks Pointer to a u64 array of at least *mf_bits* in size
9283
 * @param mf_bits_total Number of bits set in the whole miniflow (both units)
9284
 * @param mf_bits_unit0 Number of bits set in unit0 of the miniflow
9285
 */
9286
void
9287
dpcls_flow_key_gen_masks(const struct netdev_flow_key *tbl,
9288
                         uint64_t *mf_masks,
9289
                         const uint32_t mf_bits_u0,
9290
                         const uint32_t mf_bits_u1)
9291
0
{
9292
0
    uint64_t iter_u0 = tbl->mf.map.bits[0];
9293
0
    uint64_t iter_u1 = tbl->mf.map.bits[1];
9294
9295
0
    dpcls_flow_key_gen_mask_unit(iter_u0, mf_bits_u0, &mf_masks[0]);
9296
0
    dpcls_flow_key_gen_mask_unit(iter_u1, mf_bits_u1, &mf_masks[mf_bits_u0]);
9297
0
}
9298
9299
/* Returns true if 'target' satisfies 'key' in 'mask', that is, if each 1-bit
9300
 * in 'mask' the values in 'key' and 'target' are the same. */
9301
inline bool
9302
dpcls_rule_matches_key(const struct dpcls_rule *rule,
9303
                       const struct netdev_flow_key *target)
9304
0
{
9305
0
    const uint64_t *keyp = miniflow_get_values(&rule->flow.mf);
9306
0
    const uint64_t *maskp = miniflow_get_values(&rule->mask->mf);
9307
0
    uint64_t value;
9308
9309
0
    NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(value, target, rule->flow.mf.map) {
9310
0
        if (OVS_UNLIKELY((value & *maskp++) != *keyp++)) {
9311
0
            return false;
9312
0
        }
9313
0
    }
9314
0
    return true;
9315
0
}
9316
9317
/* For each miniflow in 'keys' performs a classifier lookup writing the result
9318
 * into the corresponding slot in 'rules'.  If a particular entry in 'keys' is
9319
 * NULL it is skipped.
9320
 *
9321
 * This function is optimized for use in the userspace datapath and therefore
9322
 * does not implement a lot of features available in the standard
9323
 * classifier_lookup() function.  Specifically, it does not implement
9324
 * priorities, instead returning any rule which matches the flow.
9325
 *
9326
 * Returns true if all miniflows found a corresponding rule. */
9327
bool
9328
dpcls_lookup(struct dpcls *cls, const struct netdev_flow_key *keys[],
9329
             struct dpcls_rule **rules, const size_t cnt,
9330
             int *num_lookups_p)
9331
0
{
9332
    /* The received 'cnt' miniflows are the search-keys that will be processed
9333
     * to find a matching entry into the available subtables.
9334
     * The number of bits in map_type is equal to NETDEV_MAX_BURST. */
9335
0
#define MAP_BITS (sizeof(uint32_t) * CHAR_BIT)
9336
0
    BUILD_ASSERT_DECL(MAP_BITS >= NETDEV_MAX_BURST);
9337
9338
0
    struct dpcls_subtable *subtable;
9339
0
    uint32_t keys_map = TYPE_MAXIMUM(uint32_t); /* Set all bits. */
9340
9341
0
    if (cnt != MAP_BITS) {
9342
0
        keys_map >>= MAP_BITS - cnt; /* Clear extra bits. */
9343
0
    }
9344
0
    memset(rules, 0, cnt * sizeof *rules);
9345
9346
0
    int lookups_match = 0, subtable_pos = 1;
9347
0
    uint32_t found_map;
9348
9349
    /* The Datapath classifier - aka dpcls - is composed of subtables.
9350
     * Subtables are dynamically created as needed when new rules are inserted.
9351
     * Each subtable collects rules with matches on a specific subset of packet
9352
     * fields as defined by the subtable's mask.  We proceed to process every
9353
     * search-key against each subtable, but when a match is found for a
9354
     * search-key, the search for that key can stop because the rules are
9355
     * non-overlapping. */
9356
0
    PVECTOR_FOR_EACH (subtable, &cls->subtables) {
9357
        /* Call the subtable specific lookup function. */
9358
0
        found_map = subtable->lookup_func(subtable, keys_map, keys, rules);
9359
9360
        /* Count the number of subtables searched for this packet match. This
9361
         * estimates the "spread" of subtables looked at per matched packet. */
9362
0
        uint32_t pkts_matched = count_1bits(found_map);
9363
0
        lookups_match += pkts_matched * subtable_pos;
9364
9365
        /* Clear the found rules, and return early if all packets are found. */
9366
0
        keys_map &= ~found_map;
9367
0
        if (!keys_map) {
9368
0
            if (num_lookups_p) {
9369
0
                *num_lookups_p = lookups_match;
9370
0
            }
9371
0
            return true;
9372
0
        }
9373
0
        subtable_pos++;
9374
0
    }
9375
9376
0
    if (num_lookups_p) {
9377
0
        *num_lookups_p = lookups_match;
9378
0
    }
9379
    return false;
9380
0
}