Coverage Report

Created: 2026-01-31 06:26

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/openvswitch/lib/dpif-netdev.c
Line
Count
Source
1
/*
2
 * Copyright (c) 2009-2014, 2016-2018 Nicira, Inc.
3
 *
4
 * Licensed under the Apache License, Version 2.0 (the "License");
5
 * you may not use this file except in compliance with the License.
6
 * You may obtain a copy of the License at:
7
 *
8
 *     http://www.apache.org/licenses/LICENSE-2.0
9
 *
10
 * Unless required by applicable law or agreed to in writing, software
11
 * distributed under the License is distributed on an "AS IS" BASIS,
12
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
 * See the License for the specific language governing permissions and
14
 * limitations under the License.
15
 */
16
17
#include <config.h>
18
#include "dpif-netdev.h"
19
#include "dpif-netdev-private.h"
20
#include "dpif-netdev-private-dfc.h"
21
#include "dpif-offload.h"
22
23
#include <ctype.h>
24
#include <errno.h>
25
#include <fcntl.h>
26
#include <inttypes.h>
27
#include <net/if.h>
28
#include <sys/types.h>
29
#include <netinet/in.h>
30
#include <stdint.h>
31
#include <stdlib.h>
32
#include <string.h>
33
#include <sys/ioctl.h>
34
#include <sys/socket.h>
35
#include <sys/stat.h>
36
#include <unistd.h>
37
38
#include "bitmap.h"
39
#include "ccmap.h"
40
#include "cmap.h"
41
#include "conntrack.h"
42
#include "conntrack-tp.h"
43
#include "coverage.h"
44
#include "ct-dpif.h"
45
#include "csum.h"
46
#include "dp-packet.h"
47
#include "dpif.h"
48
#include "dpif-netdev-lookup.h"
49
#include "dpif-netdev-perf.h"
50
#include "dpif-netdev-private-extract.h"
51
#include "dpif-provider.h"
52
#include "dummy.h"
53
#include "fat-rwlock.h"
54
#include "flow.h"
55
#include "hmapx.h"
56
#include "id-fpool.h"
57
#include "id-pool.h"
58
#include "ipf.h"
59
#include "mov-avg.h"
60
#include "mpsc-queue.h"
61
#include "netdev.h"
62
#include "netdev-provider.h"
63
#include "netdev-vport.h"
64
#include "netlink.h"
65
#include "odp-execute.h"
66
#include "odp-util.h"
67
#include "openvswitch/dynamic-string.h"
68
#include "openvswitch/list.h"
69
#include "openvswitch/match.h"
70
#include "openvswitch/ofp-parse.h"
71
#include "openvswitch/ofp-print.h"
72
#include "openvswitch/ofpbuf.h"
73
#include "openvswitch/shash.h"
74
#include "openvswitch/vlog.h"
75
#include "ovs-numa.h"
76
#include "ovs-rcu.h"
77
#include "packets.h"
78
#include "openvswitch/poll-loop.h"
79
#include "pvector.h"
80
#include "random.h"
81
#include "seq.h"
82
#include "smap.h"
83
#include "sset.h"
84
#include "timeval.h"
85
#include "tnl-neigh-cache.h"
86
#include "tnl-ports.h"
87
#include "unixctl.h"
88
#include "util.h"
89
#include "uuid.h"
90
91
VLOG_DEFINE_THIS_MODULE(dpif_netdev);
92
93
/* Auto Load Balancing Defaults */
94
0
#define ALB_IMPROVEMENT_THRESHOLD    25
95
0
#define ALB_LOAD_THRESHOLD           95
96
0
#define ALB_REBALANCE_INTERVAL       1     /* 1 Min */
97
0
#define MAX_ALB_REBALANCE_INTERVAL   20000 /* 20000 Min */
98
0
#define MIN_TO_MSEC                  60000
99
100
#define FLOW_DUMP_MAX_BATCH 50
101
/* Use per thread recirc_depth to prevent recirculation loop. */
102
0
#define MAX_RECIRC_DEPTH 8
103
DEFINE_STATIC_PER_THREAD_DATA(uint32_t, recirc_depth, 0)
104
105
/* Use instant packet send by default. */
106
0
#define DEFAULT_TX_FLUSH_INTERVAL 0
107
108
/* Configuration parameters. */
109
enum { MAX_METERS = 1 << 18 };  /* Maximum number of meters. */
110
enum { MAX_BANDS = 8 };         /* Maximum number of bands / meter. */
111
112
COVERAGE_DEFINE(datapath_drop_meter);
113
COVERAGE_DEFINE(datapath_drop_upcall_error);
114
COVERAGE_DEFINE(datapath_drop_lock_error);
115
COVERAGE_DEFINE(datapath_drop_userspace_action_error);
116
COVERAGE_DEFINE(datapath_drop_tunnel_push_error);
117
COVERAGE_DEFINE(datapath_drop_tunnel_pop_error);
118
COVERAGE_DEFINE(datapath_drop_recirc_error);
119
COVERAGE_DEFINE(datapath_drop_invalid_port);
120
COVERAGE_DEFINE(datapath_drop_invalid_bond);
121
COVERAGE_DEFINE(datapath_drop_invalid_tnl_port);
122
COVERAGE_DEFINE(datapath_drop_rx_invalid_packet);
123
COVERAGE_DEFINE(datapath_drop_hw_post_process);
124
COVERAGE_DEFINE(datapath_drop_hw_post_process_consumed);
125
126
/* Protects against changes to 'dp_netdevs'. */
127
struct ovs_mutex dp_netdev_mutex = OVS_MUTEX_INITIALIZER;
128
129
/* Contains all 'struct dp_netdev's. */
130
static struct shash dp_netdevs OVS_GUARDED_BY(dp_netdev_mutex)
131
    = SHASH_INITIALIZER(&dp_netdevs);
132
133
static struct vlog_rate_limit upcall_rl = VLOG_RATE_LIMIT_INIT(600, 600);
134
135
0
#define DP_NETDEV_CS_SUPPORTED_MASK (CS_NEW | CS_ESTABLISHED | CS_RELATED \
136
0
                                     | CS_INVALID | CS_REPLY_DIR | CS_TRACKED \
137
0
                                     | CS_SRC_NAT | CS_DST_NAT)
138
0
#define DP_NETDEV_CS_UNSUPPORTED_MASK (~(uint32_t)DP_NETDEV_CS_SUPPORTED_MASK)
139
140
static struct odp_support dp_netdev_support = {
141
    .max_vlan_headers = SIZE_MAX,
142
    .max_mpls_depth = SIZE_MAX,
143
    .recirc = true,
144
    .ct_state = true,
145
    .ct_zone = true,
146
    .ct_mark = true,
147
    .ct_label = true,
148
    .ct_state_nat = true,
149
    .ct_orig_tuple = true,
150
    .ct_orig_tuple6 = true,
151
};
152
153

154
/* Simple non-wildcarding single-priority classifier. */
155
156
/* Time in microseconds between successive optimizations of the dpcls
157
 * subtable vector */
158
0
#define DPCLS_OPTIMIZATION_INTERVAL 1000000LL
159
160
/* Time in microseconds of the interval in which rxq processing cycles used
161
 * in rxq to pmd assignments is measured and stored. */
162
0
#define PMD_INTERVAL_LEN 5000000LL
163
/* For converting PMD_INTERVAL_LEN to secs. */
164
0
#define INTERVAL_USEC_TO_SEC 1000000LL
165
166
/* Number of intervals for which cycles are stored
167
 * and used during rxq to pmd assignment. */
168
0
#define PMD_INTERVAL_MAX 12
169
170
/* Time in microseconds to try RCU quiescing. */
171
0
#define PMD_RCU_QUIESCE_INTERVAL 10000LL
172
173
/* Timer resolution for PMD threads in nanoseconds. */
174
0
#define PMD_TIMER_RES_NS 1000
175
176
/* Number of pkts Rx on an interface that will stop pmd thread sleeping. */
177
0
#define PMD_SLEEP_THRESH (NETDEV_MAX_BURST / 2)
178
/* Time in uS to increment a pmd thread sleep time. */
179
0
#define PMD_SLEEP_INC_US 1
180
181
struct pmd_sleep {
182
    unsigned core_id;
183
    uint64_t max_sleep;
184
};
185
186
struct dpcls {
187
    struct cmap_node node;      /* Within dp_netdev_pmd_thread.classifiers */
188
    odp_port_t in_port;
189
    struct cmap subtables_map;
190
    struct pvector subtables;
191
};
192
193
/* Data structure to keep packet order till fastpath processing. */
194
struct dp_packet_flow_map {
195
    struct dp_packet *packet;
196
    struct dp_netdev_flow *flow;
197
    uint16_t tcp_flags;
198
};
199
200
static void dpcls_init(struct dpcls *);
201
static void dpcls_destroy(struct dpcls *);
202
static void dpcls_sort_subtable_vector(struct dpcls *);
203
static uint32_t dpcls_subtable_lookup_reprobe(struct dpcls *cls);
204
static void dpcls_insert(struct dpcls *, struct dpcls_rule *,
205
                         const struct netdev_flow_key *mask);
206
static void dpcls_remove(struct dpcls *, struct dpcls_rule *);
207
208
/* Set of supported meter flags */
209
#define DP_SUPPORTED_METER_FLAGS_MASK \
210
0
    (OFPMF13_STATS | OFPMF13_PKTPS | OFPMF13_KBPS | OFPMF13_BURST)
211
212
/* Set of supported meter band types */
213
#define DP_SUPPORTED_METER_BAND_TYPES           \
214
0
    ( 1 << OFPMBT13_DROP )
215
216
struct dp_meter_band {
217
    uint32_t rate;
218
    uint32_t burst_size;
219
    atomic_uint64_t bucket;          /* In 1/1000 packets for PKTPS,
220
                                      * or in bits for KBPS. */
221
    atomic_uint64_t packet_count;
222
    atomic_uint64_t byte_count;
223
};
224
225
struct dp_meter {
226
    struct cmap_node node;
227
    uint32_t id;
228
    uint16_t flags;
229
    uint16_t n_bands;
230
    uint32_t max_delta_t;
231
    atomic_uint64_t used;  /* Time of a last use in milliseconds. */
232
    atomic_uint64_t packet_count;
233
    atomic_uint64_t byte_count;
234
    struct dp_meter_band bands[];
235
};
236
237
struct pmd_auto_lb {
238
    bool do_dry_run;
239
    bool recheck_config;
240
    bool is_enabled;            /* Current status of Auto load balancing. */
241
    uint64_t rebalance_intvl;
242
    uint64_t rebalance_poll_timer;
243
    uint8_t rebalance_improve_thresh;
244
    atomic_uint8_t rebalance_load_thresh;
245
};
246
247
enum sched_assignment_type {
248
    SCHED_ROUNDROBIN,
249
    SCHED_CYCLES, /* Default.*/
250
    SCHED_GROUP
251
};
252
253
/* Datapath based on the network device interface from netdev.h.
254
 *
255
 *
256
 * Thread-safety
257
 * =============
258
 *
259
 * Some members, marked 'const', are immutable.  Accessing other members
260
 * requires synchronization, as noted in more detail below.
261
 *
262
 * Acquisition order is, from outermost to innermost:
263
 *
264
 *    dp_netdev_mutex (global)
265
 *    port_rwlock
266
 *    bond_mutex
267
 *    non_pmd_mutex
268
 */
269
struct dp_netdev {
270
    const struct dpif_class *const class;
271
    const char *const name;
272
    const char *const full_name;
273
    struct ovs_refcount ref_cnt;
274
    atomic_flag destroyed;
275
276
    /* Ports.
277
     *
278
     * Any lookup into 'ports' or any access to the dp_netdev_ports found
279
     * through 'ports' requires taking 'port_rwlock'. */
280
    struct ovs_rwlock port_rwlock;
281
    struct hmap ports;
282
    struct seq *port_seq;       /* Incremented whenever a port changes. */
283
284
    /* The time that a packet can wait in output batch for sending. */
285
    atomic_uint32_t tx_flush_interval;
286
287
    /* Meters. */
288
    struct ovs_mutex meters_lock;
289
    struct cmap meters;
290
291
    /* Probability of EMC insertions is a factor of 'emc_insert_min'.*/
292
    atomic_uint32_t emc_insert_min;
293
    /* Enable collection of PMD performance metrics. */
294
    atomic_bool pmd_perf_metrics;
295
    /* Default max load based sleep request. */
296
    uint64_t pmd_max_sleep_default;
297
    /* Enable the SMC cache from ovsdb config */
298
    atomic_bool smc_enable_db;
299
300
    /* Protects access to ofproto-dpif-upcall interface during revalidator
301
     * thread synchronization. */
302
    struct fat_rwlock upcall_rwlock;
303
    upcall_callback *upcall_cb;  /* Callback function for executing upcalls. */
304
    void *upcall_aux;
305
306
    /* Callback function for notifying the purging of dp flows (during
307
     * reseting pmd deletion). */
308
    dp_purge_callback *dp_purge_cb;
309
    void *dp_purge_aux;
310
311
    /* Stores all 'struct dp_netdev_pmd_thread's. */
312
    struct cmap poll_threads;
313
    /* id pool for per thread static_tx_qid. */
314
    struct id_pool *tx_qid_pool;
315
    struct ovs_mutex tx_qid_pool_mutex;
316
    /* Rxq to pmd assignment type. */
317
    enum sched_assignment_type pmd_rxq_assign_type;
318
    bool pmd_iso;
319
320
    /* Protects the access of the 'struct dp_netdev_pmd_thread'
321
     * instance for non-pmd thread. */
322
    struct ovs_mutex non_pmd_mutex;
323
324
    /* Each pmd thread will store its pointer to
325
     * 'struct dp_netdev_pmd_thread' in 'per_pmd_key'. */
326
    ovsthread_key_t per_pmd_key;
327
328
    struct seq *reconfigure_seq;
329
    uint64_t last_reconfigure_seq;
330
331
    /* Cpu mask for pin of pmd threads. */
332
    char *pmd_cmask;
333
334
    /* PMD max load based sleep request user string. */
335
    char *max_sleep_list;
336
337
    uint64_t last_tnl_conf_seq;
338
339
    struct conntrack *conntrack;
340
    struct pmd_auto_lb pmd_alb;
341
342
    /* Bonds. */
343
    struct ovs_mutex bond_mutex; /* Protects updates of 'tx_bonds'. */
344
    struct cmap tx_bonds; /* Contains 'struct tx_bond'. */
345
};
346
347
static struct dp_netdev_port *dp_netdev_lookup_port(const struct dp_netdev *dp,
348
                                                    odp_port_t)
349
    OVS_REQ_RDLOCK(dp->port_rwlock);
350
351
enum rxq_cycles_counter_type {
352
    RXQ_CYCLES_PROC_CURR,       /* Cycles spent successfully polling and
353
                                   processing packets during the current
354
                                   interval. */
355
    RXQ_CYCLES_PROC_HIST,       /* Total cycles of all intervals that are used
356
                                   during rxq to pmd assignment. */
357
    RXQ_N_CYCLES
358
};
359
360
0
#define XPS_TIMEOUT 500000LL    /* In microseconds. */
361
362
/* Contained by struct dp_netdev_port's 'rxqs' member.  */
363
struct dp_netdev_rxq {
364
    struct dp_netdev_port *port;
365
    struct netdev_rxq *rx;
366
    unsigned core_id;                  /* Core to which this queue should be
367
                                          pinned. OVS_CORE_UNSPEC if the
368
                                          queue doesn't need to be pinned to a
369
                                          particular core. */
370
    atomic_count intrvl_idx;           /* Write index for 'cycles_intrvl'. */
371
    struct dp_netdev_pmd_thread *pmd;  /* pmd thread that polls this queue. */
372
    bool is_vhost;                     /* Is rxq of a vhost port. */
373
374
    /* Counters of cycles spent successfully polling and processing pkts. */
375
    atomic_ullong cycles[RXQ_N_CYCLES];
376
    /* We store PMD_INTERVAL_MAX intervals of data for an rxq and then
377
       sum them to yield the cycles used for an rxq. */
378
    atomic_ullong cycles_intrvl[PMD_INTERVAL_MAX];
379
};
380
381
enum txq_req_mode {
382
    TXQ_REQ_MODE_THREAD,
383
    TXQ_REQ_MODE_HASH,
384
};
385
386
enum txq_mode {
387
    TXQ_MODE_STATIC,
388
    TXQ_MODE_XPS,
389
    TXQ_MODE_XPS_HASH,
390
};
391
392
/* A port in a netdev-based datapath. */
393
struct dp_netdev_port {
394
    odp_port_t port_no;
395
    enum txq_mode txq_mode;     /* static, XPS, XPS_HASH. */
396
    bool need_reconfigure;      /* True if we should reconfigure netdev. */
397
    struct netdev *netdev;
398
    struct hmap_node node;      /* Node in dp_netdev's 'ports'. */
399
    struct netdev_saved_flags *sf;
400
    struct dp_netdev_rxq *rxqs;
401
    unsigned n_rxq;             /* Number of elements in 'rxqs' */
402
    unsigned *txq_used;         /* Number of threads that use each tx queue. */
403
    struct ovs_mutex txq_used_mutex;
404
    bool emc_enabled;           /* If true EMC will be used. */
405
    char *type;                 /* Port type as requested by user. */
406
    char *rxq_affinity_list;    /* Requested affinity of rx queues. */
407
    enum txq_req_mode txq_requested_mode;
408
};
409
410
static bool dp_netdev_flow_ref(struct dp_netdev_flow *);
411
static int dpif_netdev_flow_from_nlattrs(const struct nlattr *, uint32_t,
412
                                         struct flow *, bool);
413
414
struct dp_netdev_actions *dp_netdev_actions_create(const struct nlattr *,
415
                                                   size_t);
416
struct dp_netdev_actions *dp_netdev_flow_get_actions(
417
    const struct dp_netdev_flow *);
418
static void dp_netdev_actions_free(struct dp_netdev_actions *);
419
420
struct polled_queue {
421
    struct dp_netdev_rxq *rxq;
422
    odp_port_t port_no;
423
    bool emc_enabled;
424
    bool rxq_enabled;
425
    uint64_t change_seq;
426
};
427
428
/* Contained by struct dp_netdev_pmd_thread's 'poll_list' member. */
429
struct rxq_poll {
430
    struct dp_netdev_rxq *rxq;
431
    struct hmap_node node;
432
};
433
434
/* Contained by struct dp_netdev_pmd_thread's 'send_port_cache',
435
 * 'tnl_port_cache' or 'tx_ports'. */
436
struct tx_port {
437
    struct dp_netdev_port *port;
438
    int qid;
439
    long long last_used;
440
    struct hmap_node node;
441
    long long flush_time;
442
    struct dp_packet_batch output_pkts;
443
    struct dp_packet_batch *txq_pkts; /* Only for hash mode. */
444
    struct dp_netdev_rxq *output_pkts_rxqs[NETDEV_MAX_BURST];
445
};
446
447
/* Contained by struct tx_bond 'member_buckets'. */
448
struct member_entry {
449
    odp_port_t member_id;
450
    atomic_ullong n_packets;
451
    atomic_ullong n_bytes;
452
};
453
454
/* Contained by struct dp_netdev_pmd_thread's 'tx_bonds'. */
455
struct tx_bond {
456
    struct cmap_node node;
457
    uint32_t bond_id;
458
    struct member_entry member_buckets[BOND_BUCKETS];
459
};
460
461
/* Interface to netdev-based datapath. */
462
struct dpif_netdev {
463
    struct dpif dpif;
464
    struct dp_netdev *dp;
465
    uint64_t last_port_seq;
466
};
467
468
static int get_port_by_number(struct dp_netdev *dp, odp_port_t port_no,
469
                              struct dp_netdev_port **portp)
470
    OVS_REQ_RDLOCK(dp->port_rwlock);
471
static int get_port_by_name(struct dp_netdev *dp, const char *devname,
472
                            struct dp_netdev_port **portp)
473
    OVS_REQ_RDLOCK(dp->port_rwlock);
474
static void dp_netdev_free(struct dp_netdev *)
475
    OVS_REQUIRES(dp_netdev_mutex);
476
static int do_add_port(struct dp_netdev *dp, const char *devname,
477
                       const char *type, odp_port_t port_no)
478
    OVS_REQ_WRLOCK(dp->port_rwlock);
479
static void do_del_port(struct dp_netdev *dp, struct dp_netdev_port *)
480
    OVS_REQ_WRLOCK(dp->port_rwlock);
481
static int dpif_netdev_open(const struct dpif_class *, const char *name,
482
                            bool create, struct dpif **);
483
static void dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
484
                                      struct dp_packet_batch *,
485
                                      bool should_steal,
486
                                      const struct flow *flow,
487
                                      const struct nlattr *actions,
488
                                      size_t actions_len);
489
static void dp_netdev_recirculate(struct dp_netdev_pmd_thread *,
490
                                  struct dp_packet_batch *);
491
492
static void dp_netdev_disable_upcall(struct dp_netdev *);
493
static void dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd);
494
static void dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd,
495
                                    struct dp_netdev *dp, unsigned core_id,
496
                                    int numa_id);
497
static void dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd);
498
static void dp_netdev_set_nonpmd(struct dp_netdev *dp)
499
    OVS_REQ_WRLOCK(dp->port_rwlock);
500
501
static void *pmd_thread_main(void *);
502
static struct dp_netdev_pmd_thread *dp_netdev_get_pmd(struct dp_netdev *dp,
503
                                                      unsigned core_id);
504
static struct dp_netdev_pmd_thread *
505
dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos);
506
static void dp_netdev_del_pmd(struct dp_netdev *dp,
507
                              struct dp_netdev_pmd_thread *pmd);
508
static void dp_netdev_destroy_all_pmds(struct dp_netdev *dp, bool non_pmd);
509
static void dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd);
510
static void dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
511
                                         struct dp_netdev_port *port)
512
    OVS_REQUIRES(pmd->port_mutex);
513
static void dp_netdev_del_port_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
514
                                           struct tx_port *tx)
515
    OVS_REQUIRES(pmd->port_mutex);
516
static void dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd,
517
                                     struct dp_netdev_rxq *rxq)
518
    OVS_REQUIRES(pmd->port_mutex);
519
static void dp_netdev_del_rxq_from_pmd(struct dp_netdev_pmd_thread *pmd,
520
                                       struct rxq_poll *poll)
521
    OVS_REQUIRES(pmd->port_mutex);
522
static int
523
dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd,
524
                                   bool force);
525
static void dp_netdev_add_bond_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
526
                                         struct tx_bond *bond, bool update)
527
    OVS_EXCLUDED(pmd->bond_mutex);
528
static void dp_netdev_del_bond_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
529
                                           uint32_t bond_id)
530
    OVS_EXCLUDED(pmd->bond_mutex);
531
532
static void reconfigure_datapath(struct dp_netdev *dp)
533
    OVS_REQ_RDLOCK(dp->port_rwlock);
534
static bool dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd);
535
static void dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd);
536
static void dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd);
537
static void pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
538
    OVS_REQUIRES(pmd->port_mutex);
539
static inline void
540
dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd,
541
                           struct polled_queue *poll_list, int poll_cnt);
542
static void
543
dp_netdev_rxq_set_cycles(struct dp_netdev_rxq *rx,
544
                         enum rxq_cycles_counter_type type,
545
                         unsigned long long cycles);
546
static uint64_t
547
dp_netdev_rxq_get_cycles(struct dp_netdev_rxq *rx,
548
                         enum rxq_cycles_counter_type type);
549
static void
550
dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx,
551
                           unsigned long long cycles);
552
static uint64_t
553
dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned idx);
554
static uint64_t
555
get_interval_values(atomic_ullong *source, atomic_count *cur_idx,
556
                    int num_to_read);
557
static void
558
dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
559
                               bool purge);
560
static int dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd,
561
                                      struct tx_port *tx);
562
inline struct dpcls *
563
dp_netdev_pmd_lookup_dpcls(struct dp_netdev_pmd_thread *pmd,
564
                           odp_port_t in_port);
565
566
static void dp_netdev_request_reconfigure(struct dp_netdev *dp);
567
static inline bool
568
pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd);
569
570
static void dp_netdev_simple_match_insert(struct dp_netdev_pmd_thread *pmd,
571
                                          struct dp_netdev_flow *flow)
572
    OVS_REQUIRES(pmd->flow_mutex);
573
static void dp_netdev_simple_match_remove(struct dp_netdev_pmd_thread *pmd,
574
                                          struct dp_netdev_flow *flow)
575
    OVS_REQUIRES(pmd->flow_mutex);
576
577
static bool dp_netdev_flow_is_simple_match(const struct match *);
578
579
/* Updates the time in PMD threads context and should be called in three cases:
580
 *
581
 *     1. PMD structure initialization:
582
 *         - dp_netdev_configure_pmd()
583
 *
584
 *     2. Before processing of the new packet batch:
585
 *         - dpif_netdev_execute()
586
 *         - dp_netdev_process_rxq_port()
587
 *
588
 *     3. At least once per polling iteration in main polling threads if no
589
 *        packets received on current iteration:
590
 *         - dpif_netdev_run()
591
 *         - pmd_thread_main()
592
 *
593
 * 'pmd->ctx.now' should be used without update in all other cases if possible.
594
 */
595
static inline void
596
pmd_thread_ctx_time_update(struct dp_netdev_pmd_thread *pmd)
597
0
{
598
0
    pmd->ctx.now = time_usec();
599
0
}
600
601
/* Returns true if 'dpif' is a netdev or dummy dpif, false otherwise. */
602
bool
603
dpif_is_netdev(const struct dpif *dpif)
604
0
{
605
0
    return dpif->dpif_class->open == dpif_netdev_open;
606
0
}
607
608
static struct dpif_netdev *
609
dpif_netdev_cast(const struct dpif *dpif)
610
0
{
611
0
    ovs_assert(dpif_is_netdev(dpif));
612
0
    return CONTAINER_OF(dpif, struct dpif_netdev, dpif);
613
0
}
614
615
static struct dp_netdev *
616
get_dp_netdev(const struct dpif *dpif)
617
0
{
618
0
    return dpif_netdev_cast(dpif)->dp;
619
0
}
620

621
enum pmd_info_type {
622
    PMD_INFO_SHOW_STATS,  /* Show how cpu cycles are spent. */
623
    PMD_INFO_CLEAR_STATS, /* Set the cycles count to 0. */
624
    PMD_INFO_SHOW_RXQ,    /* Show poll lists of pmd threads. */
625
    PMD_INFO_PERF_SHOW,   /* Show pmd performance details. */
626
    PMD_INFO_SLEEP_SHOW,  /* Show max sleep configuration details. */
627
};
628
629
static void
630
format_pmd_thread(struct ds *reply, struct dp_netdev_pmd_thread *pmd)
631
0
{
632
0
    ds_put_cstr(reply, (pmd->core_id == NON_PMD_CORE_ID)
633
0
                        ? "main thread" : "pmd thread");
634
0
    if (pmd->numa_id != OVS_NUMA_UNSPEC) {
635
0
        ds_put_format(reply, " numa_id %d", pmd->numa_id);
636
0
    }
637
0
    if (pmd->core_id != OVS_CORE_UNSPEC && pmd->core_id != NON_PMD_CORE_ID) {
638
0
        ds_put_format(reply, " core_id %u", pmd->core_id);
639
0
    }
640
0
    ds_put_cstr(reply, ":\n");
641
0
}
642
643
static void
644
pmd_info_show_stats(struct ds *reply,
645
                    struct dp_netdev_pmd_thread *pmd)
646
0
{
647
0
    uint64_t stats[PMD_N_STATS];
648
0
    uint64_t total_cycles, total_packets;
649
0
    double passes_per_pkt = 0;
650
0
    double lookups_per_hit = 0;
651
0
    double packets_per_batch = 0;
652
653
0
    pmd_perf_read_counters(&pmd->perf_stats, stats);
654
0
    total_cycles = stats[PMD_CYCLES_ITER_IDLE]
655
0
                         + stats[PMD_CYCLES_ITER_BUSY];
656
0
    total_packets = stats[PMD_STAT_RECV];
657
658
0
    format_pmd_thread(reply, pmd);
659
660
0
    if (total_packets > 0) {
661
0
        passes_per_pkt = (total_packets + stats[PMD_STAT_RECIRC])
662
0
                            / (double) total_packets;
663
0
    }
664
0
    if (stats[PMD_STAT_MASKED_HIT] > 0) {
665
0
        lookups_per_hit = stats[PMD_STAT_MASKED_LOOKUP]
666
0
                            / (double) stats[PMD_STAT_MASKED_HIT];
667
0
    }
668
0
    if (stats[PMD_STAT_SENT_BATCHES] > 0) {
669
0
        packets_per_batch = stats[PMD_STAT_SENT_PKTS]
670
0
                            / (double) stats[PMD_STAT_SENT_BATCHES];
671
0
    }
672
673
0
    ds_put_format(reply,
674
0
                  "  packets received: %"PRIu64"\n"
675
0
                  "  packet recirculations: %"PRIu64"\n"
676
0
                  "  avg. datapath passes per packet: %.02f\n"
677
0
                  "  phwol hits: %"PRIu64"\n"
678
0
                  "  mfex opt hits: %"PRIu64"\n"
679
0
                  "  simple match hits: %"PRIu64"\n"
680
0
                  "  emc hits: %"PRIu64"\n"
681
0
                  "  smc hits: %"PRIu64"\n"
682
0
                  "  megaflow hits: %"PRIu64"\n"
683
0
                  "  avg. subtable lookups per megaflow hit: %.02f\n"
684
0
                  "  miss with success upcall: %"PRIu64"\n"
685
0
                  "  miss with failed upcall: %"PRIu64"\n"
686
0
                  "  avg. packets per output batch: %.02f\n",
687
0
                  total_packets, stats[PMD_STAT_RECIRC],
688
0
                  passes_per_pkt, stats[PMD_STAT_PHWOL_HIT],
689
0
                  stats[PMD_STAT_MFEX_OPT_HIT],
690
0
                  stats[PMD_STAT_SIMPLE_HIT],
691
0
                  stats[PMD_STAT_EXACT_HIT],
692
0
                  stats[PMD_STAT_SMC_HIT],
693
0
                  stats[PMD_STAT_MASKED_HIT],
694
0
                  lookups_per_hit, stats[PMD_STAT_MISS], stats[PMD_STAT_LOST],
695
0
                  packets_per_batch);
696
697
0
    if (total_cycles == 0) {
698
0
        return;
699
0
    }
700
701
0
    ds_put_format(reply,
702
0
                  "  idle cycles: %"PRIu64" (%.02f%%)\n"
703
0
                  "  processing cycles: %"PRIu64" (%.02f%%)\n",
704
0
                  stats[PMD_CYCLES_ITER_IDLE],
705
0
                  stats[PMD_CYCLES_ITER_IDLE] / (double) total_cycles * 100,
706
0
                  stats[PMD_CYCLES_ITER_BUSY],
707
0
                  stats[PMD_CYCLES_ITER_BUSY] / (double) total_cycles * 100);
708
709
0
    if (total_packets == 0) {
710
0
        return;
711
0
    }
712
713
0
    ds_put_format(reply,
714
0
                  "  avg cycles per packet: %.02f (%"PRIu64"/%"PRIu64")\n",
715
0
                  total_cycles / (double) total_packets,
716
0
                  total_cycles, total_packets);
717
718
0
    ds_put_format(reply,
719
0
                  "  avg processing cycles per packet: "
720
0
                  "%.02f (%"PRIu64"/%"PRIu64")\n",
721
0
                  stats[PMD_CYCLES_ITER_BUSY] / (double) total_packets,
722
0
                  stats[PMD_CYCLES_ITER_BUSY], total_packets);
723
0
}
724
725
static void
726
pmd_info_show_perf(struct ds *reply,
727
                   struct dp_netdev_pmd_thread *pmd,
728
                   struct pmd_perf_params *par)
729
0
{
730
0
    if (pmd->core_id != NON_PMD_CORE_ID) {
731
0
        char *time_str =
732
0
                xastrftime_msec("%H:%M:%S.###", time_wall_msec(), true);
733
0
        long long now = time_msec();
734
0
        double duration = (now - pmd->perf_stats.start_ms) / 1000.0;
735
736
0
        ds_put_cstr(reply, "\n");
737
0
        ds_put_format(reply, "Time: %s\n", time_str);
738
0
        ds_put_format(reply, "Measurement duration: %.3f s\n", duration);
739
0
        ds_put_cstr(reply, "\n");
740
0
        format_pmd_thread(reply, pmd);
741
0
        ds_put_cstr(reply, "\n");
742
0
        pmd_perf_format_overall_stats(reply, &pmd->perf_stats, duration);
743
0
        if (pmd_perf_metrics_enabled(pmd)) {
744
            /* Prevent parallel clearing of perf metrics. */
745
0
            ovs_mutex_lock(&pmd->perf_stats.clear_mutex);
746
0
            if (par->histograms) {
747
0
                ds_put_cstr(reply, "\n");
748
0
                pmd_perf_format_histograms(reply, &pmd->perf_stats);
749
0
            }
750
0
            if (par->iter_hist_len > 0) {
751
0
                ds_put_cstr(reply, "\n");
752
0
                pmd_perf_format_iteration_history(reply, &pmd->perf_stats,
753
0
                        par->iter_hist_len);
754
0
            }
755
0
            if (par->ms_hist_len > 0) {
756
0
                ds_put_cstr(reply, "\n");
757
0
                pmd_perf_format_ms_history(reply, &pmd->perf_stats,
758
0
                        par->ms_hist_len);
759
0
            }
760
0
            ovs_mutex_unlock(&pmd->perf_stats.clear_mutex);
761
0
        }
762
0
        free(time_str);
763
0
    }
764
0
}
765
766
static int
767
compare_poll_list(const void *a_, const void *b_)
768
0
{
769
0
    const struct rxq_poll *a = a_;
770
0
    const struct rxq_poll *b = b_;
771
772
0
    const char *namea = netdev_rxq_get_name(a->rxq->rx);
773
0
    const char *nameb = netdev_rxq_get_name(b->rxq->rx);
774
775
0
    int cmp = strcmp(namea, nameb);
776
0
    if (!cmp) {
777
0
        return netdev_rxq_get_queue_id(a->rxq->rx)
778
0
               - netdev_rxq_get_queue_id(b->rxq->rx);
779
0
    } else {
780
0
        return cmp;
781
0
    }
782
0
}
783
784
static void
785
sorted_poll_list(struct dp_netdev_pmd_thread *pmd, struct rxq_poll **list,
786
                 size_t *n)
787
    OVS_REQUIRES(pmd->port_mutex)
788
0
{
789
0
    struct rxq_poll *ret, *poll;
790
0
    size_t i;
791
792
0
    *n = hmap_count(&pmd->poll_list);
793
0
    if (!*n) {
794
0
        ret = NULL;
795
0
    } else {
796
0
        ret = xcalloc(*n, sizeof *ret);
797
0
        i = 0;
798
0
        HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
799
0
            ret[i] = *poll;
800
0
            i++;
801
0
        }
802
0
        ovs_assert(i == *n);
803
0
        qsort(ret, *n, sizeof *ret, compare_poll_list);
804
0
    }
805
806
0
    *list = ret;
807
0
}
808
809
static void
810
pmd_info_show_rxq(struct ds *reply, struct dp_netdev_pmd_thread *pmd,
811
                  int secs)
812
0
{
813
0
    if (pmd->core_id != NON_PMD_CORE_ID) {
814
0
        struct rxq_poll *list;
815
0
        size_t n_rxq;
816
0
        uint64_t total_pmd_cycles = 0;
817
0
        uint64_t busy_pmd_cycles = 0;
818
0
        uint64_t total_rxq_proc_cycles = 0;
819
0
        unsigned int intervals;
820
821
0
        ds_put_format(reply,
822
0
                      "pmd thread numa_id %d core_id %u:\n  isolated : %s\n",
823
0
                      pmd->numa_id, pmd->core_id, (pmd->isolated)
824
0
                                                  ? "true" : "false");
825
826
0
        ovs_mutex_lock(&pmd->port_mutex);
827
0
        sorted_poll_list(pmd, &list, &n_rxq);
828
829
        /* Get the total pmd cycles for an interval. */
830
0
        atomic_read_relaxed(&pmd->intrvl_cycles, &total_pmd_cycles);
831
        /* Calculate how many intervals are to be used. */
832
0
        intervals = DIV_ROUND_UP(secs,
833
0
                                 PMD_INTERVAL_LEN / INTERVAL_USEC_TO_SEC);
834
        /* Estimate the cycles to cover all intervals. */
835
0
        total_pmd_cycles *= intervals;
836
0
        busy_pmd_cycles = get_interval_values(pmd->busy_cycles_intrvl,
837
0
                                              &pmd->intrvl_idx,
838
0
                                              intervals);
839
0
        if (busy_pmd_cycles > total_pmd_cycles) {
840
0
            busy_pmd_cycles = total_pmd_cycles;
841
0
        }
842
843
0
        for (int i = 0; i < n_rxq; i++) {
844
0
            struct dp_netdev_rxq *rxq = list[i].rxq;
845
0
            const char *name = netdev_rxq_get_name(rxq->rx);
846
0
            uint64_t rxq_proc_cycles = 0;
847
848
0
            rxq_proc_cycles = get_interval_values(rxq->cycles_intrvl,
849
0
                                                  &rxq->intrvl_idx,
850
0
                                                  intervals);
851
0
            total_rxq_proc_cycles += rxq_proc_cycles;
852
0
            ds_put_format(reply, "  port: %-16s  queue-id: %2d", name,
853
0
                          netdev_rxq_get_queue_id(list[i].rxq->rx));
854
0
            ds_put_format(reply, " %s", netdev_rxq_enabled(list[i].rxq->rx)
855
0
                                        ? "(enabled) " : "(disabled)");
856
0
            ds_put_format(reply, "  pmd usage: ");
857
0
            if (total_pmd_cycles) {
858
0
                ds_put_format(reply, "%2.0f %%",
859
0
                              (double) (rxq_proc_cycles * 100) /
860
0
                              total_pmd_cycles);
861
0
            } else {
862
0
                ds_put_format(reply, "%s", "NOT AVAIL");
863
0
            }
864
0
            ds_put_cstr(reply, "\n");
865
0
        }
866
867
0
        if (n_rxq > 0) {
868
0
            ds_put_cstr(reply, "  overhead: ");
869
0
            if (total_pmd_cycles) {
870
0
                uint64_t overhead_cycles = 0;
871
872
0
                if (total_rxq_proc_cycles < busy_pmd_cycles) {
873
0
                    overhead_cycles = busy_pmd_cycles - total_rxq_proc_cycles;
874
0
                }
875
876
0
                ds_put_format(reply, "%2.0f %%",
877
0
                              (double) (overhead_cycles * 100) /
878
0
                              total_pmd_cycles);
879
0
            } else {
880
0
                ds_put_cstr(reply, "NOT AVAIL");
881
0
            }
882
0
            ds_put_cstr(reply, "\n");
883
0
        }
884
885
0
        ovs_mutex_unlock(&pmd->port_mutex);
886
0
        free(list);
887
0
    }
888
0
}
889
890
static int
891
compare_poll_thread_list(const void *a_, const void *b_)
892
0
{
893
0
    const struct dp_netdev_pmd_thread *a, *b;
894
895
0
    a = *(struct dp_netdev_pmd_thread **)a_;
896
0
    b = *(struct dp_netdev_pmd_thread **)b_;
897
898
0
    if (a->core_id < b->core_id) {
899
0
        return -1;
900
0
    }
901
0
    if (a->core_id > b->core_id) {
902
0
        return 1;
903
0
    }
904
0
    return 0;
905
0
}
906
907
/* Create a sorted list of pmd's from the dp->poll_threads cmap. We can use
908
 * this list, as long as we do not go to quiescent state. */
909
static void
910
sorted_poll_thread_list(struct dp_netdev *dp,
911
                        struct dp_netdev_pmd_thread ***list,
912
                        size_t *n)
913
0
{
914
0
    struct dp_netdev_pmd_thread *pmd;
915
0
    struct dp_netdev_pmd_thread **pmd_list;
916
0
    size_t k = 0, n_pmds;
917
918
0
    n_pmds = cmap_count(&dp->poll_threads);
919
0
    pmd_list = xcalloc(n_pmds, sizeof *pmd_list);
920
921
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
922
0
        if (k >= n_pmds) {
923
0
            break;
924
0
        }
925
0
        pmd_list[k++] = pmd;
926
0
    }
927
928
0
    qsort(pmd_list, k, sizeof *pmd_list, compare_poll_thread_list);
929
930
0
    *list = pmd_list;
931
0
    *n = k;
932
0
}
933
934
static void
935
dpif_netdev_subtable_lookup_get(struct unixctl_conn *conn, int argc OVS_UNUSED,
936
                                const char *argv[] OVS_UNUSED,
937
                                void *aux OVS_UNUSED)
938
0
{
939
0
    struct ds reply = DS_EMPTY_INITIALIZER;
940
941
0
    dpcls_impl_print_stats(&reply);
942
0
    unixctl_command_reply(conn, ds_cstr(&reply));
943
0
    ds_destroy(&reply);
944
0
}
945
946
static void
947
dpif_netdev_subtable_lookup_set(struct unixctl_conn *conn, int argc OVS_UNUSED,
948
                                const char *argv[], void *aux OVS_UNUSED)
949
0
{
950
    /* This function requires 2 parameters (argv[1] and argv[2]) to execute.
951
     *   argv[1] is subtable name
952
     *   argv[2] is priority
953
     */
954
0
    const char *func_name = argv[1];
955
956
0
    errno = 0;
957
0
    char *err_char;
958
0
    uint32_t new_prio = strtoul(argv[2], &err_char, 10);
959
0
    uint32_t lookup_dpcls_changed = 0;
960
0
    uint32_t lookup_subtable_changed = 0;
961
0
    struct shash_node *node;
962
0
    if (errno != 0 || new_prio > UINT8_MAX) {
963
0
        unixctl_command_reply_error(conn,
964
0
            "error converting priority, use integer in range 0-255\n");
965
0
        return;
966
0
    }
967
968
0
    int32_t err = dpcls_subtable_set_prio(func_name, new_prio);
969
0
    if (err) {
970
0
        unixctl_command_reply_error(conn,
971
0
            "error, subtable lookup function not found\n");
972
0
        return;
973
0
    }
974
975
0
    ovs_mutex_lock(&dp_netdev_mutex);
976
0
    SHASH_FOR_EACH (node, &dp_netdevs) {
977
0
        struct dp_netdev *dp = node->data;
978
979
        /* Get PMD threads list, required to get DPCLS instances. */
980
0
        size_t n;
981
0
        struct dp_netdev_pmd_thread **pmd_list;
982
0
        sorted_poll_thread_list(dp, &pmd_list, &n);
983
984
        /* take port mutex as HMAP iters over them. */
985
0
        ovs_rwlock_rdlock(&dp->port_rwlock);
986
987
0
        for (size_t i = 0; i < n; i++) {
988
0
            struct dp_netdev_pmd_thread *pmd = pmd_list[i];
989
0
            if (pmd->core_id == NON_PMD_CORE_ID) {
990
0
                continue;
991
0
            }
992
993
0
            struct dp_netdev_port *port = NULL;
994
0
            HMAP_FOR_EACH (port, node, &dp->ports) {
995
0
                odp_port_t in_port = port->port_no;
996
0
                struct dpcls *cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
997
0
                if (!cls) {
998
0
                    continue;
999
0
                }
1000
0
                ovs_mutex_lock(&pmd->flow_mutex);
1001
0
                uint32_t subtbl_changes = dpcls_subtable_lookup_reprobe(cls);
1002
0
                ovs_mutex_unlock(&pmd->flow_mutex);
1003
0
                if (subtbl_changes) {
1004
0
                    lookup_dpcls_changed++;
1005
0
                    lookup_subtable_changed += subtbl_changes;
1006
0
                }
1007
0
            }
1008
0
        }
1009
1010
        /* release port mutex before netdev mutex. */
1011
0
        ovs_rwlock_unlock(&dp->port_rwlock);
1012
0
        free(pmd_list);
1013
0
    }
1014
0
    ovs_mutex_unlock(&dp_netdev_mutex);
1015
1016
0
    struct ds reply = DS_EMPTY_INITIALIZER;
1017
0
    ds_put_format(&reply,
1018
0
        "Lookup priority change affected %d dpcls ports and %d subtables.\n",
1019
0
        lookup_dpcls_changed, lookup_subtable_changed);
1020
0
    const char *reply_str = ds_cstr(&reply);
1021
0
    unixctl_command_reply(conn, reply_str);
1022
0
    VLOG_INFO("%s", reply_str);
1023
0
    ds_destroy(&reply);
1024
0
}
1025
1026
static void
1027
dpif_netdev_impl_get(struct unixctl_conn *conn, int argc OVS_UNUSED,
1028
                     const char *argv[] OVS_UNUSED, void *aux OVS_UNUSED)
1029
0
{
1030
0
    struct ds reply = DS_EMPTY_INITIALIZER;
1031
0
    struct shash_node *node;
1032
1033
0
    ovs_mutex_lock(&dp_netdev_mutex);
1034
0
    SHASH_FOR_EACH (node, &dp_netdevs) {
1035
0
        struct dp_netdev_pmd_thread **pmd_list;
1036
0
        struct dp_netdev *dp = node->data;
1037
0
        size_t n;
1038
1039
        /* Get PMD threads list, required to get the DPIF impl used by each PMD
1040
         * thread. */
1041
0
        sorted_poll_thread_list(dp, &pmd_list, &n);
1042
0
        dp_netdev_impl_get(&reply, pmd_list, n);
1043
0
        free(pmd_list);
1044
0
    }
1045
0
    ovs_mutex_unlock(&dp_netdev_mutex);
1046
0
    unixctl_command_reply(conn, ds_cstr(&reply));
1047
0
    ds_destroy(&reply);
1048
0
}
1049
1050
static void
1051
dpif_netdev_impl_set(struct unixctl_conn *conn, int argc OVS_UNUSED,
1052
                     const char *argv[], void *aux OVS_UNUSED)
1053
0
{
1054
    /* This function requires just one parameter, the DPIF name. */
1055
0
    const char *dpif_name = argv[1];
1056
0
    struct shash_node *node;
1057
1058
0
    static const char *error_description[2] = {
1059
0
        "Unknown DPIF implementation",
1060
0
        "CPU doesn't support the required instruction for",
1061
0
    };
1062
1063
0
    ovs_mutex_lock(&dp_netdev_mutex);
1064
0
    int32_t err = dp_netdev_impl_set_default_by_name(dpif_name);
1065
1066
0
    if (err) {
1067
0
        struct ds reply = DS_EMPTY_INITIALIZER;
1068
0
        ds_put_format(&reply, "DPIF implementation not available: %s %s.\n",
1069
0
                      error_description[ (err == -ENOTSUP) ], dpif_name);
1070
0
        const char *reply_str = ds_cstr(&reply);
1071
0
        unixctl_command_reply_error(conn, reply_str);
1072
0
        VLOG_ERR("%s", reply_str);
1073
0
        ds_destroy(&reply);
1074
0
        ovs_mutex_unlock(&dp_netdev_mutex);
1075
0
        return;
1076
0
    }
1077
1078
0
    SHASH_FOR_EACH (node, &dp_netdevs) {
1079
0
        struct dp_netdev *dp = node->data;
1080
1081
        /* Get PMD threads list, required to get DPCLS instances. */
1082
0
        size_t n;
1083
0
        struct dp_netdev_pmd_thread **pmd_list;
1084
0
        sorted_poll_thread_list(dp, &pmd_list, &n);
1085
1086
0
        for (size_t i = 0; i < n; i++) {
1087
0
            struct dp_netdev_pmd_thread *pmd = pmd_list[i];
1088
0
            if (pmd->core_id == NON_PMD_CORE_ID) {
1089
0
                continue;
1090
0
            }
1091
1092
            /* Initialize DPIF function pointer to the newly configured
1093
             * default. */
1094
0
            atomic_store_relaxed(&pmd->netdev_input_func,
1095
0
                                 dp_netdev_impl_get_default());
1096
0
        };
1097
1098
0
        free(pmd_list);
1099
0
    }
1100
0
    ovs_mutex_unlock(&dp_netdev_mutex);
1101
1102
    /* Reply with success to command. */
1103
0
    struct ds reply = DS_EMPTY_INITIALIZER;
1104
0
    ds_put_format(&reply, "DPIF implementation set to %s.\n", dpif_name);
1105
0
    const char *reply_str = ds_cstr(&reply);
1106
0
    unixctl_command_reply(conn, reply_str);
1107
0
    VLOG_INFO("%s", reply_str);
1108
0
    ds_destroy(&reply);
1109
0
}
1110
1111
static void
1112
dpif_miniflow_extract_impl_get(struct unixctl_conn *conn, int argc OVS_UNUSED,
1113
                               const char *argv[] OVS_UNUSED,
1114
                               void *aux OVS_UNUSED)
1115
0
{
1116
0
    struct ds reply = DS_EMPTY_INITIALIZER;
1117
0
    struct shash_node *node;
1118
1119
0
    ovs_mutex_lock(&dp_netdev_mutex);
1120
0
    SHASH_FOR_EACH (node, &dp_netdevs) {
1121
0
        struct dp_netdev_pmd_thread **pmd_list;
1122
0
        struct dp_netdev *dp = node->data;
1123
0
        size_t n;
1124
1125
        /* Get PMD threads list, required to get the DPIF impl used by each PMD
1126
         * thread. */
1127
0
        sorted_poll_thread_list(dp, &pmd_list, &n);
1128
0
        dp_mfex_impl_get(&reply, pmd_list, n);
1129
0
        free(pmd_list);
1130
0
    }
1131
0
    ovs_mutex_unlock(&dp_netdev_mutex);
1132
0
    unixctl_command_reply(conn, ds_cstr(&reply));
1133
0
    ds_destroy(&reply);
1134
0
}
1135
1136
static void
1137
dpif_miniflow_extract_impl_set(struct unixctl_conn *conn, int argc,
1138
                               const char *argv[], void *aux OVS_UNUSED)
1139
0
{
1140
    /* This command takes some optional and mandatory arguments. The function
1141
     * here first parses all of the options, saving results in local variables.
1142
     * Then the parsed values are acted on.
1143
     */
1144
0
    unsigned int pmd_thread_to_change = NON_PMD_CORE_ID;
1145
0
    unsigned int study_count = MFEX_MAX_PKT_COUNT;
1146
0
    struct ds reply = DS_EMPTY_INITIALIZER;
1147
0
    bool pmd_thread_update_done = false;
1148
0
    bool mfex_name_is_study = false;
1149
0
    const char *mfex_name = NULL;
1150
0
    const char *reply_str = NULL;
1151
0
    struct shash_node *node;
1152
0
    int err;
1153
1154
0
    while (argc > 1) {
1155
        /* Optional argument "-pmd" limits the commands actions to just this
1156
         * PMD thread.
1157
         */
1158
0
        if ((!strcmp(argv[1], "-pmd") && !mfex_name)) {
1159
0
            if (argc < 3) {
1160
0
                ds_put_format(&reply,
1161
0
                              "Error: -pmd option requires a thread id"
1162
0
                              " argument.\n");
1163
0
                goto error;
1164
0
            }
1165
1166
            /* Ensure argument can be parsed to an integer. */
1167
0
            if (!str_to_uint(argv[2], 10, &pmd_thread_to_change) ||
1168
0
                (pmd_thread_to_change == NON_PMD_CORE_ID)) {
1169
0
                ds_put_format(&reply,
1170
0
                              "Error: miniflow extract parser not changed,"
1171
0
                              " PMD thread passed is not valid: '%s'."
1172
0
                              " Pass a valid pmd thread ID.\n",
1173
0
                              argv[2]);
1174
0
                goto error;
1175
0
            }
1176
1177
0
            argc -= 2;
1178
0
            argv += 2;
1179
1180
0
        } else if (!mfex_name) {
1181
            /* Name of MFEX impl requested by user. */
1182
0
            mfex_name = argv[1];
1183
0
            mfex_name_is_study = strcmp("study", mfex_name) == 0;
1184
0
            argc -= 1;
1185
0
            argv += 1;
1186
1187
        /* If name is study and more args exist, parse study_count value. */
1188
0
        } else if (mfex_name && mfex_name_is_study) {
1189
0
            if (!str_to_uint(argv[1], 10, &study_count) ||
1190
0
                (study_count == 0)) {
1191
0
                ds_put_format(&reply,
1192
0
                              "Error: invalid study_pkt_cnt value: %s.\n",
1193
0
                              argv[1]);
1194
0
                goto error;
1195
0
            }
1196
1197
0
            argc -= 1;
1198
0
            argv += 1;
1199
0
        } else {
1200
0
            ds_put_format(&reply, "Error: unknown argument %s.\n", argv[1]);
1201
0
            goto error;
1202
0
        }
1203
0
    }
1204
1205
    /* Ensure user passed an MFEX name. */
1206
0
    if (!mfex_name) {
1207
0
        ds_put_format(&reply, "Error: no miniflow extract name provided."
1208
0
                      " Output of miniflow-parser-get shows implementation"
1209
0
                      " list.\n");
1210
0
        goto error;
1211
0
    }
1212
1213
    /* If the MFEX name is "study", set the study packet count. */
1214
0
    if (mfex_name_is_study) {
1215
0
        err = mfex_set_study_pkt_cnt(study_count, mfex_name);
1216
0
        if (err) {
1217
0
            ds_put_format(&reply, "Error: failed to set study count %d for"
1218
0
                          " miniflow extract implementation %s.\n",
1219
0
                          study_count, mfex_name);
1220
0
            goto error;
1221
0
        }
1222
0
    }
1223
1224
    /* Set the default MFEX impl only if the command was applied to all PMD
1225
     * threads. If a PMD thread was selected, do NOT update the default.
1226
     */
1227
0
    if (pmd_thread_to_change == NON_PMD_CORE_ID) {
1228
0
        err = dp_mfex_impl_set_default_by_name(mfex_name);
1229
0
        if (err == -ENODEV) {
1230
0
            ds_put_format(&reply,
1231
0
                          "Error: miniflow extract not available due to CPU"
1232
0
                          " ISA requirements: %s",
1233
0
                          mfex_name);
1234
0
            goto error;
1235
0
        } else if (err) {
1236
0
            ds_put_format(&reply,
1237
0
                          "Error: unknown miniflow extract implementation %s.",
1238
0
                          mfex_name);
1239
0
            goto error;
1240
0
        }
1241
0
    }
1242
1243
    /* Get the desired MFEX function pointer and error check its usage. */
1244
0
    miniflow_extract_func mfex_func = NULL;
1245
0
    err = dp_mfex_impl_get_by_name(mfex_name, &mfex_func);
1246
0
    if (err) {
1247
0
        if (err == -ENODEV) {
1248
0
            ds_put_format(&reply,
1249
0
                          "Error: miniflow extract not available due to CPU"
1250
0
                          " ISA requirements: %s", mfex_name);
1251
0
        } else {
1252
0
            ds_put_format(&reply,
1253
0
                          "Error: unknown miniflow extract implementation %s.",
1254
0
                          mfex_name);
1255
0
        }
1256
0
        goto error;
1257
0
    }
1258
1259
    /* Apply the MFEX pointer to each pmd thread in each netdev, filtering
1260
     * by the users "-pmd" argument if required.
1261
     */
1262
0
    ovs_mutex_lock(&dp_netdev_mutex);
1263
1264
0
    SHASH_FOR_EACH (node, &dp_netdevs) {
1265
0
        struct dp_netdev_pmd_thread **pmd_list;
1266
0
        struct dp_netdev *dp = node->data;
1267
0
        size_t n;
1268
1269
0
        sorted_poll_thread_list(dp, &pmd_list, &n);
1270
1271
0
        for (size_t i = 0; i < n; i++) {
1272
0
            struct dp_netdev_pmd_thread *pmd = pmd_list[i];
1273
0
            if (pmd->core_id == NON_PMD_CORE_ID) {
1274
0
                continue;
1275
0
            }
1276
1277
            /* If -pmd specified, skip all other pmd threads. */
1278
0
            if ((pmd_thread_to_change != NON_PMD_CORE_ID) &&
1279
0
                (pmd->core_id != pmd_thread_to_change)) {
1280
0
                continue;
1281
0
            }
1282
1283
0
            pmd_thread_update_done = true;
1284
0
            atomic_store_relaxed(&pmd->miniflow_extract_opt, mfex_func);
1285
0
        };
1286
1287
0
        free(pmd_list);
1288
0
    }
1289
1290
0
    ovs_mutex_unlock(&dp_netdev_mutex);
1291
1292
    /* If PMD thread was specified, but it wasn't found, return error. */
1293
0
    if (pmd_thread_to_change != NON_PMD_CORE_ID && !pmd_thread_update_done) {
1294
0
        ds_put_format(&reply,
1295
0
                      "Error: miniflow extract parser not changed, "
1296
0
                      "PMD thread %d not in use, pass a valid pmd"
1297
0
                      " thread ID.\n", pmd_thread_to_change);
1298
0
        goto error;
1299
0
    }
1300
1301
    /* Reply with success to command. */
1302
0
    ds_put_format(&reply, "Miniflow extract implementation set to %s",
1303
0
                  mfex_name);
1304
0
    if (pmd_thread_to_change != NON_PMD_CORE_ID) {
1305
0
        ds_put_format(&reply, ", on pmd thread %d", pmd_thread_to_change);
1306
0
    }
1307
0
    if (mfex_name_is_study) {
1308
0
        ds_put_format(&reply, ", studying %d packets", study_count);
1309
0
    }
1310
0
    ds_put_format(&reply, ".\n");
1311
1312
0
    reply_str = ds_cstr(&reply);
1313
0
    VLOG_INFO("%s", reply_str);
1314
0
    unixctl_command_reply(conn, reply_str);
1315
0
    ds_destroy(&reply);
1316
0
    return;
1317
1318
0
error:
1319
0
    reply_str = ds_cstr(&reply);
1320
0
    VLOG_ERR("%s", reply_str);
1321
0
    unixctl_command_reply_error(conn, reply_str);
1322
0
    ds_destroy(&reply);
1323
0
}
1324
1325
static void
1326
dpif_netdev_pmd_rebalance(struct unixctl_conn *conn, int argc,
1327
                          const char *argv[], void *aux OVS_UNUSED)
1328
0
{
1329
0
    struct ds reply = DS_EMPTY_INITIALIZER;
1330
0
    struct dp_netdev *dp = NULL;
1331
1332
0
    ovs_mutex_lock(&dp_netdev_mutex);
1333
1334
0
    if (argc == 2) {
1335
0
        dp = shash_find_data(&dp_netdevs, argv[1]);
1336
0
    } else if (shash_count(&dp_netdevs) == 1) {
1337
        /* There's only one datapath */
1338
0
        dp = shash_first(&dp_netdevs)->data;
1339
0
    }
1340
1341
0
    if (!dp) {
1342
0
        ovs_mutex_unlock(&dp_netdev_mutex);
1343
0
        unixctl_command_reply_error(conn,
1344
0
                                    "please specify an existing datapath");
1345
0
        return;
1346
0
    }
1347
1348
0
    dp_netdev_request_reconfigure(dp);
1349
0
    ovs_mutex_unlock(&dp_netdev_mutex);
1350
0
    ds_put_cstr(&reply, "pmd rxq rebalance requested.\n");
1351
0
    unixctl_command_reply(conn, ds_cstr(&reply));
1352
0
    ds_destroy(&reply);
1353
0
}
1354
1355
static void
1356
pmd_info_show_sleep(struct ds *reply, unsigned core_id, int numa_id,
1357
                    uint64_t pmd_max_sleep)
1358
0
{
1359
0
    if (core_id == NON_PMD_CORE_ID) {
1360
0
        return;
1361
0
    }
1362
0
    ds_put_format(reply,
1363
0
                  "pmd thread numa_id %d core_id %d:\n"
1364
0
                  "  max sleep: %4"PRIu64" us\n",
1365
0
                  numa_id, core_id, pmd_max_sleep);
1366
0
}
1367
1368
static void
1369
dpif_netdev_pmd_info(struct unixctl_conn *conn, int argc, const char *argv[],
1370
                     void *aux)
1371
0
{
1372
0
    struct ds reply = DS_EMPTY_INITIALIZER;
1373
0
    struct dp_netdev_pmd_thread **pmd_list;
1374
0
    struct dp_netdev *dp = NULL;
1375
0
    enum pmd_info_type type = *(enum pmd_info_type *) aux;
1376
0
    unsigned int core_id;
1377
0
    bool filter_on_pmd = false;
1378
0
    size_t n;
1379
0
    unsigned int secs = 0;
1380
0
    unsigned long long max_secs = (PMD_INTERVAL_LEN * PMD_INTERVAL_MAX)
1381
0
                                      / INTERVAL_USEC_TO_SEC;
1382
0
    bool show_header = true;
1383
0
    uint64_t max_sleep;
1384
1385
0
    ovs_mutex_lock(&dp_netdev_mutex);
1386
1387
0
    while (argc > 1) {
1388
0
        if (!strcmp(argv[1], "-pmd") && argc > 2) {
1389
0
            if (str_to_uint(argv[2], 10, &core_id)) {
1390
0
                filter_on_pmd = true;
1391
0
            }
1392
0
            argc -= 2;
1393
0
            argv += 2;
1394
0
        } else if (type == PMD_INFO_SHOW_RXQ &&
1395
0
                       !strcmp(argv[1], "-secs") &&
1396
0
                       argc > 2) {
1397
0
            if (!str_to_uint(argv[2], 10, &secs)) {
1398
0
                secs = max_secs;
1399
0
            }
1400
0
            argc -= 2;
1401
0
            argv += 2;
1402
0
        } else {
1403
0
            dp = shash_find_data(&dp_netdevs, argv[1]);
1404
0
            argc -= 1;
1405
0
            argv += 1;
1406
0
        }
1407
0
    }
1408
1409
0
    if (!dp) {
1410
0
        if (shash_count(&dp_netdevs) == 1) {
1411
            /* There's only one datapath */
1412
0
            dp = shash_first(&dp_netdevs)->data;
1413
0
        } else {
1414
0
            ovs_mutex_unlock(&dp_netdev_mutex);
1415
0
            unixctl_command_reply_error(conn,
1416
0
                                        "please specify an existing datapath");
1417
0
            return;
1418
0
        }
1419
0
    }
1420
1421
0
    sorted_poll_thread_list(dp, &pmd_list, &n);
1422
0
    for (size_t i = 0; i < n; i++) {
1423
0
        struct dp_netdev_pmd_thread *pmd = pmd_list[i];
1424
0
        if (!pmd) {
1425
0
            break;
1426
0
        }
1427
0
        if (filter_on_pmd && pmd->core_id != core_id) {
1428
0
            continue;
1429
0
        }
1430
0
        if (type == PMD_INFO_SHOW_RXQ) {
1431
0
            if (show_header) {
1432
0
                if (!secs || secs > max_secs) {
1433
0
                    secs = max_secs;
1434
0
                } else {
1435
0
                    secs = ROUND_UP(secs,
1436
0
                                    PMD_INTERVAL_LEN / INTERVAL_USEC_TO_SEC);
1437
0
                }
1438
0
                ds_put_format(&reply, "Displaying last %u seconds "
1439
0
                              "pmd usage %%\n", secs);
1440
0
                show_header = false;
1441
0
            }
1442
0
            pmd_info_show_rxq(&reply, pmd, secs);
1443
0
        } else if (type == PMD_INFO_CLEAR_STATS) {
1444
0
            pmd_perf_stats_clear(&pmd->perf_stats);
1445
0
        } else if (type == PMD_INFO_SHOW_STATS) {
1446
0
            pmd_info_show_stats(&reply, pmd);
1447
0
        } else if (type == PMD_INFO_PERF_SHOW) {
1448
0
            pmd_info_show_perf(&reply, pmd, (struct pmd_perf_params *)aux);
1449
0
        } else if (type == PMD_INFO_SLEEP_SHOW) {
1450
0
            if (show_header) {
1451
0
                ds_put_format(&reply, "Default max sleep: %4"PRIu64" us\n",
1452
0
                              dp->pmd_max_sleep_default);
1453
0
                show_header = false;
1454
0
            }
1455
0
            atomic_read_relaxed(&pmd->max_sleep, &max_sleep);
1456
0
            pmd_info_show_sleep(&reply, pmd->core_id, pmd->numa_id,
1457
0
                                max_sleep);
1458
0
        }
1459
0
    }
1460
0
    free(pmd_list);
1461
1462
0
    ovs_mutex_unlock(&dp_netdev_mutex);
1463
1464
0
    unixctl_command_reply(conn, ds_cstr(&reply));
1465
0
    ds_destroy(&reply);
1466
0
}
1467
1468
static void
1469
pmd_perf_show_cmd(struct unixctl_conn *conn, int argc,
1470
                          const char *argv[],
1471
                          void *aux OVS_UNUSED)
1472
0
{
1473
0
    struct pmd_perf_params par;
1474
0
    long int it_hist = 0, ms_hist = 0;
1475
0
    par.histograms = true;
1476
1477
0
    while (argc > 1) {
1478
0
        if (!strcmp(argv[1], "-nh")) {
1479
0
            par.histograms = false;
1480
0
            argc -= 1;
1481
0
            argv += 1;
1482
0
        } else if (!strcmp(argv[1], "-it") && argc > 2) {
1483
0
            it_hist = strtol(argv[2], NULL, 10);
1484
0
            if (it_hist < 0) {
1485
0
                it_hist = 0;
1486
0
            } else if (it_hist > HISTORY_LEN) {
1487
0
                it_hist = HISTORY_LEN;
1488
0
            }
1489
0
            argc -= 2;
1490
0
            argv += 2;
1491
0
        } else if (!strcmp(argv[1], "-ms") && argc > 2) {
1492
0
            ms_hist = strtol(argv[2], NULL, 10);
1493
0
            if (ms_hist < 0) {
1494
0
                ms_hist = 0;
1495
0
            } else if (ms_hist > HISTORY_LEN) {
1496
0
                ms_hist = HISTORY_LEN;
1497
0
            }
1498
0
            argc -= 2;
1499
0
            argv += 2;
1500
0
        } else {
1501
0
            break;
1502
0
        }
1503
0
    }
1504
0
    par.iter_hist_len = it_hist;
1505
0
    par.ms_hist_len = ms_hist;
1506
0
    par.command_type = PMD_INFO_PERF_SHOW;
1507
0
    dpif_netdev_pmd_info(conn, argc, argv, &par);
1508
0
}
1509
1510
static void
1511
dpif_netdev_bond_show(struct unixctl_conn *conn, int argc,
1512
                      const char *argv[], void *aux OVS_UNUSED)
1513
0
{
1514
0
    struct ds reply = DS_EMPTY_INITIALIZER;
1515
0
    struct dp_netdev *dp = NULL;
1516
1517
0
    ovs_mutex_lock(&dp_netdev_mutex);
1518
0
    if (argc == 2) {
1519
0
        dp = shash_find_data(&dp_netdevs, argv[1]);
1520
0
    } else if (shash_count(&dp_netdevs) == 1) {
1521
        /* There's only one datapath. */
1522
0
        dp = shash_first(&dp_netdevs)->data;
1523
0
    }
1524
0
    if (!dp) {
1525
0
        ovs_mutex_unlock(&dp_netdev_mutex);
1526
0
        unixctl_command_reply_error(conn,
1527
0
                                    "please specify an existing datapath");
1528
0
        return;
1529
0
    }
1530
1531
0
    if (cmap_count(&dp->tx_bonds) > 0) {
1532
0
        struct tx_bond *dp_bond_entry;
1533
1534
0
        ds_put_cstr(&reply, "Bonds:\n");
1535
0
        CMAP_FOR_EACH (dp_bond_entry, node, &dp->tx_bonds) {
1536
0
            ds_put_format(&reply, "  bond-id %"PRIu32":\n",
1537
0
                          dp_bond_entry->bond_id);
1538
0
            for (int bucket = 0; bucket < BOND_BUCKETS; bucket++) {
1539
0
                uint32_t member_id = odp_to_u32(
1540
0
                    dp_bond_entry->member_buckets[bucket].member_id);
1541
0
                ds_put_format(&reply,
1542
0
                              "    bucket %d - member %"PRIu32"\n",
1543
0
                              bucket, member_id);
1544
0
            }
1545
0
        }
1546
0
    }
1547
0
    ovs_mutex_unlock(&dp_netdev_mutex);
1548
0
    unixctl_command_reply(conn, ds_cstr(&reply));
1549
0
    ds_destroy(&reply);
1550
0
}
1551
1552

1553
static int
1554
dpif_netdev_init(void)
1555
0
{
1556
0
    static enum pmd_info_type show_aux = PMD_INFO_SHOW_STATS,
1557
0
                              clear_aux = PMD_INFO_CLEAR_STATS,
1558
0
                              poll_aux = PMD_INFO_SHOW_RXQ,
1559
0
                              sleep_aux = PMD_INFO_SLEEP_SHOW;
1560
1561
0
    unixctl_command_register("dpif-netdev/pmd-stats-show", "[-pmd core] [dp]",
1562
0
                             0, 3, dpif_netdev_pmd_info,
1563
0
                             (void *)&show_aux);
1564
0
    unixctl_command_register("dpif-netdev/pmd-stats-clear", "[-pmd core] [dp]",
1565
0
                             0, 3, dpif_netdev_pmd_info,
1566
0
                             (void *)&clear_aux);
1567
0
    unixctl_command_register("dpif-netdev/pmd-rxq-show", "[-pmd core] "
1568
0
                             "[-secs secs] [dp]",
1569
0
                             0, 5, dpif_netdev_pmd_info,
1570
0
                             (void *)&poll_aux);
1571
0
    unixctl_command_register("dpif-netdev/pmd-sleep-show", "[dp]",
1572
0
                             0, 1, dpif_netdev_pmd_info,
1573
0
                             (void *)&sleep_aux);
1574
0
    unixctl_command_register("dpif-netdev/pmd-perf-show",
1575
0
                             "[-nh] [-it iter-history-len]"
1576
0
                             " [-ms ms-history-len]"
1577
0
                             " [-pmd core] [dp]",
1578
0
                             0, 8, pmd_perf_show_cmd,
1579
0
                             NULL);
1580
0
    unixctl_command_register("dpif-netdev/pmd-rxq-rebalance", "[dp]",
1581
0
                             0, 1, dpif_netdev_pmd_rebalance,
1582
0
                             NULL);
1583
0
    unixctl_command_register("dpif-netdev/pmd-perf-log-set",
1584
0
                             "on|off [-b before] [-a after] [-e|-ne] "
1585
0
                             "[-us usec] [-q qlen]",
1586
0
                             0, 10, pmd_perf_log_set_cmd,
1587
0
                             NULL);
1588
0
    unixctl_command_register("dpif-netdev/bond-show", "[dp]",
1589
0
                             0, 1, dpif_netdev_bond_show,
1590
0
                             NULL);
1591
0
    unixctl_command_register("dpif-netdev/subtable-lookup-prio-set",
1592
0
                             "[lookup_func] [prio]",
1593
0
                             2, 2, dpif_netdev_subtable_lookup_set,
1594
0
                             NULL);
1595
0
    unixctl_command_register("dpif-netdev/subtable-lookup-info-get", "",
1596
0
                             0, 0, dpif_netdev_subtable_lookup_get,
1597
0
                             NULL);
1598
0
    unixctl_command_register("dpif-netdev/subtable-lookup-prio-get", NULL,
1599
0
                             0, 0, dpif_netdev_subtable_lookup_get,
1600
0
                             NULL);
1601
0
    unixctl_command_register("dpif-netdev/dpif-impl-set",
1602
0
                             "dpif_implementation_name",
1603
0
                             1, 1, dpif_netdev_impl_set,
1604
0
                             NULL);
1605
0
    unixctl_command_register("dpif-netdev/dpif-impl-get", "",
1606
0
                             0, 0, dpif_netdev_impl_get,
1607
0
                             NULL);
1608
0
    unixctl_command_register("dpif-netdev/miniflow-parser-set",
1609
0
                             "[-pmd core] miniflow_implementation_name"
1610
0
                             " [study_pkt_cnt]",
1611
0
                             1, 5, dpif_miniflow_extract_impl_set,
1612
0
                             NULL);
1613
0
    unixctl_command_register("dpif-netdev/miniflow-parser-get", "",
1614
0
                             0, 0, dpif_miniflow_extract_impl_get,
1615
0
                             NULL);
1616
0
    return 0;
1617
0
}
1618
1619
static int
1620
dpif_netdev_enumerate(struct sset *all_dps,
1621
                      const struct dpif_class *dpif_class)
1622
0
{
1623
0
    struct shash_node *node;
1624
1625
0
    ovs_mutex_lock(&dp_netdev_mutex);
1626
0
    SHASH_FOR_EACH(node, &dp_netdevs) {
1627
0
        struct dp_netdev *dp = node->data;
1628
0
        if (dpif_class != dp->class) {
1629
            /* 'dp_netdevs' contains both "netdev" and "dummy" dpifs.
1630
             * If the class doesn't match, skip this dpif. */
1631
0
             continue;
1632
0
        }
1633
0
        sset_add(all_dps, node->name);
1634
0
    }
1635
0
    ovs_mutex_unlock(&dp_netdev_mutex);
1636
1637
0
    return 0;
1638
0
}
1639
1640
static bool
1641
dpif_netdev_class_is_dummy(const struct dpif_class *class)
1642
0
{
1643
0
    return class != &dpif_netdev_class;
1644
0
}
1645
1646
static const char *
1647
dpif_netdev_port_open_type(const struct dpif_class *class, const char *type)
1648
0
{
1649
0
    return strcmp(type, "internal") ? type
1650
0
                  : dpif_netdev_class_is_dummy(class) ? "dummy-internal"
1651
0
                  : "tap";
1652
0
}
1653
1654
static struct dpif *
1655
create_dpif_netdev(struct dp_netdev *dp)
1656
0
{
1657
0
    uint16_t netflow_id = hash_string(dp->name, 0);
1658
0
    struct dpif_netdev *dpif;
1659
1660
0
    ovs_refcount_ref(&dp->ref_cnt);
1661
1662
0
    dpif = xmalloc(sizeof *dpif);
1663
0
    dpif_init(&dpif->dpif, dp->class, dp->name, netflow_id >> 8, netflow_id);
1664
0
    dpif->dp = dp;
1665
0
    dpif->last_port_seq = seq_read(dp->port_seq);
1666
1667
0
    return &dpif->dpif;
1668
0
}
1669
1670
/* Choose an unused, non-zero port number and return it on success.
1671
 * Return ODPP_NONE on failure. */
1672
static odp_port_t
1673
choose_port(struct dp_netdev *dp, const char *name)
1674
    OVS_REQ_RDLOCK(dp->port_rwlock)
1675
0
{
1676
0
    uint32_t port_no;
1677
1678
0
    if (dp->class != &dpif_netdev_class) {
1679
0
        const char *p;
1680
0
        int start_no = 0;
1681
1682
        /* If the port name begins with "br", start the number search at
1683
         * 100 to make writing tests easier. */
1684
0
        if (!strncmp(name, "br", 2)) {
1685
0
            start_no = 100;
1686
0
        }
1687
1688
        /* If the port name contains a number, try to assign that port number.
1689
         * This can make writing unit tests easier because port numbers are
1690
         * predictable. */
1691
0
        for (p = name; *p != '\0'; p++) {
1692
0
            if (isdigit((unsigned char) *p)) {
1693
0
                port_no = start_no + strtol(p, NULL, 10);
1694
0
                if (port_no > 0 && port_no != odp_to_u32(ODPP_NONE)
1695
0
                    && !dp_netdev_lookup_port(dp, u32_to_odp(port_no))) {
1696
0
                    return u32_to_odp(port_no);
1697
0
                }
1698
0
                break;
1699
0
            }
1700
0
        }
1701
0
    }
1702
1703
0
    for (port_no = 1; port_no <= UINT16_MAX; port_no++) {
1704
0
        if (!dp_netdev_lookup_port(dp, u32_to_odp(port_no))) {
1705
0
            return u32_to_odp(port_no);
1706
0
        }
1707
0
    }
1708
1709
0
    return ODPP_NONE;
1710
0
}
1711
1712
static uint32_t
1713
dp_meter_hash(uint32_t meter_id)
1714
0
{
1715
    /* In the ofproto-dpif layer, we use the id-pool to alloc meter id
1716
     * orderly (e.g. 1, 2, ... N.), which provides a better hash
1717
     * distribution.  Use them directly instead of hash_xxx function for
1718
     * achieving high-performance. */
1719
0
    return meter_id;
1720
0
}
1721
1722
static void
1723
dp_netdev_meter_destroy(struct dp_netdev *dp)
1724
0
{
1725
0
    struct dp_meter *m;
1726
1727
0
    ovs_mutex_lock(&dp->meters_lock);
1728
0
    CMAP_FOR_EACH (m, node, &dp->meters) {
1729
0
        cmap_remove(&dp->meters, &m->node, dp_meter_hash(m->id));
1730
0
        ovsrcu_postpone(free, m);
1731
0
    }
1732
1733
0
    cmap_destroy(&dp->meters);
1734
0
    ovs_mutex_unlock(&dp->meters_lock);
1735
0
    ovs_mutex_destroy(&dp->meters_lock);
1736
0
}
1737
1738
static struct dp_meter *
1739
dp_meter_lookup(struct cmap *meters, uint32_t meter_id)
1740
0
{
1741
0
    uint32_t hash = dp_meter_hash(meter_id);
1742
0
    struct dp_meter *m;
1743
1744
0
    CMAP_FOR_EACH_WITH_HASH (m, node, hash, meters) {
1745
0
        if (m->id == meter_id) {
1746
0
            return m;
1747
0
        }
1748
0
    }
1749
1750
0
    return NULL;
1751
0
}
1752
1753
static void
1754
dp_meter_detach_free(struct cmap *meters, uint32_t meter_id)
1755
0
{
1756
0
    struct dp_meter *m = dp_meter_lookup(meters, meter_id);
1757
1758
0
    if (m) {
1759
0
        cmap_remove(meters, &m->node, dp_meter_hash(meter_id));
1760
0
        ovsrcu_postpone(free, m);
1761
0
    }
1762
0
}
1763
1764
static void
1765
dp_meter_attach(struct cmap *meters, struct dp_meter *meter)
1766
0
{
1767
0
    cmap_insert(meters, &meter->node, dp_meter_hash(meter->id));
1768
0
}
1769
1770
static int
1771
create_dp_netdev(const char *name, const struct dpif_class *class,
1772
                 struct dp_netdev **dpp)
1773
    OVS_REQUIRES(dp_netdev_mutex)
1774
0
{
1775
0
    static struct ovsthread_once tsc_freq_check = OVSTHREAD_ONCE_INITIALIZER;
1776
0
    struct dp_netdev *dp;
1777
0
    int error;
1778
1779
    /* Avoid estimating TSC frequency for dummy datapath to not slow down
1780
     * unit tests. */
1781
0
    if (!dpif_netdev_class_is_dummy(class)
1782
0
        && ovsthread_once_start(&tsc_freq_check)) {
1783
0
        pmd_perf_estimate_tsc_frequency();
1784
0
        ovsthread_once_done(&tsc_freq_check);
1785
0
    }
1786
1787
0
    dp = xzalloc(sizeof *dp);
1788
0
    shash_add(&dp_netdevs, name, dp);
1789
1790
0
    *CONST_CAST(const struct dpif_class **, &dp->class) = class;
1791
0
    *CONST_CAST(const char **, &dp->name) = xstrdup(name);
1792
0
    *CONST_CAST(const char **, &dp->full_name) = xasprintf("%s@%s",
1793
0
                                                           class->type, name);
1794
0
    ovs_refcount_init(&dp->ref_cnt);
1795
0
    atomic_flag_clear(&dp->destroyed);
1796
1797
0
    ovs_rwlock_init(&dp->port_rwlock);
1798
0
    hmap_init(&dp->ports);
1799
0
    dp->port_seq = seq_create();
1800
0
    ovs_mutex_init(&dp->bond_mutex);
1801
0
    cmap_init(&dp->tx_bonds);
1802
1803
0
    fat_rwlock_init(&dp->upcall_rwlock);
1804
1805
0
    dp->reconfigure_seq = seq_create();
1806
0
    dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq);
1807
1808
    /* Init meter resources. */
1809
0
    cmap_init(&dp->meters);
1810
0
    ovs_mutex_init(&dp->meters_lock);
1811
1812
    /* Disable upcalls by default. */
1813
0
    dp_netdev_disable_upcall(dp);
1814
0
    dp->upcall_aux = NULL;
1815
0
    dp->upcall_cb = NULL;
1816
1817
0
    dp->conntrack = conntrack_init();
1818
1819
0
    dpif_miniflow_extract_init();
1820
1821
0
    atomic_init(&dp->emc_insert_min, DEFAULT_EM_FLOW_INSERT_MIN);
1822
0
    atomic_init(&dp->tx_flush_interval, DEFAULT_TX_FLUSH_INTERVAL);
1823
1824
0
    cmap_init(&dp->poll_threads);
1825
0
    dp->pmd_rxq_assign_type = SCHED_CYCLES;
1826
1827
0
    ovs_mutex_init(&dp->tx_qid_pool_mutex);
1828
    /* We need 1 Tx queue for each possible core + 1 for non-PMD threads. */
1829
0
    dp->tx_qid_pool = id_pool_create(0, ovs_numa_get_n_cores() + 1);
1830
1831
0
    ovs_mutex_init_recursive(&dp->non_pmd_mutex);
1832
0
    ovsthread_key_create(&dp->per_pmd_key, NULL);
1833
1834
0
    ovs_rwlock_wrlock(&dp->port_rwlock);
1835
    /* non-PMD will be created before all other threads and will
1836
     * allocate static_tx_qid = 0. */
1837
0
    dp_netdev_set_nonpmd(dp);
1838
1839
0
    error = do_add_port(dp, name, dpif_netdev_port_open_type(dp->class,
1840
0
                                                             "internal"),
1841
0
                        ODPP_LOCAL);
1842
0
    ovs_rwlock_unlock(&dp->port_rwlock);
1843
0
    if (error) {
1844
0
        dp_netdev_free(dp);
1845
0
        return error;
1846
0
    }
1847
1848
0
    dp->max_sleep_list = NULL;
1849
1850
0
    dp->last_tnl_conf_seq = seq_read(tnl_conf_seq);
1851
0
    *dpp = dp;
1852
0
    return 0;
1853
0
}
1854
1855
static void
1856
dp_netdev_request_reconfigure(struct dp_netdev *dp)
1857
0
{
1858
0
    seq_change(dp->reconfigure_seq);
1859
0
}
1860
1861
static bool
1862
dp_netdev_is_reconf_required(struct dp_netdev *dp)
1863
0
{
1864
0
    return seq_read(dp->reconfigure_seq) != dp->last_reconfigure_seq;
1865
0
}
1866
1867
static int
1868
dpif_netdev_open(const struct dpif_class *class, const char *name,
1869
                 bool create, struct dpif **dpifp)
1870
0
{
1871
0
    struct dp_netdev *dp;
1872
0
    int error;
1873
1874
0
    ovs_mutex_lock(&dp_netdev_mutex);
1875
0
    dp = shash_find_data(&dp_netdevs, name);
1876
0
    if (!dp) {
1877
0
        error = create ? create_dp_netdev(name, class, &dp) : ENODEV;
1878
0
    } else {
1879
0
        error = (dp->class != class ? EINVAL
1880
0
                 : create ? EEXIST
1881
0
                 : 0);
1882
0
    }
1883
0
    if (!error) {
1884
0
        *dpifp = create_dpif_netdev(dp);
1885
0
    }
1886
0
    ovs_mutex_unlock(&dp_netdev_mutex);
1887
1888
0
    return error;
1889
0
}
1890
1891
static void
1892
dp_netdev_destroy_upcall_lock(struct dp_netdev *dp)
1893
    OVS_NO_THREAD_SAFETY_ANALYSIS
1894
0
{
1895
    /* Check that upcalls are disabled, i.e. that the rwlock is taken */
1896
0
    ovs_assert(fat_rwlock_tryrdlock(&dp->upcall_rwlock));
1897
1898
    /* Before freeing a lock we should release it */
1899
0
    fat_rwlock_unlock(&dp->upcall_rwlock);
1900
0
    fat_rwlock_destroy(&dp->upcall_rwlock);
1901
0
}
1902
1903
static uint32_t
1904
hash_bond_id(uint32_t bond_id)
1905
0
{
1906
0
    return hash_int(bond_id, 0);
1907
0
}
1908
1909
/* Requires dp_netdev_mutex so that we can't get a new reference to 'dp'
1910
 * through the 'dp_netdevs' shash while freeing 'dp'. */
1911
static void
1912
dp_netdev_free(struct dp_netdev *dp)
1913
    OVS_REQUIRES(dp_netdev_mutex)
1914
0
{
1915
0
    struct dp_netdev_port *port;
1916
0
    struct tx_bond *bond;
1917
1918
0
    shash_find_and_delete(&dp_netdevs, dp->name);
1919
1920
0
    ovs_rwlock_wrlock(&dp->port_rwlock);
1921
0
    HMAP_FOR_EACH_SAFE (port, node, &dp->ports) {
1922
0
        do_del_port(dp, port);
1923
0
    }
1924
0
    ovs_rwlock_unlock(&dp->port_rwlock);
1925
1926
0
    ovs_mutex_lock(&dp->bond_mutex);
1927
0
    CMAP_FOR_EACH (bond, node, &dp->tx_bonds) {
1928
0
        cmap_remove(&dp->tx_bonds, &bond->node, hash_bond_id(bond->bond_id));
1929
0
        ovsrcu_postpone(free, bond);
1930
0
    }
1931
0
    ovs_mutex_unlock(&dp->bond_mutex);
1932
1933
0
    dp_netdev_destroy_all_pmds(dp, true);
1934
0
    cmap_destroy(&dp->poll_threads);
1935
1936
0
    ovs_mutex_destroy(&dp->tx_qid_pool_mutex);
1937
0
    id_pool_destroy(dp->tx_qid_pool);
1938
1939
0
    ovs_mutex_destroy(&dp->non_pmd_mutex);
1940
0
    ovsthread_key_delete(dp->per_pmd_key);
1941
1942
0
    conntrack_destroy(dp->conntrack);
1943
1944
1945
0
    seq_destroy(dp->reconfigure_seq);
1946
1947
0
    seq_destroy(dp->port_seq);
1948
0
    hmap_destroy(&dp->ports);
1949
0
    ovs_rwlock_destroy(&dp->port_rwlock);
1950
1951
0
    cmap_destroy(&dp->tx_bonds);
1952
0
    ovs_mutex_destroy(&dp->bond_mutex);
1953
1954
    /* Upcalls must be disabled at this point */
1955
0
    dp_netdev_destroy_upcall_lock(dp);
1956
1957
0
    dp_netdev_meter_destroy(dp);
1958
1959
0
    free(dp->max_sleep_list);
1960
0
    free(dp->pmd_cmask);
1961
0
    free(CONST_CAST(char *, dp->name));
1962
0
    free(CONST_CAST(char *, dp->full_name));
1963
0
    free(dp);
1964
0
}
1965
1966
static void
1967
dp_netdev_unref(struct dp_netdev *dp)
1968
0
{
1969
0
    if (dp) {
1970
        /* Take dp_netdev_mutex so that, if dp->ref_cnt falls to zero, we can't
1971
         * get a new reference to 'dp' through the 'dp_netdevs' shash. */
1972
0
        ovs_mutex_lock(&dp_netdev_mutex);
1973
0
        if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) {
1974
0
            dp_netdev_free(dp);
1975
0
        }
1976
0
        ovs_mutex_unlock(&dp_netdev_mutex);
1977
0
    }
1978
0
}
1979
1980
static void
1981
dpif_netdev_close(struct dpif *dpif)
1982
0
{
1983
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
1984
1985
0
    dp_netdev_unref(dp);
1986
0
    free(dpif);
1987
0
}
1988
1989
static int
1990
dpif_netdev_destroy(struct dpif *dpif)
1991
0
{
1992
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
1993
1994
0
    if (!atomic_flag_test_and_set(&dp->destroyed)) {
1995
0
        if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) {
1996
            /* Can't happen: 'dpif' still owns a reference to 'dp'. */
1997
0
            OVS_NOT_REACHED();
1998
0
        }
1999
0
    }
2000
2001
0
    return 0;
2002
0
}
2003
2004
/* Add 'n' to the atomic variable 'var' non-atomically and using relaxed
2005
 * load/store semantics.  While the increment is not atomic, the load and
2006
 * store operations are, making it impossible to read inconsistent values.
2007
 *
2008
 * This is used to update thread local stats counters. */
2009
static void
2010
non_atomic_ullong_add(atomic_ullong *var, unsigned long long n)
2011
0
{
2012
0
    unsigned long long tmp;
2013
2014
0
    atomic_read_relaxed(var, &tmp);
2015
0
    tmp += n;
2016
0
    atomic_store_relaxed(var, tmp);
2017
0
}
2018
2019
static int
2020
dpif_netdev_get_stats(const struct dpif *dpif, struct dpif_dp_stats *stats)
2021
0
{
2022
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
2023
0
    struct dp_netdev_pmd_thread *pmd;
2024
0
    uint64_t pmd_stats[PMD_N_STATS];
2025
2026
0
    stats->n_flows = stats->n_hit = stats->n_missed = stats->n_lost = 0;
2027
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
2028
0
        stats->n_flows += cmap_count(&pmd->flow_table);
2029
0
        pmd_perf_read_counters(&pmd->perf_stats, pmd_stats);
2030
0
        stats->n_hit += pmd_stats[PMD_STAT_PHWOL_HIT];
2031
0
        stats->n_hit += pmd_stats[PMD_STAT_SIMPLE_HIT];
2032
0
        stats->n_hit += pmd_stats[PMD_STAT_EXACT_HIT];
2033
0
        stats->n_hit += pmd_stats[PMD_STAT_SMC_HIT];
2034
0
        stats->n_hit += pmd_stats[PMD_STAT_MASKED_HIT];
2035
0
        stats->n_missed += pmd_stats[PMD_STAT_MISS];
2036
0
        stats->n_lost += pmd_stats[PMD_STAT_LOST];
2037
0
    }
2038
0
    stats->n_masks = UINT32_MAX;
2039
0
    stats->n_mask_hit = UINT64_MAX;
2040
0
    stats->n_cache_hit = UINT64_MAX;
2041
2042
0
    return 0;
2043
0
}
2044
2045
static void
2046
dp_netdev_reload_pmd__(struct dp_netdev_pmd_thread *pmd)
2047
0
{
2048
0
    if (pmd->core_id == NON_PMD_CORE_ID) {
2049
0
        ovs_mutex_lock(&pmd->dp->non_pmd_mutex);
2050
0
        ovs_mutex_lock(&pmd->port_mutex);
2051
0
        pmd_load_cached_ports(pmd);
2052
0
        ovs_mutex_unlock(&pmd->port_mutex);
2053
0
        ovs_mutex_unlock(&pmd->dp->non_pmd_mutex);
2054
0
        return;
2055
0
    }
2056
2057
0
    seq_change(pmd->reload_seq);
2058
0
    atomic_store_explicit(&pmd->reload, true, memory_order_release);
2059
0
}
2060
2061
static uint32_t
2062
hash_port_no(odp_port_t port_no)
2063
0
{
2064
0
    return hash_int(odp_to_u32(port_no), 0);
2065
0
}
2066
2067
static int
2068
port_create(const char *devname, const char *type,
2069
            odp_port_t port_no, struct dp_netdev_port **portp)
2070
0
{
2071
0
    struct dp_netdev_port *port;
2072
0
    enum netdev_flags flags;
2073
0
    struct netdev *netdev;
2074
0
    int error;
2075
2076
0
    *portp = NULL;
2077
2078
    /* Open and validate network device. */
2079
0
    error = netdev_open(devname, type, &netdev);
2080
0
    if (error) {
2081
0
        return error;
2082
0
    }
2083
    /* XXX reject non-Ethernet devices */
2084
2085
0
    netdev_get_flags(netdev, &flags);
2086
0
    if (flags & NETDEV_LOOPBACK) {
2087
0
        VLOG_ERR("%s: cannot add a loopback device", devname);
2088
0
        error = EINVAL;
2089
0
        goto out;
2090
0
    }
2091
2092
0
    port = xzalloc(sizeof *port);
2093
0
    port->port_no = port_no;
2094
0
    port->netdev = netdev;
2095
0
    port->type = xstrdup(type);
2096
0
    port->sf = NULL;
2097
0
    port->emc_enabled = true;
2098
0
    port->need_reconfigure = true;
2099
0
    ovs_mutex_init(&port->txq_used_mutex);
2100
2101
0
    *portp = port;
2102
2103
0
    return 0;
2104
2105
0
out:
2106
0
    netdev_close(netdev);
2107
0
    return error;
2108
0
}
2109
2110
static int
2111
do_add_port(struct dp_netdev *dp, const char *devname, const char *type,
2112
            odp_port_t port_no)
2113
    OVS_REQ_WRLOCK(dp->port_rwlock)
2114
0
{
2115
0
    struct netdev_saved_flags *sf;
2116
0
    struct dp_netdev_port *port;
2117
0
    int error;
2118
2119
    /* Reject devices already in 'dp'. */
2120
0
    if (!get_port_by_name(dp, devname, &port)) {
2121
0
        return EEXIST;
2122
0
    }
2123
2124
0
    error = port_create(devname, type, port_no, &port);
2125
0
    if (error) {
2126
0
        return error;
2127
0
    }
2128
2129
0
    hmap_insert(&dp->ports, &port->node, hash_port_no(port_no));
2130
0
    seq_change(dp->port_seq);
2131
2132
0
    reconfigure_datapath(dp);
2133
2134
    /* Check that port was successfully configured. */
2135
0
    if (!dp_netdev_lookup_port(dp, port_no)) {
2136
0
        return EINVAL;
2137
0
    }
2138
2139
    /* Updating device flags triggers an if_notifier, which triggers a bridge
2140
     * reconfiguration and another attempt to add this port, leading to an
2141
     * infinite loop if the device is configured incorrectly and cannot be
2142
     * added.  Setting the promisc mode after a successful reconfiguration,
2143
     * since we already know that the device is somehow properly configured. */
2144
0
    error = netdev_turn_flags_on(port->netdev, NETDEV_PROMISC, &sf);
2145
0
    if (error) {
2146
0
        VLOG_ERR("%s: cannot set promisc flag", devname);
2147
0
        do_del_port(dp, port);
2148
0
        return error;
2149
0
    }
2150
0
    port->sf = sf;
2151
2152
0
    return 0;
2153
0
}
2154
2155
static int
2156
dpif_netdev_port_add(struct dpif *dpif, struct netdev *netdev,
2157
                     odp_port_t *port_nop)
2158
0
{
2159
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
2160
0
    char namebuf[NETDEV_VPORT_NAME_BUFSIZE];
2161
0
    const char *dpif_port;
2162
0
    odp_port_t port_no;
2163
0
    int error;
2164
2165
0
    ovs_rwlock_wrlock(&dp->port_rwlock);
2166
0
    dpif_port = netdev_vport_get_dpif_port(netdev, namebuf, sizeof namebuf);
2167
0
    if (*port_nop != ODPP_NONE) {
2168
0
        port_no = *port_nop;
2169
0
        error = dp_netdev_lookup_port(dp, *port_nop) ? EBUSY : 0;
2170
0
    } else {
2171
0
        port_no = choose_port(dp, dpif_port);
2172
0
        error = port_no == ODPP_NONE ? EFBIG : 0;
2173
0
    }
2174
0
    if (!error) {
2175
0
        *port_nop = port_no;
2176
0
        error = do_add_port(dp, dpif_port, netdev_get_type(netdev), port_no);
2177
0
    }
2178
0
    ovs_rwlock_unlock(&dp->port_rwlock);
2179
2180
0
    return error;
2181
0
}
2182
2183
static int
2184
dpif_netdev_port_del(struct dpif *dpif, odp_port_t port_no)
2185
0
{
2186
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
2187
0
    int error;
2188
2189
0
    ovs_rwlock_wrlock(&dp->port_rwlock);
2190
0
    if (port_no == ODPP_LOCAL) {
2191
0
        error = EINVAL;
2192
0
    } else {
2193
0
        struct dp_netdev_port *port;
2194
2195
0
        error = get_port_by_number(dp, port_no, &port);
2196
0
        if (!error) {
2197
0
            do_del_port(dp, port);
2198
0
        }
2199
0
    }
2200
0
    ovs_rwlock_unlock(&dp->port_rwlock);
2201
2202
0
    return error;
2203
0
}
2204
2205
static bool
2206
is_valid_port_number(odp_port_t port_no)
2207
0
{
2208
0
    return port_no != ODPP_NONE;
2209
0
}
2210
2211
static struct dp_netdev_port *
2212
dp_netdev_lookup_port(const struct dp_netdev *dp, odp_port_t port_no)
2213
    OVS_REQ_RDLOCK(dp->port_rwlock)
2214
0
{
2215
0
    struct dp_netdev_port *port;
2216
2217
0
    HMAP_FOR_EACH_WITH_HASH (port, node, hash_port_no(port_no), &dp->ports) {
2218
0
        if (port->port_no == port_no) {
2219
0
            return port;
2220
0
        }
2221
0
    }
2222
0
    return NULL;
2223
0
}
2224
2225
static int
2226
get_port_by_number(struct dp_netdev *dp,
2227
                   odp_port_t port_no, struct dp_netdev_port **portp)
2228
    OVS_REQ_RDLOCK(dp->port_rwlock)
2229
0
{
2230
0
    if (!is_valid_port_number(port_no)) {
2231
0
        *portp = NULL;
2232
0
        return EINVAL;
2233
0
    } else {
2234
0
        *portp = dp_netdev_lookup_port(dp, port_no);
2235
0
        return *portp ? 0 : ENODEV;
2236
0
    }
2237
0
}
2238
2239
static void
2240
port_destroy(struct dp_netdev_port *port)
2241
0
{
2242
0
    if (!port) {
2243
0
        return;
2244
0
    }
2245
2246
0
    netdev_close(port->netdev);
2247
0
    netdev_restore_flags(port->sf);
2248
2249
0
    for (unsigned i = 0; i < port->n_rxq; i++) {
2250
0
        netdev_rxq_close(port->rxqs[i].rx);
2251
0
    }
2252
0
    ovs_mutex_destroy(&port->txq_used_mutex);
2253
0
    free(port->rxq_affinity_list);
2254
0
    free(port->txq_used);
2255
0
    free(port->rxqs);
2256
0
    free(port->type);
2257
0
    free(port);
2258
0
}
2259
2260
static int
2261
get_port_by_name(struct dp_netdev *dp,
2262
                 const char *devname, struct dp_netdev_port **portp)
2263
    OVS_REQ_RDLOCK(dp->port_rwlock)
2264
0
{
2265
0
    struct dp_netdev_port *port;
2266
2267
0
    HMAP_FOR_EACH (port, node, &dp->ports) {
2268
0
        if (!strcmp(netdev_get_name(port->netdev), devname)) {
2269
0
            *portp = port;
2270
0
            return 0;
2271
0
        }
2272
0
    }
2273
2274
    /* Callers of dpif_netdev_port_query_by_name() expect ENODEV for a non
2275
     * existing port. */
2276
0
    return ENODEV;
2277
0
}
2278
2279
/* Returns 'true' if there is a port with pmd netdev. */
2280
static bool
2281
has_pmd_port(struct dp_netdev *dp)
2282
    OVS_REQ_RDLOCK(dp->port_rwlock)
2283
0
{
2284
0
    struct dp_netdev_port *port;
2285
2286
0
    HMAP_FOR_EACH (port, node, &dp->ports) {
2287
0
        if (netdev_is_pmd(port->netdev)) {
2288
0
            return true;
2289
0
        }
2290
0
    }
2291
2292
0
    return false;
2293
0
}
2294
2295
static void
2296
do_del_port(struct dp_netdev *dp, struct dp_netdev_port *port)
2297
    OVS_REQ_WRLOCK(dp->port_rwlock)
2298
0
{
2299
0
    hmap_remove(&dp->ports, &port->node);
2300
0
    seq_change(dp->port_seq);
2301
2302
0
    reconfigure_datapath(dp);
2303
0
    port_destroy(port);
2304
0
}
2305
2306
static void
2307
answer_port_query(const struct dp_netdev_port *port,
2308
                  struct dpif_port *dpif_port)
2309
0
{
2310
0
    dpif_port->name = xstrdup(netdev_get_name(port->netdev));
2311
0
    dpif_port->type = xstrdup(port->type);
2312
0
    dpif_port->port_no = port->port_no;
2313
0
}
2314
2315
static int
2316
dpif_netdev_port_query_by_number(const struct dpif *dpif, odp_port_t port_no,
2317
                                 struct dpif_port *dpif_port)
2318
0
{
2319
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
2320
0
    struct dp_netdev_port *port;
2321
0
    int error;
2322
2323
0
    ovs_rwlock_rdlock(&dp->port_rwlock);
2324
0
    error = get_port_by_number(dp, port_no, &port);
2325
0
    if (!error && dpif_port) {
2326
0
        answer_port_query(port, dpif_port);
2327
0
    }
2328
0
    ovs_rwlock_unlock(&dp->port_rwlock);
2329
2330
0
    return error;
2331
0
}
2332
2333
static int
2334
dpif_netdev_port_query_by_name(const struct dpif *dpif, const char *devname,
2335
                               struct dpif_port *dpif_port)
2336
0
{
2337
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
2338
0
    struct dp_netdev_port *port;
2339
0
    int error;
2340
2341
0
    ovs_rwlock_rdlock(&dp->port_rwlock);
2342
0
    error = get_port_by_name(dp, devname, &port);
2343
0
    if (!error && dpif_port) {
2344
0
        answer_port_query(port, dpif_port);
2345
0
    }
2346
0
    ovs_rwlock_unlock(&dp->port_rwlock);
2347
2348
0
    return error;
2349
0
}
2350
2351
static void
2352
dp_netdev_flow_free(struct dp_netdev_flow *flow)
2353
0
{
2354
0
    dp_netdev_actions_free(dp_netdev_flow_get_actions(flow));
2355
0
    free(flow->dp_extra_info);
2356
0
    free(flow);
2357
0
}
2358
2359
void dp_netdev_flow_unref(struct dp_netdev_flow *flow)
2360
0
{
2361
0
    if (ovs_refcount_unref_relaxed(&flow->ref_cnt) == 1) {
2362
0
        ovsrcu_postpone(dp_netdev_flow_free, flow);
2363
0
    }
2364
0
}
2365
2366
inline struct dpcls *
2367
dp_netdev_pmd_lookup_dpcls(struct dp_netdev_pmd_thread *pmd,
2368
                           odp_port_t in_port)
2369
0
{
2370
0
    struct dpcls *cls;
2371
0
    uint32_t hash = hash_port_no(in_port);
2372
0
    CMAP_FOR_EACH_WITH_HASH (cls, node, hash, &pmd->classifiers) {
2373
0
        if (cls->in_port == in_port) {
2374
            /* Port classifier exists already */
2375
0
            return cls;
2376
0
        }
2377
0
    }
2378
0
    return NULL;
2379
0
}
2380
2381
static inline struct dpcls *
2382
dp_netdev_pmd_find_dpcls(struct dp_netdev_pmd_thread *pmd,
2383
                         odp_port_t in_port)
2384
    OVS_REQUIRES(pmd->flow_mutex)
2385
0
{
2386
0
    struct dpcls *cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
2387
2388
0
    if (!cls) {
2389
0
        uint32_t hash = hash_port_no(in_port);
2390
2391
        /* Create new classifier for in_port */
2392
0
        cls = xmalloc(sizeof(*cls));
2393
0
        dpcls_init(cls);
2394
0
        cls->in_port = in_port;
2395
0
        cmap_insert(&pmd->classifiers, &cls->node, hash);
2396
0
        VLOG_DBG("Creating dpcls %p for in_port %d", cls, in_port);
2397
0
    }
2398
0
    return cls;
2399
0
}
2400
2401
static void
2402
log_netdev_flow_change(const struct dp_netdev_flow *flow,
2403
                       const struct match *match,
2404
                       const struct dp_netdev_actions *old_actions,
2405
                       const struct nlattr *actions,
2406
                       size_t actions_len)
2407
0
{
2408
0
    struct ds ds = DS_EMPTY_INITIALIZER;
2409
0
    struct ofpbuf key_buf, mask_buf;
2410
0
    struct odp_flow_key_parms odp_parms = {
2411
0
        .flow = &match->flow,
2412
0
        .mask = &match->wc.masks,
2413
0
        .support = dp_netdev_support,
2414
0
    };
2415
2416
0
    if (OVS_LIKELY(VLOG_DROP_DBG((&upcall_rl)))) {
2417
0
        return;
2418
0
    }
2419
2420
0
    ofpbuf_init(&key_buf, 0);
2421
0
    ofpbuf_init(&mask_buf, 0);
2422
2423
0
    odp_flow_key_from_flow(&odp_parms, &key_buf);
2424
0
    odp_parms.key_buf = &key_buf;
2425
0
    odp_flow_key_from_mask(&odp_parms, &mask_buf);
2426
2427
0
    if (old_actions) {
2428
0
        ds_put_cstr(&ds, "flow_mod: ");
2429
0
    } else {
2430
0
        ds_put_cstr(&ds, "flow_add: ");
2431
0
    }
2432
0
    odp_format_ufid(&flow->ufid, &ds);
2433
0
    ds_put_cstr(&ds, " mega_");
2434
0
    odp_format_ufid(&flow->mega_ufid, &ds);
2435
0
    ds_put_cstr(&ds, " ");
2436
0
    odp_flow_format(key_buf.data, key_buf.size,
2437
0
                    mask_buf.data, mask_buf.size,
2438
0
                    NULL, &ds, false, true);
2439
0
    if (old_actions) {
2440
0
        ds_put_cstr(&ds, ", old_actions:");
2441
0
        format_odp_actions(&ds, old_actions->actions, old_actions->size,
2442
0
                           NULL);
2443
0
    }
2444
0
    ds_put_cstr(&ds, ", actions:");
2445
0
    format_odp_actions(&ds, actions, actions_len, NULL);
2446
2447
0
    VLOG_DBG("%s", ds_cstr(&ds));
2448
2449
0
    ofpbuf_uninit(&key_buf);
2450
0
    ofpbuf_uninit(&mask_buf);
2451
2452
    /* Add a printout of the actual match installed. */
2453
0
    struct match m;
2454
0
    ds_clear(&ds);
2455
0
    ds_put_cstr(&ds, "flow match: ");
2456
0
    miniflow_expand(&flow->cr.flow.mf, &m.flow);
2457
0
    miniflow_expand(&flow->cr.mask->mf, &m.wc.masks);
2458
0
    memset(&m.tun_md, 0, sizeof m.tun_md);
2459
0
    match_format(&m, NULL, &ds, OFP_DEFAULT_PRIORITY);
2460
2461
0
    VLOG_DBG("%s", ds_cstr(&ds));
2462
2463
0
    ds_destroy(&ds);
2464
0
}
2465
2466
/* Offloaded flows can be handled asynchronously, so we do not always know
2467
 * whether a specific flow is offloaded or not.  It might still be pending;
2468
 * in fact, multiple modifications can be pending, and the actual offload
2469
 * state depends on the completion of each modification.
2470
 *
2471
 * To correctly determine whether a flow is offloaded when it is being
2472
 * destroyed (and therefore requires cleanup), we must ensure that all
2473
 * operations have completed.  To achieve this, we track the number of
2474
 * outstanding offloaded flow modifications. */
2475
static bool
2476
offload_queue_inc(struct dp_netdev_flow *flow)
2477
0
{
2478
0
    int current;
2479
2480
0
    while (true) {
2481
0
        atomic_read(&flow->offload_queue_depth, &current);
2482
0
        if (current < 0) {
2483
            /* We are cleaning up, so no longer enqueue operations. */
2484
0
            return false;
2485
0
        }
2486
2487
        /* Here we try to atomically increase the value.  If we do not succeed,
2488
         * someone else has modified it, and we need to check again for a
2489
         * current negative value. */
2490
0
        if (atomic_compare_exchange_strong(&flow->offload_queue_depth,
2491
0
                                           &current, current + 1)) {
2492
0
            return true;
2493
0
        }
2494
0
    }
2495
0
}
2496
2497
static bool
2498
offload_queue_dec(struct dp_netdev_flow *flow)
2499
0
{
2500
0
    int old;
2501
2502
0
    atomic_sub(&flow->offload_queue_depth, 1, &old);
2503
0
    ovs_assert(old >= 1);
2504
2505
0
    if (old == 1) {
2506
        /* Note that this only indicates that the queue might be empty. */
2507
0
        return true;
2508
0
    }
2509
0
    return false;
2510
0
}
2511
2512
static bool
2513
offload_queue_complete(struct dp_netdev_flow *flow)
2514
0
{
2515
    /* This function returns false if the queue is still in use.
2516
     * If the queue is empty, it will attempt to atomically mark it as
2517
     * 'not in use' by making the queue depth negative.  This prevents
2518
     * other flow operations from being added.  If successful, it returns
2519
     * true. */
2520
0
     int expected_val = 0;
2521
2522
0
    return atomic_compare_exchange_strong(&flow->offload_queue_depth,
2523
0
                                          &expected_val, -1);
2524
0
}
2525
2526
static void
2527
offload_flow_reference_unreference_cb(unsigned pmd_id OVS_UNUSED,
2528
                                      void *flow_reference_)
2529
0
{
2530
0
    struct dp_netdev_flow *flow_reference = flow_reference_;
2531
2532
0
    if (flow_reference) {
2533
0
        flow_reference->offloaded = false;
2534
0
        dp_netdev_flow_unref(flow_reference);
2535
0
    }
2536
0
}
2537
2538
static void
2539
offload_flow_del_resume(struct dp_netdev_flow *flow_reference,
2540
                        int error)
2541
0
{
2542
0
    if (error == EINPROGRESS) {
2543
0
        return;
2544
0
    }
2545
2546
0
    if (error) {
2547
0
        odp_port_t in_port = flow_reference->flow.in_port.odp_port;
2548
2549
0
        VLOG_DBG(
2550
0
            "Failed removing offload flow ufid " UUID_FMT " from port %d: %d",
2551
0
            UUID_ARGS((struct uuid *)&flow_reference->mega_ufid), in_port,
2552
0
            error);
2553
0
    } else {
2554
        /* Release because we successfully removed the reference. */
2555
0
        dp_netdev_flow_unref(flow_reference);
2556
0
    }
2557
2558
    /* Release as we took a reference in offload_flow_del(). */
2559
0
    dp_netdev_flow_unref(flow_reference);
2560
0
}
2561
2562
static void
2563
offload_flow_del_resume_cb(void *aux OVS_UNUSED,
2564
                           struct dpif_flow_stats *stats OVS_UNUSED,
2565
                           unsigned pmd_id OVS_UNUSED,
2566
                           void *flow_reference,
2567
                           void *previous_flow_reference OVS_UNUSED, int error)
2568
0
{
2569
0
    offload_flow_del_resume(flow_reference, error);
2570
0
}
2571
2572
static void
2573
offload_flow_del(struct dp_netdev *dp, unsigned pmd_id,
2574
                 struct dp_netdev_flow *flow)
2575
0
{
2576
0
    odp_port_t in_port = flow->flow.in_port.odp_port;
2577
0
    struct dpif_offload_flow_del del = {
2578
0
        .in_port = in_port,
2579
0
        .pmd_id = pmd_id,
2580
0
        .ufid = CONST_CAST(ovs_u128 *, &flow->mega_ufid),
2581
0
        .flow_reference = flow,
2582
0
        .stats = NULL,
2583
0
        .cb_data = { .callback = offload_flow_del_resume_cb },
2584
0
    };
2585
0
    int error;
2586
2587
0
    if (!dpif_offload_enabled()) {
2588
0
        return;
2589
0
    }
2590
2591
    /* This offload flow delete is only called when the actual flow is
2592
     * destructed.  However, we can only trust the state of flow->offloaded
2593
     * if no more flow_put operations are pending.  Below, we check whether
2594
     * the queue can be marked as complete, and then determine if we need
2595
     * to schedule a removal.  If not, the delete will be rescheduled later
2596
     * in the last offload_flow_put_resume_cb() callback. */
2597
0
    ovs_assert(flow->dead);
2598
0
    if (!offload_queue_complete(flow) || !flow->offloaded) {
2599
0
        return;
2600
0
    }
2601
2602
0
    flow->offloaded = false;
2603
0
    dp_netdev_flow_ref(flow);
2604
2605
    /* It's the responsibility of the offload provider to remove the
2606
     * actual rule from hardware only if none of the other PMD threads
2607
     * have the rule installed in hardware. */
2608
0
    error = dpif_offload_datapath_flow_del(dp->full_name, &del);
2609
0
    offload_flow_del_resume(flow, error);
2610
0
}
2611
2612
static void
2613
dp_netdev_pmd_remove_flow(struct dp_netdev_pmd_thread *pmd,
2614
                          struct dp_netdev_flow *flow)
2615
    OVS_REQUIRES(pmd->flow_mutex)
2616
0
{
2617
0
    struct cmap_node *node = CONST_CAST(struct cmap_node *, &flow->node);
2618
0
    struct dpcls *cls;
2619
0
    odp_port_t in_port = flow->flow.in_port.odp_port;
2620
2621
0
    cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
2622
0
    ovs_assert(cls != NULL);
2623
0
    dpcls_remove(cls, &flow->cr);
2624
0
    dp_netdev_simple_match_remove(pmd, flow);
2625
0
    cmap_remove(&pmd->flow_table, node, dp_netdev_flow_hash(&flow->ufid));
2626
0
    ccmap_dec(&pmd->n_flows, odp_to_u32(in_port));
2627
0
    flow->dead = true;
2628
0
    offload_flow_del(pmd->dp, pmd->core_id, flow);
2629
2630
0
    dp_netdev_flow_unref(flow);
2631
0
}
2632
2633
static void
2634
dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd)
2635
0
{
2636
0
    struct dp_netdev_flow *netdev_flow;
2637
2638
0
    ovs_mutex_lock(&pmd->flow_mutex);
2639
0
    CMAP_FOR_EACH (netdev_flow, node, &pmd->flow_table) {
2640
0
        dp_netdev_pmd_remove_flow(pmd, netdev_flow);
2641
0
    }
2642
0
    ovs_mutex_unlock(&pmd->flow_mutex);
2643
0
}
2644
2645
static int
2646
dpif_netdev_flow_flush(struct dpif *dpif)
2647
0
{
2648
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
2649
0
    struct dp_netdev_pmd_thread *pmd;
2650
2651
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
2652
0
        dp_netdev_pmd_flow_flush(pmd);
2653
0
    }
2654
2655
0
    return 0;
2656
0
}
2657
2658
struct dp_netdev_port_state {
2659
    struct hmap_position position;
2660
    char *name;
2661
};
2662
2663
static int
2664
dpif_netdev_port_dump_start(const struct dpif *dpif OVS_UNUSED, void **statep)
2665
0
{
2666
0
    *statep = xzalloc(sizeof(struct dp_netdev_port_state));
2667
0
    return 0;
2668
0
}
2669
2670
static int
2671
dpif_netdev_port_dump_next(const struct dpif *dpif, void *state_,
2672
                           struct dpif_port *dpif_port)
2673
0
{
2674
0
    struct dp_netdev_port_state *state = state_;
2675
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
2676
0
    struct hmap_node *node;
2677
0
    int retval;
2678
2679
0
    ovs_rwlock_rdlock(&dp->port_rwlock);
2680
0
    node = hmap_at_position(&dp->ports, &state->position);
2681
0
    if (node) {
2682
0
        struct dp_netdev_port *port;
2683
2684
0
        port = CONTAINER_OF(node, struct dp_netdev_port, node);
2685
2686
0
        free(state->name);
2687
0
        state->name = xstrdup(netdev_get_name(port->netdev));
2688
0
        dpif_port->name = state->name;
2689
0
        dpif_port->type = port->type;
2690
0
        dpif_port->port_no = port->port_no;
2691
2692
0
        retval = 0;
2693
0
    } else {
2694
0
        retval = EOF;
2695
0
    }
2696
0
    ovs_rwlock_unlock(&dp->port_rwlock);
2697
2698
0
    return retval;
2699
0
}
2700
2701
static int
2702
dpif_netdev_port_dump_done(const struct dpif *dpif OVS_UNUSED, void *state_)
2703
0
{
2704
0
    struct dp_netdev_port_state *state = state_;
2705
0
    free(state->name);
2706
0
    free(state);
2707
0
    return 0;
2708
0
}
2709
2710
static int
2711
dpif_netdev_port_poll(const struct dpif *dpif_, char **devnamep OVS_UNUSED)
2712
0
{
2713
0
    struct dpif_netdev *dpif = dpif_netdev_cast(dpif_);
2714
0
    uint64_t new_port_seq;
2715
0
    int error;
2716
2717
0
    new_port_seq = seq_read(dpif->dp->port_seq);
2718
0
    if (dpif->last_port_seq != new_port_seq) {
2719
0
        dpif->last_port_seq = new_port_seq;
2720
0
        error = ENOBUFS;
2721
0
    } else {
2722
0
        error = EAGAIN;
2723
0
    }
2724
2725
0
    return error;
2726
0
}
2727
2728
static void
2729
dpif_netdev_port_poll_wait(const struct dpif *dpif_)
2730
0
{
2731
0
    struct dpif_netdev *dpif = dpif_netdev_cast(dpif_);
2732
2733
0
    seq_wait(dpif->dp->port_seq, dpif->last_port_seq);
2734
0
}
2735
2736
static struct dp_netdev_flow *
2737
dp_netdev_flow_cast(const struct dpcls_rule *cr)
2738
0
{
2739
0
    return cr ? CONTAINER_OF(cr, struct dp_netdev_flow, cr) : NULL;
2740
0
}
2741
2742
static bool dp_netdev_flow_ref(struct dp_netdev_flow *flow)
2743
0
{
2744
0
    return ovs_refcount_try_ref_rcu(&flow->ref_cnt);
2745
0
}
2746
2747
/* netdev_flow_key utilities.
2748
 *
2749
 * netdev_flow_key is basically a miniflow.  We use these functions
2750
 * (netdev_flow_key_clone, netdev_flow_key_equal, ...) instead of the miniflow
2751
 * functions (miniflow_clone_inline, miniflow_equal, ...), because:
2752
 *
2753
 * - Since we are dealing exclusively with miniflows created by
2754
 *   miniflow_extract(), if the map is different the miniflow is different.
2755
 *   Therefore we can be faster by comparing the map and the miniflow in a
2756
 *   single memcmp().
2757
 * - These functions can be inlined by the compiler. */
2758
2759
static inline bool
2760
netdev_flow_key_equal(const struct netdev_flow_key *a,
2761
                      const struct netdev_flow_key *b)
2762
0
{
2763
    /* 'b->len' may be not set yet. */
2764
0
    return a->hash == b->hash && !memcmp(&a->mf, &b->mf, a->len);
2765
0
}
2766
2767
static inline void
2768
netdev_flow_key_clone(struct netdev_flow_key *dst,
2769
                      const struct netdev_flow_key *src)
2770
0
{
2771
0
    memcpy(dst, src,
2772
0
           offsetof(struct netdev_flow_key, mf) + src->len);
2773
0
}
2774
2775
/* Initialize a netdev_flow_key 'mask' from 'match'. */
2776
static inline void
2777
netdev_flow_mask_init(struct netdev_flow_key *mask,
2778
                      const struct match *match)
2779
0
{
2780
0
    uint64_t *dst = miniflow_values(&mask->mf);
2781
0
    struct flowmap fmap;
2782
0
    uint32_t hash = 0;
2783
0
    size_t idx;
2784
2785
    /* Only check masks that make sense for the flow. */
2786
0
    flow_wc_map(&match->flow, &fmap);
2787
0
    flowmap_init(&mask->mf.map);
2788
2789
0
    FLOWMAP_FOR_EACH_INDEX(idx, fmap) {
2790
0
        uint64_t mask_u64 = flow_u64_value(&match->wc.masks, idx);
2791
2792
0
        if (mask_u64) {
2793
0
            flowmap_set(&mask->mf.map, idx, 1);
2794
0
            *dst++ = mask_u64;
2795
0
            hash = hash_add64(hash, mask_u64);
2796
0
        }
2797
0
    }
2798
2799
0
    map_t map;
2800
2801
0
    FLOWMAP_FOR_EACH_MAP (map, mask->mf.map) {
2802
0
        hash = hash_add64(hash, map);
2803
0
    }
2804
2805
0
    size_t n = dst - miniflow_get_values(&mask->mf);
2806
2807
0
    mask->hash = hash_finish(hash, n * 8);
2808
0
    mask->len = netdev_flow_key_size(n);
2809
0
}
2810
2811
/* Initializes 'dst' as a copy of 'flow' masked with 'mask'. */
2812
static inline void
2813
netdev_flow_key_init_masked(struct netdev_flow_key *dst,
2814
                            const struct flow *flow,
2815
                            const struct netdev_flow_key *mask)
2816
0
{
2817
0
    uint64_t *dst_u64 = miniflow_values(&dst->mf);
2818
0
    const uint64_t *mask_u64 = miniflow_get_values(&mask->mf);
2819
0
    uint32_t hash = 0;
2820
0
    uint64_t value;
2821
2822
0
    dst->len = mask->len;
2823
0
    dst->mf = mask->mf;   /* Copy maps. */
2824
2825
0
    FLOW_FOR_EACH_IN_MAPS(value, flow, mask->mf.map) {
2826
0
        *dst_u64 = value & *mask_u64++;
2827
0
        hash = hash_add64(hash, *dst_u64++);
2828
0
    }
2829
0
    dst->hash = hash_finish(hash,
2830
0
                            (dst_u64 - miniflow_get_values(&dst->mf)) * 8);
2831
0
}
2832
2833
/* Initializes 'key' as a copy of 'flow'. */
2834
static inline void
2835
netdev_flow_key_init(struct netdev_flow_key *key,
2836
                     const struct flow *flow)
2837
0
{
2838
0
    uint32_t hash = 0;
2839
0
    uint64_t value;
2840
2841
0
    miniflow_map_init(&key->mf, flow);
2842
0
    miniflow_init(&key->mf, flow);
2843
2844
0
    size_t n = miniflow_n_values(&key->mf);
2845
2846
0
    FLOW_FOR_EACH_IN_MAPS (value, flow, key->mf.map) {
2847
0
        hash = hash_add64(hash, value);
2848
0
    }
2849
2850
0
    key->hash = hash_finish(hash, n * 8);
2851
0
    key->len = netdev_flow_key_size(n);
2852
0
}
2853
2854
static inline void
2855
emc_change_entry(struct emc_entry *ce, struct dp_netdev_flow *flow,
2856
                 const struct netdev_flow_key *key)
2857
0
{
2858
0
    if (ce->flow != flow) {
2859
0
        if (ce->flow) {
2860
0
            dp_netdev_flow_unref(ce->flow);
2861
0
        }
2862
2863
0
        if (dp_netdev_flow_ref(flow)) {
2864
0
            ce->flow = flow;
2865
0
        } else {
2866
0
            ce->flow = NULL;
2867
0
        }
2868
0
    }
2869
0
    if (key) {
2870
0
        netdev_flow_key_clone(&ce->key, key);
2871
0
    }
2872
0
}
2873
2874
static inline void
2875
emc_insert(struct emc_cache *cache, const struct netdev_flow_key *key,
2876
           struct dp_netdev_flow *flow)
2877
0
{
2878
0
    struct emc_entry *to_be_replaced = NULL;
2879
0
    struct emc_entry *current_entry;
2880
2881
0
    EMC_FOR_EACH_POS_WITH_HASH(cache, current_entry, key->hash) {
2882
0
        if (netdev_flow_key_equal(&current_entry->key, key)) {
2883
            /* We found the entry with the 'mf' miniflow */
2884
0
            emc_change_entry(current_entry, flow, NULL);
2885
0
            return;
2886
0
        }
2887
2888
        /* Replacement policy: put the flow in an empty (not alive) entry, or
2889
         * in the first entry where it can be */
2890
0
        if (!to_be_replaced
2891
0
            || (emc_entry_alive(to_be_replaced)
2892
0
                && !emc_entry_alive(current_entry))
2893
0
            || current_entry->key.hash < to_be_replaced->key.hash) {
2894
0
            to_be_replaced = current_entry;
2895
0
        }
2896
0
    }
2897
    /* We didn't find the miniflow in the cache.
2898
     * The 'to_be_replaced' entry is where the new flow will be stored */
2899
2900
0
    emc_change_entry(to_be_replaced, flow, key);
2901
0
}
2902
2903
static inline void
2904
emc_probabilistic_insert(struct dp_netdev_pmd_thread *pmd,
2905
                         const struct netdev_flow_key *key,
2906
                         struct dp_netdev_flow *flow)
2907
0
{
2908
    /* Insert an entry into the EMC based on probability value 'min'. By
2909
     * default the value is UINT32_MAX / 100 which yields an insertion
2910
     * probability of 1/100 ie. 1% */
2911
2912
0
    uint32_t min = pmd->ctx.emc_insert_min;
2913
2914
0
    if (min && random_uint32() <= min) {
2915
0
        emc_insert(&(pmd->flow_cache).emc_cache, key, flow);
2916
0
    }
2917
0
}
2918
2919
static inline const struct cmap_node *
2920
smc_entry_get(struct dp_netdev_pmd_thread *pmd, const uint32_t hash)
2921
0
{
2922
0
    struct smc_cache *cache = &(pmd->flow_cache).smc_cache;
2923
0
    struct smc_bucket *bucket = &cache->buckets[hash & SMC_MASK];
2924
0
    uint16_t sig = hash >> 16;
2925
0
    uint16_t index = UINT16_MAX;
2926
2927
0
    for (int i = 0; i < SMC_ENTRY_PER_BUCKET; i++) {
2928
0
        if (bucket->sig[i] == sig) {
2929
0
            index = bucket->flow_idx[i];
2930
0
            break;
2931
0
        }
2932
0
    }
2933
0
    if (index != UINT16_MAX) {
2934
0
        return cmap_find_by_index(&pmd->flow_table, index);
2935
0
    }
2936
0
    return NULL;
2937
0
}
2938
2939
/* Insert the flow_table index into SMC. Insertion may fail when 1) SMC is
2940
 * turned off, 2) the flow_table index is larger than uint16_t can handle.
2941
 * If there is already an SMC entry having same signature, the index will be
2942
 * updated. If there is no existing entry, but an empty entry is available,
2943
 * the empty entry will be taken. If no empty entry or existing same signature,
2944
 * a random entry from the hashed bucket will be picked. */
2945
static inline void
2946
smc_insert(struct dp_netdev_pmd_thread *pmd,
2947
           const struct netdev_flow_key *key,
2948
           uint32_t hash)
2949
0
{
2950
0
    struct smc_cache *smc_cache = &(pmd->flow_cache).smc_cache;
2951
0
    struct smc_bucket *bucket = &smc_cache->buckets[key->hash & SMC_MASK];
2952
0
    uint16_t index;
2953
0
    uint32_t cmap_index;
2954
0
    int i;
2955
2956
0
    if (!pmd->ctx.smc_enable_db) {
2957
0
        return;
2958
0
    }
2959
2960
0
    cmap_index = cmap_find_index(&pmd->flow_table, hash);
2961
0
    index = (cmap_index >= UINT16_MAX) ? UINT16_MAX : (uint16_t)cmap_index;
2962
2963
    /* If the index is larger than SMC can handle (uint16_t), we don't
2964
     * insert */
2965
0
    if (index == UINT16_MAX) {
2966
0
        return;
2967
0
    }
2968
2969
    /* If an entry with same signature already exists, update the index */
2970
0
    uint16_t sig = key->hash >> 16;
2971
0
    for (i = 0; i < SMC_ENTRY_PER_BUCKET; i++) {
2972
0
        if (bucket->sig[i] == sig) {
2973
0
            bucket->flow_idx[i] = index;
2974
0
            return;
2975
0
        }
2976
0
    }
2977
    /* If there is an empty entry, occupy it. */
2978
0
    for (i = 0; i < SMC_ENTRY_PER_BUCKET; i++) {
2979
0
        if (bucket->flow_idx[i] == UINT16_MAX) {
2980
0
            bucket->sig[i] = sig;
2981
0
            bucket->flow_idx[i] = index;
2982
0
            return;
2983
0
        }
2984
0
    }
2985
    /* Otherwise, pick a random entry. */
2986
0
    i = random_uint32() % SMC_ENTRY_PER_BUCKET;
2987
0
    bucket->sig[i] = sig;
2988
0
    bucket->flow_idx[i] = index;
2989
0
}
2990
2991
inline void
2992
emc_probabilistic_insert_batch(struct dp_netdev_pmd_thread *pmd,
2993
                               const struct netdev_flow_key *keys,
2994
                               struct dpcls_rule **rules,
2995
                               uint32_t emc_insert_mask)
2996
0
{
2997
0
    while (emc_insert_mask) {
2998
0
        uint32_t i = raw_ctz(emc_insert_mask);
2999
0
        emc_insert_mask &= emc_insert_mask - 1;
3000
        /* Get the require parameters for EMC/SMC from the rule */
3001
0
        struct dp_netdev_flow *flow = dp_netdev_flow_cast(rules[i]);
3002
        /* Insert the key into EMC/SMC. */
3003
0
        emc_probabilistic_insert(pmd, &keys[i], flow);
3004
0
    }
3005
0
}
3006
3007
inline void
3008
smc_insert_batch(struct dp_netdev_pmd_thread *pmd,
3009
                 const struct netdev_flow_key *keys,
3010
                 struct dpcls_rule **rules,
3011
                 uint32_t smc_insert_mask)
3012
0
{
3013
0
    while (smc_insert_mask) {
3014
0
        uint32_t i = raw_ctz(smc_insert_mask);
3015
0
        smc_insert_mask &= smc_insert_mask - 1;
3016
        /* Get the require parameters for EMC/SMC from the rule */
3017
0
        struct dp_netdev_flow *flow = dp_netdev_flow_cast(rules[i]);
3018
0
        uint32_t hash = dp_netdev_flow_hash(&flow->ufid);
3019
        /* Insert the key into EMC/SMC. */
3020
0
        smc_insert(pmd, &keys[i], hash);
3021
0
    }
3022
0
}
3023
3024
static struct dp_netdev_flow *
3025
dp_netdev_pmd_lookup_flow(struct dp_netdev_pmd_thread *pmd,
3026
                          const struct netdev_flow_key *key,
3027
                          int *lookup_num_p)
3028
0
{
3029
0
    struct dpcls *cls;
3030
0
    struct dpcls_rule *rule = NULL;
3031
0
    odp_port_t in_port = u32_to_odp(MINIFLOW_GET_U32(&key->mf,
3032
0
                                                     in_port.odp_port));
3033
0
    struct dp_netdev_flow *netdev_flow = NULL;
3034
3035
0
    cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
3036
0
    if (OVS_LIKELY(cls)) {
3037
0
        dpcls_lookup(cls, &key, &rule, 1, lookup_num_p);
3038
0
        netdev_flow = dp_netdev_flow_cast(rule);
3039
0
    }
3040
0
    return netdev_flow;
3041
0
}
3042
3043
static struct dp_netdev_flow *
3044
dp_netdev_pmd_find_flow(const struct dp_netdev_pmd_thread *pmd,
3045
                        const ovs_u128 *ufidp, const struct nlattr *key,
3046
                        size_t key_len)
3047
0
{
3048
0
    struct dp_netdev_flow *netdev_flow;
3049
0
    struct flow flow;
3050
0
    ovs_u128 ufid;
3051
3052
    /* If a UFID is not provided, determine one based on the key. */
3053
0
    if (!ufidp && key && key_len
3054
0
        && !dpif_netdev_flow_from_nlattrs(key, key_len, &flow, false)) {
3055
0
        odp_flow_key_hash(&flow, sizeof flow, &ufid);
3056
0
        ufidp = &ufid;
3057
0
    }
3058
3059
0
    if (ufidp) {
3060
0
        CMAP_FOR_EACH_WITH_HASH (netdev_flow, node, dp_netdev_flow_hash(ufidp),
3061
0
                                 &pmd->flow_table) {
3062
0
            if (ovs_u128_equals(netdev_flow->ufid, *ufidp)) {
3063
0
                return netdev_flow;
3064
0
            }
3065
0
        }
3066
0
    }
3067
3068
0
    return NULL;
3069
0
}
3070
3071
static void
3072
get_dpif_flow_status(const struct dp_netdev *dp,
3073
                     const struct dp_netdev_flow *netdev_flow_,
3074
                     struct dpif_flow_stats *stats,
3075
                     struct dpif_flow_attrs *attrs)
3076
0
{
3077
0
    struct dpif_flow_stats offload_stats;
3078
0
    struct dpif_flow_attrs offload_attrs;
3079
0
    struct dp_netdev_flow *netdev_flow;
3080
0
    unsigned long long n;
3081
0
    long long used;
3082
0
    uint16_t flags;
3083
3084
0
    netdev_flow = CONST_CAST(struct dp_netdev_flow *, netdev_flow_);
3085
3086
0
    atomic_read_relaxed(&netdev_flow->stats.packet_count, &n);
3087
0
    stats->n_packets = n;
3088
0
    atomic_read_relaxed(&netdev_flow->stats.byte_count, &n);
3089
0
    stats->n_bytes = n;
3090
0
    atomic_read_relaxed(&netdev_flow->stats.used, &used);
3091
0
    stats->used = used;
3092
0
    atomic_read_relaxed(&netdev_flow->stats.tcp_flags, &flags);
3093
0
    stats->tcp_flags = flags;
3094
3095
0
    if (dpif_offload_datapath_flow_stats(dp->full_name,
3096
0
                                         netdev_flow->flow.in_port.odp_port,
3097
0
                                         &netdev_flow->mega_ufid,
3098
0
                                         &offload_stats, &offload_attrs)) {
3099
0
        stats->n_packets += offload_stats.n_packets;
3100
0
        stats->n_bytes += offload_stats.n_bytes;
3101
0
        stats->used = MAX(stats->used, offload_stats.used);
3102
0
        stats->tcp_flags |= offload_stats.tcp_flags;
3103
0
        if (attrs) {
3104
0
            attrs->offloaded = offload_attrs.offloaded;
3105
0
            attrs->dp_layer = offload_attrs.dp_layer;
3106
0
        }
3107
0
    } else if (attrs) {
3108
0
        attrs->offloaded = false;
3109
0
        attrs->dp_layer = "ovs";
3110
0
    }
3111
0
}
3112
3113
/* Converts to the dpif_flow format, using 'key_buf' and 'mask_buf' for
3114
 * storing the netlink-formatted key/mask. 'key_buf' may be the same as
3115
 * 'mask_buf'. Actions will be returned without copying, by relying on RCU to
3116
 * protect them. */
3117
static void
3118
dp_netdev_flow_to_dpif_flow(const struct dp_netdev *dp,
3119
                            const struct dp_netdev_flow *netdev_flow,
3120
                            struct ofpbuf *key_buf, struct ofpbuf *mask_buf,
3121
                            struct dpif_flow *flow, bool terse)
3122
0
{
3123
0
    if (terse) {
3124
0
        memset(flow, 0, sizeof *flow);
3125
0
    } else {
3126
0
        struct flow_wildcards wc;
3127
0
        struct dp_netdev_actions *actions;
3128
0
        size_t offset;
3129
0
        struct odp_flow_key_parms odp_parms = {
3130
0
            .flow = &netdev_flow->flow,
3131
0
            .mask = &wc.masks,
3132
0
            .support = dp_netdev_support,
3133
0
        };
3134
3135
0
        miniflow_expand(&netdev_flow->cr.mask->mf, &wc.masks);
3136
        /* in_port is exact matched, but we have left it out from the mask for
3137
         * optimnization reasons. Add in_port back to the mask. */
3138
0
        wc.masks.in_port.odp_port = ODPP_NONE;
3139
3140
        /* Key */
3141
0
        offset = key_buf->size;
3142
0
        flow->key = ofpbuf_tail(key_buf);
3143
0
        odp_flow_key_from_flow(&odp_parms, key_buf);
3144
0
        flow->key_len = key_buf->size - offset;
3145
3146
        /* Mask */
3147
0
        offset = mask_buf->size;
3148
0
        flow->mask = ofpbuf_tail(mask_buf);
3149
0
        odp_parms.key_buf = key_buf;
3150
0
        odp_flow_key_from_mask(&odp_parms, mask_buf);
3151
0
        flow->mask_len = mask_buf->size - offset;
3152
3153
        /* Actions */
3154
0
        actions = dp_netdev_flow_get_actions(netdev_flow);
3155
0
        flow->actions = actions->actions;
3156
0
        flow->actions_len = actions->size;
3157
0
    }
3158
3159
0
    flow->ufid = netdev_flow->ufid;
3160
0
    flow->ufid_present = true;
3161
0
    flow->pmd_id = netdev_flow->pmd_id;
3162
3163
0
    get_dpif_flow_status(dp, netdev_flow, &flow->stats, &flow->attrs);
3164
0
    flow->attrs.dp_extra_info = netdev_flow->dp_extra_info;
3165
0
}
3166
3167
static int
3168
dpif_netdev_mask_from_nlattrs(const struct nlattr *key, uint32_t key_len,
3169
                              const struct nlattr *mask_key,
3170
                              uint32_t mask_key_len, const struct flow *flow,
3171
                              struct flow_wildcards *wc, bool probe)
3172
0
{
3173
0
    enum odp_key_fitness fitness;
3174
3175
0
    fitness = odp_flow_key_to_mask(mask_key, mask_key_len, wc, flow, NULL);
3176
0
    if (fitness) {
3177
0
        if (!probe) {
3178
            /* This should not happen: it indicates that
3179
             * odp_flow_key_from_mask() and odp_flow_key_to_mask()
3180
             * disagree on the acceptable form of a mask.  Log the problem
3181
             * as an error, with enough details to enable debugging. */
3182
0
            static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
3183
3184
0
            if (!VLOG_DROP_ERR(&rl)) {
3185
0
                struct ds s;
3186
3187
0
                ds_init(&s);
3188
0
                odp_flow_format(key, key_len, mask_key, mask_key_len, NULL, &s,
3189
0
                                true, true);
3190
0
                VLOG_ERR("internal error parsing flow mask %s (%s)",
3191
0
                ds_cstr(&s), odp_key_fitness_to_string(fitness));
3192
0
                ds_destroy(&s);
3193
0
            }
3194
0
        }
3195
3196
0
        return EINVAL;
3197
0
    }
3198
3199
0
    return 0;
3200
0
}
3201
3202
static int
3203
dpif_netdev_flow_from_nlattrs(const struct nlattr *key, uint32_t key_len,
3204
                              struct flow *flow, bool probe)
3205
0
{
3206
0
    if (odp_flow_key_to_flow(key, key_len, flow, NULL)) {
3207
0
        if (!probe) {
3208
            /* This should not happen: it indicates that
3209
             * odp_flow_key_from_flow() and odp_flow_key_to_flow() disagree on
3210
             * the acceptable form of a flow.  Log the problem as an error,
3211
             * with enough details to enable debugging. */
3212
0
            static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
3213
3214
0
            if (!VLOG_DROP_ERR(&rl)) {
3215
0
                struct ds s;
3216
3217
0
                ds_init(&s);
3218
0
                odp_flow_format(key, key_len, NULL, 0, NULL, &s, true, false);
3219
0
                VLOG_ERR("internal error parsing flow key %s", ds_cstr(&s));
3220
0
                ds_destroy(&s);
3221
0
            }
3222
0
        }
3223
3224
0
        return EINVAL;
3225
0
    }
3226
3227
0
    if (flow->ct_state & DP_NETDEV_CS_UNSUPPORTED_MASK) {
3228
0
        return EINVAL;
3229
0
    }
3230
3231
0
    return 0;
3232
0
}
3233
3234
static int
3235
dpif_netdev_flow_get(const struct dpif *dpif, const struct dpif_flow_get *get)
3236
0
{
3237
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
3238
0
    struct dp_netdev_flow *netdev_flow;
3239
0
    struct dp_netdev_pmd_thread *pmd;
3240
0
    struct hmapx to_find = HMAPX_INITIALIZER(&to_find);
3241
0
    struct hmapx_node *node;
3242
0
    int error = EINVAL;
3243
3244
0
    if (get->pmd_id == PMD_ID_NULL) {
3245
0
        CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3246
0
            if (dp_netdev_pmd_try_ref(pmd) && !hmapx_add(&to_find, pmd)) {
3247
0
                dp_netdev_pmd_unref(pmd);
3248
0
            }
3249
0
        }
3250
0
    } else {
3251
0
        pmd = dp_netdev_get_pmd(dp, get->pmd_id);
3252
0
        if (!pmd) {
3253
0
            goto out;
3254
0
        }
3255
0
        hmapx_add(&to_find, pmd);
3256
0
    }
3257
3258
0
    if (!hmapx_count(&to_find)) {
3259
0
        goto out;
3260
0
    }
3261
3262
0
    HMAPX_FOR_EACH (node, &to_find) {
3263
0
        pmd = (struct dp_netdev_pmd_thread *) node->data;
3264
0
        netdev_flow = dp_netdev_pmd_find_flow(pmd, get->ufid, get->key,
3265
0
                                              get->key_len);
3266
0
        if (netdev_flow) {
3267
0
            dp_netdev_flow_to_dpif_flow(dp, netdev_flow, get->buffer,
3268
0
                                        get->buffer, get->flow, false);
3269
0
            error = 0;
3270
0
            break;
3271
0
        } else {
3272
0
            error = ENOENT;
3273
0
        }
3274
0
    }
3275
3276
0
    HMAPX_FOR_EACH (node, &to_find) {
3277
0
        pmd = (struct dp_netdev_pmd_thread *) node->data;
3278
0
        dp_netdev_pmd_unref(pmd);
3279
0
    }
3280
0
out:
3281
0
    hmapx_destroy(&to_find);
3282
0
    return error;
3283
0
}
3284
3285
static void
3286
dp_netdev_get_mega_ufid(const struct match *match, ovs_u128 *mega_ufid)
3287
0
{
3288
0
    struct flow masked_flow;
3289
0
    size_t i;
3290
3291
0
    for (i = 0; i < sizeof(struct flow); i++) {
3292
0
        ((uint8_t *)&masked_flow)[i] = ((uint8_t *)&match->flow)[i] &
3293
0
                                       ((uint8_t *)&match->wc)[i];
3294
0
    }
3295
0
    odp_flow_key_hash(&masked_flow, sizeof masked_flow, mega_ufid);
3296
0
}
3297
3298
uint64_t
3299
dp_netdev_simple_match_mark(odp_port_t in_port, ovs_be16 dl_type,
3300
                            uint8_t nw_frag, ovs_be16 vlan_tci)
3301
0
{
3302
    /* Simple Match Mark:
3303
     *
3304
     * BE:
3305
     * +-----------------+-------------++---------+---+-----------+
3306
     * |     in_port     |   dl_type   || nw_frag |CFI|  VID(12)  |
3307
     * +-----------------+-------------++---------+---+-----------+
3308
     * 0                 32          47 49         51  52     63
3309
     *
3310
     * LE:
3311
     * +-----------------+-------------+------++-------+---+------+
3312
     * |     in_port     |   dl_type   |VID(8)||nw_frag|CFI|VID(4)|
3313
     * +-----------------+-------------+------++-------+---+------+
3314
     * 0                 32          47 48  55  57   59 60  61   63
3315
     *
3316
     *         Big Endian              Little Endian
3317
     * in_port : 32 bits [ 0..31]  in_port : 32 bits [ 0..31]
3318
     * dl_type : 16 bits [32..47]  dl_type : 16 bits [32..47]
3319
     * <empty> :  1 bit  [48..48]  vlan VID:  8 bits [48..55]
3320
     * nw_frag :  2 bits [49..50]  <empty> :  1 bit  [56..56]
3321
     * vlan CFI:  1 bit  [51..51]  nw_frag :  2 bits [57..59]
3322
     * vlan VID: 12 bits [52..63]  vlan CFI:  1 bit  [60..60]
3323
     *                             vlan VID:  4 bits [61..63]
3324
     *
3325
     * Layout is different for LE and BE in order to save a couple of
3326
     * network to host translations.
3327
     * */
3328
0
    return ((uint64_t) odp_to_u32(in_port) << 32)
3329
0
           | ((OVS_FORCE uint32_t) dl_type << 16)
3330
#if WORDS_BIGENDIAN
3331
           | (((uint16_t) nw_frag & FLOW_NW_FRAG_MASK) << VLAN_PCP_SHIFT)
3332
#else
3333
0
           | ((nw_frag & FLOW_NW_FRAG_MASK) << (VLAN_PCP_SHIFT - 8))
3334
0
#endif
3335
0
           | (OVS_FORCE uint16_t) (vlan_tci & htons(VLAN_VID_MASK | VLAN_CFI));
3336
0
}
3337
3338
struct dp_netdev_flow *
3339
dp_netdev_simple_match_lookup(const struct dp_netdev_pmd_thread *pmd,
3340
                              odp_port_t in_port, ovs_be16 dl_type,
3341
                              uint8_t nw_frag, ovs_be16 vlan_tci)
3342
0
{
3343
0
    uint64_t mark = dp_netdev_simple_match_mark(in_port, dl_type,
3344
0
                                                nw_frag, vlan_tci);
3345
0
    uint32_t hash = hash_uint64(mark);
3346
0
    struct dp_netdev_flow *flow;
3347
0
    bool found = false;
3348
3349
0
    CMAP_FOR_EACH_WITH_HASH (flow, simple_match_node,
3350
0
                             hash, &pmd->simple_match_table) {
3351
0
        if (flow->simple_match_mark == mark) {
3352
0
            found = true;
3353
0
            break;
3354
0
        }
3355
0
    }
3356
0
    return found ? flow : NULL;
3357
0
}
3358
3359
bool
3360
dp_netdev_simple_match_enabled(const struct dp_netdev_pmd_thread *pmd,
3361
                               odp_port_t in_port)
3362
0
{
3363
0
    return ccmap_find(&pmd->n_flows, odp_to_u32(in_port))
3364
0
           == ccmap_find(&pmd->n_simple_flows, odp_to_u32(in_port));
3365
0
}
3366
3367
static void
3368
dp_netdev_simple_match_insert(struct dp_netdev_pmd_thread *pmd,
3369
                              struct dp_netdev_flow *dp_flow)
3370
    OVS_REQUIRES(pmd->flow_mutex)
3371
0
{
3372
0
    odp_port_t in_port = dp_flow->flow.in_port.odp_port;
3373
0
    ovs_be16 vlan_tci = dp_flow->flow.vlans[0].tci;
3374
0
    ovs_be16 dl_type = dp_flow->flow.dl_type;
3375
0
    uint8_t nw_frag = dp_flow->flow.nw_frag;
3376
3377
0
    if (!dp_netdev_flow_ref(dp_flow)) {
3378
0
        return;
3379
0
    }
3380
3381
    /* Avoid double insertion.  Should not happen in practice. */
3382
0
    dp_netdev_simple_match_remove(pmd, dp_flow);
3383
3384
0
    uint64_t mark = dp_netdev_simple_match_mark(in_port, dl_type,
3385
0
                                                nw_frag, vlan_tci);
3386
0
    uint32_t hash = hash_uint64(mark);
3387
3388
0
    dp_flow->simple_match_mark = mark;
3389
0
    cmap_insert(&pmd->simple_match_table,
3390
0
                CONST_CAST(struct cmap_node *, &dp_flow->simple_match_node),
3391
0
                hash);
3392
0
    ccmap_inc(&pmd->n_simple_flows, odp_to_u32(in_port));
3393
3394
0
    VLOG_DBG("Simple match insert: "
3395
0
             "core_id(%d),in_port(%"PRIu32"),mark(0x%016"PRIx64").",
3396
0
             pmd->core_id, in_port, mark);
3397
0
}
3398
3399
static void
3400
dp_netdev_simple_match_remove(struct dp_netdev_pmd_thread *pmd,
3401
                               struct dp_netdev_flow *dp_flow)
3402
    OVS_REQUIRES(pmd->flow_mutex)
3403
0
{
3404
0
    odp_port_t in_port = dp_flow->flow.in_port.odp_port;
3405
0
    ovs_be16 vlan_tci = dp_flow->flow.vlans[0].tci;
3406
0
    ovs_be16 dl_type = dp_flow->flow.dl_type;
3407
0
    uint8_t nw_frag = dp_flow->flow.nw_frag;
3408
0
    struct dp_netdev_flow *flow;
3409
0
    uint64_t mark = dp_netdev_simple_match_mark(in_port, dl_type,
3410
0
                                                nw_frag, vlan_tci);
3411
0
    uint32_t hash = hash_uint64(mark);
3412
3413
0
    flow = dp_netdev_simple_match_lookup(pmd, in_port, dl_type,
3414
0
                                         nw_frag, vlan_tci);
3415
0
    if (flow == dp_flow) {
3416
0
        VLOG_DBG("Simple match remove: "
3417
0
                 "core_id(%d),in_port(%"PRIu32"),mark(0x%016"PRIx64").",
3418
0
                 pmd->core_id, in_port, mark);
3419
0
        cmap_remove(&pmd->simple_match_table,
3420
0
                    CONST_CAST(struct cmap_node *, &flow->simple_match_node),
3421
0
                    hash);
3422
0
        ccmap_dec(&pmd->n_simple_flows, odp_to_u32(in_port));
3423
0
        dp_netdev_flow_unref(flow);
3424
0
    }
3425
0
}
3426
3427
static bool
3428
dp_netdev_flow_is_simple_match(const struct match *match)
3429
0
{
3430
0
    const struct flow *flow = &match->flow;
3431
0
    const struct flow_wildcards *wc = &match->wc;
3432
3433
0
    if (flow->recirc_id || flow->packet_type != htonl(PT_ETH)) {
3434
0
        return false;
3435
0
    }
3436
3437
    /* Check that flow matches only minimal set of fields that always set.
3438
     * Also checking that VLAN VID+CFI is an exact match, because these
3439
     * are not mandatory and could be masked. */
3440
0
    struct flow_wildcards *minimal = xmalloc(sizeof *minimal);
3441
0
    ovs_be16 vlan_tci_mask = htons(VLAN_VID_MASK | VLAN_CFI);
3442
3443
0
    flow_wildcards_init_catchall(minimal);
3444
    /* 'dpif-netdev' always has following in exact match:
3445
     *   - recirc_id                   <-- recirc_id == 0 checked on input.
3446
     *   - in_port                     <-- Will be checked on input.
3447
     *   - packet_type                 <-- Assuming all packets are PT_ETH.
3448
     *   - dl_type                     <-- Need to match with.
3449
     *   - vlan_tci                    <-- Need to match with.
3450
     *   - and nw_frag for ip packets. <-- Need to match with.
3451
     */
3452
0
    WC_MASK_FIELD(minimal, recirc_id);
3453
0
    WC_MASK_FIELD(minimal, in_port);
3454
0
    WC_MASK_FIELD(minimal, packet_type);
3455
0
    WC_MASK_FIELD(minimal, dl_type);
3456
0
    WC_MASK_FIELD_MASK(minimal, vlans[0].tci, vlan_tci_mask);
3457
0
    WC_MASK_FIELD_MASK(minimal, nw_frag, FLOW_NW_FRAG_MASK);
3458
3459
0
    if (flow_wildcards_has_extra(minimal, wc)
3460
0
        || wc->masks.vlans[0].tci != vlan_tci_mask) {
3461
0
        free(minimal);
3462
0
        return false;
3463
0
    }
3464
0
    free(minimal);
3465
3466
0
    return true;
3467
0
}
3468
3469
static void
3470
offload_flow_put_resume(struct dp_netdev *dp, struct dp_netdev_flow *flow,
3471
                        struct dp_netdev_flow *previous_flow_reference,
3472
                        unsigned pmd_id, int error)
3473
0
{
3474
0
    if (error == EINPROGRESS) {
3475
0
        return;
3476
0
    }
3477
3478
0
    if (!error) {
3479
0
        flow->offloaded = true;
3480
0
    } else {
3481
        /* If the flow was already offloaded, the new action set can no
3482
         * longer be offloaded.  In theory, we should disassociate the
3483
         * offload from all PMDs that have this flow marked as offloaded.
3484
         * Unfortunately, there is no mechanism to inform other PMDs, so
3485
         * we cannot explicitly mark such flows.  This situation typically
3486
         * occurs when the revalidator modifies the flow, so it is safe to
3487
         * assume it will update all affected flows and that the offload
3488
         * will subsequently fail. */
3489
0
        flow->offloaded = false;
3490
3491
        /* On error, the flow reference was not stored by the offload provider,
3492
         * so we should decrease the reference. */
3493
0
        dp_netdev_flow_unref(flow);
3494
0
    }
3495
3496
0
    if (offload_queue_dec(flow) && flow->dead) {
3497
        /* If flows are processed asynchronously, modifications might
3498
         * still be queued up while the flow is being removed.  If this
3499
         * was the last flow in the queue on a dead flow, we try again
3500
         * to see if we need to remove this flow. */
3501
0
        offload_flow_del(dp, pmd_id, flow);
3502
0
    }
3503
3504
0
    if (previous_flow_reference) {
3505
0
        dp_netdev_flow_unref(previous_flow_reference);
3506
0
        if (previous_flow_reference != flow) {
3507
0
            VLOG_DBG("Updated flow reference was from outdated flow");
3508
0
        }
3509
0
    }
3510
0
}
3511
3512
static void
3513
offload_flow_put_resume_cb(void *aux, struct dpif_flow_stats *stats OVS_UNUSED,
3514
                           unsigned pmd_id, void *flow_reference_,
3515
                           void *old_flow_reference_,
3516
                           int error)
3517
0
{
3518
0
    struct dp_netdev *dp = aux;
3519
0
    struct dp_netdev_flow *flow_reference = flow_reference_;
3520
0
    struct dp_netdev_flow *old_flow_reference = old_flow_reference_;
3521
3522
0
    offload_flow_put_resume(dp, flow_reference, old_flow_reference,
3523
0
                            pmd_id, error);
3524
0
}
3525
3526
static void
3527
offload_flow_put(struct dp_netdev_pmd_thread *pmd, struct dp_netdev_flow *flow,
3528
                 struct match *match, const struct nlattr *actions,
3529
                 size_t actions_len)
3530
0
{
3531
0
    struct dpif_offload_flow_put put = {
3532
0
        .in_port = match->flow.in_port.odp_port,
3533
0
        .orig_in_port = flow->orig_in_port,
3534
0
        .pmd_id = pmd->core_id,
3535
0
        .ufid = CONST_CAST(ovs_u128 *, &flow->mega_ufid),
3536
0
        .match = match,
3537
0
        .actions = actions,
3538
0
        .actions_len = actions_len,
3539
0
        .stats = NULL,
3540
0
        .flow_reference = flow,
3541
0
        .cb_data = {
3542
0
            .callback = offload_flow_put_resume_cb,
3543
0
            .callback_aux = pmd->dp,
3544
0
        },
3545
0
    };
3546
0
    void *previous_flow_reference = NULL;
3547
0
    int error;
3548
3549
0
    if (!dpif_offload_enabled() || flow->dead || !offload_queue_inc(flow)) {
3550
0
        return;
3551
0
    }
3552
3553
0
    dp_netdev_flow_ref(flow);
3554
3555
0
    error = dpif_offload_datapath_flow_put(pmd->dp->full_name, &put,
3556
0
                                           &previous_flow_reference);
3557
0
    offload_flow_put_resume(pmd->dp, put.flow_reference,
3558
0
                            previous_flow_reference,
3559
0
                            pmd->core_id, error);
3560
0
}
3561
3562
static struct dp_netdev_flow *
3563
dp_netdev_flow_add(struct dp_netdev_pmd_thread *pmd,
3564
                   struct match *match, const ovs_u128 *ufid,
3565
                   const struct nlattr *actions, size_t actions_len,
3566
                   odp_port_t orig_in_port)
3567
    OVS_REQUIRES(pmd->flow_mutex)
3568
0
{
3569
0
    struct ds extra_info = DS_EMPTY_INITIALIZER;
3570
0
    struct dp_netdev_flow *flow;
3571
0
    struct netdev_flow_key mask;
3572
0
    struct dpcls *cls;
3573
0
    size_t unit;
3574
3575
    /* Make sure in_port is exact matched before we read it. */
3576
0
    ovs_assert(match->wc.masks.in_port.odp_port == ODPP_NONE);
3577
0
    odp_port_t in_port = match->flow.in_port.odp_port;
3578
3579
    /* As we select the dpcls based on the port number, each netdev flow
3580
     * belonging to the same dpcls will have the same odp_port value.
3581
     * For performance reasons we wildcard odp_port here in the mask.  In the
3582
     * typical case dp_hash is also wildcarded, and the resulting 8-byte
3583
     * chunk {dp_hash, in_port} will be ignored by netdev_flow_mask_init() and
3584
     * will not be part of the subtable mask.
3585
     * This will speed up the hash computation during dpcls_lookup() because
3586
     * there is one less call to hash_add64() in this case. */
3587
0
    match->wc.masks.in_port.odp_port = 0;
3588
0
    netdev_flow_mask_init(&mask, match);
3589
0
    match->wc.masks.in_port.odp_port = ODPP_NONE;
3590
3591
    /* Make sure wc does not have metadata. */
3592
0
    ovs_assert(!FLOWMAP_HAS_FIELD(&mask.mf.map, metadata)
3593
0
               && !FLOWMAP_HAS_FIELD(&mask.mf.map, regs));
3594
3595
    /* Do not allocate extra space. */
3596
0
    flow = xmalloc(sizeof *flow - sizeof flow->cr.flow.mf + mask.len);
3597
0
    memset(&flow->stats, 0, sizeof flow->stats);
3598
0
    flow->dead = false;
3599
0
    flow->offloaded = false;
3600
0
    atomic_init(&flow->offload_queue_depth, 0);
3601
0
    flow->batch = NULL;
3602
0
    flow->orig_in_port = orig_in_port;
3603
0
    *CONST_CAST(unsigned *, &flow->pmd_id) = pmd->core_id;
3604
0
    *CONST_CAST(struct flow *, &flow->flow) = match->flow;
3605
0
    *CONST_CAST(ovs_u128 *, &flow->ufid) = *ufid;
3606
0
    ovs_refcount_init(&flow->ref_cnt);
3607
0
    ovsrcu_set(&flow->actions, dp_netdev_actions_create(actions, actions_len));
3608
3609
0
    dp_netdev_get_mega_ufid(match, CONST_CAST(ovs_u128 *, &flow->mega_ufid));
3610
0
    netdev_flow_key_init_masked(&flow->cr.flow, &match->flow, &mask);
3611
3612
    /* Select dpcls for in_port. Relies on in_port to be exact match. */
3613
0
    cls = dp_netdev_pmd_find_dpcls(pmd, in_port);
3614
0
    dpcls_insert(cls, &flow->cr, &mask);
3615
3616
0
    ds_put_cstr(&extra_info, "miniflow_bits(");
3617
0
    FLOWMAP_FOR_EACH_UNIT (unit) {
3618
0
        if (unit) {
3619
0
            ds_put_char(&extra_info, ',');
3620
0
        }
3621
0
        ds_put_format(&extra_info, "%d",
3622
0
                      count_1bits(flow->cr.mask->mf.map.bits[unit]));
3623
0
    }
3624
0
    ds_put_char(&extra_info, ')');
3625
0
    flow->dp_extra_info = ds_steal_cstr(&extra_info);
3626
0
    ds_destroy(&extra_info);
3627
3628
0
    cmap_insert(&pmd->flow_table, CONST_CAST(struct cmap_node *, &flow->node),
3629
0
                dp_netdev_flow_hash(&flow->ufid));
3630
0
    ccmap_inc(&pmd->n_flows, odp_to_u32(in_port));
3631
3632
0
    if (dp_netdev_flow_is_simple_match(match)) {
3633
0
        dp_netdev_simple_match_insert(pmd, flow);
3634
0
    }
3635
3636
0
    offload_flow_put(pmd, flow, match, actions, actions_len);
3637
0
    log_netdev_flow_change(flow, match, NULL, actions, actions_len);
3638
3639
0
    return flow;
3640
0
}
3641
3642
static int
3643
flow_put_on_pmd(struct dp_netdev_pmd_thread *pmd,
3644
                struct netdev_flow_key *key,
3645
                struct match *match,
3646
                ovs_u128 *ufid,
3647
                const struct dpif_flow_put *put,
3648
                struct dpif_flow_stats *stats)
3649
0
{
3650
0
    struct dp_netdev_flow *netdev_flow = NULL;
3651
0
    int error = 0;
3652
3653
0
    if (stats) {
3654
0
        memset(stats, 0, sizeof *stats);
3655
0
    }
3656
3657
0
    ovs_mutex_lock(&pmd->flow_mutex);
3658
0
    if (put->ufid) {
3659
0
        netdev_flow = dp_netdev_pmd_find_flow(pmd, put->ufid,
3660
0
                                              put->key, put->key_len);
3661
0
    } else {
3662
        /* Use key instead of the locally generated ufid
3663
         * to search netdev_flow. */
3664
0
        netdev_flow = dp_netdev_pmd_lookup_flow(pmd, key, NULL);
3665
0
    }
3666
3667
0
    if (put->flags & DPIF_FP_CREATE) {
3668
0
        if (!netdev_flow) {
3669
0
            dp_netdev_flow_add(pmd, match, ufid,
3670
0
                               put->actions, put->actions_len, ODPP_NONE);
3671
0
        } else {
3672
0
            error = EEXIST;
3673
0
        }
3674
0
        goto exit;
3675
0
    }
3676
3677
0
    if (put->flags & DPIF_FP_MODIFY) {
3678
0
        if (!netdev_flow) {
3679
0
            error = ENOENT;
3680
0
        } else {
3681
0
            if (!put->ufid && !flow_equal(&match->flow, &netdev_flow->flow)) {
3682
                /* Overlapping flow. */
3683
0
                error = EINVAL;
3684
0
                goto exit;
3685
0
            }
3686
3687
0
            struct dp_netdev_actions *new_actions;
3688
0
            struct dp_netdev_actions *old_actions;
3689
3690
0
            new_actions = dp_netdev_actions_create(put->actions,
3691
0
                                                   put->actions_len);
3692
3693
0
            old_actions = dp_netdev_flow_get_actions(netdev_flow);
3694
0
            ovsrcu_set(&netdev_flow->actions, new_actions);
3695
3696
0
            offload_flow_put(pmd, netdev_flow, match, put->actions,
3697
0
                             put->actions_len);
3698
0
            log_netdev_flow_change(netdev_flow, match, old_actions,
3699
0
                                   put->actions, put->actions_len);
3700
3701
0
            if (stats) {
3702
0
                get_dpif_flow_status(pmd->dp, netdev_flow, stats, NULL);
3703
0
            }
3704
0
            if (put->flags & DPIF_FP_ZERO_STATS) {
3705
                /* XXX: The userspace datapath uses thread local statistics
3706
                 * (for flows), which should be updated only by the owning
3707
                 * thread.  Since we cannot write on stats memory here,
3708
                 * we choose not to support this flag.  Please note:
3709
                 * - This feature is currently used only by dpctl commands with
3710
                 *   option --clear.
3711
                 * - Should the need arise, this operation can be implemented
3712
                 *   by keeping a base value (to be update here) for each
3713
                 *   counter, and subtracting it before outputting the stats */
3714
0
                error = EOPNOTSUPP;
3715
0
            }
3716
0
            ovsrcu_postpone(dp_netdev_actions_free, old_actions);
3717
0
        }
3718
0
    }
3719
3720
0
exit:
3721
0
    ovs_mutex_unlock(&pmd->flow_mutex);
3722
0
    return error;
3723
0
}
3724
3725
static int
3726
dpif_netdev_flow_put(struct dpif *dpif, const struct dpif_flow_put *put)
3727
0
{
3728
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
3729
0
    struct netdev_flow_key key;
3730
0
    struct dp_netdev_pmd_thread *pmd;
3731
0
    struct match match;
3732
0
    ovs_u128 ufid;
3733
0
    int error;
3734
0
    bool probe = put->flags & DPIF_FP_PROBE;
3735
3736
0
    if (put->stats) {
3737
0
        memset(put->stats, 0, sizeof *put->stats);
3738
0
    }
3739
0
    error = dpif_netdev_flow_from_nlattrs(put->key, put->key_len, &match.flow,
3740
0
                                          probe);
3741
0
    if (error) {
3742
0
        return error;
3743
0
    }
3744
0
    error = dpif_netdev_mask_from_nlattrs(put->key, put->key_len,
3745
0
                                          put->mask, put->mask_len,
3746
0
                                          &match.flow, &match.wc, probe);
3747
0
    if (error) {
3748
0
        return error;
3749
0
    }
3750
3751
0
    if (match.wc.masks.in_port.odp_port != ODPP_NONE) {
3752
0
        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
3753
3754
0
        VLOG_ERR_RL(&rl, "failed to put%s flow: in_port is not an exact match",
3755
0
                    (put->flags & DPIF_FP_CREATE) ? "[create]"
3756
0
                    : (put->flags & DPIF_FP_MODIFY) ? "[modify]" : "[zero]");
3757
0
        return EINVAL;
3758
0
    }
3759
3760
0
    if (put->ufid) {
3761
0
        ufid = *put->ufid;
3762
0
    } else {
3763
0
        odp_flow_key_hash(&match.flow, sizeof match.flow, &ufid);
3764
0
    }
3765
3766
    /* The Netlink encoding of datapath flow keys cannot express
3767
     * wildcarding the presence of a VLAN tag. Instead, a missing VLAN
3768
     * tag is interpreted as exact match on the fact that there is no
3769
     * VLAN.  Unless we refactor a lot of code that translates between
3770
     * Netlink and struct flow representations, we have to do the same
3771
     * here.  This must be in sync with 'match' in handle_packet_upcall(). */
3772
0
    if (!match.wc.masks.vlans[0].tci) {
3773
0
        match.wc.masks.vlans[0].tci = htons(VLAN_VID_MASK | VLAN_CFI);
3774
0
    }
3775
3776
    /* Must produce a netdev_flow_key for lookup.
3777
     * Use the same method as employed to create the key when adding
3778
     * the flow to the dplcs to make sure they match.
3779
     * We need to put in the unmasked key as flow_put_on_pmd() will first try
3780
     * to see if an entry exists doing a packet type lookup. As masked-out
3781
     * fields are interpreted as zeros, they could falsely match a wider IP
3782
     * address mask. Installation of the flow will use the match variable. */
3783
0
    netdev_flow_key_init(&key, &match.flow);
3784
3785
0
    if (put->pmd_id == PMD_ID_NULL) {
3786
0
        if (cmap_count(&dp->poll_threads) == 0) {
3787
0
            return EINVAL;
3788
0
        }
3789
0
        CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3790
0
            struct dpif_flow_stats pmd_stats;
3791
0
            int pmd_error;
3792
3793
0
            pmd_error = flow_put_on_pmd(pmd, &key, &match, &ufid, put,
3794
0
                                        &pmd_stats);
3795
0
            if (pmd_error) {
3796
0
                error = pmd_error;
3797
0
            } else if (put->stats) {
3798
0
                put->stats->n_packets += pmd_stats.n_packets;
3799
0
                put->stats->n_bytes += pmd_stats.n_bytes;
3800
0
                put->stats->used = MAX(put->stats->used, pmd_stats.used);
3801
0
                put->stats->tcp_flags |= pmd_stats.tcp_flags;
3802
0
            }
3803
0
        }
3804
0
    } else {
3805
0
        pmd = dp_netdev_get_pmd(dp, put->pmd_id);
3806
0
        if (!pmd) {
3807
0
            return EINVAL;
3808
0
        }
3809
0
        error = flow_put_on_pmd(pmd, &key, &match, &ufid, put, put->stats);
3810
0
        dp_netdev_pmd_unref(pmd);
3811
0
    }
3812
3813
0
    return error;
3814
0
}
3815
3816
static int
3817
flow_del_on_pmd(struct dp_netdev_pmd_thread *pmd,
3818
                struct dpif_flow_stats *stats,
3819
                const struct dpif_flow_del *del)
3820
0
{
3821
0
    struct dp_netdev_flow *netdev_flow;
3822
0
    int error = 0;
3823
3824
0
    ovs_mutex_lock(&pmd->flow_mutex);
3825
0
    netdev_flow = dp_netdev_pmd_find_flow(pmd, del->ufid, del->key,
3826
0
                                          del->key_len);
3827
0
    if (netdev_flow) {
3828
0
        if (stats) {
3829
0
            get_dpif_flow_status(pmd->dp, netdev_flow, stats, NULL);
3830
0
        }
3831
0
        dp_netdev_pmd_remove_flow(pmd, netdev_flow);
3832
0
    } else {
3833
0
        error = ENOENT;
3834
0
    }
3835
0
    ovs_mutex_unlock(&pmd->flow_mutex);
3836
3837
0
    return error;
3838
0
}
3839
3840
static int
3841
dpif_netdev_flow_del(struct dpif *dpif, const struct dpif_flow_del *del)
3842
0
{
3843
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
3844
0
    struct dp_netdev_pmd_thread *pmd;
3845
0
    int error = 0;
3846
3847
0
    if (del->stats) {
3848
0
        memset(del->stats, 0, sizeof *del->stats);
3849
0
    }
3850
3851
0
    if (del->pmd_id == PMD_ID_NULL) {
3852
0
        if (cmap_count(&dp->poll_threads) == 0) {
3853
0
            return EINVAL;
3854
0
        }
3855
0
        CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3856
0
            struct dpif_flow_stats pmd_stats;
3857
0
            int pmd_error;
3858
3859
0
            pmd_error = flow_del_on_pmd(pmd, &pmd_stats, del);
3860
0
            if (pmd_error) {
3861
0
                error = pmd_error;
3862
0
            } else if (del->stats) {
3863
0
                del->stats->n_packets += pmd_stats.n_packets;
3864
0
                del->stats->n_bytes += pmd_stats.n_bytes;
3865
0
                del->stats->used = MAX(del->stats->used, pmd_stats.used);
3866
0
                del->stats->tcp_flags |= pmd_stats.tcp_flags;
3867
0
            }
3868
0
        }
3869
0
    } else {
3870
0
        pmd = dp_netdev_get_pmd(dp, del->pmd_id);
3871
0
        if (!pmd) {
3872
0
            return EINVAL;
3873
0
        }
3874
0
        error = flow_del_on_pmd(pmd, del->stats, del);
3875
0
        dp_netdev_pmd_unref(pmd);
3876
0
    }
3877
3878
3879
0
    return error;
3880
0
}
3881
3882
struct dpif_netdev_flow_dump {
3883
    struct dpif_flow_dump up;
3884
    struct cmap_position poll_thread_pos;
3885
    struct cmap_position flow_pos;
3886
    struct dp_netdev_pmd_thread *cur_pmd;
3887
    int status;
3888
    struct ovs_mutex mutex;
3889
};
3890
3891
static struct dpif_netdev_flow_dump *
3892
dpif_netdev_flow_dump_cast(struct dpif_flow_dump *dump)
3893
0
{
3894
0
    return CONTAINER_OF(dump, struct dpif_netdev_flow_dump, up);
3895
0
}
3896
3897
static struct dpif_flow_dump *
3898
dpif_netdev_flow_dump_create(const struct dpif *dpif_, bool terse,
3899
                             struct dpif_flow_dump_types *types)
3900
0
{
3901
0
    struct dpif_netdev_flow_dump *dump;
3902
3903
0
    dump = xzalloc(sizeof *dump);
3904
0
    dpif_flow_dump_init(&dump->up, dpif_, terse, types);
3905
0
    ovs_mutex_init(&dump->mutex);
3906
3907
0
    return &dump->up;
3908
0
}
3909
3910
static int
3911
dpif_netdev_flow_dump_destroy(struct dpif_flow_dump *dump_)
3912
0
{
3913
0
    struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_);
3914
3915
0
    ovs_mutex_destroy(&dump->mutex);
3916
0
    free(dump);
3917
0
    return 0;
3918
0
}
3919
3920
struct dpif_netdev_flow_dump_thread {
3921
    struct dpif_flow_dump_thread up;
3922
    struct dpif_netdev_flow_dump *dump;
3923
    struct odputil_keybuf keybuf[FLOW_DUMP_MAX_BATCH];
3924
    struct odputil_keybuf maskbuf[FLOW_DUMP_MAX_BATCH];
3925
};
3926
3927
static struct dpif_netdev_flow_dump_thread *
3928
dpif_netdev_flow_dump_thread_cast(struct dpif_flow_dump_thread *thread)
3929
0
{
3930
0
    return CONTAINER_OF(thread, struct dpif_netdev_flow_dump_thread, up);
3931
0
}
3932
3933
static struct dpif_flow_dump_thread *
3934
dpif_netdev_flow_dump_thread_create(struct dpif_flow_dump *dump_)
3935
0
{
3936
0
    struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_);
3937
0
    struct dpif_netdev_flow_dump_thread *thread;
3938
3939
0
    thread = xmalloc(sizeof *thread);
3940
0
    dpif_flow_dump_thread_init(&thread->up, &dump->up);
3941
0
    thread->dump = dump;
3942
0
    return &thread->up;
3943
0
}
3944
3945
static void
3946
dpif_netdev_flow_dump_thread_destroy(struct dpif_flow_dump_thread *thread_)
3947
0
{
3948
0
    struct dpif_netdev_flow_dump_thread *thread
3949
0
        = dpif_netdev_flow_dump_thread_cast(thread_);
3950
3951
0
    free(thread);
3952
0
}
3953
3954
static int
3955
dpif_netdev_flow_dump_next(struct dpif_flow_dump_thread *thread_,
3956
                           struct dpif_flow *flows, int max_flows)
3957
0
{
3958
0
    struct dpif_netdev_flow_dump_thread *thread
3959
0
        = dpif_netdev_flow_dump_thread_cast(thread_);
3960
0
    struct dpif_netdev_flow_dump *dump = thread->dump;
3961
0
    struct dp_netdev_flow *netdev_flows[FLOW_DUMP_MAX_BATCH];
3962
0
    struct dpif_netdev *dpif = dpif_netdev_cast(thread->up.dump->dpif);
3963
0
    struct dp_netdev *dp = get_dp_netdev(&dpif->dpif);
3964
0
    int n_flows = 0;
3965
0
    int i;
3966
3967
0
    ovs_mutex_lock(&dump->mutex);
3968
0
    if (!dump->status) {
3969
0
        struct dp_netdev_pmd_thread *pmd = dump->cur_pmd;
3970
0
        int flow_limit = MIN(max_flows, FLOW_DUMP_MAX_BATCH);
3971
3972
        /* First call to dump_next(), extracts the first pmd thread.
3973
         * If there is no pmd thread, returns immediately. */
3974
0
        if (!pmd) {
3975
0
            pmd = dp_netdev_pmd_get_next(dp, &dump->poll_thread_pos);
3976
0
            if (!pmd) {
3977
0
                ovs_mutex_unlock(&dump->mutex);
3978
0
                return n_flows;
3979
3980
0
            }
3981
0
        }
3982
3983
0
        do {
3984
0
            for (n_flows = 0; n_flows < flow_limit; n_flows++) {
3985
0
                struct cmap_node *node;
3986
3987
0
                node = cmap_next_position(&pmd->flow_table, &dump->flow_pos);
3988
0
                if (!node) {
3989
0
                    break;
3990
0
                }
3991
0
                netdev_flows[n_flows] = CONTAINER_OF(node,
3992
0
                                                     struct dp_netdev_flow,
3993
0
                                                     node);
3994
0
            }
3995
            /* When finishing dumping the current pmd thread, moves to
3996
             * the next. */
3997
0
            if (n_flows < flow_limit) {
3998
0
                memset(&dump->flow_pos, 0, sizeof dump->flow_pos);
3999
0
                dp_netdev_pmd_unref(pmd);
4000
0
                pmd = dp_netdev_pmd_get_next(dp, &dump->poll_thread_pos);
4001
0
                if (!pmd) {
4002
0
                    dump->status = EOF;
4003
0
                    break;
4004
0
                }
4005
0
            }
4006
            /* Keeps the reference to next caller. */
4007
0
            dump->cur_pmd = pmd;
4008
4009
            /* If the current dump is empty, do not exit the loop, since the
4010
             * remaining pmds could have flows to be dumped.  Just dumps again
4011
             * on the new 'pmd'. */
4012
0
        } while (!n_flows);
4013
0
    }
4014
0
    ovs_mutex_unlock(&dump->mutex);
4015
4016
0
    for (i = 0; i < n_flows; i++) {
4017
0
        struct odputil_keybuf *maskbuf = &thread->maskbuf[i];
4018
0
        struct odputil_keybuf *keybuf = &thread->keybuf[i];
4019
0
        struct dp_netdev_flow *netdev_flow = netdev_flows[i];
4020
0
        struct dpif_flow *f = &flows[i];
4021
0
        struct ofpbuf key, mask;
4022
4023
0
        ofpbuf_use_stack(&key, keybuf, sizeof *keybuf);
4024
0
        ofpbuf_use_stack(&mask, maskbuf, sizeof *maskbuf);
4025
0
        dp_netdev_flow_to_dpif_flow(dp, netdev_flow, &key, &mask, f,
4026
0
                                    dump->up.terse);
4027
0
    }
4028
4029
0
    return n_flows;
4030
0
}
4031
4032
static int
4033
dpif_netdev_execute(struct dpif *dpif, struct dpif_execute *execute)
4034
    OVS_NO_THREAD_SAFETY_ANALYSIS
4035
0
{
4036
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
4037
0
    struct dp_netdev_pmd_thread *pmd;
4038
0
    struct dp_packet_batch pp;
4039
4040
0
    if (dp_packet_size(execute->packet) < ETH_HEADER_LEN ||
4041
0
        dp_packet_size(execute->packet) > UINT16_MAX) {
4042
0
        return EINVAL;
4043
0
    }
4044
4045
    /* Tries finding the 'pmd'.  If NULL is returned, that means
4046
     * the current thread is a non-pmd thread and should use
4047
     * dp_netdev_get_pmd(dp, NON_PMD_CORE_ID). */
4048
0
    pmd = ovsthread_getspecific(dp->per_pmd_key);
4049
0
    if (!pmd) {
4050
0
        pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
4051
0
        if (!pmd) {
4052
0
            return EBUSY;
4053
0
        }
4054
0
    }
4055
4056
0
    if (execute->probe) {
4057
        /* If this is part of a probe, Drop the packet, since executing
4058
         * the action may actually cause spurious packets be sent into
4059
         * the network. */
4060
0
        if (pmd->core_id == NON_PMD_CORE_ID) {
4061
0
            dp_netdev_pmd_unref(pmd);
4062
0
        }
4063
0
        return 0;
4064
0
    }
4065
4066
    /* If the current thread is non-pmd thread, acquires
4067
     * the 'non_pmd_mutex'. */
4068
0
    if (pmd->core_id == NON_PMD_CORE_ID) {
4069
0
        ovs_mutex_lock(&dp->non_pmd_mutex);
4070
0
    }
4071
4072
    /* Update current time in PMD context. We don't care about EMC insertion
4073
     * probability, because we are on a slow path. */
4074
0
    pmd_thread_ctx_time_update(pmd);
4075
4076
    /* The action processing expects the RSS hash to be valid, because
4077
     * it's always initialized at the beginning of datapath processing.
4078
     * In this case, though, 'execute->packet' may not have gone through
4079
     * the datapath at all, it may have been generated by the upper layer
4080
     * (OpenFlow packet-out, BFD frame, ...). */
4081
0
    if (!dp_packet_rss_valid(execute->packet)) {
4082
0
        dp_packet_set_rss_hash(execute->packet,
4083
0
                               flow_hash_5tuple(execute->flow, 0));
4084
0
    }
4085
4086
    /* Making a copy because the packet might be stolen during the execution
4087
     * and caller might still need it.  */
4088
0
    struct dp_packet *packet_clone = dp_packet_clone(execute->packet);
4089
0
    dp_packet_batch_init_packet(&pp, packet_clone);
4090
0
    dp_netdev_execute_actions(pmd, &pp, false, execute->flow,
4091
0
                              execute->actions, execute->actions_len);
4092
0
    dp_netdev_pmd_flush_output_packets(pmd, true);
4093
4094
0
    if (pmd->core_id == NON_PMD_CORE_ID) {
4095
0
        ovs_mutex_unlock(&dp->non_pmd_mutex);
4096
0
        dp_netdev_pmd_unref(pmd);
4097
0
    }
4098
4099
0
    if (dp_packet_batch_size(&pp) == 1) {
4100
        /* Packet wasn't dropped during the execution.  Swapping content with
4101
         * the original packet, because the caller might expect actions to
4102
         * modify it.  Uisng the packet from a batch instead of 'packet_clone'
4103
         * because it maybe stolen and replaced by other packet, e.g. by
4104
         * the fragmentation engine. */
4105
0
        dp_packet_swap(execute->packet, pp.packets[0]);
4106
0
        dp_packet_delete_batch(&pp, true);
4107
0
    } else if (dp_packet_batch_size(&pp)) {
4108
        /* FIXME: We have more packets than expected.  Likely, we got IP
4109
         * fragments of the reassembled packet.  Dropping them here as we have
4110
         * no way to get them to the caller.  It might be that all the required
4111
         * actions with them are already executed, but it also might not be a
4112
         * case, e.g. if dpif_netdev_execute() called to execute a single
4113
         * tunnel push. */
4114
0
        dp_packet_delete_batch(&pp, true);
4115
0
    }
4116
4117
0
    return 0;
4118
0
}
4119
4120
static void
4121
dpif_netdev_operate(struct dpif *dpif, struct dpif_op **ops, size_t n_ops)
4122
0
{
4123
0
    size_t i;
4124
4125
0
    for (i = 0; i < n_ops; i++) {
4126
0
        struct dpif_op *op = ops[i];
4127
4128
0
        switch (op->type) {
4129
0
        case DPIF_OP_FLOW_PUT:
4130
0
            op->error = dpif_netdev_flow_put(dpif, &op->flow_put);
4131
0
            break;
4132
4133
0
        case DPIF_OP_FLOW_DEL:
4134
0
            op->error = dpif_netdev_flow_del(dpif, &op->flow_del);
4135
0
            break;
4136
4137
0
        case DPIF_OP_EXECUTE:
4138
0
            op->error = dpif_netdev_execute(dpif, &op->execute);
4139
0
            break;
4140
4141
0
        case DPIF_OP_FLOW_GET:
4142
0
            op->error = dpif_netdev_flow_get(dpif, &op->flow_get);
4143
0
            break;
4144
0
        }
4145
0
    }
4146
0
}
4147
4148
/* Enable or Disable PMD auto load balancing. */
4149
static void
4150
set_pmd_auto_lb(struct dp_netdev *dp, bool state, bool always_log)
4151
0
{
4152
0
    struct pmd_auto_lb *pmd_alb = &dp->pmd_alb;
4153
4154
0
    if (pmd_alb->is_enabled != state || always_log) {
4155
0
        pmd_alb->is_enabled = state;
4156
0
        if (pmd_alb->is_enabled) {
4157
0
            uint8_t rebalance_load_thresh;
4158
4159
0
            atomic_read_relaxed(&pmd_alb->rebalance_load_thresh,
4160
0
                                &rebalance_load_thresh);
4161
0
            VLOG_INFO("PMD auto load balance is enabled, "
4162
0
                      "interval %"PRIu64" mins, "
4163
0
                      "pmd load threshold %"PRIu8"%%, "
4164
0
                      "improvement threshold %"PRIu8"%%.",
4165
0
                       pmd_alb->rebalance_intvl / MIN_TO_MSEC,
4166
0
                       rebalance_load_thresh,
4167
0
                       pmd_alb->rebalance_improve_thresh);
4168
0
        } else {
4169
0
            pmd_alb->rebalance_poll_timer = 0;
4170
0
            VLOG_INFO("PMD auto load balance is disabled.");
4171
0
        }
4172
0
    }
4173
0
}
4174
4175
static int
4176
parse_pmd_sleep_list(const char *max_sleep_list,
4177
                     struct pmd_sleep **pmd_sleeps)
4178
0
{
4179
0
    char *list, *copy, *key, *value;
4180
0
    int num_vals = 0;
4181
4182
0
    if (!max_sleep_list) {
4183
0
        return num_vals;
4184
0
    }
4185
4186
0
    list = copy = xstrdup(max_sleep_list);
4187
4188
0
    while (ofputil_parse_key_value(&list, &key, &value)) {
4189
0
        uint64_t temp, pmd_max_sleep;
4190
0
        char *error = NULL;
4191
0
        unsigned core;
4192
0
        int i;
4193
4194
0
        error = str_to_u64(key, &temp);
4195
0
        if (error) {
4196
0
            free(error);
4197
0
            continue;
4198
0
        }
4199
4200
0
        if (value[0] == '\0') {
4201
            /* No value specified. key is dp default. */
4202
0
            core = UINT_MAX;
4203
0
            pmd_max_sleep = temp;
4204
0
        } else {
4205
0
            error = str_to_u64(value, &pmd_max_sleep);
4206
0
            if (!error && temp < UINT_MAX) {
4207
                /* Key is pmd core id. */
4208
0
                core = (unsigned) temp;
4209
0
            } else {
4210
0
                free(error);
4211
0
                continue;
4212
0
            }
4213
0
        }
4214
4215
        /* Detect duplicate max sleep values. */
4216
0
        for (i = 0; i < num_vals; i++) {
4217
0
            if ((*pmd_sleeps)[i].core_id == core) {
4218
0
                break;
4219
0
            }
4220
0
        }
4221
0
        if (i == num_vals) {
4222
            /* Not duplicate, add a new entry. */
4223
0
            *pmd_sleeps = xrealloc(*pmd_sleeps,
4224
0
                                   (num_vals + 1) * sizeof **pmd_sleeps);
4225
0
            num_vals++;
4226
0
        }
4227
4228
0
        pmd_max_sleep = MIN(PMD_RCU_QUIESCE_INTERVAL, pmd_max_sleep);
4229
4230
0
        (*pmd_sleeps)[i].core_id = core;
4231
0
        (*pmd_sleeps)[i].max_sleep = pmd_max_sleep;
4232
0
    }
4233
4234
0
    free(copy);
4235
0
    return num_vals;
4236
0
}
4237
4238
static void
4239
log_pmd_sleep(unsigned core_id, int numa_id, uint64_t pmd_max_sleep)
4240
0
{
4241
0
    if (core_id == NON_PMD_CORE_ID) {
4242
0
        return;
4243
0
    }
4244
0
    VLOG_INFO("PMD thread on numa_id: %d, core id: %2d, "
4245
0
              "max sleep: %4"PRIu64" us.", numa_id, core_id, pmd_max_sleep);
4246
0
}
4247
4248
static void
4249
pmd_init_max_sleep(struct dp_netdev *dp, struct dp_netdev_pmd_thread *pmd)
4250
0
{
4251
0
    uint64_t max_sleep = dp->pmd_max_sleep_default;
4252
0
    struct pmd_sleep *pmd_sleeps = NULL;
4253
0
    int num_vals;
4254
4255
0
    num_vals = parse_pmd_sleep_list(dp->max_sleep_list, &pmd_sleeps);
4256
4257
    /* Check if the user has set a specific value for this pmd. */
4258
0
    for (int i = 0; i < num_vals; i++) {
4259
0
        if (pmd_sleeps[i].core_id == pmd->core_id) {
4260
0
            max_sleep = pmd_sleeps[i].max_sleep;
4261
0
            break;
4262
0
        }
4263
0
    }
4264
0
    atomic_init(&pmd->max_sleep, max_sleep);
4265
0
    log_pmd_sleep(pmd->core_id, pmd->numa_id, max_sleep);
4266
0
    free(pmd_sleeps);
4267
0
}
4268
4269
static bool
4270
assign_sleep_values_to_pmds(struct dp_netdev *dp, int num_vals,
4271
                            struct pmd_sleep *pmd_sleeps)
4272
0
{
4273
0
    struct dp_netdev_pmd_thread *pmd;
4274
0
    bool value_changed = false;
4275
4276
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4277
0
        uint64_t new_max_sleep, cur_pmd_max_sleep;
4278
4279
0
        if (pmd->core_id == NON_PMD_CORE_ID) {
4280
0
            continue;
4281
0
        }
4282
4283
        /* Default to global value. */
4284
0
        new_max_sleep = dp->pmd_max_sleep_default;
4285
4286
        /* Check for pmd specific value. */
4287
0
        for (int i = 0;  i < num_vals; i++) {
4288
0
            if (pmd->core_id == pmd_sleeps[i].core_id) {
4289
0
                new_max_sleep = pmd_sleeps[i].max_sleep;
4290
0
                break;
4291
0
            }
4292
0
        }
4293
0
        atomic_read_relaxed(&pmd->max_sleep, &cur_pmd_max_sleep);
4294
0
        if (new_max_sleep != cur_pmd_max_sleep) {
4295
0
            atomic_store_relaxed(&pmd->max_sleep, new_max_sleep);
4296
0
            value_changed = true;
4297
0
        }
4298
0
    }
4299
0
    return value_changed;
4300
0
}
4301
4302
static void
4303
log_all_pmd_sleeps(struct dp_netdev *dp)
4304
0
{
4305
0
    struct dp_netdev_pmd_thread **pmd_list = NULL;
4306
0
    struct dp_netdev_pmd_thread *pmd;
4307
0
    size_t n;
4308
4309
0
    VLOG_INFO("Default PMD thread max sleep: %4"PRIu64" us.",
4310
0
              dp->pmd_max_sleep_default);
4311
4312
0
    sorted_poll_thread_list(dp, &pmd_list, &n);
4313
4314
0
    for (size_t i = 0; i < n; i++) {
4315
0
        uint64_t cur_pmd_max_sleep;
4316
4317
0
        pmd = pmd_list[i];
4318
0
        atomic_read_relaxed(&pmd->max_sleep, &cur_pmd_max_sleep);
4319
0
        log_pmd_sleep(pmd->core_id, pmd->numa_id, cur_pmd_max_sleep);
4320
0
    }
4321
0
    free(pmd_list);
4322
0
}
4323
4324
static bool
4325
set_all_pmd_max_sleeps(struct dp_netdev *dp, const struct smap *config)
4326
0
{
4327
0
    const char *max_sleep_list = smap_get(config, "pmd-sleep-max");
4328
0
    struct pmd_sleep *pmd_sleeps = NULL;
4329
0
    uint64_t default_max_sleep = 0;
4330
0
    bool default_changed = false;
4331
0
    bool pmd_changed = false;
4332
0
    uint64_t pmd_maxsleep;
4333
0
    int num_vals = 0;
4334
4335
    /* Check for deprecated 'pmd-maxsleep' value. */
4336
0
    pmd_maxsleep = smap_get_ullong(config, "pmd-maxsleep", UINT64_MAX);
4337
0
    if (pmd_maxsleep != UINT64_MAX && !max_sleep_list) {
4338
0
        VLOG_WARN_ONCE("pmd-maxsleep is deprecated. "
4339
0
                       "Please use pmd-sleep-max instead.");
4340
0
        default_max_sleep = pmd_maxsleep;
4341
0
    }
4342
4343
    /* Check if there is no change in string or value. */
4344
0
    if (!!dp->max_sleep_list == !!max_sleep_list) {
4345
0
        if (max_sleep_list
4346
0
            ? nullable_string_is_equal(max_sleep_list, dp->max_sleep_list)
4347
0
            : default_max_sleep == dp->pmd_max_sleep_default) {
4348
0
            return false;
4349
0
        }
4350
0
    }
4351
4352
    /* Free existing string and copy new one (if any). */
4353
0
    free(dp->max_sleep_list);
4354
0
    dp->max_sleep_list = nullable_xstrdup(max_sleep_list);
4355
4356
0
    if (max_sleep_list) {
4357
0
        num_vals = parse_pmd_sleep_list(max_sleep_list, &pmd_sleeps);
4358
4359
        /* Check if the user has set a global value. */
4360
0
        for (int i = 0; i < num_vals; i++) {
4361
0
            if (pmd_sleeps[i].core_id == UINT_MAX) {
4362
0
                default_max_sleep = pmd_sleeps[i].max_sleep;
4363
0
                break;
4364
0
            }
4365
0
        }
4366
0
    }
4367
4368
0
    if (dp->pmd_max_sleep_default != default_max_sleep) {
4369
0
        dp->pmd_max_sleep_default = default_max_sleep;
4370
0
        default_changed = true;
4371
0
    }
4372
0
    pmd_changed = assign_sleep_values_to_pmds(dp, num_vals, pmd_sleeps);
4373
4374
0
    free(pmd_sleeps);
4375
0
    return default_changed || pmd_changed;
4376
0
}
4377
4378
/* Applies datapath configuration from the database. Some of the changes are
4379
 * actually applied in dpif_netdev_run(). */
4380
static int
4381
dpif_netdev_set_config(struct dpif *dpif, const struct smap *other_config)
4382
0
{
4383
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
4384
0
    const char *cmask = smap_get(other_config, "pmd-cpu-mask");
4385
0
    const char *pmd_rxq_assign = smap_get_def(other_config, "pmd-rxq-assign",
4386
0
                                             "cycles");
4387
0
    unsigned long long insert_prob =
4388
0
        smap_get_ullong(other_config, "emc-insert-inv-prob",
4389
0
                        DEFAULT_EM_FLOW_INSERT_INV_PROB);
4390
0
    uint32_t insert_min, cur_min;
4391
0
    uint32_t tx_flush_interval, cur_tx_flush_interval;
4392
0
    uint64_t rebalance_intvl;
4393
0
    uint8_t cur_rebalance_load;
4394
0
    uint32_t rebalance_load, rebalance_improve;
4395
0
    bool log_autolb = false;
4396
0
    enum sched_assignment_type pmd_rxq_assign_type;
4397
0
    static bool first_set_config = true;
4398
4399
0
    tx_flush_interval = smap_get_int(other_config, "tx-flush-interval",
4400
0
                                     DEFAULT_TX_FLUSH_INTERVAL);
4401
0
    atomic_read_relaxed(&dp->tx_flush_interval, &cur_tx_flush_interval);
4402
0
    if (tx_flush_interval != cur_tx_flush_interval) {
4403
0
        atomic_store_relaxed(&dp->tx_flush_interval, tx_flush_interval);
4404
0
        VLOG_INFO("Flushing interval for tx queues set to %"PRIu32" us",
4405
0
                  tx_flush_interval);
4406
0
    }
4407
4408
0
    if (!nullable_string_is_equal(dp->pmd_cmask, cmask)) {
4409
0
        free(dp->pmd_cmask);
4410
0
        dp->pmd_cmask = nullable_xstrdup(cmask);
4411
0
        dp_netdev_request_reconfigure(dp);
4412
0
    }
4413
4414
0
    atomic_read_relaxed(&dp->emc_insert_min, &cur_min);
4415
0
    if (insert_prob <= UINT32_MAX) {
4416
0
        insert_min = insert_prob == 0 ? 0 : UINT32_MAX / insert_prob;
4417
0
    } else {
4418
0
        insert_min = DEFAULT_EM_FLOW_INSERT_MIN;
4419
0
        insert_prob = DEFAULT_EM_FLOW_INSERT_INV_PROB;
4420
0
    }
4421
4422
0
    if (insert_min != cur_min) {
4423
0
        atomic_store_relaxed(&dp->emc_insert_min, insert_min);
4424
0
        if (insert_min == 0) {
4425
0
            VLOG_INFO("EMC insertion probability changed to zero");
4426
0
        } else {
4427
0
            VLOG_INFO("EMC insertion probability changed to 1/%llu (~%.2f%%)",
4428
0
                      insert_prob, (100 / (float)insert_prob));
4429
0
        }
4430
0
    }
4431
4432
0
    bool perf_enabled = smap_get_bool(other_config, "pmd-perf-metrics", false);
4433
0
    bool cur_perf_enabled;
4434
0
    atomic_read_relaxed(&dp->pmd_perf_metrics, &cur_perf_enabled);
4435
0
    if (perf_enabled != cur_perf_enabled) {
4436
0
        atomic_store_relaxed(&dp->pmd_perf_metrics, perf_enabled);
4437
0
        if (perf_enabled) {
4438
0
            VLOG_INFO("PMD performance metrics collection enabled");
4439
0
        } else {
4440
0
            VLOG_INFO("PMD performance metrics collection disabled");
4441
0
        }
4442
0
    }
4443
4444
0
    bool smc_enable = smap_get_bool(other_config, "smc-enable", false);
4445
0
    bool cur_smc;
4446
0
    atomic_read_relaxed(&dp->smc_enable_db, &cur_smc);
4447
0
    if (smc_enable != cur_smc) {
4448
0
        atomic_store_relaxed(&dp->smc_enable_db, smc_enable);
4449
0
        if (smc_enable) {
4450
0
            VLOG_INFO("SMC cache is enabled");
4451
0
        } else {
4452
0
            VLOG_INFO("SMC cache is disabled");
4453
0
        }
4454
0
    }
4455
4456
0
    if (!strcmp(pmd_rxq_assign, "roundrobin")) {
4457
0
        pmd_rxq_assign_type = SCHED_ROUNDROBIN;
4458
0
    } else if (!strcmp(pmd_rxq_assign, "cycles")) {
4459
0
        pmd_rxq_assign_type = SCHED_CYCLES;
4460
0
    } else if (!strcmp(pmd_rxq_assign, "group")) {
4461
0
        pmd_rxq_assign_type = SCHED_GROUP;
4462
0
    } else {
4463
        /* Default. */
4464
0
        VLOG_WARN("Unsupported rx queue to PMD assignment mode in "
4465
0
                  "pmd-rxq-assign. Defaulting to 'cycles'.");
4466
0
        pmd_rxq_assign_type = SCHED_CYCLES;
4467
0
        pmd_rxq_assign = "cycles";
4468
0
    }
4469
0
    if (dp->pmd_rxq_assign_type != pmd_rxq_assign_type) {
4470
0
        dp->pmd_rxq_assign_type = pmd_rxq_assign_type;
4471
0
        VLOG_INFO("Rxq to PMD assignment mode changed to: \'%s\'.",
4472
0
                  pmd_rxq_assign);
4473
0
        dp_netdev_request_reconfigure(dp);
4474
0
    }
4475
4476
0
    bool pmd_iso = smap_get_bool(other_config, "pmd-rxq-isolate", true);
4477
4478
0
    if (pmd_rxq_assign_type != SCHED_GROUP && pmd_iso == false) {
4479
        /* Invalid combination. */
4480
0
        VLOG_WARN("pmd-rxq-isolate can only be set false "
4481
0
                  "when using pmd-rxq-assign=group");
4482
0
        pmd_iso = true;
4483
0
    }
4484
0
    if (dp->pmd_iso != pmd_iso) {
4485
0
        dp->pmd_iso = pmd_iso;
4486
0
        if (pmd_iso) {
4487
0
            VLOG_INFO("pmd-rxq-affinity isolates PMD core");
4488
0
        } else {
4489
0
            VLOG_INFO("pmd-rxq-affinity does not isolate PMD core");
4490
0
        }
4491
0
        dp_netdev_request_reconfigure(dp);
4492
0
    }
4493
4494
0
    struct pmd_auto_lb *pmd_alb = &dp->pmd_alb;
4495
4496
0
    rebalance_intvl = smap_get_ullong(other_config,
4497
0
                                      "pmd-auto-lb-rebal-interval",
4498
0
                                      ALB_REBALANCE_INTERVAL);
4499
0
    if (rebalance_intvl > MAX_ALB_REBALANCE_INTERVAL) {
4500
0
        rebalance_intvl = ALB_REBALANCE_INTERVAL;
4501
0
    }
4502
4503
    /* Input is in min, convert it to msec. */
4504
0
    rebalance_intvl =
4505
0
        rebalance_intvl ? rebalance_intvl * MIN_TO_MSEC : MIN_TO_MSEC;
4506
4507
0
    if (pmd_alb->rebalance_intvl != rebalance_intvl) {
4508
0
        pmd_alb->rebalance_intvl = rebalance_intvl;
4509
0
        VLOG_INFO("PMD auto load balance interval set to "
4510
0
                  "%"PRIu64" mins\n", rebalance_intvl / MIN_TO_MSEC);
4511
0
        log_autolb = true;
4512
0
    }
4513
4514
0
    rebalance_improve = smap_get_uint(other_config,
4515
0
                                      "pmd-auto-lb-improvement-threshold",
4516
0
                                      ALB_IMPROVEMENT_THRESHOLD);
4517
0
    if (rebalance_improve > 100) {
4518
0
        rebalance_improve = ALB_IMPROVEMENT_THRESHOLD;
4519
0
    }
4520
0
    if (rebalance_improve != pmd_alb->rebalance_improve_thresh) {
4521
0
        pmd_alb->rebalance_improve_thresh = rebalance_improve;
4522
0
        VLOG_INFO("PMD auto load balance improvement threshold set to "
4523
0
                  "%"PRIu32"%%", rebalance_improve);
4524
0
        log_autolb = true;
4525
0
    }
4526
4527
0
    rebalance_load = smap_get_uint(other_config, "pmd-auto-lb-load-threshold",
4528
0
                                   ALB_LOAD_THRESHOLD);
4529
0
    if (rebalance_load > 100) {
4530
0
        rebalance_load = ALB_LOAD_THRESHOLD;
4531
0
    }
4532
0
    atomic_read_relaxed(&pmd_alb->rebalance_load_thresh, &cur_rebalance_load);
4533
0
    if (rebalance_load != cur_rebalance_load) {
4534
0
        atomic_store_relaxed(&pmd_alb->rebalance_load_thresh,
4535
0
                             rebalance_load);
4536
0
        VLOG_INFO("PMD auto load balance load threshold set to %"PRIu32"%%",
4537
0
                  rebalance_load);
4538
0
        log_autolb = true;
4539
0
    }
4540
4541
0
    bool autolb_state = smap_get_bool(other_config, "pmd-auto-lb", false);
4542
4543
0
    set_pmd_auto_lb(dp, autolb_state, log_autolb);
4544
4545
0
    bool sleep_changed = set_all_pmd_max_sleeps(dp, other_config);
4546
0
    if (first_set_config || sleep_changed) {
4547
0
        log_all_pmd_sleeps(dp);
4548
0
    }
4549
4550
0
    if (first_set_config) {
4551
0
        dpif_offload_datapath_register_flow_unreference_cb(
4552
0
            dpif, offload_flow_reference_unreference_cb);
4553
0
    }
4554
4555
0
    first_set_config = false;
4556
0
    return 0;
4557
0
}
4558
4559
static bool
4560
dpif_netdev_number_handlers_required(struct dpif *dpif_ OVS_UNUSED,
4561
                                     uint32_t *n_handlers)
4562
0
{
4563
0
    *n_handlers = 0;
4564
0
    return true;
4565
0
}
4566
4567
/* Parses affinity list and returns result in 'core_ids'. */
4568
static int
4569
parse_affinity_list(const char *affinity_list, unsigned *core_ids, int n_rxq)
4570
0
{
4571
0
    unsigned i;
4572
0
    char *list, *copy, *key, *value;
4573
0
    int error = 0;
4574
4575
0
    for (i = 0; i < n_rxq; i++) {
4576
0
        core_ids[i] = OVS_CORE_UNSPEC;
4577
0
    }
4578
4579
0
    if (!affinity_list) {
4580
0
        return 0;
4581
0
    }
4582
4583
0
    list = copy = xstrdup(affinity_list);
4584
4585
0
    while (ofputil_parse_key_value(&list, &key, &value)) {
4586
0
        int rxq_id, core_id;
4587
4588
0
        if (!str_to_int(key, 0, &rxq_id) || rxq_id < 0
4589
0
            || !str_to_int(value, 0, &core_id) || core_id < 0) {
4590
0
            error = EINVAL;
4591
0
            break;
4592
0
        }
4593
4594
0
        if (rxq_id < n_rxq) {
4595
0
            core_ids[rxq_id] = core_id;
4596
0
        }
4597
0
    }
4598
4599
0
    free(copy);
4600
0
    return error;
4601
0
}
4602
4603
/* Parses 'affinity_list' and applies configuration if it is valid. */
4604
static int
4605
dpif_netdev_port_set_rxq_affinity(struct dp_netdev_port *port,
4606
                                  const char *affinity_list)
4607
0
{
4608
0
    unsigned *core_ids, i;
4609
0
    int error = 0;
4610
4611
0
    core_ids = xmalloc(port->n_rxq * sizeof *core_ids);
4612
0
    if (parse_affinity_list(affinity_list, core_ids, port->n_rxq)) {
4613
0
        error = EINVAL;
4614
0
        goto exit;
4615
0
    }
4616
4617
0
    for (i = 0; i < port->n_rxq; i++) {
4618
0
        port->rxqs[i].core_id = core_ids[i];
4619
0
    }
4620
4621
0
exit:
4622
0
    free(core_ids);
4623
0
    return error;
4624
0
}
4625
4626
/* Returns 'true' if one of the 'port's RX queues exists in 'poll_list'
4627
 * of given PMD thread. */
4628
static bool
4629
dpif_netdev_pmd_polls_port(struct dp_netdev_pmd_thread *pmd,
4630
                           struct dp_netdev_port *port)
4631
    OVS_EXCLUDED(pmd->port_mutex)
4632
0
{
4633
0
    struct rxq_poll *poll;
4634
0
    bool found = false;
4635
4636
0
    ovs_mutex_lock(&pmd->port_mutex);
4637
0
    HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
4638
0
        if (port == poll->rxq->port) {
4639
0
            found = true;
4640
0
            break;
4641
0
        }
4642
0
    }
4643
0
    ovs_mutex_unlock(&pmd->port_mutex);
4644
0
    return found;
4645
0
}
4646
4647
/* Updates port configuration from the database.  The changes are actually
4648
 * applied in dpif_netdev_run(). */
4649
static int
4650
dpif_netdev_port_set_config(struct dpif *dpif, odp_port_t port_no,
4651
                            const struct smap *cfg)
4652
0
{
4653
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
4654
0
    struct dp_netdev_port *port;
4655
0
    int error = 0;
4656
0
    const char *affinity_list = smap_get(cfg, "pmd-rxq-affinity");
4657
0
    bool emc_enabled = smap_get_bool(cfg, "emc-enable", true);
4658
0
    const char *tx_steering_mode = smap_get(cfg, "tx-steering");
4659
0
    enum txq_req_mode txq_mode;
4660
4661
0
    ovs_rwlock_wrlock(&dp->port_rwlock);
4662
0
    error = get_port_by_number(dp, port_no, &port);
4663
0
    if (error) {
4664
0
        goto unlock;
4665
0
    }
4666
4667
0
    if (emc_enabled != port->emc_enabled) {
4668
0
        struct dp_netdev_pmd_thread *pmd;
4669
0
        struct ds ds = DS_EMPTY_INITIALIZER;
4670
0
        uint32_t cur_min, insert_prob;
4671
4672
0
        port->emc_enabled = emc_enabled;
4673
        /* Mark for reload all the threads that polls this port and request
4674
         * for reconfiguration for the actual reloading of threads. */
4675
0
        CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4676
0
            if (dpif_netdev_pmd_polls_port(pmd, port)) {
4677
0
                pmd->need_reload = true;
4678
0
            }
4679
0
        }
4680
0
        dp_netdev_request_reconfigure(dp);
4681
4682
0
        ds_put_format(&ds, "%s: EMC has been %s.",
4683
0
                      netdev_get_name(port->netdev),
4684
0
                      (emc_enabled) ? "enabled" : "disabled");
4685
0
        if (emc_enabled) {
4686
0
            ds_put_cstr(&ds, " Current insertion probability is ");
4687
0
            atomic_read_relaxed(&dp->emc_insert_min, &cur_min);
4688
0
            if (!cur_min) {
4689
0
                ds_put_cstr(&ds, "zero.");
4690
0
            } else {
4691
0
                insert_prob = UINT32_MAX / cur_min;
4692
0
                ds_put_format(&ds, "1/%"PRIu32" (~%.2f%%).",
4693
0
                              insert_prob, 100 / (float) insert_prob);
4694
0
            }
4695
0
        }
4696
0
        VLOG_INFO("%s", ds_cstr(&ds));
4697
0
        ds_destroy(&ds);
4698
0
    }
4699
4700
    /* Checking for RXq affinity changes. */
4701
0
    if (netdev_is_pmd(port->netdev)
4702
0
        && !nullable_string_is_equal(affinity_list, port->rxq_affinity_list)) {
4703
4704
0
        error = dpif_netdev_port_set_rxq_affinity(port, affinity_list);
4705
0
        if (error) {
4706
0
            goto unlock;
4707
0
        }
4708
0
        free(port->rxq_affinity_list);
4709
0
        port->rxq_affinity_list = nullable_xstrdup(affinity_list);
4710
4711
0
        dp_netdev_request_reconfigure(dp);
4712
0
    }
4713
4714
0
    if (nullable_string_is_equal(tx_steering_mode, "hash")) {
4715
0
        txq_mode = TXQ_REQ_MODE_HASH;
4716
0
    } else {
4717
0
        txq_mode = TXQ_REQ_MODE_THREAD;
4718
0
    }
4719
4720
0
    if (txq_mode != port->txq_requested_mode) {
4721
0
        port->txq_requested_mode = txq_mode;
4722
0
        VLOG_INFO("%s: Tx packet steering mode has been set to '%s'.",
4723
0
                  netdev_get_name(port->netdev),
4724
0
                  (txq_mode == TXQ_REQ_MODE_THREAD) ? "thread" : "hash");
4725
0
        dp_netdev_request_reconfigure(dp);
4726
0
    }
4727
4728
0
unlock:
4729
0
    ovs_rwlock_unlock(&dp->port_rwlock);
4730
0
    return error;
4731
0
}
4732
4733
static int
4734
dpif_netdev_queue_to_priority(const struct dpif *dpif OVS_UNUSED,
4735
                              uint32_t queue_id, uint32_t *priority)
4736
0
{
4737
0
    *priority = queue_id;
4738
0
    return 0;
4739
0
}
4740
4741

4742
/* Creates and returns a new 'struct dp_netdev_actions', whose actions are
4743
 * a copy of the 'size' bytes of 'actions' input parameters. */
4744
struct dp_netdev_actions *
4745
dp_netdev_actions_create(const struct nlattr *actions, size_t size)
4746
0
{
4747
0
    struct dp_netdev_actions *netdev_actions;
4748
4749
0
    netdev_actions = xmalloc(sizeof *netdev_actions + size);
4750
0
    netdev_actions->size = size;
4751
0
    if (size) {
4752
0
        memcpy(netdev_actions->actions, actions, size);
4753
0
    }
4754
4755
0
    return netdev_actions;
4756
0
}
4757
4758
struct dp_netdev_actions *
4759
dp_netdev_flow_get_actions(const struct dp_netdev_flow *flow)
4760
0
{
4761
0
    return ovsrcu_get(struct dp_netdev_actions *, &flow->actions);
4762
0
}
4763
4764
static void
4765
dp_netdev_actions_free(struct dp_netdev_actions *actions)
4766
0
{
4767
0
    free(actions);
4768
0
}
4769

4770
static void
4771
dp_netdev_rxq_set_cycles(struct dp_netdev_rxq *rx,
4772
                         enum rxq_cycles_counter_type type,
4773
                         unsigned long long cycles)
4774
0
{
4775
0
   atomic_store_relaxed(&rx->cycles[type], cycles);
4776
0
}
4777
4778
static void
4779
dp_netdev_rxq_add_cycles(struct dp_netdev_rxq *rx,
4780
                         enum rxq_cycles_counter_type type,
4781
                         unsigned long long cycles)
4782
0
{
4783
0
    non_atomic_ullong_add(&rx->cycles[type], cycles);
4784
0
}
4785
4786
static uint64_t
4787
dp_netdev_rxq_get_cycles(struct dp_netdev_rxq *rx,
4788
                         enum rxq_cycles_counter_type type)
4789
0
{
4790
0
    unsigned long long processing_cycles;
4791
0
    atomic_read_relaxed(&rx->cycles[type], &processing_cycles);
4792
0
    return processing_cycles;
4793
0
}
4794
4795
static void
4796
dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx,
4797
                                unsigned long long cycles)
4798
0
{
4799
0
    unsigned int idx = atomic_count_inc(&rx->intrvl_idx) % PMD_INTERVAL_MAX;
4800
0
    atomic_store_relaxed(&rx->cycles_intrvl[idx], cycles);
4801
0
}
4802
4803
static uint64_t
4804
dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned idx)
4805
0
{
4806
0
    unsigned long long processing_cycles;
4807
0
    atomic_read_relaxed(&rx->cycles_intrvl[idx], &processing_cycles);
4808
0
    return processing_cycles;
4809
0
}
4810
4811
#if ATOMIC_ALWAYS_LOCK_FREE_8B
4812
static inline bool
4813
pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd)
4814
0
{
4815
0
    bool pmd_perf_enabled;
4816
0
    atomic_read_relaxed(&pmd->dp->pmd_perf_metrics, &pmd_perf_enabled);
4817
0
    return pmd_perf_enabled;
4818
0
}
4819
#else
4820
/* If stores and reads of 64-bit integers are not atomic, the full PMD
4821
 * performance metrics are not available as locked access to 64 bit
4822
 * integers would be prohibitively expensive. */
4823
static inline bool
4824
pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd OVS_UNUSED)
4825
{
4826
    return false;
4827
}
4828
#endif
4829
4830
static int
4831
dp_netdev_pmd_flush_output_on_port(struct dp_netdev_pmd_thread *pmd,
4832
                                   struct tx_port *p)
4833
0
{
4834
0
    int i;
4835
0
    int tx_qid;
4836
0
    int output_cnt;
4837
0
    bool concurrent_txqs;
4838
0
    struct cycle_timer timer;
4839
0
    uint64_t cycles;
4840
0
    uint32_t tx_flush_interval;
4841
4842
0
    cycle_timer_start(&pmd->perf_stats, &timer);
4843
4844
0
    output_cnt = dp_packet_batch_size(&p->output_pkts);
4845
0
    ovs_assert(output_cnt > 0);
4846
4847
0
    if (p->port->txq_mode == TXQ_MODE_XPS_HASH) {
4848
0
        int n_txq = netdev_n_txq(p->port->netdev);
4849
4850
        /* Re-batch per txq based on packet hash. */
4851
0
        struct dp_packet *packet;
4852
0
        DP_PACKET_BATCH_FOR_EACH (j, packet, &p->output_pkts) {
4853
0
            uint32_t hash;
4854
4855
0
            if (OVS_LIKELY(dp_packet_rss_valid(packet))) {
4856
0
                hash = dp_packet_get_rss_hash(packet);
4857
0
            } else {
4858
0
                struct flow flow;
4859
4860
0
                flow_extract(packet, &flow);
4861
0
                hash = flow_hash_5tuple(&flow, 0);
4862
0
            }
4863
0
            dp_packet_batch_add(&p->txq_pkts[hash % n_txq], packet);
4864
0
        }
4865
4866
        /* Flush batches of each Tx queues. */
4867
0
        for (i = 0; i < n_txq; i++) {
4868
0
            if (dp_packet_batch_is_empty(&p->txq_pkts[i])) {
4869
0
                continue;
4870
0
            }
4871
0
            netdev_send(p->port->netdev, i, &p->txq_pkts[i], true);
4872
0
            dp_packet_batch_init(&p->txq_pkts[i]);
4873
0
        }
4874
0
    } else {
4875
0
        if (p->port->txq_mode == TXQ_MODE_XPS) {
4876
0
            tx_qid = dpif_netdev_xps_get_tx_qid(pmd, p);
4877
0
            concurrent_txqs = true;
4878
0
        } else {
4879
0
            tx_qid = pmd->static_tx_qid;
4880
0
            concurrent_txqs = false;
4881
0
        }
4882
0
        netdev_send(p->port->netdev, tx_qid, &p->output_pkts, concurrent_txqs);
4883
0
    }
4884
0
    dp_packet_batch_init(&p->output_pkts);
4885
4886
    /* Update time of the next flush. */
4887
0
    atomic_read_relaxed(&pmd->dp->tx_flush_interval, &tx_flush_interval);
4888
0
    p->flush_time = pmd->ctx.now + tx_flush_interval;
4889
4890
0
    ovs_assert(pmd->n_output_batches > 0);
4891
0
    pmd->n_output_batches--;
4892
4893
0
    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_PKTS, output_cnt);
4894
0
    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_BATCHES, 1);
4895
4896
    /* Distribute send cycles evenly among transmitted packets and assign to
4897
     * their respective rx queues. */
4898
0
    cycles = cycle_timer_stop(&pmd->perf_stats, &timer) / output_cnt;
4899
0
    for (i = 0; i < output_cnt; i++) {
4900
0
        if (p->output_pkts_rxqs[i]) {
4901
0
            dp_netdev_rxq_add_cycles(p->output_pkts_rxqs[i],
4902
0
                                     RXQ_CYCLES_PROC_CURR, cycles);
4903
0
        }
4904
0
    }
4905
4906
0
    return output_cnt;
4907
0
}
4908
4909
static int
4910
dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd,
4911
                                   bool force)
4912
0
{
4913
0
    struct tx_port *p;
4914
0
    int output_cnt = 0;
4915
4916
0
    if (!pmd->n_output_batches) {
4917
0
        return 0;
4918
0
    }
4919
4920
0
    HMAP_FOR_EACH (p, node, &pmd->send_port_cache) {
4921
0
        if (!dp_packet_batch_is_empty(&p->output_pkts)
4922
0
            && (force || pmd->ctx.now >= p->flush_time)) {
4923
0
            output_cnt += dp_netdev_pmd_flush_output_on_port(pmd, p);
4924
0
        }
4925
0
    }
4926
0
    return output_cnt;
4927
0
}
4928
4929
static int
4930
dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
4931
                           struct dp_netdev_rxq *rxq,
4932
                           odp_port_t port_no)
4933
0
{
4934
0
    struct pmd_perf_stats *s = &pmd->perf_stats;
4935
0
    struct dp_packet_batch batch;
4936
0
    struct cycle_timer timer;
4937
0
    int error;
4938
0
    int batch_cnt = 0;
4939
0
    int rem_qlen = 0, *qlen_p = NULL;
4940
0
    uint64_t cycles;
4941
4942
    /* Measure duration for polling and processing rx burst. */
4943
0
    cycle_timer_start(&pmd->perf_stats, &timer);
4944
4945
0
    pmd->ctx.last_rxq = rxq;
4946
0
    dp_packet_batch_init(&batch);
4947
4948
    /* Fetch the rx queue length only for vhostuser ports. */
4949
0
    if (pmd_perf_metrics_enabled(pmd) && rxq->is_vhost) {
4950
0
        qlen_p = &rem_qlen;
4951
0
    }
4952
4953
0
    error = netdev_rxq_recv(rxq->rx, &batch, qlen_p);
4954
0
    if (!error) {
4955
        /* At least one packet received. */
4956
0
        *recirc_depth_get() = 0;
4957
0
        pmd_thread_ctx_time_update(pmd);
4958
0
        batch_cnt = dp_packet_batch_size(&batch);
4959
0
        if (pmd_perf_metrics_enabled(pmd)) {
4960
            /* Update batch histogram. */
4961
0
            s->current.batches++;
4962
0
            histogram_add_sample(&s->pkts_per_batch, batch_cnt);
4963
            /* Update the maximum vhost rx queue fill level. */
4964
0
            if (rxq->is_vhost && rem_qlen >= 0) {
4965
0
                uint32_t qfill = batch_cnt + rem_qlen;
4966
0
                if (qfill > s->current.max_vhost_qfill) {
4967
0
                    s->current.max_vhost_qfill = qfill;
4968
0
                }
4969
0
            }
4970
0
        }
4971
4972
        /* Process packet batch. */
4973
0
        int ret = pmd->netdev_input_func(pmd, &batch, port_no);
4974
0
        if (ret) {
4975
0
            dp_netdev_input(pmd, &batch, port_no);
4976
0
        }
4977
4978
        /* Assign processing cycles to rx queue. */
4979
0
        cycles = cycle_timer_stop(&pmd->perf_stats, &timer);
4980
0
        dp_netdev_rxq_add_cycles(rxq, RXQ_CYCLES_PROC_CURR, cycles);
4981
4982
0
        dp_netdev_pmd_flush_output_packets(pmd, false);
4983
0
    } else {
4984
        /* Discard cycles. */
4985
0
        cycle_timer_stop(&pmd->perf_stats, &timer);
4986
0
        if (error != EAGAIN && error != EOPNOTSUPP) {
4987
0
            static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
4988
4989
0
            VLOG_ERR_RL(&rl, "error receiving data from %s: %s",
4990
0
                    netdev_rxq_get_name(rxq->rx), ovs_strerror(error));
4991
0
        }
4992
0
    }
4993
4994
0
    pmd->ctx.last_rxq = NULL;
4995
4996
0
    return batch_cnt;
4997
0
}
4998
4999
static struct tx_port *
5000
tx_port_lookup(const struct hmap *hmap, odp_port_t port_no)
5001
0
{
5002
0
    struct tx_port *tx;
5003
5004
0
    HMAP_FOR_EACH_IN_BUCKET (tx, node, hash_port_no(port_no), hmap) {
5005
0
        if (tx->port->port_no == port_no) {
5006
0
            return tx;
5007
0
        }
5008
0
    }
5009
5010
0
    return NULL;
5011
0
}
5012
5013
static struct tx_bond *
5014
tx_bond_lookup(const struct cmap *tx_bonds, uint32_t bond_id)
5015
0
{
5016
0
    uint32_t hash = hash_bond_id(bond_id);
5017
0
    struct tx_bond *tx;
5018
5019
0
    CMAP_FOR_EACH_WITH_HASH (tx, node, hash, tx_bonds) {
5020
0
        if (tx->bond_id == bond_id) {
5021
0
            return tx;
5022
0
        }
5023
0
    }
5024
0
    return NULL;
5025
0
}
5026
5027
static int
5028
port_reconfigure(struct dp_netdev_port *port)
5029
0
{
5030
0
    struct netdev *netdev = port->netdev;
5031
0
    int i, err;
5032
5033
    /* Closes the existing 'rxq's. */
5034
0
    for (i = 0; i < port->n_rxq; i++) {
5035
0
        netdev_rxq_close(port->rxqs[i].rx);
5036
0
        port->rxqs[i].rx = NULL;
5037
0
    }
5038
0
    unsigned last_nrxq = port->n_rxq;
5039
0
    port->n_rxq = 0;
5040
5041
    /* Allows 'netdev' to apply the pending configuration changes. */
5042
0
    if (netdev_is_reconf_required(netdev) || port->need_reconfigure) {
5043
0
        err = netdev_reconfigure(netdev);
5044
0
        if (err && (err != EOPNOTSUPP)) {
5045
0
            VLOG_ERR("Failed to set interface %s new configuration",
5046
0
                     netdev_get_name(netdev));
5047
0
            return err;
5048
0
        }
5049
0
    }
5050
    /* If the netdev_reconfigure() above succeeds, reopens the 'rxq's. */
5051
0
    port->rxqs = xrealloc(port->rxqs,
5052
0
                          sizeof *port->rxqs * netdev_n_rxq(netdev));
5053
    /* Realloc 'used' counters for tx queues. */
5054
0
    free(port->txq_used);
5055
0
    port->txq_used = xcalloc(netdev_n_txq(netdev), sizeof *port->txq_used);
5056
5057
0
    for (i = 0; i < netdev_n_rxq(netdev); i++) {
5058
0
        bool new_queue = i >= last_nrxq;
5059
0
        if (new_queue) {
5060
0
            memset(&port->rxqs[i], 0, sizeof port->rxqs[i]);
5061
0
        }
5062
5063
0
        port->rxqs[i].port = port;
5064
0
        port->rxqs[i].is_vhost = !strncmp(port->type, "dpdkvhost", 9);
5065
5066
0
        err = netdev_rxq_open(netdev, &port->rxqs[i].rx, i);
5067
0
        if (err) {
5068
0
            return err;
5069
0
        }
5070
0
        port->n_rxq++;
5071
0
    }
5072
5073
    /* Parse affinity list to apply configuration for new queues. */
5074
0
    dpif_netdev_port_set_rxq_affinity(port, port->rxq_affinity_list);
5075
5076
    /* If reconfiguration was successful mark it as such, so we can use it */
5077
0
    port->need_reconfigure = false;
5078
5079
0
    return 0;
5080
0
}
5081
5082
struct sched_numa_list {
5083
    struct hmap numas;  /* Contains 'struct sched_numa'. */
5084
};
5085
5086
/* Meta data for out-of-place pmd rxq assignments. */
5087
struct sched_pmd {
5088
    struct sched_numa *numa;
5089
    /* Associated PMD thread. */
5090
    struct dp_netdev_pmd_thread *pmd;
5091
    uint64_t pmd_proc_cycles;
5092
    struct dp_netdev_rxq **rxqs;
5093
    unsigned n_rxq;
5094
    bool isolated;
5095
};
5096
5097
struct sched_numa {
5098
    struct hmap_node node;
5099
    int numa_id;
5100
    /* PMDs on numa node. */
5101
    struct sched_pmd *pmds;
5102
    /* Num of PMDs on numa node. */
5103
    unsigned n_pmds;
5104
    /* Num of isolated PMDs on numa node. */
5105
    unsigned n_isolated;
5106
    int rr_cur_index;
5107
    bool rr_idx_inc;
5108
};
5109
5110
static size_t
5111
sched_numa_list_count(struct sched_numa_list *numa_list)
5112
0
{
5113
0
    return hmap_count(&numa_list->numas);
5114
0
}
5115
5116
static struct sched_numa *
5117
sched_numa_list_next(struct sched_numa_list *numa_list,
5118
                     const struct sched_numa *numa)
5119
0
{
5120
0
    struct hmap_node *node = NULL;
5121
5122
0
    if (numa) {
5123
0
        node = hmap_next(&numa_list->numas, &numa->node);
5124
0
    }
5125
0
    if (!node) {
5126
0
        node = hmap_first(&numa_list->numas);
5127
0
    }
5128
5129
0
    return (node) ? CONTAINER_OF(node, struct sched_numa, node) : NULL;
5130
0
}
5131
5132
static struct sched_numa *
5133
sched_numa_list_lookup(struct sched_numa_list *numa_list, int numa_id)
5134
0
{
5135
0
    struct sched_numa *numa;
5136
5137
0
    HMAP_FOR_EACH_WITH_HASH (numa, node, hash_int(numa_id, 0),
5138
0
                             &numa_list->numas) {
5139
0
        if (numa->numa_id == numa_id) {
5140
0
            return numa;
5141
0
        }
5142
0
    }
5143
0
    return NULL;
5144
0
}
5145
5146
static int
5147
compare_sched_pmd_list(const void *a_, const void *b_)
5148
0
{
5149
0
    struct sched_pmd *a, *b;
5150
5151
0
    a = (struct sched_pmd *) a_;
5152
0
    b = (struct sched_pmd *) b_;
5153
5154
0
    return compare_poll_thread_list(&a->pmd, &b->pmd);
5155
0
}
5156
5157
static void
5158
sort_numa_list_pmds(struct sched_numa_list *numa_list)
5159
0
{
5160
0
    struct sched_numa *numa;
5161
5162
0
    HMAP_FOR_EACH (numa, node, &numa_list->numas) {
5163
0
        if (numa->n_pmds > 1) {
5164
0
            qsort(numa->pmds, numa->n_pmds, sizeof *numa->pmds,
5165
0
                  compare_sched_pmd_list);
5166
0
        }
5167
0
    }
5168
0
}
5169
5170
/* Populate numas and pmds on those numas. */
5171
static void
5172
sched_numa_list_populate(struct sched_numa_list *numa_list,
5173
                         struct dp_netdev *dp)
5174
0
{
5175
0
    struct dp_netdev_pmd_thread *pmd;
5176
5177
0
    hmap_init(&numa_list->numas);
5178
5179
    /* For each pmd on this datapath. */
5180
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5181
0
        struct sched_numa *numa;
5182
0
        struct sched_pmd *sched_pmd;
5183
0
        if (pmd->core_id == NON_PMD_CORE_ID) {
5184
0
            continue;
5185
0
        }
5186
5187
        /* Get the numa of the PMD. */
5188
0
        numa = sched_numa_list_lookup(numa_list, pmd->numa_id);
5189
        /* Create a new numa node for it if not already created. */
5190
0
        if (!numa) {
5191
0
            numa = xzalloc(sizeof *numa);
5192
0
            numa->numa_id = pmd->numa_id;
5193
0
            hmap_insert(&numa_list->numas, &numa->node,
5194
0
                        hash_int(pmd->numa_id, 0));
5195
0
        }
5196
5197
        /* Create a sched_pmd on this numa for the pmd. */
5198
0
        numa->n_pmds++;
5199
0
        numa->pmds = xrealloc(numa->pmds, numa->n_pmds * sizeof *numa->pmds);
5200
0
        sched_pmd = &numa->pmds[numa->n_pmds - 1];
5201
0
        memset(sched_pmd, 0, sizeof *sched_pmd);
5202
0
        sched_pmd->numa = numa;
5203
0
        sched_pmd->pmd = pmd;
5204
        /* At least one pmd is present so initialize curr_idx and idx_inc. */
5205
0
        numa->rr_cur_index = 0;
5206
0
        numa->rr_idx_inc = true;
5207
0
    }
5208
0
    sort_numa_list_pmds(numa_list);
5209
0
}
5210
5211
static void
5212
sched_numa_list_free_entries(struct sched_numa_list *numa_list)
5213
0
{
5214
0
    struct sched_numa *numa;
5215
5216
0
    HMAP_FOR_EACH_POP (numa, node, &numa_list->numas) {
5217
0
        for (unsigned i = 0; i < numa->n_pmds; i++) {
5218
0
            struct sched_pmd *sched_pmd;
5219
5220
0
            sched_pmd = &numa->pmds[i];
5221
0
            sched_pmd->n_rxq = 0;
5222
0
            free(sched_pmd->rxqs);
5223
0
        }
5224
0
        numa->n_pmds = 0;
5225
0
        free(numa->pmds);
5226
0
        free(numa);
5227
0
    }
5228
0
    hmap_destroy(&numa_list->numas);
5229
0
}
5230
5231
static struct sched_pmd *
5232
sched_pmd_find_by_pmd(struct sched_numa_list *numa_list,
5233
                      struct dp_netdev_pmd_thread *pmd)
5234
0
{
5235
0
    struct sched_numa *numa;
5236
5237
0
    HMAP_FOR_EACH (numa, node, &numa_list->numas) {
5238
0
        for (unsigned i = 0; i < numa->n_pmds; i++) {
5239
0
            struct sched_pmd *sched_pmd;
5240
5241
0
            sched_pmd = &numa->pmds[i];
5242
0
            if (pmd == sched_pmd->pmd) {
5243
0
                return sched_pmd;
5244
0
            }
5245
0
        }
5246
0
    }
5247
0
    return NULL;
5248
0
}
5249
5250
static void
5251
sched_pmd_add_rxq(struct sched_pmd *sched_pmd, struct dp_netdev_rxq *rxq,
5252
                  uint64_t cycles)
5253
0
{
5254
    /* As sched_pmd is allocated outside this fn. better to not assume
5255
     * rxqs is initialized to NULL. */
5256
0
    if (sched_pmd->n_rxq == 0) {
5257
0
        sched_pmd->rxqs = xmalloc(sizeof *sched_pmd->rxqs);
5258
0
    } else {
5259
0
        sched_pmd->rxqs = xrealloc(sched_pmd->rxqs, (sched_pmd->n_rxq + 1) *
5260
0
                                                    sizeof *sched_pmd->rxqs);
5261
0
    }
5262
5263
0
    sched_pmd->rxqs[sched_pmd->n_rxq++] = rxq;
5264
0
    sched_pmd->pmd_proc_cycles += cycles;
5265
0
}
5266
5267
static void
5268
sched_numa_list_assignments(struct sched_numa_list *numa_list,
5269
                            struct dp_netdev *dp)
5270
    OVS_REQ_RDLOCK(dp->port_rwlock)
5271
0
{
5272
0
    struct dp_netdev_port *port;
5273
5274
    /* For each port. */
5275
0
    HMAP_FOR_EACH (port, node, &dp->ports) {
5276
0
        if (!netdev_is_pmd(port->netdev)) {
5277
0
            continue;
5278
0
        }
5279
        /* For each rxq on the port. */
5280
0
        for (unsigned qid = 0; qid < port->n_rxq; qid++) {
5281
0
            struct dp_netdev_rxq *rxq = &port->rxqs[qid];
5282
0
            struct sched_pmd *sched_pmd;
5283
0
            uint64_t proc_cycles = 0;
5284
5285
0
            for (int i = 0; i < PMD_INTERVAL_MAX; i++) {
5286
0
                proc_cycles  += dp_netdev_rxq_get_intrvl_cycles(rxq, i);
5287
0
            }
5288
5289
0
            sched_pmd = sched_pmd_find_by_pmd(numa_list, rxq->pmd);
5290
0
            if (sched_pmd) {
5291
0
                if (rxq->core_id != OVS_CORE_UNSPEC && dp->pmd_iso) {
5292
0
                    sched_pmd->isolated = true;
5293
0
                }
5294
0
                sched_pmd_add_rxq(sched_pmd, rxq, proc_cycles);
5295
0
            }
5296
0
        }
5297
0
    }
5298
0
}
5299
5300
static void
5301
sched_numa_list_put_in_place(struct sched_numa_list *numa_list)
5302
0
{
5303
0
    struct sched_numa *numa;
5304
5305
    /* For each numa. */
5306
0
    HMAP_FOR_EACH (numa, node, &numa_list->numas) {
5307
        /* For each pmd. */
5308
0
        for (int i = 0; i < numa->n_pmds; i++) {
5309
0
            struct sched_pmd *sched_pmd;
5310
5311
0
            sched_pmd = &numa->pmds[i];
5312
0
            sched_pmd->pmd->isolated = sched_pmd->isolated;
5313
            /* For each rxq. */
5314
0
            for (unsigned k = 0; k < sched_pmd->n_rxq; k++) {
5315
                /* Store the new pmd from the out of place sched_numa_list
5316
                 * struct to the dp_netdev_rxq struct */
5317
0
                sched_pmd->rxqs[k]->pmd = sched_pmd->pmd;
5318
0
            }
5319
0
        }
5320
0
    }
5321
0
}
5322
5323
/* Returns 'true' if OVS rxq scheduling algorithm assigned any unpinned rxq to
5324
 * a PMD thread core on a non-local numa node. */
5325
static bool
5326
sched_numa_list_cross_numa_polling(struct sched_numa_list *numa_list)
5327
0
{
5328
0
    struct sched_numa *numa;
5329
5330
0
    HMAP_FOR_EACH (numa, node, &numa_list->numas) {
5331
0
        for (int i = 0; i < numa->n_pmds; i++) {
5332
0
            struct sched_pmd *sched_pmd;
5333
5334
0
            sched_pmd = &numa->pmds[i];
5335
0
            if (sched_pmd->isolated) {
5336
                /* All rxqs on this PMD thread core are pinned. */
5337
0
                continue;
5338
0
            }
5339
0
            for (unsigned k = 0; k < sched_pmd->n_rxq; k++) {
5340
0
                struct dp_netdev_rxq *rxq = sched_pmd->rxqs[k];
5341
                /* Check if the rxq is not pinned to a specific PMD thread core
5342
                 * by the user AND the PMD thread core that OVS assigned is
5343
                 * non-local to the rxq port. */
5344
0
                if (rxq->core_id == OVS_CORE_UNSPEC &&
5345
0
                    rxq->pmd->numa_id !=
5346
0
                        netdev_get_numa_id(rxq->port->netdev)) {
5347
0
                    return true;
5348
0
                }
5349
0
            }
5350
0
        }
5351
0
    }
5352
0
    return false;
5353
0
}
5354
5355
static unsigned
5356
sched_numa_noniso_pmd_count(struct sched_numa *numa)
5357
0
{
5358
0
    if (numa->n_pmds > numa->n_isolated) {
5359
0
        return numa->n_pmds - numa->n_isolated;
5360
0
    }
5361
0
    return 0;
5362
0
}
5363
5364
/* Sort Rx Queues by the processing cycles they are consuming. */
5365
static int
5366
compare_rxq_cycles(const void *a, const void *b)
5367
0
{
5368
0
    struct dp_netdev_rxq *qa;
5369
0
    struct dp_netdev_rxq *qb;
5370
0
    uint64_t cycles_qa, cycles_qb;
5371
5372
0
    qa = *(struct dp_netdev_rxq **) a;
5373
0
    qb = *(struct dp_netdev_rxq **) b;
5374
5375
0
    cycles_qa = dp_netdev_rxq_get_cycles(qa, RXQ_CYCLES_PROC_HIST);
5376
0
    cycles_qb = dp_netdev_rxq_get_cycles(qb, RXQ_CYCLES_PROC_HIST);
5377
5378
0
    if (cycles_qa != cycles_qb) {
5379
0
        return (cycles_qa < cycles_qb) ? 1 : -1;
5380
0
    } else {
5381
        /* Cycles are the same so tiebreak on port/queue id.
5382
         * Tiebreaking (as opposed to return 0) ensures consistent
5383
         * sort results across multiple OS's. */
5384
0
        uint32_t port_qa = odp_to_u32(qa->port->port_no);
5385
0
        uint32_t port_qb = odp_to_u32(qb->port->port_no);
5386
0
        if (port_qa != port_qb) {
5387
0
            return port_qa > port_qb ? 1 : -1;
5388
0
        } else {
5389
0
            return netdev_rxq_get_queue_id(qa->rx)
5390
0
                    - netdev_rxq_get_queue_id(qb->rx);
5391
0
        }
5392
0
    }
5393
0
}
5394
5395
static bool
5396
sched_pmd_new_lowest(struct sched_pmd *current_lowest, struct sched_pmd *pmd,
5397
                     bool has_proc)
5398
0
{
5399
0
    uint64_t current_num, pmd_num;
5400
5401
0
    if (current_lowest == NULL) {
5402
0
        return true;
5403
0
    }
5404
5405
0
    if (has_proc) {
5406
0
        current_num = current_lowest->pmd_proc_cycles;
5407
0
        pmd_num = pmd->pmd_proc_cycles;
5408
0
    } else {
5409
0
        current_num = current_lowest->n_rxq;
5410
0
        pmd_num = pmd->n_rxq;
5411
0
    }
5412
5413
0
    if (pmd_num < current_num) {
5414
0
        return true;
5415
0
    }
5416
0
    return false;
5417
0
}
5418
5419
static struct sched_pmd *
5420
sched_pmd_get_lowest(struct sched_numa *numa, bool has_cyc)
5421
0
{
5422
0
    struct sched_pmd *lowest_sched_pmd = NULL;
5423
5424
0
    for (unsigned i = 0; i < numa->n_pmds; i++) {
5425
0
        struct sched_pmd *sched_pmd;
5426
5427
0
        sched_pmd = &numa->pmds[i];
5428
0
        if (sched_pmd->isolated) {
5429
0
            continue;
5430
0
        }
5431
0
        if (sched_pmd_new_lowest(lowest_sched_pmd, sched_pmd, has_cyc)) {
5432
0
            lowest_sched_pmd = sched_pmd;
5433
0
        }
5434
0
    }
5435
0
    return lowest_sched_pmd;
5436
0
}
5437
5438
/*
5439
 * Returns the next pmd from the numa node.
5440
 *
5441
 * If 'updown' is 'true' it will alternate between selecting the next pmd in
5442
 * either an up or down walk, switching between up/down when the first or last
5443
 * core is reached. e.g. 1,2,3,3,2,1,1,2...
5444
 *
5445
 * If 'updown' is 'false' it will select the next pmd wrapping around when
5446
 * last core reached. e.g. 1,2,3,1,2,3,1,2...
5447
 */
5448
static struct sched_pmd *
5449
sched_pmd_next_rr(struct sched_numa *numa, bool updown)
5450
0
{
5451
0
    int numa_idx = numa->rr_cur_index;
5452
5453
0
    if (numa->rr_idx_inc == true) {
5454
        /* Incrementing through list of pmds. */
5455
0
        if (numa->rr_cur_index == numa->n_pmds - 1) {
5456
            /* Reached the last pmd. */
5457
0
            if (updown) {
5458
0
                numa->rr_idx_inc = false;
5459
0
            } else {
5460
0
                numa->rr_cur_index = 0;
5461
0
            }
5462
0
        } else {
5463
0
            numa->rr_cur_index++;
5464
0
        }
5465
0
    } else {
5466
        /* Decrementing through list of pmds. */
5467
0
        if (numa->rr_cur_index == 0) {
5468
            /* Reached the first pmd. */
5469
0
            numa->rr_idx_inc = true;
5470
0
        } else {
5471
0
            numa->rr_cur_index--;
5472
0
        }
5473
0
    }
5474
0
    return &numa->pmds[numa_idx];
5475
0
}
5476
5477
static struct sched_pmd *
5478
sched_pmd_next_noniso_rr(struct sched_numa *numa, bool updown)
5479
0
{
5480
0
    struct sched_pmd *sched_pmd = NULL;
5481
5482
    /* sched_pmd_next_rr() may return duplicate PMDs before all PMDs have been
5483
     * returned depending on updown. Call it more than n_pmds to ensure all
5484
     * PMDs can be searched for the next non-isolated PMD. */
5485
0
    for (unsigned i = 0; i < numa->n_pmds * 2; i++) {
5486
0
        sched_pmd = sched_pmd_next_rr(numa, updown);
5487
0
        if (!sched_pmd->isolated) {
5488
0
            break;
5489
0
        }
5490
0
        sched_pmd = NULL;
5491
0
    }
5492
0
    return sched_pmd;
5493
0
}
5494
5495
static struct sched_pmd *
5496
sched_pmd_next(struct sched_numa *numa, enum sched_assignment_type algo,
5497
               bool has_proc)
5498
0
{
5499
0
    if (algo == SCHED_GROUP) {
5500
0
        return sched_pmd_get_lowest(numa, has_proc);
5501
0
    }
5502
5503
    /* By default RR the PMDs. */
5504
0
    return sched_pmd_next_noniso_rr(numa, algo == SCHED_CYCLES ? true : false);
5505
0
}
5506
5507
static const char *
5508
get_assignment_type_string(enum sched_assignment_type algo)
5509
0
{
5510
0
    switch (algo) {
5511
0
    case SCHED_ROUNDROBIN: return "roundrobin";
5512
0
    case SCHED_CYCLES: return "cycles";
5513
0
    case SCHED_GROUP: return "group";
5514
0
    default: return "Unknown";
5515
0
    }
5516
0
}
5517
5518
0
#define MAX_RXQ_CYC_TEXT 40
5519
0
#define MAX_RXQ_CYC_STRLEN (INT_STRLEN(uint64_t) + MAX_RXQ_CYC_TEXT)
5520
5521
static char *
5522
get_rxq_cyc_log(char *a, enum sched_assignment_type algo, uint64_t cycles)
5523
0
{
5524
0
    int ret = 0;
5525
5526
0
    if (algo != SCHED_ROUNDROBIN) {
5527
0
        ret = snprintf(a, MAX_RXQ_CYC_STRLEN,
5528
0
                       " (measured processing cycles %"PRIu64")", cycles);
5529
0
    }
5530
5531
0
    if (algo == SCHED_ROUNDROBIN || ret <= 0) {
5532
0
        a[0] = '\0';
5533
0
    }
5534
0
    return a;
5535
0
}
5536
5537
static void
5538
sched_numa_list_schedule(struct sched_numa_list *numa_list,
5539
                         struct dp_netdev *dp,
5540
                         enum sched_assignment_type algo,
5541
                         enum vlog_level level)
5542
    OVS_REQ_RDLOCK(dp->port_rwlock)
5543
0
{
5544
0
    struct dp_netdev_port *port;
5545
0
    struct dp_netdev_rxq **rxqs = NULL;
5546
0
    struct sched_numa *last_cross_numa;
5547
0
    unsigned n_rxqs = 0;
5548
0
    bool start_logged = false;
5549
0
    size_t n_numa;
5550
5551
    /* For each port. */
5552
0
    HMAP_FOR_EACH (port, node, &dp->ports) {
5553
0
        if (!netdev_is_pmd(port->netdev)) {
5554
0
            continue;
5555
0
        }
5556
5557
        /* For each rxq on the port. */
5558
0
        for (int qid = 0; qid < port->n_rxq; qid++) {
5559
0
            struct dp_netdev_rxq *rxq = &port->rxqs[qid];
5560
5561
0
            if (algo != SCHED_ROUNDROBIN) {
5562
0
                uint64_t cycle_hist = 0;
5563
5564
                /* Sum the queue intervals and store the cycle history. */
5565
0
                for (unsigned i = 0; i < PMD_INTERVAL_MAX; i++) {
5566
0
                    cycle_hist += dp_netdev_rxq_get_intrvl_cycles(rxq, i);
5567
0
                }
5568
0
                dp_netdev_rxq_set_cycles(rxq, RXQ_CYCLES_PROC_HIST,
5569
0
                                         cycle_hist);
5570
0
            }
5571
5572
            /* Check if this rxq is pinned. */
5573
0
            if (rxq->core_id != OVS_CORE_UNSPEC) {
5574
0
                struct sched_pmd *sched_pmd;
5575
0
                struct dp_netdev_pmd_thread *pmd;
5576
0
                struct sched_numa *numa;
5577
0
                bool iso = dp->pmd_iso;
5578
0
                uint64_t proc_cycles;
5579
0
                char rxq_cyc_log[MAX_RXQ_CYC_STRLEN];
5580
5581
                /* This rxq should be pinned, pin it now. */
5582
0
                pmd = dp_netdev_get_pmd(dp, rxq->core_id);
5583
0
                sched_pmd = sched_pmd_find_by_pmd(numa_list, pmd);
5584
0
                dp_netdev_pmd_unref(pmd);
5585
0
                if (!sched_pmd) {
5586
                    /* Cannot find the PMD.  Cannot pin this rxq. */
5587
0
                    VLOG(level == VLL_DBG ? VLL_DBG : VLL_WARN,
5588
0
                            "Core %2u cannot be pinned with "
5589
0
                            "port \'%s\' rx queue %d. Use pmd-cpu-mask to "
5590
0
                            "enable a pmd on core %u. An alternative core "
5591
0
                            "will be assigned.",
5592
0
                            rxq->core_id,
5593
0
                            netdev_rxq_get_name(rxq->rx),
5594
0
                            netdev_rxq_get_queue_id(rxq->rx),
5595
0
                            rxq->core_id);
5596
0
                    rxqs = xrealloc(rxqs, (n_rxqs + 1) * sizeof *rxqs);
5597
0
                    rxqs[n_rxqs++] = rxq;
5598
0
                    continue;
5599
0
                }
5600
0
                if (iso) {
5601
                    /* Mark PMD as isolated if not done already. */
5602
0
                    if (sched_pmd->isolated == false) {
5603
0
                        sched_pmd->isolated = true;
5604
0
                        numa = sched_pmd->numa;
5605
0
                        numa->n_isolated++;
5606
0
                    }
5607
0
                }
5608
0
                proc_cycles = dp_netdev_rxq_get_cycles(rxq,
5609
0
                                                       RXQ_CYCLES_PROC_HIST);
5610
0
                VLOG(level, "Core %2u on numa node %d is pinned with "
5611
0
                            "port \'%s\' rx queue %d%s",
5612
0
                            sched_pmd->pmd->core_id, sched_pmd->pmd->numa_id,
5613
0
                            netdev_rxq_get_name(rxq->rx),
5614
0
                            netdev_rxq_get_queue_id(rxq->rx),
5615
0
                            get_rxq_cyc_log(rxq_cyc_log, algo, proc_cycles));
5616
0
                sched_pmd_add_rxq(sched_pmd, rxq, proc_cycles);
5617
0
            } else {
5618
0
                rxqs = xrealloc(rxqs, (n_rxqs + 1) * sizeof *rxqs);
5619
0
                rxqs[n_rxqs++] = rxq;
5620
0
            }
5621
0
        }
5622
0
    }
5623
5624
0
    if (n_rxqs > 1 && algo != SCHED_ROUNDROBIN) {
5625
        /* Sort the queues in order of the processing cycles
5626
         * they consumed during their last pmd interval. */
5627
0
        qsort(rxqs, n_rxqs, sizeof *rxqs, compare_rxq_cycles);
5628
0
    }
5629
5630
0
    last_cross_numa = NULL;
5631
0
    n_numa = sched_numa_list_count(numa_list);
5632
0
    for (unsigned i = 0; i < n_rxqs; i++) {
5633
0
        struct dp_netdev_rxq *rxq = rxqs[i];
5634
0
        struct sched_pmd *sched_pmd = NULL;
5635
0
        struct sched_numa *numa;
5636
0
        int port_numa_id;
5637
0
        uint64_t proc_cycles;
5638
0
        char rxq_cyc_log[MAX_RXQ_CYC_STRLEN];
5639
5640
0
        if (start_logged == false && level != VLL_DBG) {
5641
0
            VLOG(level, "Performing pmd to rx queue assignment using %s "
5642
0
                        "algorithm.", get_assignment_type_string(algo));
5643
0
            start_logged = true;
5644
0
        }
5645
5646
        /* Store the cycles for this rxq as we will log these later. */
5647
0
        proc_cycles = dp_netdev_rxq_get_cycles(rxq, RXQ_CYCLES_PROC_HIST);
5648
5649
0
        port_numa_id = netdev_get_numa_id(rxq->port->netdev);
5650
5651
        /* Select numa. */
5652
0
        numa = sched_numa_list_lookup(numa_list, port_numa_id);
5653
5654
        /* Check if numa has no PMDs or no non-isolated PMDs. */
5655
0
        if (!numa || !sched_numa_noniso_pmd_count(numa)) {
5656
            /* Unable to use this numa to find a PMD. */
5657
0
            numa = NULL;
5658
            /* Find any numa with available PMDs. */
5659
0
            for (int j = 0; j < n_numa; j++) {
5660
0
                numa = sched_numa_list_next(numa_list, last_cross_numa);
5661
0
                last_cross_numa = numa;
5662
0
                if (sched_numa_noniso_pmd_count(numa)) {
5663
0
                    break;
5664
0
                }
5665
0
                numa = NULL;
5666
0
            }
5667
0
        }
5668
5669
0
        if (numa) {
5670
            /* Select the PMD that should be used for this rxq. */
5671
0
            sched_pmd = sched_pmd_next(numa, algo,
5672
0
                                       proc_cycles ? true : false);
5673
0
        }
5674
5675
        /* Check that a pmd has been selected. */
5676
0
        if (sched_pmd) {
5677
0
            int pmd_numa_id;
5678
5679
0
            pmd_numa_id = sched_pmd->numa->numa_id;
5680
            /* Check if selected pmd numa matches port numa. */
5681
0
            if (pmd_numa_id != port_numa_id) {
5682
0
                VLOG(level, "There's no available (non-isolated) pmd thread "
5683
0
                            "on numa node %d. Port \'%s\' rx queue %d will "
5684
0
                            "be assigned to a pmd on numa node %d. "
5685
0
                            "This may lead to reduced performance.",
5686
0
                            port_numa_id, netdev_rxq_get_name(rxq->rx),
5687
0
                            netdev_rxq_get_queue_id(rxq->rx), pmd_numa_id);
5688
0
            }
5689
0
            VLOG(level, "Core %2u on numa node %d assigned port \'%s\' "
5690
0
                        "rx queue %d%s.",
5691
0
                        sched_pmd->pmd->core_id, sched_pmd->pmd->numa_id,
5692
0
                        netdev_rxq_get_name(rxq->rx),
5693
0
                        netdev_rxq_get_queue_id(rxq->rx),
5694
0
                        get_rxq_cyc_log(rxq_cyc_log, algo, proc_cycles));
5695
0
            sched_pmd_add_rxq(sched_pmd, rxq, proc_cycles);
5696
0
        } else  {
5697
0
            VLOG(level == VLL_DBG ? level : VLL_WARN,
5698
0
                 "No non-isolated pmd on any numa available for "
5699
0
                 "port \'%s\' rx queue %d%s. "
5700
0
                 "This rx queue will not be polled.",
5701
0
                 netdev_rxq_get_name(rxq->rx),
5702
0
                 netdev_rxq_get_queue_id(rxq->rx),
5703
0
                 get_rxq_cyc_log(rxq_cyc_log, algo, proc_cycles));
5704
0
        }
5705
0
    }
5706
0
    free(rxqs);
5707
0
}
5708
5709
static void
5710
rxq_scheduling(struct dp_netdev *dp)
5711
    OVS_REQ_RDLOCK(dp->port_rwlock)
5712
0
{
5713
0
    struct sched_numa_list numa_list;
5714
0
    enum sched_assignment_type algo = dp->pmd_rxq_assign_type;
5715
5716
0
    sched_numa_list_populate(&numa_list, dp);
5717
0
    sched_numa_list_schedule(&numa_list, dp, algo, VLL_INFO);
5718
0
    sched_numa_list_put_in_place(&numa_list);
5719
5720
0
    sched_numa_list_free_entries(&numa_list);
5721
0
}
5722
5723
static uint64_t variance(uint64_t a[], int n);
5724
5725
static uint64_t
5726
sched_numa_variance(struct sched_numa *numa)
5727
0
{
5728
0
    uint64_t *percent_busy = NULL;
5729
0
    int n_proc = 0;
5730
0
    uint64_t var;
5731
5732
0
    percent_busy = xmalloc(numa->n_pmds * sizeof *percent_busy);
5733
5734
0
    for (unsigned i = 0; i < numa->n_pmds; i++) {
5735
0
        struct sched_pmd *sched_pmd;
5736
0
        uint64_t total_cycles = 0;
5737
5738
0
        sched_pmd = &numa->pmds[i];
5739
        /* Exclude isolated PMDs from variance calculations. */
5740
0
        if (sched_pmd->isolated == true) {
5741
0
            continue;
5742
0
        }
5743
        /* Get the total pmd cycles for an interval. */
5744
0
        atomic_read_relaxed(&sched_pmd->pmd->intrvl_cycles, &total_cycles);
5745
5746
0
        if (total_cycles) {
5747
            /* Estimate the cycles to cover all intervals. */
5748
0
            total_cycles *= PMD_INTERVAL_MAX;
5749
0
            percent_busy[n_proc++] = (sched_pmd->pmd_proc_cycles * 100)
5750
0
                                            / total_cycles;
5751
0
        } else {
5752
0
            percent_busy[n_proc++] = 0;
5753
0
        }
5754
0
    }
5755
0
    var = variance(percent_busy, n_proc);
5756
0
    free(percent_busy);
5757
0
    return var;
5758
0
}
5759
5760
/*
5761
 * This function checks that some basic conditions needed for a rebalance to be
5762
 * effective are met. Such as Rxq scheduling assignment type, more than one
5763
 * PMD, more than 2 Rxqs on a PMD. If there was no reconfiguration change
5764
 * since the last check, it reuses the last result.
5765
 *
5766
 * It is not intended to be an inclusive check of every condition that may make
5767
 * a rebalance ineffective. It is done as a quick check so a full
5768
 * pmd_rebalance_dry_run() can be avoided when it is not needed.
5769
 */
5770
static bool
5771
pmd_rebalance_dry_run_needed(struct dp_netdev *dp)
5772
    OVS_REQ_RDLOCK(dp->port_rwlock)
5773
0
{
5774
0
    struct dp_netdev_pmd_thread *pmd;
5775
0
    struct pmd_auto_lb *pmd_alb = &dp->pmd_alb;
5776
0
    unsigned int cnt = 0;
5777
0
    bool multi_rxq = false;
5778
5779
    /* Check if there was no reconfiguration since last check. */
5780
0
    if (!pmd_alb->recheck_config) {
5781
0
        if (!pmd_alb->do_dry_run) {
5782
0
            VLOG_DBG("PMD auto load balance nothing to do, "
5783
0
                     "no configuration changes since last check.");
5784
0
            return false;
5785
0
        }
5786
0
        return true;
5787
0
    }
5788
0
    pmd_alb->recheck_config = false;
5789
5790
    /* Check for incompatible assignment type. */
5791
0
    if (dp->pmd_rxq_assign_type == SCHED_ROUNDROBIN) {
5792
0
        VLOG_DBG("PMD auto load balance nothing to do, "
5793
0
                 "pmd-rxq-assign=roundrobin assignment type configured.");
5794
0
        return pmd_alb->do_dry_run = false;
5795
0
    }
5796
5797
    /* Check that there is at least 2 non-isolated PMDs and
5798
     * one of them is polling more than one rxq. */
5799
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5800
0
        if (pmd->core_id == NON_PMD_CORE_ID || pmd->isolated) {
5801
0
            continue;
5802
0
        }
5803
5804
0
        if (hmap_count(&pmd->poll_list) > 1) {
5805
0
            multi_rxq = true;
5806
0
        }
5807
0
        if (cnt && multi_rxq) {
5808
0
            return pmd_alb->do_dry_run = true;
5809
0
        }
5810
0
        cnt++;
5811
0
    }
5812
5813
0
    VLOG_DBG("PMD auto load balance nothing to do, "
5814
0
             "not enough non-isolated PMDs or RxQs.");
5815
0
    return pmd_alb->do_dry_run = false;
5816
0
}
5817
5818
static bool
5819
pmd_rebalance_dry_run(struct dp_netdev *dp)
5820
    OVS_REQ_RDLOCK(dp->port_rwlock)
5821
0
{
5822
0
    struct sched_numa_list numa_list_cur;
5823
0
    struct sched_numa_list numa_list_est;
5824
0
    bool thresh_met = false;
5825
5826
0
    VLOG_DBG("PMD auto load balance performing dry run.");
5827
5828
    /* Populate current assignments. */
5829
0
    sched_numa_list_populate(&numa_list_cur, dp);
5830
0
    sched_numa_list_assignments(&numa_list_cur, dp);
5831
5832
    /* Populate estimated assignments. */
5833
0
    sched_numa_list_populate(&numa_list_est, dp);
5834
0
    sched_numa_list_schedule(&numa_list_est, dp,
5835
0
                             dp->pmd_rxq_assign_type, VLL_DBG);
5836
5837
    /* Check if cross-numa polling, there is only one numa with PMDs. */
5838
0
    if (!sched_numa_list_cross_numa_polling(&numa_list_est) ||
5839
0
            sched_numa_list_count(&numa_list_est) == 1) {
5840
0
        struct sched_numa *numa_cur;
5841
5842
        /* Calculate variances. */
5843
0
        HMAP_FOR_EACH (numa_cur, node, &numa_list_cur.numas) {
5844
0
            uint64_t current_var, estimate_var;
5845
0
            struct sched_numa *numa_est;
5846
0
            uint64_t improvement = 0;
5847
5848
0
            numa_est = sched_numa_list_lookup(&numa_list_est,
5849
0
                                              numa_cur->numa_id);
5850
0
            if (!numa_est) {
5851
0
                continue;
5852
0
            }
5853
0
            current_var = sched_numa_variance(numa_cur);
5854
0
            estimate_var = sched_numa_variance(numa_est);
5855
0
            if (estimate_var < current_var) {
5856
0
                improvement = ((current_var - estimate_var) * 100)
5857
0
                              / current_var;
5858
0
            }
5859
0
            VLOG_DBG("Numa node %d. Current variance %"PRIu64" Estimated "
5860
0
                     "variance %"PRIu64". Variance improvement %"PRIu64"%%.",
5861
0
                     numa_cur->numa_id, current_var,
5862
0
                     estimate_var, improvement);
5863
0
            if (improvement >= dp->pmd_alb.rebalance_improve_thresh) {
5864
0
                thresh_met = true;
5865
0
            }
5866
0
        }
5867
0
        VLOG_DBG("PMD load variance improvement threshold %u%% is %s.",
5868
0
                 dp->pmd_alb.rebalance_improve_thresh,
5869
0
                 thresh_met ? "met" : "not met");
5870
0
    } else {
5871
0
        VLOG_DBG("PMD auto load balance detected cross-numa polling with "
5872
0
                 "multiple numa nodes. Unable to accurately estimate.");
5873
0
    }
5874
5875
0
    sched_numa_list_free_entries(&numa_list_cur);
5876
0
    sched_numa_list_free_entries(&numa_list_est);
5877
5878
0
    return thresh_met;
5879
0
}
5880
5881
static void
5882
reload_affected_pmds(struct dp_netdev *dp)
5883
0
{
5884
0
    struct dp_netdev_pmd_thread *pmd;
5885
5886
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5887
0
        if (pmd->need_reload) {
5888
0
            dp_netdev_reload_pmd__(pmd);
5889
0
        }
5890
0
    }
5891
5892
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5893
0
        if (pmd->need_reload) {
5894
0
            if (pmd->core_id != NON_PMD_CORE_ID) {
5895
0
                bool reload;
5896
5897
0
                do {
5898
0
                    atomic_read_explicit(&pmd->reload, &reload,
5899
0
                                         memory_order_acquire);
5900
0
                } while (reload);
5901
0
            }
5902
0
            pmd->need_reload = false;
5903
0
        }
5904
0
    }
5905
0
}
5906
5907
static void
5908
reconfigure_pmd_threads(struct dp_netdev *dp)
5909
    OVS_REQ_RDLOCK(dp->port_rwlock)
5910
0
{
5911
0
    struct dp_netdev_pmd_thread *pmd;
5912
0
    struct ovs_numa_dump *pmd_cores;
5913
0
    struct ovs_numa_info_core *core;
5914
0
    struct hmapx to_delete = HMAPX_INITIALIZER(&to_delete);
5915
0
    struct hmapx_node *node;
5916
0
    bool changed = false;
5917
0
    bool need_to_adjust_static_tx_qids = false;
5918
5919
    /* The pmd threads should be started only if there's a pmd port in the
5920
     * datapath.  If the user didn't provide any "pmd-cpu-mask", we start
5921
     * NR_PMD_THREADS per numa node. */
5922
0
    if (!has_pmd_port(dp)) {
5923
0
        pmd_cores = ovs_numa_dump_n_cores_per_numa(0);
5924
0
    } else if (dp->pmd_cmask && dp->pmd_cmask[0]) {
5925
0
        pmd_cores = ovs_numa_dump_cores_with_cmask(dp->pmd_cmask);
5926
0
    } else {
5927
0
        pmd_cores = ovs_numa_dump_n_cores_per_numa(NR_PMD_THREADS);
5928
0
    }
5929
5930
    /* We need to adjust 'static_tx_qid's only if we're reducing number of
5931
     * PMD threads. Otherwise, new threads will allocate all the freed ids. */
5932
0
    if (ovs_numa_dump_count(pmd_cores) < cmap_count(&dp->poll_threads) - 1) {
5933
        /* Adjustment is required to keep 'static_tx_qid's sequential and
5934
         * avoid possible issues, for example, imbalanced tx queue usage
5935
         * and unnecessary locking caused by remapping on netdev level. */
5936
0
        need_to_adjust_static_tx_qids = true;
5937
0
    }
5938
5939
    /* Check for unwanted pmd threads */
5940
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5941
0
        if (pmd->core_id == NON_PMD_CORE_ID) {
5942
0
            continue;
5943
0
        }
5944
0
        if (!ovs_numa_dump_contains_core(pmd_cores, pmd->numa_id,
5945
0
                                                    pmd->core_id)) {
5946
0
            hmapx_add(&to_delete, pmd);
5947
0
        } else if (need_to_adjust_static_tx_qids) {
5948
0
            atomic_store_relaxed(&pmd->reload_tx_qid, true);
5949
0
            pmd->need_reload = true;
5950
0
        }
5951
0
    }
5952
5953
0
    HMAPX_FOR_EACH (node, &to_delete) {
5954
0
        pmd = (struct dp_netdev_pmd_thread *) node->data;
5955
0
        VLOG_INFO("PMD thread on numa_id: %d, core id: %2d destroyed.",
5956
0
                  pmd->numa_id, pmd->core_id);
5957
0
        dp_netdev_del_pmd(dp, pmd);
5958
0
    }
5959
0
    changed = !hmapx_is_empty(&to_delete);
5960
0
    hmapx_destroy(&to_delete);
5961
5962
0
    if (need_to_adjust_static_tx_qids) {
5963
        /* 'static_tx_qid's are not sequential now.
5964
         * Reload remaining threads to fix this. */
5965
0
        reload_affected_pmds(dp);
5966
0
    }
5967
5968
    /* Check for required new pmd threads */
5969
0
    FOR_EACH_CORE_ON_DUMP(core, pmd_cores) {
5970
0
        pmd = dp_netdev_get_pmd(dp, core->core_id);
5971
0
        if (!pmd) {
5972
0
            struct ds name = DS_EMPTY_INITIALIZER;
5973
5974
0
            pmd = xzalloc(sizeof *pmd);
5975
0
            dp_netdev_configure_pmd(pmd, dp, core->core_id, core->numa_id);
5976
5977
0
            ds_put_format(&name, "pmd-c%02d/id:", core->core_id);
5978
0
            pmd->thread = ovs_thread_create(ds_cstr(&name),
5979
0
                                            pmd_thread_main, pmd);
5980
0
            ds_destroy(&name);
5981
5982
0
            VLOG_INFO("PMD thread on numa_id: %d, core id: %2d created.",
5983
0
                      pmd->numa_id, pmd->core_id);
5984
0
            changed = true;
5985
0
        } else {
5986
0
            dp_netdev_pmd_unref(pmd);
5987
0
        }
5988
0
    }
5989
5990
0
    if (changed) {
5991
0
        struct ovs_numa_info_numa *numa;
5992
5993
        /* Log the number of pmd threads per numa node. */
5994
0
        FOR_EACH_NUMA_ON_DUMP (numa, pmd_cores) {
5995
0
            VLOG_INFO("There are %"PRIuSIZE" pmd threads on numa node %d",
5996
0
                      numa->n_cores, numa->numa_id);
5997
0
        }
5998
0
    }
5999
6000
0
    ovs_numa_dump_destroy(pmd_cores);
6001
0
}
6002
6003
static void
6004
pmd_remove_stale_ports(struct dp_netdev *dp,
6005
                       struct dp_netdev_pmd_thread *pmd)
6006
    OVS_EXCLUDED(pmd->port_mutex)
6007
    OVS_REQ_RDLOCK(dp->port_rwlock)
6008
0
{
6009
0
    struct rxq_poll *poll;
6010
0
    struct tx_port *tx;
6011
6012
0
    ovs_mutex_lock(&pmd->port_mutex);
6013
0
    HMAP_FOR_EACH_SAFE (poll, node, &pmd->poll_list) {
6014
0
        struct dp_netdev_port *port = poll->rxq->port;
6015
6016
0
        if (port->need_reconfigure
6017
0
            || !hmap_contains(&dp->ports, &port->node)) {
6018
0
            dp_netdev_del_rxq_from_pmd(pmd, poll);
6019
0
        }
6020
0
    }
6021
0
    HMAP_FOR_EACH_SAFE (tx, node, &pmd->tx_ports) {
6022
0
        struct dp_netdev_port *port = tx->port;
6023
6024
0
        if (port->need_reconfigure
6025
0
            || !hmap_contains(&dp->ports, &port->node)) {
6026
0
            dp_netdev_del_port_tx_from_pmd(pmd, tx);
6027
0
        }
6028
0
    }
6029
0
    ovs_mutex_unlock(&pmd->port_mutex);
6030
0
}
6031
6032
/* Must be called each time a port is added/removed or the cmask changes.
6033
 * This creates and destroys pmd threads, reconfigures ports, opens their
6034
 * rxqs and assigns all rxqs/txqs to pmd threads. */
6035
static void
6036
reconfigure_datapath(struct dp_netdev *dp)
6037
    OVS_REQ_RDLOCK(dp->port_rwlock)
6038
0
{
6039
0
    struct hmapx busy_threads = HMAPX_INITIALIZER(&busy_threads);
6040
0
    struct dp_netdev_pmd_thread *pmd;
6041
0
    struct dp_netdev_port *port;
6042
0
    int wanted_txqs;
6043
6044
0
    dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq);
6045
6046
    /* Step 1: Adjust the pmd threads based on the datapath ports, the cores
6047
     * on the system and the user configuration. */
6048
0
    reconfigure_pmd_threads(dp);
6049
6050
0
    wanted_txqs = cmap_count(&dp->poll_threads);
6051
6052
    /* The number of pmd threads might have changed, or a port can be new:
6053
     * adjust the txqs. */
6054
0
    HMAP_FOR_EACH (port, node, &dp->ports) {
6055
0
        netdev_set_tx_multiq(port->netdev, wanted_txqs);
6056
0
    }
6057
6058
    /* Step 2: Remove from the pmd threads ports that have been removed or
6059
     * need reconfiguration. */
6060
6061
    /* Check for all the ports that need reconfiguration.  We cache this in
6062
     * 'port->need_reconfigure', because netdev_is_reconf_required() can
6063
     * change at any time.
6064
     * Also mark for reconfiguration all ports which will likely change their
6065
     * 'txq_mode' parameter.  It's required to stop using them before
6066
     * changing this setting and it's simpler to mark ports here and allow
6067
     * 'pmd_remove_stale_ports' to remove them from threads.  There will be
6068
     * no actual reconfiguration in 'port_reconfigure' because it's
6069
     * unnecessary.  */
6070
0
    HMAP_FOR_EACH (port, node, &dp->ports) {
6071
0
        if (netdev_is_reconf_required(port->netdev)
6072
0
            || ((port->txq_mode == TXQ_MODE_XPS)
6073
0
                != (netdev_n_txq(port->netdev) < wanted_txqs))
6074
0
            || ((port->txq_mode == TXQ_MODE_XPS_HASH)
6075
0
                != (port->txq_requested_mode == TXQ_REQ_MODE_HASH
6076
0
                    && netdev_n_txq(port->netdev) > 1))) {
6077
0
            port->need_reconfigure = true;
6078
0
        }
6079
0
    }
6080
6081
    /* Remove from the pmd threads all the ports that have been deleted or
6082
     * need reconfiguration. */
6083
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
6084
0
        pmd_remove_stale_ports(dp, pmd);
6085
0
    }
6086
6087
    /* Reload affected pmd threads.  We must wait for the pmd threads before
6088
     * reconfiguring the ports, because a port cannot be reconfigured while
6089
     * it's being used. */
6090
0
    reload_affected_pmds(dp);
6091
6092
    /* Step 3: Reconfigure ports. */
6093
6094
    /* We only reconfigure the ports that we determined above, because they're
6095
     * not being used by any pmd thread at the moment.  If a port fails to
6096
     * reconfigure we remove it from the datapath. */
6097
0
    HMAP_FOR_EACH_SAFE (port, node, &dp->ports) {
6098
0
        int err;
6099
6100
0
        if (!port->need_reconfigure) {
6101
0
            continue;
6102
0
        }
6103
6104
0
        err = port_reconfigure(port);
6105
0
        if (err) {
6106
0
            hmap_remove(&dp->ports, &port->node);
6107
0
            seq_change(dp->port_seq);
6108
0
            port_destroy(port);
6109
0
        } else {
6110
            /* With a single queue, there is no point in using hash mode. */
6111
0
            if (port->txq_requested_mode == TXQ_REQ_MODE_HASH &&
6112
0
                netdev_n_txq(port->netdev) > 1) {
6113
0
                port->txq_mode = TXQ_MODE_XPS_HASH;
6114
0
            } else if (netdev_n_txq(port->netdev) < wanted_txqs) {
6115
0
                port->txq_mode = TXQ_MODE_XPS;
6116
0
            } else {
6117
0
                port->txq_mode = TXQ_MODE_STATIC;
6118
0
            }
6119
0
        }
6120
0
    }
6121
6122
    /* Step 4: Compute new rxq scheduling.  We don't touch the pmd threads
6123
     * for now, we just update the 'pmd' pointer in each rxq to point to the
6124
     * wanted thread according to the scheduling policy. */
6125
6126
    /* Reset all the pmd threads to non isolated. */
6127
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
6128
0
        pmd->isolated = false;
6129
0
    }
6130
6131
    /* Reset all the queues to unassigned */
6132
0
    HMAP_FOR_EACH (port, node, &dp->ports) {
6133
0
        for (int i = 0; i < port->n_rxq; i++) {
6134
0
            port->rxqs[i].pmd = NULL;
6135
0
        }
6136
0
    }
6137
0
    rxq_scheduling(dp);
6138
6139
    /* Step 5: Remove queues not compliant with new scheduling. */
6140
6141
    /* Count all the threads that will have at least one queue to poll. */
6142
0
    HMAP_FOR_EACH (port, node, &dp->ports) {
6143
0
        for (int qid = 0; qid < port->n_rxq; qid++) {
6144
0
            struct dp_netdev_rxq *q = &port->rxqs[qid];
6145
6146
0
            if (q->pmd) {
6147
0
                hmapx_add(&busy_threads, q->pmd);
6148
0
            }
6149
0
        }
6150
0
    }
6151
6152
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
6153
0
        struct rxq_poll *poll;
6154
6155
0
        ovs_mutex_lock(&pmd->port_mutex);
6156
0
        HMAP_FOR_EACH_SAFE (poll, node, &pmd->poll_list) {
6157
0
            if (poll->rxq->pmd != pmd) {
6158
0
                dp_netdev_del_rxq_from_pmd(pmd, poll);
6159
6160
                /* This pmd might sleep after this step if it has no rxq
6161
                 * remaining. Tell it to busy wait for new assignment if it
6162
                 * has at least one scheduled queue. */
6163
0
                if (hmap_count(&pmd->poll_list) == 0 &&
6164
0
                    hmapx_contains(&busy_threads, pmd)) {
6165
0
                    atomic_store_relaxed(&pmd->wait_for_reload, true);
6166
0
                }
6167
0
            }
6168
0
        }
6169
0
        ovs_mutex_unlock(&pmd->port_mutex);
6170
0
    }
6171
6172
0
    hmapx_destroy(&busy_threads);
6173
6174
    /* Reload affected pmd threads.  We must wait for the pmd threads to remove
6175
     * the old queues before readding them, otherwise a queue can be polled by
6176
     * two threads at the same time. */
6177
0
    reload_affected_pmds(dp);
6178
6179
    /* Step 6: Add queues from scheduling, if they're not there already. */
6180
0
    HMAP_FOR_EACH (port, node, &dp->ports) {
6181
0
        if (!netdev_is_pmd(port->netdev)) {
6182
0
            continue;
6183
0
        }
6184
6185
0
        for (int qid = 0; qid < port->n_rxq; qid++) {
6186
0
            struct dp_netdev_rxq *q = &port->rxqs[qid];
6187
6188
0
            if (q->pmd) {
6189
0
                ovs_mutex_lock(&q->pmd->port_mutex);
6190
0
                dp_netdev_add_rxq_to_pmd(q->pmd, q);
6191
0
                ovs_mutex_unlock(&q->pmd->port_mutex);
6192
0
            }
6193
0
        }
6194
0
    }
6195
6196
    /* Add every port and bond to the tx port and bond caches of
6197
     * every pmd thread, if it's not there already and if this pmd
6198
     * has at least one rxq to poll.
6199
     */
6200
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
6201
0
        ovs_mutex_lock(&pmd->port_mutex);
6202
0
        if (hmap_count(&pmd->poll_list) || pmd->core_id == NON_PMD_CORE_ID) {
6203
0
            struct tx_bond *bond;
6204
6205
0
            HMAP_FOR_EACH (port, node, &dp->ports) {
6206
0
                dp_netdev_add_port_tx_to_pmd(pmd, port);
6207
0
            }
6208
6209
0
            CMAP_FOR_EACH (bond, node, &dp->tx_bonds) {
6210
0
                dp_netdev_add_bond_tx_to_pmd(pmd, bond, false);
6211
0
            }
6212
0
        }
6213
0
        ovs_mutex_unlock(&pmd->port_mutex);
6214
0
    }
6215
6216
    /* Reload affected pmd threads. */
6217
0
    reload_affected_pmds(dp);
6218
6219
    /* PMD ALB will need to recheck if dry run needed. */
6220
0
    dp->pmd_alb.recheck_config = true;
6221
0
}
6222
6223
/* Returns true if one of the netdevs in 'dp' requires a reconfiguration */
6224
static bool
6225
ports_require_restart(const struct dp_netdev *dp)
6226
    OVS_REQ_RDLOCK(dp->port_rwlock)
6227
0
{
6228
0
    struct dp_netdev_port *port;
6229
6230
0
    HMAP_FOR_EACH (port, node, &dp->ports) {
6231
0
        if (netdev_is_reconf_required(port->netdev)) {
6232
0
            return true;
6233
0
        }
6234
0
    }
6235
6236
0
    return false;
6237
0
}
6238
6239
/* Calculates variance in the values stored in array 'a'. 'n' is the number
6240
 * of elements in array to be considered for calculating vairance.
6241
 * Usage example: data array 'a' contains the processing load of each pmd and
6242
 * 'n' is the number of PMDs. It returns the variance in processing load of
6243
 * PMDs*/
6244
static uint64_t
6245
variance(uint64_t a[], int n)
6246
0
{
6247
    /* Compute mean (average of elements). */
6248
0
    uint64_t sum = 0;
6249
0
    uint64_t mean = 0;
6250
0
    uint64_t sqDiff = 0;
6251
6252
0
    if (!n) {
6253
0
        return 0;
6254
0
    }
6255
6256
0
    for (int i = 0; i < n; i++) {
6257
0
        sum += a[i];
6258
0
    }
6259
6260
0
    if (sum) {
6261
0
        mean = sum / n;
6262
6263
        /* Compute sum squared differences with mean. */
6264
0
        for (int i = 0; i < n; i++) {
6265
0
            sqDiff += (a[i] - mean)*(a[i] - mean);
6266
0
        }
6267
0
    }
6268
0
    return (sqDiff ? (sqDiff / n) : 0);
6269
0
}
6270
6271
/* Return true if needs to revalidate datapath flows. */
6272
static bool
6273
dpif_netdev_run(struct dpif *dpif)
6274
0
{
6275
0
    struct dp_netdev_port *port;
6276
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
6277
0
    struct dp_netdev_pmd_thread *non_pmd;
6278
0
    uint64_t new_tnl_seq;
6279
0
    bool need_to_flush = true;
6280
0
    bool pmd_rebalance = false;
6281
0
    long long int now = time_msec();
6282
0
    struct dp_netdev_pmd_thread *pmd;
6283
6284
0
    ovs_rwlock_rdlock(&dp->port_rwlock);
6285
0
    non_pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
6286
0
    if (non_pmd) {
6287
0
        ovs_mutex_lock(&dp->non_pmd_mutex);
6288
6289
0
        atomic_read_relaxed(&dp->smc_enable_db, &non_pmd->ctx.smc_enable_db);
6290
6291
0
        HMAP_FOR_EACH (port, node, &dp->ports) {
6292
0
            if (!netdev_is_pmd(port->netdev)) {
6293
0
                int i;
6294
6295
0
                if (port->emc_enabled) {
6296
0
                    atomic_read_relaxed(&dp->emc_insert_min,
6297
0
                                        &non_pmd->ctx.emc_insert_min);
6298
0
                } else {
6299
0
                    non_pmd->ctx.emc_insert_min = 0;
6300
0
                }
6301
6302
0
                for (i = 0; i < port->n_rxq; i++) {
6303
6304
0
                    if (!netdev_rxq_enabled(port->rxqs[i].rx)) {
6305
0
                        continue;
6306
0
                    }
6307
6308
0
                    if (dp_netdev_process_rxq_port(non_pmd,
6309
0
                                                   &port->rxqs[i],
6310
0
                                                   port->port_no)) {
6311
0
                        need_to_flush = false;
6312
0
                    }
6313
0
                }
6314
0
            }
6315
0
        }
6316
0
        if (need_to_flush) {
6317
            /* We didn't receive anything in the process loop.
6318
             * Check if we need to send something.
6319
             * There was no time updates on current iteration. */
6320
0
            pmd_thread_ctx_time_update(non_pmd);
6321
0
            dp_netdev_pmd_flush_output_packets(non_pmd, false);
6322
0
        }
6323
6324
0
        dpif_netdev_xps_revalidate_pmd(non_pmd, false);
6325
0
        ovs_mutex_unlock(&dp->non_pmd_mutex);
6326
6327
0
        dp_netdev_pmd_unref(non_pmd);
6328
0
    }
6329
6330
0
    struct pmd_auto_lb *pmd_alb = &dp->pmd_alb;
6331
0
    if (pmd_alb->is_enabled) {
6332
0
        if (!pmd_alb->rebalance_poll_timer) {
6333
0
            pmd_alb->rebalance_poll_timer = now;
6334
0
        } else if ((pmd_alb->rebalance_poll_timer +
6335
0
                   pmd_alb->rebalance_intvl) < now) {
6336
0
            pmd_alb->rebalance_poll_timer = now;
6337
0
            CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
6338
0
                if (atomic_count_get(&pmd->pmd_overloaded) >=
6339
0
                                    PMD_INTERVAL_MAX) {
6340
0
                    pmd_rebalance = true;
6341
0
                    break;
6342
0
                }
6343
0
            }
6344
6345
0
            if (pmd_rebalance &&
6346
0
                !dp_netdev_is_reconf_required(dp) &&
6347
0
                !ports_require_restart(dp) &&
6348
0
                pmd_rebalance_dry_run_needed(dp) &&
6349
0
                pmd_rebalance_dry_run(dp)) {
6350
0
                VLOG_INFO("PMD auto load balance dry run. "
6351
0
                          "Requesting datapath reconfigure.");
6352
0
                dp_netdev_request_reconfigure(dp);
6353
0
            }
6354
0
        }
6355
0
    }
6356
6357
0
    if (dp_netdev_is_reconf_required(dp) || ports_require_restart(dp)) {
6358
0
        reconfigure_datapath(dp);
6359
0
    }
6360
0
    ovs_rwlock_unlock(&dp->port_rwlock);
6361
6362
0
    tnl_neigh_cache_run();
6363
0
    tnl_port_map_run();
6364
0
    new_tnl_seq = seq_read(tnl_conf_seq);
6365
6366
0
    if (dp->last_tnl_conf_seq != new_tnl_seq) {
6367
0
        dp->last_tnl_conf_seq = new_tnl_seq;
6368
0
        return true;
6369
0
    }
6370
0
    return false;
6371
0
}
6372
6373
static void
6374
dpif_netdev_wait(struct dpif *dpif)
6375
0
{
6376
0
    struct dp_netdev_port *port;
6377
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
6378
6379
0
    ovs_mutex_lock(&dp_netdev_mutex);
6380
0
    ovs_rwlock_rdlock(&dp->port_rwlock);
6381
0
    HMAP_FOR_EACH (port, node, &dp->ports) {
6382
0
        netdev_wait_reconf_required(port->netdev);
6383
0
        if (!netdev_is_pmd(port->netdev)) {
6384
0
            int i;
6385
6386
0
            for (i = 0; i < port->n_rxq; i++) {
6387
0
                netdev_rxq_wait(port->rxqs[i].rx);
6388
0
            }
6389
0
        }
6390
0
    }
6391
0
    ovs_rwlock_unlock(&dp->port_rwlock);
6392
0
    ovs_mutex_unlock(&dp_netdev_mutex);
6393
0
    seq_wait(tnl_conf_seq, dp->last_tnl_conf_seq);
6394
0
}
6395
6396
static void
6397
pmd_free_cached_ports(struct dp_netdev_pmd_thread *pmd)
6398
0
{
6399
0
    struct tx_port *tx_port_cached;
6400
6401
    /* Flush all the queued packets. */
6402
0
    dp_netdev_pmd_flush_output_packets(pmd, true);
6403
    /* Free all used tx queue ids. */
6404
0
    dpif_netdev_xps_revalidate_pmd(pmd, true);
6405
6406
0
    HMAP_FOR_EACH_POP (tx_port_cached, node, &pmd->tnl_port_cache) {
6407
0
        free(tx_port_cached->txq_pkts);
6408
0
        free(tx_port_cached);
6409
0
    }
6410
0
    HMAP_FOR_EACH_POP (tx_port_cached, node, &pmd->send_port_cache) {
6411
0
        free(tx_port_cached->txq_pkts);
6412
0
        free(tx_port_cached);
6413
0
    }
6414
0
}
6415
6416
/* Copies ports from 'pmd->tx_ports' (shared with the main thread) to
6417
 * thread-local copies. Copy to 'pmd->tnl_port_cache' if it is a tunnel
6418
 * device, otherwise to 'pmd->send_port_cache' if the port has at least
6419
 * one txq. */
6420
static void
6421
pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
6422
    OVS_REQUIRES(pmd->port_mutex)
6423
0
{
6424
0
    struct tx_port *tx_port, *tx_port_cached;
6425
6426
0
    pmd_free_cached_ports(pmd);
6427
0
    hmap_shrink(&pmd->send_port_cache);
6428
0
    hmap_shrink(&pmd->tnl_port_cache);
6429
6430
0
    HMAP_FOR_EACH (tx_port, node, &pmd->tx_ports) {
6431
0
        int n_txq = netdev_n_txq(tx_port->port->netdev);
6432
0
        struct dp_packet_batch *txq_pkts_cached;
6433
6434
0
        if (netdev_has_tunnel_push_pop(tx_port->port->netdev)) {
6435
0
            tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached);
6436
0
            if (tx_port->txq_pkts) {
6437
0
                txq_pkts_cached = xmemdup(tx_port->txq_pkts,
6438
0
                                          n_txq * sizeof *tx_port->txq_pkts);
6439
0
                tx_port_cached->txq_pkts = txq_pkts_cached;
6440
0
            }
6441
0
            hmap_insert(&pmd->tnl_port_cache, &tx_port_cached->node,
6442
0
                        hash_port_no(tx_port_cached->port->port_no));
6443
0
        }
6444
6445
0
        if (n_txq) {
6446
0
            tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached);
6447
0
            if (tx_port->txq_pkts) {
6448
0
                txq_pkts_cached = xmemdup(tx_port->txq_pkts,
6449
0
                                          n_txq * sizeof *tx_port->txq_pkts);
6450
0
                tx_port_cached->txq_pkts = txq_pkts_cached;
6451
0
            }
6452
0
            hmap_insert(&pmd->send_port_cache, &tx_port_cached->node,
6453
0
                        hash_port_no(tx_port_cached->port->port_no));
6454
0
        }
6455
0
    }
6456
0
}
6457
6458
static void
6459
pmd_alloc_static_tx_qid(struct dp_netdev_pmd_thread *pmd)
6460
0
{
6461
0
    ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex);
6462
0
    if (!id_pool_alloc_id(pmd->dp->tx_qid_pool, &pmd->static_tx_qid)) {
6463
0
        VLOG_ABORT("static_tx_qid allocation failed for PMD on core %2d"
6464
0
                   ", numa_id %d.", pmd->core_id, pmd->numa_id);
6465
0
    }
6466
0
    ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex);
6467
6468
0
    VLOG_DBG("static_tx_qid = %d allocated for PMD thread on core %2d"
6469
0
             ", numa_id %d.", pmd->static_tx_qid, pmd->core_id, pmd->numa_id);
6470
0
}
6471
6472
static void
6473
pmd_free_static_tx_qid(struct dp_netdev_pmd_thread *pmd)
6474
0
{
6475
0
    ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex);
6476
0
    id_pool_free_id(pmd->dp->tx_qid_pool, pmd->static_tx_qid);
6477
0
    ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex);
6478
0
}
6479
6480
static int
6481
pmd_load_queues_and_ports(struct dp_netdev_pmd_thread *pmd,
6482
                          struct polled_queue **ppoll_list)
6483
0
{
6484
0
    struct polled_queue *poll_list = *ppoll_list;
6485
0
    struct rxq_poll *poll;
6486
0
    int i;
6487
6488
0
    ovs_mutex_lock(&pmd->port_mutex);
6489
0
    poll_list = xrealloc(poll_list, hmap_count(&pmd->poll_list)
6490
0
                                    * sizeof *poll_list);
6491
6492
0
    i = 0;
6493
0
    HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
6494
0
        poll_list[i].rxq = poll->rxq;
6495
0
        poll_list[i].port_no = poll->rxq->port->port_no;
6496
0
        poll_list[i].emc_enabled = poll->rxq->port->emc_enabled;
6497
0
        poll_list[i].rxq_enabled = netdev_rxq_enabled(poll->rxq->rx);
6498
0
        poll_list[i].change_seq =
6499
0
                     netdev_get_change_seq(poll->rxq->port->netdev);
6500
0
        i++;
6501
0
    }
6502
6503
0
    pmd_load_cached_ports(pmd);
6504
6505
0
    ovs_mutex_unlock(&pmd->port_mutex);
6506
6507
0
    *ppoll_list = poll_list;
6508
0
    return i;
6509
0
}
6510
6511
static void *
6512
pmd_thread_main(void *f_)
6513
0
{
6514
0
    struct dp_netdev_pmd_thread *pmd = f_;
6515
0
    struct pmd_perf_stats *s = &pmd->perf_stats;
6516
0
    unsigned int lc = 0;
6517
0
    struct polled_queue *poll_list;
6518
0
    bool wait_for_reload = false;
6519
0
    bool dpdk_attached;
6520
0
    bool reload_tx_qid;
6521
0
    bool exiting;
6522
0
    bool reload;
6523
0
    int poll_cnt;
6524
0
    int i;
6525
0
    int process_packets = 0;
6526
0
    uint64_t sleep_time = 0;
6527
6528
0
    poll_list = NULL;
6529
6530
    /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */
6531
0
    ovsthread_setspecific(pmd->dp->per_pmd_key, pmd);
6532
0
    ovs_numa_thread_setaffinity_core(pmd->core_id);
6533
0
    dpdk_attached = dpdk_attach_thread(pmd->core_id);
6534
0
    poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
6535
0
    dfc_cache_init(&pmd->flow_cache);
6536
0
    pmd_alloc_static_tx_qid(pmd);
6537
0
    set_timer_resolution(PMD_TIMER_RES_NS);
6538
6539
0
reload:
6540
0
    atomic_count_init(&pmd->pmd_overloaded, 0);
6541
6542
0
    pmd->intrvl_tsc_prev = 0;
6543
0
    atomic_store_relaxed(&pmd->intrvl_cycles, 0);
6544
6545
0
    if (!dpdk_attached) {
6546
0
        dpdk_attached = dpdk_attach_thread(pmd->core_id);
6547
0
    }
6548
6549
    /* List port/core affinity */
6550
0
    for (i = 0; i < poll_cnt; i++) {
6551
0
       VLOG_DBG("Core %d processing port \'%s\' with queue-id %d\n",
6552
0
                pmd->core_id, netdev_rxq_get_name(poll_list[i].rxq->rx),
6553
0
                netdev_rxq_get_queue_id(poll_list[i].rxq->rx));
6554
       /* Reset the rxq current cycles counter. */
6555
0
       dp_netdev_rxq_set_cycles(poll_list[i].rxq, RXQ_CYCLES_PROC_CURR, 0);
6556
0
       for (int j = 0; j < PMD_INTERVAL_MAX; j++) {
6557
0
           dp_netdev_rxq_set_intrvl_cycles(poll_list[i].rxq, 0);
6558
0
       }
6559
0
    }
6560
6561
0
    if (!poll_cnt) {
6562
0
        if (wait_for_reload) {
6563
            /* Don't sleep, control thread will ask for a reload shortly. */
6564
0
            do {
6565
0
                atomic_read_explicit(&pmd->reload, &reload,
6566
0
                                     memory_order_acquire);
6567
0
            } while (!reload);
6568
0
        } else {
6569
0
            while (seq_read(pmd->reload_seq) == pmd->last_reload_seq) {
6570
0
                seq_wait(pmd->reload_seq, pmd->last_reload_seq);
6571
0
                poll_block();
6572
0
            }
6573
0
        }
6574
0
    }
6575
6576
0
    for (i = 0; i < PMD_INTERVAL_MAX; i++) {
6577
0
        atomic_store_relaxed(&pmd->busy_cycles_intrvl[i], 0);
6578
0
    }
6579
0
    atomic_count_set(&pmd->intrvl_idx, 0);
6580
0
    cycles_counter_update(s);
6581
6582
0
    pmd->next_rcu_quiesce = pmd->ctx.now + PMD_RCU_QUIESCE_INTERVAL;
6583
6584
    /* Protect pmd stats from external clearing while polling. */
6585
0
    ovs_mutex_lock(&pmd->perf_stats.stats_mutex);
6586
0
    for (;;) {
6587
0
        uint64_t rx_packets = 0, tx_packets = 0;
6588
0
        uint64_t time_slept = 0;
6589
0
        uint64_t max_sleep;
6590
6591
0
        pmd_perf_start_iteration(s);
6592
6593
0
        atomic_read_relaxed(&pmd->dp->smc_enable_db, &pmd->ctx.smc_enable_db);
6594
0
        atomic_read_relaxed(&pmd->max_sleep, &max_sleep);
6595
6596
0
        for (i = 0; i < poll_cnt; i++) {
6597
6598
0
            if (!poll_list[i].rxq_enabled) {
6599
0
                continue;
6600
0
            }
6601
6602
0
            if (poll_list[i].emc_enabled) {
6603
0
                atomic_read_relaxed(&pmd->dp->emc_insert_min,
6604
0
                                    &pmd->ctx.emc_insert_min);
6605
0
            } else {
6606
0
                pmd->ctx.emc_insert_min = 0;
6607
0
            }
6608
6609
0
            process_packets =
6610
0
                dp_netdev_process_rxq_port(pmd, poll_list[i].rxq,
6611
0
                                           poll_list[i].port_no);
6612
0
            rx_packets += process_packets;
6613
0
            if (process_packets >= PMD_SLEEP_THRESH) {
6614
0
                sleep_time = 0;
6615
0
            }
6616
0
        }
6617
6618
0
        if (!rx_packets) {
6619
            /* We didn't receive anything in the process loop.
6620
             * Check if we need to send something.
6621
             * There was no time updates on current iteration. */
6622
0
            pmd_thread_ctx_time_update(pmd);
6623
0
            tx_packets = dp_netdev_pmd_flush_output_packets(pmd,
6624
0
                                                   max_sleep && sleep_time
6625
0
                                                   ? true : false);
6626
0
        }
6627
6628
0
        if (max_sleep) {
6629
            /* Check if a sleep should happen on this iteration. */
6630
0
            if (sleep_time) {
6631
0
                struct cycle_timer sleep_timer;
6632
6633
0
                cycle_timer_start(&pmd->perf_stats, &sleep_timer);
6634
0
                xnanosleep_no_quiesce(sleep_time * 1000);
6635
0
                time_slept = cycle_timer_stop(&pmd->perf_stats, &sleep_timer);
6636
0
                pmd_thread_ctx_time_update(pmd);
6637
0
            }
6638
0
            if (sleep_time < max_sleep) {
6639
                /* Increase sleep time for next iteration. */
6640
0
                sleep_time += PMD_SLEEP_INC_US;
6641
0
            } else {
6642
0
                sleep_time = max_sleep;
6643
0
            }
6644
0
        } else {
6645
            /* Reset sleep time as max sleep policy may have been changed. */
6646
0
            sleep_time = 0;
6647
0
        }
6648
6649
        /* Do RCU synchronization at fixed interval.  This ensures that
6650
         * synchronization would not be delayed long even at high load of
6651
         * packet processing. */
6652
0
        if (pmd->ctx.now > pmd->next_rcu_quiesce) {
6653
0
            if (!ovsrcu_try_quiesce()) {
6654
0
                pmd->next_rcu_quiesce =
6655
0
                    pmd->ctx.now + PMD_RCU_QUIESCE_INTERVAL;
6656
0
            }
6657
0
        }
6658
6659
0
        if (lc++ > 1024) {
6660
0
            lc = 0;
6661
6662
0
            coverage_try_clear();
6663
0
            dp_netdev_pmd_try_optimize(pmd, poll_list, poll_cnt);
6664
0
            if (!ovsrcu_try_quiesce()) {
6665
0
                emc_cache_slow_sweep(&((pmd->flow_cache).emc_cache));
6666
0
                pmd->next_rcu_quiesce =
6667
0
                    pmd->ctx.now + PMD_RCU_QUIESCE_INTERVAL;
6668
0
            }
6669
6670
0
            for (i = 0; i < poll_cnt; i++) {
6671
0
                uint64_t current_seq =
6672
0
                         netdev_get_change_seq(poll_list[i].rxq->port->netdev);
6673
0
                if (poll_list[i].change_seq != current_seq) {
6674
0
                    poll_list[i].change_seq = current_seq;
6675
0
                    poll_list[i].rxq_enabled =
6676
0
                                 netdev_rxq_enabled(poll_list[i].rxq->rx);
6677
0
                }
6678
0
            }
6679
0
        }
6680
6681
0
        atomic_read_explicit(&pmd->reload, &reload, memory_order_acquire);
6682
0
        if (OVS_UNLIKELY(reload)) {
6683
0
            break;
6684
0
        }
6685
6686
0
        pmd_perf_end_iteration(s, rx_packets, tx_packets, time_slept,
6687
0
                               pmd_perf_metrics_enabled(pmd));
6688
0
    }
6689
0
    ovs_mutex_unlock(&pmd->perf_stats.stats_mutex);
6690
6691
0
    poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
6692
0
    atomic_read_relaxed(&pmd->wait_for_reload, &wait_for_reload);
6693
0
    atomic_read_relaxed(&pmd->reload_tx_qid, &reload_tx_qid);
6694
0
    atomic_read_relaxed(&pmd->exit, &exiting);
6695
    /* Signal here to make sure the pmd finishes
6696
     * reloading the updated configuration. */
6697
0
    dp_netdev_pmd_reload_done(pmd);
6698
6699
0
    if (reload_tx_qid) {
6700
0
        pmd_free_static_tx_qid(pmd);
6701
0
        pmd_alloc_static_tx_qid(pmd);
6702
0
    }
6703
6704
0
    if (!exiting) {
6705
0
        goto reload;
6706
0
    }
6707
6708
0
    pmd_free_static_tx_qid(pmd);
6709
0
    dfc_cache_uninit(&pmd->flow_cache);
6710
0
    free(poll_list);
6711
0
    pmd_free_cached_ports(pmd);
6712
0
    if (dpdk_attached) {
6713
0
        dpdk_detach_thread();
6714
0
    }
6715
0
    return NULL;
6716
0
}
6717
6718
static void
6719
dp_netdev_disable_upcall(struct dp_netdev *dp)
6720
    OVS_ACQUIRES(dp->upcall_rwlock)
6721
0
{
6722
0
    fat_rwlock_wrlock(&dp->upcall_rwlock);
6723
0
}
6724
6725

6726
/* Meters */
6727
static void
6728
dpif_netdev_meter_get_features(const struct dpif * dpif OVS_UNUSED,
6729
                               struct ofputil_meter_features *features)
6730
0
{
6731
0
    features->max_meters = MAX_METERS;
6732
0
    features->band_types = DP_SUPPORTED_METER_BAND_TYPES;
6733
0
    features->capabilities = DP_SUPPORTED_METER_FLAGS_MASK;
6734
0
    features->max_bands = MAX_BANDS;
6735
0
    features->max_color = 0;
6736
0
}
6737
6738
/* Tries to atomically add 'n' to 'value' in terms of saturation arithmetic,
6739
 * i.e., if the result will be larger than 'max_value', will store 'max_value'
6740
 * instead. */
6741
static void
6742
atomic_sat_add(atomic_uint64_t *value, uint64_t n, uint64_t max_value)
6743
0
{
6744
0
    uint64_t current, new_value;
6745
6746
0
    atomic_read_relaxed(value, &current);
6747
0
    do {
6748
0
        new_value = current + n;
6749
0
        new_value = MIN(new_value, max_value);
6750
0
    } while (!atomic_compare_exchange_weak_relaxed(value, &current,
6751
0
                                                   new_value));
6752
0
}
6753
6754
/* Tries to atomically subtract 'n' from 'value'.  Does not perform the
6755
 * operation and returns 'false' if the result will be less than 'min_value'.
6756
 * Otherwise, stores the result and returns 'true'. */
6757
static bool
6758
atomic_bound_sub(atomic_uint64_t *value, uint64_t n, uint64_t min_value)
6759
0
{
6760
0
    uint64_t current;
6761
6762
0
    atomic_read_relaxed(value, &current);
6763
0
    do {
6764
0
        if (current < min_value + n) {
6765
0
            return false;
6766
0
        }
6767
0
    } while (!atomic_compare_exchange_weak_relaxed(value, &current,
6768
0
                                                   current - n));
6769
0
    return true;
6770
0
}
6771
6772
/* Applies the meter identified by 'meter_id' to 'packets_'.  Packets
6773
 * that exceed a band are dropped in-place. */
6774
static void
6775
dp_netdev_run_meter(struct dp_netdev *dp, struct dp_packet_batch *packets_,
6776
                    uint32_t meter_id, long long int now_ms)
6777
0
{
6778
0
    const size_t cnt = dp_packet_batch_size(packets_);
6779
0
    uint32_t exceeded_rate[NETDEV_MAX_BURST];
6780
0
    uint32_t exceeded_band[NETDEV_MAX_BURST];
6781
0
    uint64_t bytes, volume, meter_used, old;
6782
0
    uint64_t band_packets[MAX_BANDS];
6783
0
    uint64_t band_bytes[MAX_BANDS];
6784
0
    struct dp_meter_band *band;
6785
0
    struct dp_packet *packet;
6786
0
    struct dp_meter *meter;
6787
0
    bool exceeded = false;
6788
6789
0
    if (meter_id >= MAX_METERS) {
6790
0
        return;
6791
0
    }
6792
6793
0
    meter = dp_meter_lookup(&dp->meters, meter_id);
6794
0
    if (!meter) {
6795
0
        return;
6796
0
    }
6797
6798
    /* Initialize as negative values. */
6799
0
    memset(exceeded_band, 0xff, cnt * sizeof *exceeded_band);
6800
    /* Initialize as zeroes. */
6801
0
    memset(exceeded_rate, 0, cnt * sizeof *exceeded_rate);
6802
6803
0
    atomic_read_relaxed(&meter->used, &meter_used);
6804
0
    do {
6805
0
        if (meter_used >= now_ms) {
6806
            /* The '>' condition means that we have several threads hitting the
6807
             * same meter, and the other one already advanced the time. */
6808
0
            meter_used = now_ms;
6809
0
            break;
6810
0
        }
6811
0
    } while (!atomic_compare_exchange_weak_relaxed(&meter->used,
6812
0
                                                   &meter_used, now_ms));
6813
6814
    /* Refill all buckets right away, since other threads may use them. */
6815
0
    if (meter_used < now_ms) {
6816
        /* All packets will hit the meter at the same time. */
6817
0
        uint64_t delta_t = now_ms - meter_used;
6818
6819
        /* Make sure delta_t will not be too large, so that bucket will not
6820
         * wrap around below. */
6821
0
        delta_t = MIN(delta_t, meter->max_delta_t);
6822
6823
0
        for (int m = 0; m < meter->n_bands; m++) {
6824
0
            band = &meter->bands[m];
6825
            /* Update band's bucket.  We can't just use atomic add here,
6826
             * because we should never add above the max capacity. */
6827
0
            atomic_sat_add(&band->bucket, delta_t * band->rate,
6828
0
                           band->burst_size * 1000ULL);
6829
0
        }
6830
0
    }
6831
6832
    /* Update meter stats. */
6833
0
    atomic_add_relaxed(&meter->packet_count, cnt, &old);
6834
0
    bytes = 0;
6835
0
    DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
6836
0
        bytes += dp_packet_size(packet);
6837
0
    }
6838
0
    atomic_add_relaxed(&meter->byte_count, bytes, &old);
6839
6840
    /* Meters can operate in terms of packets per second or kilobits per
6841
     * second. */
6842
0
    if (meter->flags & OFPMF13_PKTPS) {
6843
        /* Rate in packets/second, bucket 1/1000 packets.
6844
         * msec * packets/sec = 1/1000 packets. */
6845
0
        volume = cnt * 1000; /* Take 'cnt' packets from the bucket. */
6846
0
    } else {
6847
        /* Rate in kbps, bucket in bits.
6848
         * msec * kbps = bits */
6849
0
        volume = bytes * 8;
6850
0
    }
6851
6852
    /* Find the band hit with the highest rate for each packet (if any). */
6853
0
    for (int m = 0; m < meter->n_bands; m++) {
6854
0
        band = &meter->bands[m];
6855
6856
        /* Drain the bucket for all the packets, if possible. */
6857
0
        if (atomic_bound_sub(&band->bucket, volume, 0)) {
6858
0
            continue;
6859
0
        }
6860
6861
        /* Band limit hit, must process packet-by-packet. */
6862
0
        DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
6863
0
            uint64_t packet_volume = (meter->flags & OFPMF13_PKTPS)
6864
0
                                     ? 1000 : (dp_packet_size(packet) * 8);
6865
6866
0
            if (!atomic_bound_sub(&band->bucket, packet_volume, 0)) {
6867
                /* Update the exceeding band for the exceeding packet.
6868
                 * Only one band will be fired by a packet, and that can
6869
                 * be different for each packet. */
6870
0
                if (band->rate > exceeded_rate[i]) {
6871
0
                    exceeded_rate[i] = band->rate;
6872
0
                    exceeded_band[i] = m;
6873
0
                    exceeded = true;
6874
0
                }
6875
0
            }
6876
0
        }
6877
0
    }
6878
6879
    /* No need to iterate over packets if there are no drops. */
6880
0
    if (!exceeded) {
6881
0
        return;
6882
0
    }
6883
6884
    /* Fire the highest rate band exceeded by each packet, and drop
6885
     * packets if needed. */
6886
6887
0
    memset(band_packets, 0, sizeof band_packets);
6888
0
    memset(band_bytes,   0, sizeof band_bytes);
6889
6890
0
    size_t j;
6891
0
    DP_PACKET_BATCH_REFILL_FOR_EACH (j, cnt, packet, packets_) {
6892
0
        uint32_t m = exceeded_band[j];
6893
6894
0
        if (m != UINT32_MAX) {
6895
            /* Meter drop packet. */
6896
0
            band_packets[m]++;
6897
0
            band_bytes[m] += dp_packet_size(packet);
6898
0
            dp_packet_delete(packet);
6899
0
        } else {
6900
            /* Meter accepts packet. */
6901
0
            dp_packet_batch_refill(packets_, packet, j);
6902
0
        }
6903
0
    }
6904
6905
0
    for (int m = 0; m < meter->n_bands; m++) {
6906
0
        if (!band_packets[m]) {
6907
0
            continue;
6908
0
        }
6909
0
        band = &meter->bands[m];
6910
0
        atomic_add_relaxed(&band->packet_count, band_packets[m], &old);
6911
0
        atomic_add_relaxed(&band->byte_count,   band_bytes[m],   &old);
6912
0
        COVERAGE_ADD(datapath_drop_meter, band_packets[m]);
6913
0
    }
6914
0
}
6915
6916
/* Meter set/get/del processing is still single-threaded. */
6917
static int
6918
dpif_netdev_meter_set(struct dpif *dpif, ofproto_meter_id meter_id,
6919
                      struct ofputil_meter_config *config)
6920
0
{
6921
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
6922
0
    uint32_t mid = meter_id.uint32;
6923
0
    struct dp_meter *meter;
6924
0
    int i;
6925
6926
0
    if (mid >= MAX_METERS) {
6927
0
        return EFBIG; /* Meter_id out of range. */
6928
0
    }
6929
6930
0
    if (config->flags & ~DP_SUPPORTED_METER_FLAGS_MASK) {
6931
0
        return EBADF; /* Unsupported flags set */
6932
0
    }
6933
6934
0
    if (config->n_bands > MAX_BANDS) {
6935
0
        return EINVAL;
6936
0
    }
6937
6938
0
    for (i = 0; i < config->n_bands; ++i) {
6939
0
        switch (config->bands[i].type) {
6940
0
        case OFPMBT13_DROP:
6941
0
            break;
6942
0
        default:
6943
0
            return ENODEV; /* Unsupported band type */
6944
0
        }
6945
0
    }
6946
6947
    /* Allocate meter */
6948
0
    meter = xzalloc(sizeof *meter
6949
0
                    + config->n_bands * sizeof(struct dp_meter_band));
6950
6951
0
    meter->flags = config->flags;
6952
0
    meter->n_bands = config->n_bands;
6953
0
    meter->max_delta_t = 0;
6954
0
    meter->id = mid;
6955
0
    atomic_init(&meter->used, time_msec());
6956
6957
    /* set up bands */
6958
0
    for (i = 0; i < config->n_bands; ++i) {
6959
0
        uint32_t band_max_delta_t;
6960
0
        uint64_t bucket_size;
6961
6962
        /* Set burst size to a workable value if none specified. */
6963
0
        if (config->bands[i].burst_size == 0) {
6964
0
            config->bands[i].burst_size = config->bands[i].rate;
6965
0
        }
6966
6967
0
        meter->bands[i].rate = config->bands[i].rate;
6968
0
        meter->bands[i].burst_size = config->bands[i].burst_size;
6969
        /* Start with a full bucket. */
6970
0
        bucket_size = meter->bands[i].burst_size * 1000ULL;
6971
0
        atomic_init(&meter->bands[i].bucket, bucket_size);
6972
6973
        /* Figure out max delta_t that is enough to fill any bucket. */
6974
0
        band_max_delta_t = bucket_size / meter->bands[i].rate;
6975
0
        if (band_max_delta_t > meter->max_delta_t) {
6976
0
            meter->max_delta_t = band_max_delta_t;
6977
0
        }
6978
0
    }
6979
6980
0
    ovs_mutex_lock(&dp->meters_lock);
6981
6982
0
    dp_meter_detach_free(&dp->meters, mid); /* Free existing meter, if any. */
6983
0
    dp_meter_attach(&dp->meters, meter);
6984
6985
0
    ovs_mutex_unlock(&dp->meters_lock);
6986
6987
0
    return 0;
6988
0
}
6989
6990
static int
6991
dpif_netdev_meter_get(const struct dpif *dpif,
6992
                      ofproto_meter_id meter_id_,
6993
                      struct ofputil_meter_stats *stats, uint16_t n_bands)
6994
0
{
6995
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
6996
0
    uint32_t meter_id = meter_id_.uint32;
6997
0
    struct dp_meter *meter;
6998
6999
0
    if (meter_id >= MAX_METERS) {
7000
0
        return EFBIG;
7001
0
    }
7002
7003
0
    meter = dp_meter_lookup(&dp->meters, meter_id);
7004
0
    if (!meter) {
7005
0
        return ENOENT;
7006
0
    }
7007
7008
0
    if (stats) {
7009
0
        int i = 0;
7010
7011
0
        atomic_read_relaxed(&meter->packet_count, &stats->packet_in_count);
7012
0
        atomic_read_relaxed(&meter->byte_count, &stats->byte_in_count);
7013
7014
0
        for (i = 0; i < n_bands && i < meter->n_bands; ++i) {
7015
0
            atomic_read_relaxed(&meter->bands[i].packet_count,
7016
0
                                &stats->bands[i].packet_count);
7017
0
            atomic_read_relaxed(&meter->bands[i].byte_count,
7018
0
                                &stats->bands[i].byte_count);
7019
0
        }
7020
0
        stats->n_bands = i;
7021
0
    }
7022
7023
0
    return 0;
7024
0
}
7025
7026
static int
7027
dpif_netdev_meter_del(struct dpif *dpif,
7028
                      ofproto_meter_id meter_id_,
7029
                      struct ofputil_meter_stats *stats, uint16_t n_bands)
7030
0
{
7031
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
7032
0
    int error;
7033
7034
0
    error = dpif_netdev_meter_get(dpif, meter_id_, stats, n_bands);
7035
0
    if (!error) {
7036
0
        uint32_t meter_id = meter_id_.uint32;
7037
7038
0
        ovs_mutex_lock(&dp->meters_lock);
7039
0
        dp_meter_detach_free(&dp->meters, meter_id);
7040
0
        ovs_mutex_unlock(&dp->meters_lock);
7041
0
    }
7042
0
    return error;
7043
0
}
7044
7045

7046
static void
7047
dpif_netdev_disable_upcall(struct dpif *dpif)
7048
    OVS_NO_THREAD_SAFETY_ANALYSIS
7049
0
{
7050
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
7051
0
    dp_netdev_disable_upcall(dp);
7052
0
}
7053
7054
static void
7055
dp_netdev_enable_upcall(struct dp_netdev *dp)
7056
    OVS_RELEASES(dp->upcall_rwlock)
7057
0
{
7058
0
    fat_rwlock_unlock(&dp->upcall_rwlock);
7059
0
}
7060
7061
static void
7062
dpif_netdev_enable_upcall(struct dpif *dpif)
7063
    OVS_NO_THREAD_SAFETY_ANALYSIS
7064
0
{
7065
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
7066
0
    dp_netdev_enable_upcall(dp);
7067
0
}
7068
7069
static void
7070
dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd)
7071
0
{
7072
0
    atomic_store_relaxed(&pmd->wait_for_reload, false);
7073
0
    atomic_store_relaxed(&pmd->reload_tx_qid, false);
7074
0
    pmd->last_reload_seq = seq_read(pmd->reload_seq);
7075
0
    atomic_store_explicit(&pmd->reload, false, memory_order_release);
7076
0
}
7077
7078
/* Finds and refs the dp_netdev_pmd_thread on core 'core_id'.  Returns
7079
 * the pointer if succeeds, otherwise, NULL (it can return NULL even if
7080
 * 'core_id' is NON_PMD_CORE_ID).
7081
 *
7082
 * Caller must unrefs the returned reference.  */
7083
static struct dp_netdev_pmd_thread *
7084
dp_netdev_get_pmd(struct dp_netdev *dp, unsigned core_id)
7085
0
{
7086
0
    struct dp_netdev_pmd_thread *pmd;
7087
7088
0
    CMAP_FOR_EACH_WITH_HASH (pmd, node, hash_int(core_id, 0),
7089
0
                             &dp->poll_threads) {
7090
0
        if (pmd->core_id == core_id) {
7091
0
            return dp_netdev_pmd_try_ref(pmd) ? pmd : NULL;
7092
0
        }
7093
0
    }
7094
7095
0
    return NULL;
7096
0
}
7097
7098
/* Sets the 'struct dp_netdev_pmd_thread' for non-pmd threads. */
7099
static void
7100
dp_netdev_set_nonpmd(struct dp_netdev *dp)
7101
    OVS_REQ_WRLOCK(dp->port_rwlock)
7102
0
{
7103
0
    struct dp_netdev_pmd_thread *non_pmd;
7104
7105
0
    non_pmd = xzalloc(sizeof *non_pmd);
7106
0
    dp_netdev_configure_pmd(non_pmd, dp, NON_PMD_CORE_ID, OVS_NUMA_UNSPEC);
7107
0
}
7108
7109
/* Caller must have valid pointer to 'pmd'. */
7110
static bool
7111
dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd)
7112
0
{
7113
0
    return ovs_refcount_try_ref_rcu(&pmd->ref_cnt);
7114
0
}
7115
7116
static void
7117
dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd)
7118
0
{
7119
0
    if (pmd && ovs_refcount_unref(&pmd->ref_cnt) == 1) {
7120
0
        ovsrcu_postpone(dp_netdev_destroy_pmd, pmd);
7121
0
    }
7122
0
}
7123
7124
/* Given cmap position 'pos', tries to ref the next node.  If try_ref()
7125
 * fails, keeps checking for next node until reaching the end of cmap.
7126
 *
7127
 * Caller must unrefs the returned reference. */
7128
static struct dp_netdev_pmd_thread *
7129
dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos)
7130
0
{
7131
0
    struct dp_netdev_pmd_thread *next;
7132
7133
0
    do {
7134
0
        struct cmap_node *node;
7135
7136
0
        node = cmap_next_position(&dp->poll_threads, pos);
7137
0
        next = node ? CONTAINER_OF(node, struct dp_netdev_pmd_thread, node)
7138
0
            : NULL;
7139
0
    } while (next && !dp_netdev_pmd_try_ref(next));
7140
7141
0
    return next;
7142
0
}
7143
7144
/* Configures the 'pmd' based on the input argument. */
7145
static void
7146
dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp,
7147
                        unsigned core_id, int numa_id)
7148
    OVS_NO_THREAD_SAFETY_ANALYSIS
7149
0
{
7150
0
    pmd->dp = dp;
7151
0
    pmd->core_id = core_id;
7152
0
    pmd->numa_id = numa_id;
7153
0
    pmd->need_reload = false;
7154
0
    pmd->n_output_batches = 0;
7155
7156
0
    ovs_refcount_init(&pmd->ref_cnt);
7157
0
    atomic_init(&pmd->exit, false);
7158
0
    pmd->reload_seq = seq_create();
7159
0
    pmd->last_reload_seq = seq_read(pmd->reload_seq);
7160
0
    atomic_init(&pmd->reload, false);
7161
0
    ovs_mutex_init(&pmd->flow_mutex);
7162
0
    ovs_mutex_init(&pmd->port_mutex);
7163
0
    ovs_mutex_init(&pmd->bond_mutex);
7164
0
    cmap_init(&pmd->flow_table);
7165
0
    cmap_init(&pmd->classifiers);
7166
0
    cmap_init(&pmd->simple_match_table);
7167
0
    ccmap_init(&pmd->n_flows);
7168
0
    ccmap_init(&pmd->n_simple_flows);
7169
0
    pmd->ctx.last_rxq = NULL;
7170
0
    pmd_thread_ctx_time_update(pmd);
7171
0
    pmd->next_optimization = pmd->ctx.now + DPCLS_OPTIMIZATION_INTERVAL;
7172
0
    pmd->next_rcu_quiesce = pmd->ctx.now + PMD_RCU_QUIESCE_INTERVAL;
7173
0
    pmd->next_cycle_store = pmd->ctx.now + PMD_INTERVAL_LEN;
7174
0
    pmd->busy_cycles_intrvl = xzalloc(PMD_INTERVAL_MAX *
7175
0
                                      sizeof *pmd->busy_cycles_intrvl);
7176
0
    hmap_init(&pmd->poll_list);
7177
0
    hmap_init(&pmd->tx_ports);
7178
0
    hmap_init(&pmd->tnl_port_cache);
7179
0
    hmap_init(&pmd->send_port_cache);
7180
0
    cmap_init(&pmd->tx_bonds);
7181
7182
0
    pmd_init_max_sleep(dp, pmd);
7183
7184
    /* Initialize DPIF function pointer to the default configured version. */
7185
0
    atomic_init(&pmd->netdev_input_func, dp_netdev_impl_get_default());
7186
7187
    /* Init default miniflow_extract function */
7188
0
    atomic_init(&pmd->miniflow_extract_opt, dp_mfex_impl_get_default());
7189
7190
    /* init the 'flow_cache' since there is no
7191
     * actual thread created for NON_PMD_CORE_ID. */
7192
0
    if (core_id == NON_PMD_CORE_ID) {
7193
0
        dfc_cache_init(&pmd->flow_cache);
7194
0
        pmd_alloc_static_tx_qid(pmd);
7195
0
    }
7196
0
    pmd_perf_stats_init(&pmd->perf_stats);
7197
0
    cmap_insert(&dp->poll_threads, CONST_CAST(struct cmap_node *, &pmd->node),
7198
0
                hash_int(core_id, 0));
7199
0
}
7200
7201
static void
7202
dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd)
7203
    OVS_NO_THREAD_SAFETY_ANALYSIS
7204
0
{
7205
0
    struct dpcls *cls;
7206
7207
0
    dp_netdev_pmd_flow_flush(pmd);
7208
0
    hmap_destroy(&pmd->send_port_cache);
7209
0
    hmap_destroy(&pmd->tnl_port_cache);
7210
0
    hmap_destroy(&pmd->tx_ports);
7211
0
    cmap_destroy(&pmd->tx_bonds);
7212
0
    hmap_destroy(&pmd->poll_list);
7213
0
    free(pmd->busy_cycles_intrvl);
7214
    /* All flows (including their dpcls_rules) have been deleted already */
7215
0
    CMAP_FOR_EACH (cls, node, &pmd->classifiers) {
7216
0
        dpcls_destroy(cls);
7217
0
        ovsrcu_postpone(free, cls);
7218
0
    }
7219
0
    cmap_destroy(&pmd->classifiers);
7220
0
    cmap_destroy(&pmd->flow_table);
7221
0
    cmap_destroy(&pmd->simple_match_table);
7222
0
    ccmap_destroy(&pmd->n_flows);
7223
0
    ccmap_destroy(&pmd->n_simple_flows);
7224
0
    ovs_mutex_destroy(&pmd->flow_mutex);
7225
0
    seq_destroy(pmd->reload_seq);
7226
0
    ovs_mutex_destroy(&pmd->port_mutex);
7227
0
    ovs_mutex_destroy(&pmd->bond_mutex);
7228
0
    free(pmd->netdev_input_func_userdata);
7229
0
    free(pmd);
7230
0
}
7231
7232
/* Stops the pmd thread, removes it from the 'dp->poll_threads',
7233
 * and unrefs the struct. */
7234
static void
7235
dp_netdev_del_pmd(struct dp_netdev *dp, struct dp_netdev_pmd_thread *pmd)
7236
0
{
7237
    /* NON_PMD_CORE_ID doesn't have a thread, so we don't have to synchronize,
7238
     * but extra cleanup is necessary */
7239
0
    if (pmd->core_id == NON_PMD_CORE_ID) {
7240
0
        ovs_mutex_lock(&dp->non_pmd_mutex);
7241
0
        dfc_cache_uninit(&pmd->flow_cache);
7242
0
        pmd_free_cached_ports(pmd);
7243
0
        pmd_free_static_tx_qid(pmd);
7244
0
        ovs_mutex_unlock(&dp->non_pmd_mutex);
7245
0
    } else {
7246
0
        atomic_store_relaxed(&pmd->exit, true);
7247
0
        dp_netdev_reload_pmd__(pmd);
7248
0
        xpthread_join(pmd->thread, NULL);
7249
0
    }
7250
7251
0
    dp_netdev_pmd_clear_ports(pmd);
7252
7253
    /* Purges the 'pmd''s flows after stopping the thread, but before
7254
     * destroying the flows, so that the flow stats can be collected. */
7255
0
    if (dp->dp_purge_cb) {
7256
0
        dp->dp_purge_cb(dp->dp_purge_aux, pmd->core_id);
7257
0
    }
7258
0
    cmap_remove(&pmd->dp->poll_threads, &pmd->node, hash_int(pmd->core_id, 0));
7259
0
    dp_netdev_pmd_unref(pmd);
7260
0
}
7261
7262
/* Destroys all pmd threads. If 'non_pmd' is true it also destroys the non pmd
7263
 * thread. */
7264
static void
7265
dp_netdev_destroy_all_pmds(struct dp_netdev *dp, bool non_pmd)
7266
0
{
7267
0
    struct dp_netdev_pmd_thread *pmd;
7268
0
    struct dp_netdev_pmd_thread **pmd_list;
7269
0
    size_t k = 0, n_pmds;
7270
7271
0
    n_pmds = cmap_count(&dp->poll_threads);
7272
0
    pmd_list = xcalloc(n_pmds, sizeof *pmd_list);
7273
7274
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
7275
0
        if (!non_pmd && pmd->core_id == NON_PMD_CORE_ID) {
7276
0
            continue;
7277
0
        }
7278
        /* We cannot call dp_netdev_del_pmd(), since it alters
7279
         * 'dp->poll_threads' (while we're iterating it) and it
7280
         * might quiesce. */
7281
0
        ovs_assert(k < n_pmds);
7282
0
        pmd_list[k++] = pmd;
7283
0
    }
7284
7285
0
    for (size_t i = 0; i < k; i++) {
7286
0
        dp_netdev_del_pmd(dp, pmd_list[i]);
7287
0
    }
7288
0
    free(pmd_list);
7289
0
}
7290
7291
/* Deletes all rx queues from pmd->poll_list and all the ports from
7292
 * pmd->tx_ports. */
7293
static void
7294
dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd)
7295
0
{
7296
0
    struct rxq_poll *poll;
7297
0
    struct tx_port *port;
7298
0
    struct tx_bond *tx;
7299
7300
0
    ovs_mutex_lock(&pmd->port_mutex);
7301
0
    HMAP_FOR_EACH_POP (poll, node, &pmd->poll_list) {
7302
0
        free(poll);
7303
0
    }
7304
0
    HMAP_FOR_EACH_POP (port, node, &pmd->tx_ports) {
7305
0
        free(port->txq_pkts);
7306
0
        free(port);
7307
0
    }
7308
0
    ovs_mutex_unlock(&pmd->port_mutex);
7309
7310
0
    ovs_mutex_lock(&pmd->bond_mutex);
7311
0
    CMAP_FOR_EACH (tx, node, &pmd->tx_bonds) {
7312
0
        cmap_remove(&pmd->tx_bonds, &tx->node, hash_bond_id(tx->bond_id));
7313
0
        ovsrcu_postpone(free, tx);
7314
0
    }
7315
0
    ovs_mutex_unlock(&pmd->bond_mutex);
7316
0
}
7317
7318
/* Adds rx queue to poll_list of PMD thread, if it's not there already. */
7319
static void
7320
dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd,
7321
                         struct dp_netdev_rxq *rxq)
7322
    OVS_REQUIRES(pmd->port_mutex)
7323
0
{
7324
0
    int qid = netdev_rxq_get_queue_id(rxq->rx);
7325
0
    uint32_t hash = hash_2words(odp_to_u32(rxq->port->port_no), qid);
7326
0
    struct rxq_poll *poll;
7327
7328
0
    HMAP_FOR_EACH_WITH_HASH (poll, node, hash, &pmd->poll_list) {
7329
0
        if (poll->rxq == rxq) {
7330
            /* 'rxq' is already polled by this thread. Do nothing. */
7331
0
            return;
7332
0
        }
7333
0
    }
7334
7335
0
    poll = xmalloc(sizeof *poll);
7336
0
    poll->rxq = rxq;
7337
0
    hmap_insert(&pmd->poll_list, &poll->node, hash);
7338
7339
0
    pmd->need_reload = true;
7340
0
}
7341
7342
/* Delete 'poll' from poll_list of PMD thread. */
7343
static void
7344
dp_netdev_del_rxq_from_pmd(struct dp_netdev_pmd_thread *pmd,
7345
                           struct rxq_poll *poll)
7346
    OVS_REQUIRES(pmd->port_mutex)
7347
0
{
7348
0
    hmap_remove(&pmd->poll_list, &poll->node);
7349
0
    free(poll);
7350
7351
0
    pmd->need_reload = true;
7352
0
}
7353
7354
/* Add 'port' to the tx port cache of 'pmd', which must be reloaded for the
7355
 * changes to take effect. */
7356
static void
7357
dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
7358
                             struct dp_netdev_port *port)
7359
    OVS_REQUIRES(pmd->port_mutex)
7360
0
{
7361
0
    struct tx_port *tx;
7362
7363
0
    tx = tx_port_lookup(&pmd->tx_ports, port->port_no);
7364
0
    if (tx) {
7365
        /* 'port' is already on this thread tx cache. Do nothing. */
7366
0
        return;
7367
0
    }
7368
7369
0
    tx = xzalloc(sizeof *tx);
7370
7371
0
    tx->port = port;
7372
0
    tx->qid = -1;
7373
0
    tx->flush_time = 0LL;
7374
0
    dp_packet_batch_init(&tx->output_pkts);
7375
7376
0
    if (tx->port->txq_mode == TXQ_MODE_XPS_HASH) {
7377
0
        int i, n_txq = netdev_n_txq(tx->port->netdev);
7378
7379
0
        tx->txq_pkts = xzalloc(n_txq * sizeof *tx->txq_pkts);
7380
0
        for (i = 0; i < n_txq; i++) {
7381
0
            dp_packet_batch_init(&tx->txq_pkts[i]);
7382
0
        }
7383
0
    }
7384
7385
0
    hmap_insert(&pmd->tx_ports, &tx->node, hash_port_no(tx->port->port_no));
7386
0
    pmd->need_reload = true;
7387
0
}
7388
7389
/* Del 'tx' from the tx port cache of 'pmd', which must be reloaded for the
7390
 * changes to take effect. */
7391
static void
7392
dp_netdev_del_port_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
7393
                               struct tx_port *tx)
7394
    OVS_REQUIRES(pmd->port_mutex)
7395
0
{
7396
0
    hmap_remove(&pmd->tx_ports, &tx->node);
7397
0
    free(tx->txq_pkts);
7398
0
    free(tx);
7399
0
    pmd->need_reload = true;
7400
0
}
7401
7402
/* Add bond to the tx bond cmap of 'pmd'. */
7403
static void
7404
dp_netdev_add_bond_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
7405
                             struct tx_bond *bond, bool update)
7406
    OVS_EXCLUDED(pmd->bond_mutex)
7407
0
{
7408
0
    struct tx_bond *tx;
7409
7410
0
    ovs_mutex_lock(&pmd->bond_mutex);
7411
0
    tx = tx_bond_lookup(&pmd->tx_bonds, bond->bond_id);
7412
7413
0
    if (tx && !update) {
7414
        /* It's not an update and the entry already exists.  Do nothing. */
7415
0
        goto unlock;
7416
0
    }
7417
7418
0
    if (tx) {
7419
0
        struct tx_bond *new_tx = xmemdup(bond, sizeof *bond);
7420
7421
        /* Copy the stats for each bucket. */
7422
0
        for (int i = 0; i < BOND_BUCKETS; i++) {
7423
0
            uint64_t n_packets, n_bytes;
7424
7425
0
            atomic_read_relaxed(&tx->member_buckets[i].n_packets, &n_packets);
7426
0
            atomic_read_relaxed(&tx->member_buckets[i].n_bytes, &n_bytes);
7427
0
            atomic_init(&new_tx->member_buckets[i].n_packets, n_packets);
7428
0
            atomic_init(&new_tx->member_buckets[i].n_bytes, n_bytes);
7429
0
        }
7430
0
        cmap_replace(&pmd->tx_bonds, &tx->node, &new_tx->node,
7431
0
                     hash_bond_id(bond->bond_id));
7432
0
        ovsrcu_postpone(free, tx);
7433
0
    } else {
7434
0
        tx = xmemdup(bond, sizeof *bond);
7435
0
        cmap_insert(&pmd->tx_bonds, &tx->node, hash_bond_id(bond->bond_id));
7436
0
    }
7437
0
unlock:
7438
0
    ovs_mutex_unlock(&pmd->bond_mutex);
7439
0
}
7440
7441
/* Delete bond from the tx bond cmap of 'pmd'. */
7442
static void
7443
dp_netdev_del_bond_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
7444
                               uint32_t bond_id)
7445
    OVS_EXCLUDED(pmd->bond_mutex)
7446
0
{
7447
0
    struct tx_bond *tx;
7448
7449
0
    ovs_mutex_lock(&pmd->bond_mutex);
7450
0
    tx = tx_bond_lookup(&pmd->tx_bonds, bond_id);
7451
0
    if (tx) {
7452
0
        cmap_remove(&pmd->tx_bonds, &tx->node, hash_bond_id(tx->bond_id));
7453
0
        ovsrcu_postpone(free, tx);
7454
0
    }
7455
0
    ovs_mutex_unlock(&pmd->bond_mutex);
7456
0
}
7457

7458
static char *
7459
dpif_netdev_get_datapath_version(void)
7460
0
{
7461
0
     return xstrdup("<built-in>");
7462
0
}
7463
7464
static void
7465
dp_netdev_flow_used(struct dp_netdev_flow *netdev_flow, int cnt, int size,
7466
                    uint16_t tcp_flags, long long now)
7467
0
{
7468
0
    uint16_t flags;
7469
7470
0
    atomic_store_relaxed(&netdev_flow->stats.used, now);
7471
0
    non_atomic_ullong_add(&netdev_flow->stats.packet_count, cnt);
7472
0
    non_atomic_ullong_add(&netdev_flow->stats.byte_count, size);
7473
0
    atomic_read_relaxed(&netdev_flow->stats.tcp_flags, &flags);
7474
0
    flags |= tcp_flags;
7475
0
    atomic_store_relaxed(&netdev_flow->stats.tcp_flags, flags);
7476
0
}
7477
7478
static int
7479
dp_netdev_upcall(struct dp_netdev_pmd_thread *pmd, struct dp_packet *packet_,
7480
                 struct flow *flow, struct flow_wildcards *wc, ovs_u128 *ufid,
7481
                 enum dpif_upcall_type type, const struct nlattr *userdata,
7482
                 struct ofpbuf *actions, struct ofpbuf *put_actions)
7483
0
{
7484
0
    struct dp_netdev *dp = pmd->dp;
7485
7486
0
    if (OVS_UNLIKELY(!dp->upcall_cb)) {
7487
0
        return ENODEV;
7488
0
    }
7489
7490
0
    if (OVS_UNLIKELY(!VLOG_DROP_DBG(&upcall_rl))) {
7491
0
        struct ds ds = DS_EMPTY_INITIALIZER;
7492
0
        char *packet_str;
7493
0
        struct ofpbuf key;
7494
0
        struct odp_flow_key_parms odp_parms = {
7495
0
            .flow = flow,
7496
0
            .mask = wc ? &wc->masks : NULL,
7497
0
            .support = dp_netdev_support,
7498
0
        };
7499
7500
0
        ofpbuf_init(&key, 0);
7501
0
        odp_flow_key_from_flow(&odp_parms, &key);
7502
0
        packet_str = ofp_dp_packet_to_string(packet_);
7503
7504
0
        odp_flow_key_format(key.data, key.size, &ds);
7505
7506
0
        VLOG_DBG("%s: %s upcall:\n%s\n%s", dp->name,
7507
0
                 dpif_upcall_type_to_string(type), ds_cstr(&ds), packet_str);
7508
7509
0
        ofpbuf_uninit(&key);
7510
0
        free(packet_str);
7511
7512
0
        ds_destroy(&ds);
7513
0
    }
7514
7515
0
    if (type != DPIF_UC_MISS) {
7516
0
        dp_packet_ol_send_prepare(packet_, 0);
7517
0
    }
7518
7519
0
    return dp->upcall_cb(packet_, flow, ufid, pmd->core_id, type, userdata,
7520
0
                         actions, wc, put_actions, dp->upcall_aux);
7521
0
}
7522
7523
static inline uint32_t
7524
dpif_netdev_packet_get_rss_hash(struct dp_packet *packet,
7525
                                const struct miniflow *mf)
7526
0
{
7527
0
    uint32_t hash, recirc_depth;
7528
7529
0
    if (OVS_LIKELY(dp_packet_rss_valid(packet))) {
7530
0
        hash = dp_packet_get_rss_hash(packet);
7531
0
    } else {
7532
0
        hash = miniflow_hash_5tuple(mf, 0);
7533
0
        dp_packet_set_rss_hash(packet, hash);
7534
0
    }
7535
7536
    /* The RSS hash must account for the recirculation depth to avoid
7537
     * collisions in the exact match cache */
7538
0
    recirc_depth = *recirc_depth_get_unsafe();
7539
0
    if (OVS_UNLIKELY(recirc_depth)) {
7540
0
        hash = hash_finish(hash, recirc_depth);
7541
0
    }
7542
0
    return hash;
7543
0
}
7544
7545
struct packet_batch_per_flow {
7546
    unsigned int byte_count;
7547
    uint16_t tcp_flags;
7548
    struct dp_netdev_flow *flow;
7549
7550
    struct dp_packet_batch array;
7551
};
7552
7553
static inline void
7554
packet_batch_per_flow_update(struct packet_batch_per_flow *batch,
7555
                             struct dp_packet *packet,
7556
                             uint16_t tcp_flags)
7557
0
{
7558
0
    batch->byte_count += dp_packet_size(packet);
7559
0
    batch->tcp_flags |= tcp_flags;
7560
0
    dp_packet_batch_add(&batch->array, packet);
7561
0
}
7562
7563
static inline void
7564
packet_batch_per_flow_init(struct packet_batch_per_flow *batch,
7565
                           struct dp_netdev_flow *flow)
7566
0
{
7567
0
    flow->batch = batch;
7568
7569
0
    batch->flow = flow;
7570
0
    dp_packet_batch_init(&batch->array);
7571
0
    batch->byte_count = 0;
7572
0
    batch->tcp_flags = 0;
7573
0
}
7574
7575
static inline void
7576
packet_batch_per_flow_execute(struct packet_batch_per_flow *batch,
7577
                              struct dp_netdev_pmd_thread *pmd)
7578
0
{
7579
0
    struct dp_netdev_actions *actions;
7580
0
    struct dp_netdev_flow *flow = batch->flow;
7581
7582
0
    dp_netdev_flow_used(flow, dp_packet_batch_size(&batch->array),
7583
0
                        batch->byte_count,
7584
0
                        batch->tcp_flags, pmd->ctx.now / 1000);
7585
7586
0
    actions = dp_netdev_flow_get_actions(flow);
7587
7588
0
    dp_netdev_execute_actions(pmd, &batch->array, true, &flow->flow,
7589
0
                              actions->actions, actions->size);
7590
0
}
7591
7592
void
7593
dp_netdev_batch_execute(struct dp_netdev_pmd_thread *pmd,
7594
                        struct dp_packet_batch *packets,
7595
                        struct dpcls_rule *rule,
7596
                        uint32_t bytes,
7597
                        uint16_t tcp_flags)
7598
0
{
7599
    /* Gets action* from the rule. */
7600
0
    struct dp_netdev_flow *flow = dp_netdev_flow_cast(rule);
7601
0
    struct dp_netdev_actions *actions = dp_netdev_flow_get_actions(flow);
7602
7603
0
    dp_netdev_flow_used(flow, dp_packet_batch_size(packets), bytes,
7604
0
                        tcp_flags, pmd->ctx.now / 1000);
7605
0
    const uint32_t steal = 1;
7606
0
    dp_netdev_execute_actions(pmd, packets, steal, &flow->flow,
7607
0
                              actions->actions, actions->size);
7608
0
}
7609
7610
static inline void
7611
dp_netdev_queue_batches(struct dp_packet *pkt,
7612
                        struct dp_netdev_flow *flow, uint16_t tcp_flags,
7613
                        struct packet_batch_per_flow *batches,
7614
                        size_t *n_batches)
7615
0
{
7616
0
    struct packet_batch_per_flow *batch = flow->batch;
7617
7618
0
    if (OVS_UNLIKELY(!batch)) {
7619
0
        batch = &batches[(*n_batches)++];
7620
0
        packet_batch_per_flow_init(batch, flow);
7621
0
    }
7622
7623
0
    packet_batch_per_flow_update(batch, pkt, tcp_flags);
7624
0
}
7625
7626
static inline void
7627
packet_enqueue_to_flow_map(struct dp_packet *packet,
7628
                           struct dp_netdev_flow *flow,
7629
                           uint16_t tcp_flags,
7630
                           struct dp_packet_flow_map *flow_map,
7631
                           size_t index)
7632
0
{
7633
0
    struct dp_packet_flow_map *map = &flow_map[index];
7634
0
    map->flow = flow;
7635
0
    map->packet = packet;
7636
0
    map->tcp_flags = tcp_flags;
7637
0
}
7638
7639
/* SMC lookup function for a batch of packets.
7640
 * By doing batching SMC lookup, we can use prefetch
7641
 * to hide memory access latency.
7642
 */
7643
static inline void
7644
smc_lookup_batch(struct dp_netdev_pmd_thread *pmd,
7645
            struct netdev_flow_key *keys,
7646
            struct netdev_flow_key **missed_keys,
7647
            struct dp_packet_batch *packets_,
7648
            const int cnt,
7649
            struct dp_packet_flow_map *flow_map,
7650
            uint8_t *index_map)
7651
0
{
7652
0
    int i;
7653
0
    struct dp_packet *packet;
7654
0
    size_t n_smc_hit = 0, n_missed = 0;
7655
0
    struct dfc_cache *cache = &pmd->flow_cache;
7656
0
    struct smc_cache *smc_cache = &cache->smc_cache;
7657
0
    const struct cmap_node *flow_node;
7658
0
    int recv_idx;
7659
0
    uint16_t tcp_flags;
7660
7661
    /* Prefetch buckets for all packets */
7662
0
    for (i = 0; i < cnt; i++) {
7663
0
        OVS_PREFETCH(&smc_cache->buckets[keys[i].hash & SMC_MASK]);
7664
0
    }
7665
7666
0
    DP_PACKET_BATCH_REFILL_FOR_EACH (i, cnt, packet, packets_) {
7667
0
        struct dp_netdev_flow *flow = NULL;
7668
0
        flow_node = smc_entry_get(pmd, keys[i].hash);
7669
0
        bool hit = false;
7670
        /* Get the original order of this packet in received batch. */
7671
0
        recv_idx = index_map[i];
7672
7673
0
        if (OVS_LIKELY(flow_node != NULL)) {
7674
0
            CMAP_NODE_FOR_EACH (flow, node, flow_node) {
7675
                /* Since we dont have per-port megaflow to check the port
7676
                 * number, we need to  verify that the input ports match. */
7677
0
                if (OVS_LIKELY(dpcls_rule_matches_key(&flow->cr, &keys[i]) &&
7678
0
                flow->flow.in_port.odp_port == packet->md.in_port.odp_port)) {
7679
0
                    tcp_flags = miniflow_get_tcp_flags(&keys[i].mf);
7680
7681
                    /* SMC hit and emc miss, we insert into EMC */
7682
0
                    keys[i].len =
7683
0
                        netdev_flow_key_size(miniflow_n_values(&keys[i].mf));
7684
0
                    emc_probabilistic_insert(pmd, &keys[i], flow);
7685
                    /* Add these packets into the flow map in the same order
7686
                     * as received.
7687
                     */
7688
0
                    packet_enqueue_to_flow_map(packet, flow, tcp_flags,
7689
0
                                               flow_map, recv_idx);
7690
0
                    n_smc_hit++;
7691
0
                    hit = true;
7692
0
                    break;
7693
0
                }
7694
0
            }
7695
0
            if (hit) {
7696
0
                continue;
7697
0
            }
7698
0
        }
7699
7700
        /* SMC missed. Group missed packets together at
7701
         * the beginning of the 'packets' array. */
7702
0
        dp_packet_batch_refill(packets_, packet, i);
7703
7704
        /* Preserve the order of packet for flow batching. */
7705
0
        index_map[n_missed] = recv_idx;
7706
7707
        /* Put missed keys to the pointer arrays return to the caller */
7708
0
        missed_keys[n_missed++] = &keys[i];
7709
0
    }
7710
7711
0
    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SMC_HIT, n_smc_hit);
7712
0
}
7713
7714
struct dp_netdev_flow *
7715
smc_lookup_single(struct dp_netdev_pmd_thread *pmd,
7716
                  struct dp_packet *packet,
7717
                  struct netdev_flow_key *key)
7718
0
{
7719
0
    const struct cmap_node *flow_node = smc_entry_get(pmd, key->hash);
7720
7721
0
    if (OVS_LIKELY(flow_node != NULL)) {
7722
0
        struct dp_netdev_flow *flow = NULL;
7723
7724
0
        CMAP_NODE_FOR_EACH (flow, node, flow_node) {
7725
            /* Since we dont have per-port megaflow to check the port
7726
             * number, we need to verify that the input ports match. */
7727
0
            if (OVS_LIKELY(dpcls_rule_matches_key(&flow->cr, key) &&
7728
0
                flow->flow.in_port.odp_port == packet->md.in_port.odp_port)) {
7729
7730
0
                return (void *) flow;
7731
0
            }
7732
0
        }
7733
0
    }
7734
7735
0
    return NULL;
7736
0
}
7737
7738
inline int
7739
dp_netdev_hw_flow(const struct dp_netdev_pmd_thread *pmd,
7740
                  struct dp_packet *packet,
7741
                  struct dp_netdev_flow **flow)
7742
0
{
7743
0
    struct dp_netdev_rxq *rxq = pmd->ctx.last_rxq;
7744
0
    bool post_process_api_supported;
7745
0
    void *flow_reference = NULL;
7746
0
    int err;
7747
7748
0
    atomic_read_relaxed(&rxq->port->netdev->hw_info.post_process_api_supported,
7749
0
                        &post_process_api_supported);
7750
7751
0
    if (!post_process_api_supported) {
7752
0
        *flow = NULL;
7753
0
        return 0;
7754
0
    }
7755
7756
0
    err = dpif_offload_netdev_hw_post_process(rxq->port->netdev, pmd->core_id,
7757
0
                                              packet, &flow_reference);
7758
0
    if (err && err != EOPNOTSUPP) {
7759
0
        if (err != ECANCELED) {
7760
0
            COVERAGE_INC(datapath_drop_hw_post_process);
7761
0
        } else {
7762
0
            COVERAGE_INC(datapath_drop_hw_post_process_consumed);
7763
0
        }
7764
0
        return -1;
7765
0
    }
7766
7767
0
    *flow = flow_reference;
7768
0
    return 0;
7769
0
}
7770
7771
/* Enqueues already classified packet into per-flow batches or the flow map,
7772
 * depending on the fact if batching enabled. */
7773
static inline void
7774
dfc_processing_enqueue_classified_packet(struct dp_packet *packet,
7775
                                         struct dp_netdev_flow *flow,
7776
                                         uint16_t tcp_flags,
7777
                                         bool batch_enable,
7778
                                         struct packet_batch_per_flow *batches,
7779
                                         size_t *n_batches,
7780
                                         struct dp_packet_flow_map *flow_map,
7781
                                         size_t *map_cnt)
7782
7783
0
{
7784
0
    if (OVS_LIKELY(batch_enable)) {
7785
0
        dp_netdev_queue_batches(packet, flow, tcp_flags, batches,
7786
0
                                n_batches);
7787
0
    } else {
7788
        /* Flow batching should be performed only after fast-path
7789
         * processing is also completed for packets with emc miss
7790
         * or else it will result in reordering of packets with
7791
         * same datapath flows. */
7792
0
        packet_enqueue_to_flow_map(packet, flow, tcp_flags,
7793
0
                                   flow_map, (*map_cnt)++);
7794
0
    }
7795
7796
0
}
7797
7798
/* Try to process all ('cnt') the 'packets' using only the datapath flow cache
7799
 * 'pmd->flow_cache'. If a flow is not found for a packet 'packets[i]', the
7800
 * miniflow is copied into 'keys' and the packet pointer is moved at the
7801
 * beginning of the 'packets' array. The pointers of missed keys are put in the
7802
 * missed_keys pointer array for future processing.
7803
 *
7804
 * The function returns the number of packets that needs to be processed in the
7805
 * 'packets' array (they have been moved to the beginning of the vector).
7806
 *
7807
 * For performance reasons a caller may choose not to initialize the metadata
7808
 * in 'packets_'.  If 'md_is_valid' is false, the metadata in 'packets'
7809
 * is not valid and must be initialized by this function using 'port_no'.
7810
 * If 'md_is_valid' is true, the metadata is already valid and 'port_no'
7811
 * will be ignored.
7812
 */
7813
static inline size_t
7814
dfc_processing(struct dp_netdev_pmd_thread *pmd,
7815
               struct dp_packet_batch *packets_,
7816
               struct netdev_flow_key *keys,
7817
               struct netdev_flow_key **missed_keys,
7818
               struct packet_batch_per_flow batches[], size_t *n_batches,
7819
               struct dp_packet_flow_map *flow_map,
7820
               size_t *n_flows, uint8_t *index_map,
7821
               bool md_is_valid, odp_port_t port_no)
7822
0
{
7823
0
    const bool offload_enabled = dpif_offload_enabled();
7824
0
    const uint32_t recirc_depth = *recirc_depth_get();
7825
0
    const size_t cnt = dp_packet_batch_size(packets_);
7826
0
    size_t n_missed = 0, n_emc_hit = 0, n_phwol_hit = 0;
7827
0
    size_t n_mfex_opt_hit = 0, n_simple_hit = 0;
7828
0
    struct dfc_cache *cache = &pmd->flow_cache;
7829
0
    struct netdev_flow_key *key = &keys[0];
7830
0
    struct dp_packet *packet;
7831
0
    size_t map_cnt = 0;
7832
0
    bool batch_enable = true;
7833
7834
0
    const bool simple_match_enabled =
7835
0
        !md_is_valid && dp_netdev_simple_match_enabled(pmd, port_no);
7836
    /* 'simple_match_table' is a full flow table.  If the flow is not there,
7837
     * upcall is required, and there is no chance to find a match in caches. */
7838
0
    const bool smc_enable_db = !simple_match_enabled && pmd->ctx.smc_enable_db;
7839
0
    const uint32_t cur_min = simple_match_enabled
7840
0
                             ? 0 : pmd->ctx.emc_insert_min;
7841
7842
0
    pmd_perf_update_counter(&pmd->perf_stats,
7843
0
                            md_is_valid ? PMD_STAT_RECIRC : PMD_STAT_RECV,
7844
0
                            cnt);
7845
0
    int i;
7846
0
    DP_PACKET_BATCH_REFILL_FOR_EACH (i, cnt, packet, packets_) {
7847
0
        struct dp_netdev_flow *flow = NULL;
7848
0
        uint16_t tcp_flags;
7849
7850
0
        if (OVS_UNLIKELY(dp_packet_size(packet) < ETH_HEADER_LEN)) {
7851
0
            dp_packet_delete(packet);
7852
0
            COVERAGE_INC(datapath_drop_rx_invalid_packet);
7853
0
            continue;
7854
0
        }
7855
7856
0
        if (i != cnt - 1) {
7857
0
            struct dp_packet **packets = packets_->packets;
7858
            /* Prefetch next packet data and metadata. */
7859
0
            OVS_PREFETCH(dp_packet_data(packets[i+1]));
7860
0
            pkt_metadata_prefetch_init(&packets[i+1]->md);
7861
0
        }
7862
7863
0
        if (!md_is_valid) {
7864
0
            pkt_metadata_init(&packet->md, port_no);
7865
0
        }
7866
7867
0
        if (offload_enabled && recirc_depth == 0) {
7868
0
            if (OVS_UNLIKELY(dp_netdev_hw_flow(pmd, packet, &flow))) {
7869
                /* Packet restoration failed and it was dropped, do not
7870
                 * continue processing.
7871
                 */
7872
0
                continue;
7873
0
            }
7874
0
            if (OVS_LIKELY(flow)) {
7875
0
                tcp_flags = parse_tcp_flags(packet, NULL, NULL, NULL);
7876
0
                n_phwol_hit++;
7877
0
                dfc_processing_enqueue_classified_packet(
7878
0
                        packet, flow, tcp_flags, batch_enable,
7879
0
                        batches, n_batches, flow_map, &map_cnt);
7880
0
                continue;
7881
0
            }
7882
0
        }
7883
7884
0
        if (!flow && simple_match_enabled) {
7885
0
            ovs_be16 dl_type = 0, vlan_tci = 0;
7886
0
            uint8_t nw_frag = 0;
7887
7888
0
            tcp_flags = parse_tcp_flags(packet, &dl_type, &nw_frag, &vlan_tci);
7889
0
            flow = dp_netdev_simple_match_lookup(pmd, port_no, dl_type,
7890
0
                                                 nw_frag, vlan_tci);
7891
0
            if (OVS_LIKELY(flow)) {
7892
0
                n_simple_hit++;
7893
0
                dfc_processing_enqueue_classified_packet(
7894
0
                        packet, flow, tcp_flags, batch_enable,
7895
0
                        batches, n_batches, flow_map, &map_cnt);
7896
0
                continue;
7897
0
            }
7898
0
        }
7899
7900
0
        miniflow_extract(packet, &key->mf);
7901
0
        key->len = 0; /* Not computed yet. */
7902
0
        key->hash =
7903
0
                (md_is_valid == false)
7904
0
                ? dpif_netdev_packet_get_rss_hash_orig_pkt(packet, &key->mf)
7905
0
                : dpif_netdev_packet_get_rss_hash(packet, &key->mf);
7906
7907
        /* If EMC is disabled skip emc_lookup */
7908
0
        flow = (cur_min != 0) ? emc_lookup(&cache->emc_cache, key) : NULL;
7909
0
        if (OVS_LIKELY(flow)) {
7910
0
            tcp_flags = miniflow_get_tcp_flags(&key->mf);
7911
0
            n_emc_hit++;
7912
0
            dfc_processing_enqueue_classified_packet(
7913
0
                    packet, flow, tcp_flags, batch_enable,
7914
0
                    batches, n_batches, flow_map, &map_cnt);
7915
0
        } else {
7916
            /* Exact match cache missed. Group missed packets together at
7917
             * the beginning of the 'packets' array. */
7918
0
            dp_packet_batch_refill(packets_, packet, i);
7919
7920
            /* Preserve the order of packet for flow batching. */
7921
0
            index_map[n_missed] = map_cnt;
7922
0
            flow_map[map_cnt++].flow = NULL;
7923
7924
            /* 'key[n_missed]' contains the key of the current packet and it
7925
             * will be passed to SMC lookup. The next key should be extracted
7926
             * to 'keys[n_missed + 1]'.
7927
             * We also maintain a pointer array to keys missed both SMC and EMC
7928
             * which will be returned to the caller for future processing. */
7929
0
            missed_keys[n_missed] = key;
7930
0
            key = &keys[++n_missed];
7931
7932
            /* Skip batching for subsequent packets to avoid reordering. */
7933
0
            batch_enable = false;
7934
0
        }
7935
0
    }
7936
    /* Count of packets which are not flow batched. */
7937
0
    *n_flows = map_cnt;
7938
7939
0
    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_PHWOL_HIT, n_phwol_hit);
7940
0
    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MFEX_OPT_HIT,
7941
0
                            n_mfex_opt_hit);
7942
0
    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SIMPLE_HIT,
7943
0
                            n_simple_hit);
7944
0
    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_EXACT_HIT, n_emc_hit);
7945
7946
0
    if (!smc_enable_db) {
7947
0
        return dp_packet_batch_size(packets_);
7948
0
    }
7949
7950
    /* Packets miss EMC will do a batch lookup in SMC if enabled */
7951
0
    smc_lookup_batch(pmd, keys, missed_keys, packets_,
7952
0
                     n_missed, flow_map, index_map);
7953
7954
0
    return dp_packet_batch_size(packets_);
7955
0
}
7956
7957
static inline int
7958
handle_packet_upcall(struct dp_netdev_pmd_thread *pmd,
7959
                     struct dp_packet *packet,
7960
                     const struct netdev_flow_key *key,
7961
                     struct ofpbuf *actions, struct ofpbuf *put_actions)
7962
0
{
7963
0
    struct ofpbuf *add_actions;
7964
0
    struct dp_packet_batch b;
7965
0
    struct match match;
7966
0
    ovs_u128 ufid;
7967
0
    int error;
7968
0
    uint64_t cycles = cycles_counter_update(&pmd->perf_stats);
7969
0
    odp_port_t orig_in_port = packet->md.orig_in_port;
7970
7971
0
    match.tun_md.valid = false;
7972
0
    miniflow_expand(&key->mf, &match.flow);
7973
0
    memset(&match.wc, 0, sizeof match.wc);
7974
7975
0
    ofpbuf_clear(actions);
7976
0
    ofpbuf_clear(put_actions);
7977
7978
0
    odp_flow_key_hash(&match.flow, sizeof match.flow, &ufid);
7979
0
    error = dp_netdev_upcall(pmd, packet, &match.flow, &match.wc,
7980
0
                             &ufid, DPIF_UC_MISS, NULL, actions,
7981
0
                             put_actions);
7982
0
    if (OVS_UNLIKELY(error && error != ENOSPC)) {
7983
0
        dp_packet_delete(packet);
7984
0
        COVERAGE_INC(datapath_drop_upcall_error);
7985
0
        return error;
7986
0
    }
7987
7988
    /* The Netlink encoding of datapath flow keys cannot express
7989
     * wildcarding the presence of a VLAN tag. Instead, a missing VLAN
7990
     * tag is interpreted as exact match on the fact that there is no
7991
     * VLAN.  Unless we refactor a lot of code that translates between
7992
     * Netlink and struct flow representations, we have to do the same
7993
     * here.  This must be in sync with 'match' in dpif_netdev_flow_put(). */
7994
0
    if (!match.wc.masks.vlans[0].tci) {
7995
0
        match.wc.masks.vlans[0].tci = htons(VLAN_VID_MASK | VLAN_CFI);
7996
0
    }
7997
7998
    /* We can't allow the packet batching in the next loop to execute
7999
     * the actions.  Otherwise, if there are any slow path actions,
8000
     * we'll send the packet up twice. */
8001
0
    dp_packet_batch_init_packet(&b, packet);
8002
0
    dp_netdev_execute_actions(pmd, &b, true, &match.flow,
8003
0
                              actions->data, actions->size);
8004
8005
0
    add_actions = put_actions->size ? put_actions : actions;
8006
0
    if (OVS_LIKELY(error != ENOSPC)) {
8007
0
        struct dp_netdev_flow *netdev_flow;
8008
8009
        /* XXX: There's a race window where a flow covering this packet
8010
         * could have already been installed since we last did the flow
8011
         * lookup before upcall.  This could be solved by moving the
8012
         * mutex lock outside the loop, but that's an awful long time
8013
         * to be locking revalidators out of making flow modifications. */
8014
0
        ovs_mutex_lock(&pmd->flow_mutex);
8015
0
        netdev_flow = dp_netdev_pmd_lookup_flow(pmd, key, NULL);
8016
0
        if (OVS_LIKELY(!netdev_flow)) {
8017
0
            netdev_flow = dp_netdev_flow_add(pmd, &match, &ufid,
8018
0
                                             add_actions->data,
8019
0
                                             add_actions->size, orig_in_port);
8020
0
        }
8021
0
        ovs_mutex_unlock(&pmd->flow_mutex);
8022
0
        uint32_t hash = dp_netdev_flow_hash(&netdev_flow->ufid);
8023
0
        smc_insert(pmd, key, hash);
8024
0
        emc_probabilistic_insert(pmd, key, netdev_flow);
8025
0
    }
8026
0
    if (pmd_perf_metrics_enabled(pmd)) {
8027
        /* Update upcall stats. */
8028
0
        cycles = cycles_counter_update(&pmd->perf_stats) - cycles;
8029
0
        struct pmd_perf_stats *s = &pmd->perf_stats;
8030
0
        s->current.upcalls++;
8031
0
        s->current.upcall_cycles += cycles;
8032
0
        histogram_add_sample(&s->cycles_per_upcall, cycles);
8033
0
    }
8034
0
    return error;
8035
0
}
8036
8037
static inline void
8038
fast_path_processing(struct dp_netdev_pmd_thread *pmd,
8039
                     struct dp_packet_batch *packets_,
8040
                     struct netdev_flow_key **keys,
8041
                     struct dp_packet_flow_map *flow_map,
8042
                     uint8_t *index_map,
8043
                     odp_port_t in_port)
8044
0
{
8045
0
    const size_t cnt = dp_packet_batch_size(packets_);
8046
0
#if !defined(__CHECKER__) && !defined(_WIN32)
8047
0
    const size_t PKT_ARRAY_SIZE = cnt;
8048
#else
8049
    /* Sparse or MSVC doesn't like variable length array. */
8050
    enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
8051
#endif
8052
0
    struct dp_packet *packet;
8053
0
    struct dpcls *cls;
8054
0
    struct dpcls_rule *rules[PKT_ARRAY_SIZE];
8055
0
    struct dp_netdev *dp = pmd->dp;
8056
0
    int upcall_ok_cnt = 0, upcall_fail_cnt = 0;
8057
0
    int lookup_cnt = 0, add_lookup_cnt;
8058
0
    bool any_miss;
8059
8060
0
    for (size_t i = 0; i < cnt; i++) {
8061
        /* Key length is needed in all the cases, hash computed on demand. */
8062
0
        keys[i]->len = netdev_flow_key_size(miniflow_n_values(&keys[i]->mf));
8063
0
    }
8064
    /* Get the classifier for the in_port */
8065
0
    cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
8066
0
    if (OVS_LIKELY(cls)) {
8067
0
        any_miss = !dpcls_lookup(cls, (const struct netdev_flow_key **)keys,
8068
0
                                rules, cnt, &lookup_cnt);
8069
0
    } else {
8070
0
        any_miss = true;
8071
0
        memset(rules, 0, sizeof(rules));
8072
0
    }
8073
0
    if (OVS_UNLIKELY(any_miss) && !fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {
8074
0
        uint64_t actions_stub[512 / 8], slow_stub[512 / 8];
8075
0
        struct ofpbuf actions, put_actions;
8076
8077
0
        ofpbuf_use_stub(&actions, actions_stub, sizeof actions_stub);
8078
0
        ofpbuf_use_stub(&put_actions, slow_stub, sizeof slow_stub);
8079
8080
0
        DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
8081
0
            struct dp_netdev_flow *netdev_flow;
8082
8083
0
            if (OVS_LIKELY(rules[i])) {
8084
0
                continue;
8085
0
            }
8086
8087
            /* It's possible that an earlier slow path execution installed
8088
             * a rule covering this flow.  In this case, it's a lot cheaper
8089
             * to catch it here than execute a miss. */
8090
0
            netdev_flow = dp_netdev_pmd_lookup_flow(pmd, keys[i],
8091
0
                                                    &add_lookup_cnt);
8092
0
            if (netdev_flow) {
8093
0
                lookup_cnt += add_lookup_cnt;
8094
0
                rules[i] = &netdev_flow->cr;
8095
0
                continue;
8096
0
            }
8097
8098
0
            int error = handle_packet_upcall(pmd, packet, keys[i],
8099
0
                                             &actions, &put_actions);
8100
8101
0
            if (OVS_UNLIKELY(error)) {
8102
0
                upcall_fail_cnt++;
8103
0
            } else {
8104
0
                upcall_ok_cnt++;
8105
0
            }
8106
0
        }
8107
8108
0
        ofpbuf_uninit(&actions);
8109
0
        ofpbuf_uninit(&put_actions);
8110
0
        fat_rwlock_unlock(&dp->upcall_rwlock);
8111
0
    } else if (OVS_UNLIKELY(any_miss)) {
8112
0
        DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
8113
0
            if (OVS_UNLIKELY(!rules[i])) {
8114
0
                dp_packet_delete(packet);
8115
0
                COVERAGE_INC(datapath_drop_lock_error);
8116
0
                upcall_fail_cnt++;
8117
0
            }
8118
0
        }
8119
0
    }
8120
8121
0
    DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
8122
0
        struct dp_netdev_flow *flow;
8123
        /* Get the original order of this packet in received batch. */
8124
0
        int recv_idx = index_map[i];
8125
0
        uint16_t tcp_flags;
8126
8127
0
        if (OVS_UNLIKELY(!rules[i])) {
8128
0
            continue;
8129
0
        }
8130
8131
0
        flow = dp_netdev_flow_cast(rules[i]);
8132
0
        uint32_t hash =  dp_netdev_flow_hash(&flow->ufid);
8133
0
        smc_insert(pmd, keys[i], hash);
8134
8135
0
        emc_probabilistic_insert(pmd, keys[i], flow);
8136
        /* Add these packets into the flow map in the same order
8137
         * as received.
8138
         */
8139
0
        tcp_flags = miniflow_get_tcp_flags(&keys[i]->mf);
8140
0
        packet_enqueue_to_flow_map(packet, flow, tcp_flags,
8141
0
                                   flow_map, recv_idx);
8142
0
    }
8143
8144
0
    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_HIT,
8145
0
                            cnt - upcall_ok_cnt - upcall_fail_cnt);
8146
0
    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_LOOKUP,
8147
0
                            lookup_cnt);
8148
0
    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MISS,
8149
0
                            upcall_ok_cnt);
8150
0
    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_LOST,
8151
0
                            upcall_fail_cnt);
8152
0
}
8153
8154
/* Packets enter the datapath from a port (or from recirculation) here.
8155
 *
8156
 * When 'md_is_valid' is true the metadata in 'packets' are already valid.
8157
 * When false the metadata in 'packets' need to be initialized. */
8158
static void
8159
dp_netdev_input__(struct dp_netdev_pmd_thread *pmd,
8160
                  struct dp_packet_batch *packets,
8161
                  bool md_is_valid, odp_port_t port_no)
8162
0
{
8163
0
#if !defined(__CHECKER__) && !defined(_WIN32)
8164
0
    const size_t PKT_ARRAY_SIZE = dp_packet_batch_size(packets);
8165
#else
8166
    /* Sparse or MSVC doesn't like variable length array. */
8167
    enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
8168
#endif
8169
0
    OVS_ALIGNED_VAR(CACHE_LINE_SIZE)
8170
0
        struct netdev_flow_key keys[PKT_ARRAY_SIZE];
8171
0
    struct netdev_flow_key *missed_keys[PKT_ARRAY_SIZE];
8172
0
    struct packet_batch_per_flow batches[PKT_ARRAY_SIZE];
8173
0
    size_t n_batches;
8174
0
    struct dp_packet_flow_map flow_map[PKT_ARRAY_SIZE];
8175
0
    uint8_t index_map[PKT_ARRAY_SIZE];
8176
0
    size_t n_flows, i;
8177
8178
0
    odp_port_t in_port;
8179
8180
0
    n_batches = 0;
8181
0
    dfc_processing(pmd, packets, keys, missed_keys, batches, &n_batches,
8182
0
                   flow_map, &n_flows, index_map, md_is_valid, port_no);
8183
8184
0
    if (!dp_packet_batch_is_empty(packets)) {
8185
        /* Get ingress port from first packet's metadata. */
8186
0
        in_port = packets->packets[0]->md.in_port.odp_port;
8187
0
        fast_path_processing(pmd, packets, missed_keys,
8188
0
                             flow_map, index_map, in_port);
8189
0
    }
8190
8191
    /* Batch rest of packets which are in flow map. */
8192
0
    for (i = 0; i < n_flows; i++) {
8193
0
        struct dp_packet_flow_map *map = &flow_map[i];
8194
8195
0
        if (OVS_UNLIKELY(!map->flow)) {
8196
0
            continue;
8197
0
        }
8198
0
        dp_netdev_queue_batches(map->packet, map->flow, map->tcp_flags,
8199
0
                                batches, &n_batches);
8200
0
     }
8201
8202
    /* All the flow batches need to be reset before any call to
8203
     * packet_batch_per_flow_execute() as it could potentially trigger
8204
     * recirculation. When a packet matching flow 'j' happens to be
8205
     * recirculated, the nested call to dp_netdev_input__() could potentially
8206
     * classify the packet as matching another flow - say 'k'. It could happen
8207
     * that in the previous call to dp_netdev_input__() that same flow 'k' had
8208
     * already its own batches[k] still waiting to be served.  So if its
8209
     * 'batch' member is not reset, the recirculated packet would be wrongly
8210
     * appended to batches[k] of the 1st call to dp_netdev_input__(). */
8211
0
    for (i = 0; i < n_batches; i++) {
8212
0
        batches[i].flow->batch = NULL;
8213
0
    }
8214
8215
0
    for (i = 0; i < n_batches; i++) {
8216
0
        packet_batch_per_flow_execute(&batches[i], pmd);
8217
0
    }
8218
0
}
8219
8220
int32_t
8221
dp_netdev_input(struct dp_netdev_pmd_thread *pmd,
8222
                struct dp_packet_batch *packets,
8223
                odp_port_t port_no)
8224
0
{
8225
0
    dp_netdev_input__(pmd, packets, false, port_no);
8226
0
    return 0;
8227
0
}
8228
8229
static void
8230
dp_netdev_recirculate(struct dp_netdev_pmd_thread *pmd,
8231
                      struct dp_packet_batch *packets)
8232
0
{
8233
0
    dp_netdev_input__(pmd, packets, true, 0);
8234
0
}
8235
8236
struct dp_netdev_execute_aux {
8237
    struct dp_netdev_pmd_thread *pmd;
8238
    const struct flow *flow;
8239
};
8240
8241
static void
8242
dpif_netdev_register_dp_purge_cb(struct dpif *dpif, dp_purge_callback *cb,
8243
                                 void *aux)
8244
0
{
8245
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
8246
0
    dp->dp_purge_aux = aux;
8247
0
    dp->dp_purge_cb = cb;
8248
0
}
8249
8250
static void
8251
dpif_netdev_register_upcall_cb(struct dpif *dpif, upcall_callback *cb,
8252
                               void *aux)
8253
0
{
8254
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
8255
0
    dp->upcall_aux = aux;
8256
0
    dp->upcall_cb = cb;
8257
0
}
8258
8259
static void
8260
dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
8261
                               bool purge)
8262
0
{
8263
0
    struct tx_port *tx;
8264
0
    struct dp_netdev_port *port;
8265
0
    long long interval;
8266
8267
0
    HMAP_FOR_EACH (tx, node, &pmd->send_port_cache) {
8268
0
        if (tx->port->txq_mode != TXQ_MODE_XPS) {
8269
0
            continue;
8270
0
        }
8271
0
        interval = pmd->ctx.now - tx->last_used;
8272
0
        if (tx->qid >= 0 && (purge || interval >= XPS_TIMEOUT)) {
8273
0
            port = tx->port;
8274
0
            ovs_mutex_lock(&port->txq_used_mutex);
8275
0
            port->txq_used[tx->qid]--;
8276
0
            ovs_mutex_unlock(&port->txq_used_mutex);
8277
0
            tx->qid = -1;
8278
0
        }
8279
0
    }
8280
0
}
8281
8282
static int
8283
dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd,
8284
                           struct tx_port *tx)
8285
0
{
8286
0
    struct dp_netdev_port *port;
8287
0
    long long interval;
8288
0
    int i, min_cnt, min_qid;
8289
8290
0
    interval = pmd->ctx.now - tx->last_used;
8291
0
    tx->last_used = pmd->ctx.now;
8292
8293
0
    if (OVS_LIKELY(tx->qid >= 0 && interval < XPS_TIMEOUT)) {
8294
0
        return tx->qid;
8295
0
    }
8296
8297
0
    port = tx->port;
8298
8299
0
    ovs_mutex_lock(&port->txq_used_mutex);
8300
0
    if (tx->qid >= 0) {
8301
0
        port->txq_used[tx->qid]--;
8302
0
        tx->qid = -1;
8303
0
    }
8304
8305
0
    min_cnt = -1;
8306
0
    min_qid = 0;
8307
0
    for (i = 0; i < netdev_n_txq(port->netdev); i++) {
8308
0
        if (port->txq_used[i] < min_cnt || min_cnt == -1) {
8309
0
            min_cnt = port->txq_used[i];
8310
0
            min_qid = i;
8311
0
        }
8312
0
    }
8313
8314
0
    port->txq_used[min_qid]++;
8315
0
    tx->qid = min_qid;
8316
8317
0
    ovs_mutex_unlock(&port->txq_used_mutex);
8318
8319
0
    dpif_netdev_xps_revalidate_pmd(pmd, false);
8320
8321
0
    VLOG_DBG("Core %d: New TX queue ID %d for port \'%s\'.",
8322
0
             pmd->core_id, tx->qid, netdev_get_name(tx->port->netdev));
8323
0
    return min_qid;
8324
0
}
8325
8326
static struct tx_port *
8327
pmd_tnl_port_cache_lookup(const struct dp_netdev_pmd_thread *pmd,
8328
                          odp_port_t port_no)
8329
0
{
8330
0
    return tx_port_lookup(&pmd->tnl_port_cache, port_no);
8331
0
}
8332
8333
static struct tx_port *
8334
pmd_send_port_cache_lookup(const struct dp_netdev_pmd_thread *pmd,
8335
                           odp_port_t port_no)
8336
0
{
8337
0
    return tx_port_lookup(&pmd->send_port_cache, port_no);
8338
0
}
8339
8340
static int
8341
push_tnl_action(const struct dp_netdev_pmd_thread *pmd,
8342
                const struct nlattr *attr,
8343
                struct dp_packet_batch *batch)
8344
0
{
8345
0
    struct tx_port *tun_port;
8346
0
    const struct ovs_action_push_tnl *data;
8347
0
    int err;
8348
8349
0
    data = nl_attr_get(attr);
8350
8351
0
    tun_port = pmd_tnl_port_cache_lookup(pmd, data->tnl_port);
8352
0
    if (!tun_port) {
8353
0
        err = -EINVAL;
8354
0
        goto error;
8355
0
    }
8356
0
    err = netdev_push_header(tun_port->port->netdev, batch, data);
8357
0
    if (!err) {
8358
0
        return 0;
8359
0
    }
8360
0
error:
8361
0
    dp_packet_delete_batch(batch, true);
8362
0
    return err;
8363
0
}
8364
8365
static void
8366
dp_execute_userspace_action(struct dp_netdev_pmd_thread *pmd,
8367
                            struct dp_packet *packet, bool should_steal,
8368
                            struct flow *flow, ovs_u128 *ufid,
8369
                            struct ofpbuf *actions,
8370
                            const struct nlattr *userdata)
8371
0
{
8372
0
    struct dp_packet_batch b;
8373
0
    int error;
8374
8375
0
    ofpbuf_clear(actions);
8376
8377
0
    error = dp_netdev_upcall(pmd, packet, flow, NULL, ufid,
8378
0
                             DPIF_UC_ACTION, userdata, actions,
8379
0
                             NULL);
8380
0
    if (!error || error == ENOSPC) {
8381
0
        dp_packet_batch_init_packet(&b, packet);
8382
0
        dp_netdev_execute_actions(pmd, &b, should_steal, flow,
8383
0
                                  actions->data, actions->size);
8384
0
    } else if (should_steal) {
8385
0
        dp_packet_delete(packet);
8386
0
        COVERAGE_INC(datapath_drop_userspace_action_error);
8387
0
    }
8388
0
}
8389
8390
static bool
8391
dp_execute_output_action(struct dp_netdev_pmd_thread *pmd,
8392
                         struct dp_packet_batch *packets_,
8393
                         bool should_steal, odp_port_t port_no)
8394
0
{
8395
0
    struct tx_port *p = pmd_send_port_cache_lookup(pmd, port_no);
8396
0
    struct dp_packet_batch out;
8397
8398
0
    if (!OVS_LIKELY(p)) {
8399
0
        COVERAGE_ADD(datapath_drop_invalid_port,
8400
0
                     dp_packet_batch_size(packets_));
8401
0
        dp_packet_delete_batch(packets_, should_steal);
8402
0
        return false;
8403
0
    }
8404
0
    if (!should_steal) {
8405
0
        dp_packet_batch_clone(&out, packets_);
8406
0
        dp_packet_batch_reset_cutlen(packets_);
8407
0
        packets_ = &out;
8408
0
    }
8409
0
    dp_packet_batch_apply_cutlen(packets_);
8410
0
    if (dp_packet_batch_size(&p->output_pkts)
8411
0
        + dp_packet_batch_size(packets_) > NETDEV_MAX_BURST) {
8412
        /* Flush here to avoid overflow. */
8413
0
        dp_netdev_pmd_flush_output_on_port(pmd, p);
8414
0
    }
8415
0
    if (dp_packet_batch_is_empty(&p->output_pkts)) {
8416
0
        pmd->n_output_batches++;
8417
0
    }
8418
8419
0
    struct dp_packet *packet;
8420
0
    DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
8421
0
        p->output_pkts_rxqs[dp_packet_batch_size(&p->output_pkts)] =
8422
0
            pmd->ctx.last_rxq;
8423
0
        dp_packet_batch_add(&p->output_pkts, packet);
8424
0
    }
8425
0
    return true;
8426
0
}
8427
8428
static void
8429
dp_execute_lb_output_action(struct dp_netdev_pmd_thread *pmd,
8430
                            struct dp_packet_batch *packets_,
8431
                            bool should_steal, uint32_t bond)
8432
0
{
8433
0
    struct tx_bond *p_bond = tx_bond_lookup(&pmd->tx_bonds, bond);
8434
0
    struct dp_packet_batch out;
8435
0
    struct dp_packet *packet;
8436
8437
0
    if (!p_bond) {
8438
0
        COVERAGE_ADD(datapath_drop_invalid_bond,
8439
0
                     dp_packet_batch_size(packets_));
8440
0
        dp_packet_delete_batch(packets_, should_steal);
8441
0
        return;
8442
0
    }
8443
0
    if (!should_steal) {
8444
0
        dp_packet_batch_clone(&out, packets_);
8445
0
        dp_packet_batch_reset_cutlen(packets_);
8446
0
        packets_ = &out;
8447
0
    }
8448
0
    dp_packet_batch_apply_cutlen(packets_);
8449
8450
0
    DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
8451
        /*
8452
         * Lookup the bond-hash table using hash to get the member.
8453
         */
8454
0
        uint32_t hash = dp_packet_get_rss_hash(packet);
8455
0
        struct member_entry *s_entry
8456
0
            = &p_bond->member_buckets[hash & BOND_MASK];
8457
0
        odp_port_t bond_member = s_entry->member_id;
8458
0
        uint32_t size = dp_packet_size(packet);
8459
0
        struct dp_packet_batch output_pkt;
8460
8461
0
        dp_packet_batch_init_packet(&output_pkt, packet);
8462
0
        if (OVS_LIKELY(dp_execute_output_action(pmd, &output_pkt, true,
8463
0
                                                bond_member))) {
8464
            /* Update member stats. */
8465
0
            non_atomic_ullong_add(&s_entry->n_packets, 1);
8466
0
            non_atomic_ullong_add(&s_entry->n_bytes, size);
8467
0
        }
8468
0
    }
8469
0
}
8470
8471
static void
8472
dp_execute_cb(void *aux_, struct dp_packet_batch *packets_,
8473
              const struct nlattr *a, bool should_steal)
8474
    OVS_NO_THREAD_SAFETY_ANALYSIS
8475
0
{
8476
0
    struct dp_netdev_execute_aux *aux = aux_;
8477
0
    uint32_t *depth = recirc_depth_get();
8478
0
    struct dp_netdev_pmd_thread *pmd = aux->pmd;
8479
0
    struct dp_netdev *dp = pmd->dp;
8480
0
    int type = nl_attr_type(a);
8481
0
    struct tx_port *p;
8482
0
    uint32_t packet_count, packets_dropped;
8483
8484
0
    switch ((enum ovs_action_attr)type) {
8485
0
    case OVS_ACTION_ATTR_OUTPUT:
8486
0
        dp_execute_output_action(pmd, packets_, should_steal,
8487
0
                                 nl_attr_get_odp_port(a));
8488
0
        return;
8489
8490
0
    case OVS_ACTION_ATTR_LB_OUTPUT:
8491
0
        dp_execute_lb_output_action(pmd, packets_, should_steal,
8492
0
                                    nl_attr_get_u32(a));
8493
0
        return;
8494
8495
0
    case OVS_ACTION_ATTR_TUNNEL_PUSH:
8496
0
        if (should_steal) {
8497
            /* We're requested to push tunnel header, but also we need to take
8498
             * the ownership of these packets. Thus, we can avoid performing
8499
             * the action, because the caller will not use the result anyway.
8500
             * Just break to free the batch. */
8501
0
            break;
8502
0
        }
8503
0
        dp_packet_batch_apply_cutlen(packets_);
8504
0
        packet_count = dp_packet_batch_size(packets_);
8505
0
        if (push_tnl_action(pmd, a, packets_)) {
8506
0
            COVERAGE_ADD(datapath_drop_tunnel_push_error,
8507
0
                         packet_count);
8508
0
        }
8509
0
        return;
8510
8511
0
    case OVS_ACTION_ATTR_TUNNEL_POP:
8512
0
        if (*depth < MAX_RECIRC_DEPTH) {
8513
0
            struct dp_packet_batch *orig_packets_ = packets_;
8514
0
            odp_port_t portno = nl_attr_get_odp_port(a);
8515
8516
0
            p = pmd_tnl_port_cache_lookup(pmd, portno);
8517
0
            if (p) {
8518
0
                struct dp_packet_batch tnl_pkt;
8519
8520
0
                if (!should_steal) {
8521
0
                    dp_packet_batch_clone(&tnl_pkt, packets_);
8522
0
                    packets_ = &tnl_pkt;
8523
0
                    dp_packet_batch_reset_cutlen(orig_packets_);
8524
0
                }
8525
8526
0
                dp_packet_batch_apply_cutlen(packets_);
8527
8528
0
                packet_count = dp_packet_batch_size(packets_);
8529
0
                netdev_pop_header(p->port->netdev, packets_);
8530
0
                packets_dropped =
8531
0
                   packet_count - dp_packet_batch_size(packets_);
8532
0
                if (packets_dropped) {
8533
0
                    COVERAGE_ADD(datapath_drop_tunnel_pop_error,
8534
0
                                 packets_dropped);
8535
0
                }
8536
0
                if (dp_packet_batch_is_empty(packets_)) {
8537
0
                    return;
8538
0
                }
8539
8540
0
                struct dp_packet *packet;
8541
0
                DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
8542
0
                    packet->md.in_port.odp_port = portno;
8543
0
                }
8544
8545
0
                (*depth)++;
8546
0
                dp_netdev_recirculate(pmd, packets_);
8547
0
                (*depth)--;
8548
0
                return;
8549
0
            }
8550
0
            COVERAGE_ADD(datapath_drop_invalid_tnl_port,
8551
0
                         dp_packet_batch_size(packets_));
8552
0
        } else {
8553
0
            COVERAGE_ADD(datapath_drop_recirc_error,
8554
0
                         dp_packet_batch_size(packets_));
8555
0
        }
8556
0
        break;
8557
8558
0
    case OVS_ACTION_ATTR_USERSPACE:
8559
0
        if (!fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {
8560
0
            struct dp_packet_batch *orig_packets_ = packets_;
8561
0
            const struct nlattr *userdata;
8562
0
            struct dp_packet_batch usr_pkt;
8563
0
            struct ofpbuf actions;
8564
0
            struct flow flow;
8565
0
            ovs_u128 ufid;
8566
0
            bool clone = false;
8567
8568
0
            userdata = nl_attr_find_nested(a, OVS_USERSPACE_ATTR_USERDATA);
8569
0
            ofpbuf_init(&actions, 0);
8570
8571
0
            if (packets_->trunc) {
8572
0
                if (!should_steal) {
8573
0
                    dp_packet_batch_clone(&usr_pkt, packets_);
8574
0
                    packets_ = &usr_pkt;
8575
0
                    clone = true;
8576
0
                    dp_packet_batch_reset_cutlen(orig_packets_);
8577
0
                }
8578
8579
0
                dp_packet_batch_apply_cutlen(packets_);
8580
0
            }
8581
8582
0
            struct dp_packet *packet;
8583
0
            DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
8584
0
                flow_extract(packet, &flow);
8585
0
                odp_flow_key_hash(&flow, sizeof flow, &ufid);
8586
0
                dp_execute_userspace_action(pmd, packet, should_steal, &flow,
8587
0
                                            &ufid, &actions, userdata);
8588
0
            }
8589
8590
0
            if (clone) {
8591
0
                dp_packet_delete_batch(packets_, true);
8592
0
            }
8593
8594
0
            ofpbuf_uninit(&actions);
8595
0
            fat_rwlock_unlock(&dp->upcall_rwlock);
8596
8597
0
            return;
8598
0
        }
8599
0
        COVERAGE_ADD(datapath_drop_lock_error,
8600
0
                     dp_packet_batch_size(packets_));
8601
0
        break;
8602
8603
0
    case OVS_ACTION_ATTR_RECIRC:
8604
0
        if (*depth < MAX_RECIRC_DEPTH) {
8605
0
            struct dp_packet_batch recirc_pkts;
8606
8607
0
            if (!should_steal) {
8608
0
               dp_packet_batch_clone(&recirc_pkts, packets_);
8609
0
               packets_ = &recirc_pkts;
8610
0
            }
8611
8612
0
            struct dp_packet *packet;
8613
0
            DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
8614
0
                packet->md.recirc_id = nl_attr_get_u32(a);
8615
0
            }
8616
8617
0
            (*depth)++;
8618
0
            dp_netdev_recirculate(pmd, packets_);
8619
0
            (*depth)--;
8620
8621
0
            return;
8622
0
        }
8623
8624
0
        COVERAGE_ADD(datapath_drop_recirc_error,
8625
0
                     dp_packet_batch_size(packets_));
8626
0
        VLOG_WARN("Packet dropped. Max recirculation depth exceeded.");
8627
0
        break;
8628
8629
0
    case OVS_ACTION_ATTR_CT: {
8630
0
        const struct nlattr *b;
8631
0
        bool force = false;
8632
0
        bool commit = false;
8633
0
        unsigned int left;
8634
0
        uint16_t zone = 0;
8635
0
        uint32_t tp_id = 0;
8636
0
        const char *helper = NULL;
8637
0
        const uint32_t *setmark = NULL;
8638
0
        const struct ovs_key_ct_labels *setlabel = NULL;
8639
0
        struct nat_action_info_t nat_action_info;
8640
0
        struct nat_action_info_t *nat_action_info_ref = NULL;
8641
0
        bool nat_config = false;
8642
8643
0
        NL_ATTR_FOR_EACH_UNSAFE (b, left, nl_attr_get(a),
8644
0
                                 nl_attr_get_size(a)) {
8645
0
            enum ovs_ct_attr sub_type = nl_attr_type(b);
8646
8647
0
            switch(sub_type) {
8648
0
            case OVS_CT_ATTR_FORCE_COMMIT:
8649
0
                force = true;
8650
                /* fall through. */
8651
0
            case OVS_CT_ATTR_COMMIT:
8652
0
                commit = true;
8653
0
                break;
8654
0
            case OVS_CT_ATTR_ZONE:
8655
0
                zone = nl_attr_get_u16(b);
8656
0
                break;
8657
0
            case OVS_CT_ATTR_HELPER:
8658
0
                helper = nl_attr_get_string(b);
8659
0
                break;
8660
0
            case OVS_CT_ATTR_MARK:
8661
0
                setmark = nl_attr_get(b);
8662
0
                break;
8663
0
            case OVS_CT_ATTR_LABELS:
8664
0
                setlabel = nl_attr_get(b);
8665
0
                break;
8666
0
            case OVS_CT_ATTR_EVENTMASK:
8667
                /* Silently ignored, as userspace datapath does not generate
8668
                 * netlink events. */
8669
0
                break;
8670
0
            case OVS_CT_ATTR_TIMEOUT:
8671
0
                if (!str_to_uint(nl_attr_get_string(b), 10, &tp_id)) {
8672
0
                    VLOG_WARN("Invalid Timeout Policy ID: %s.",
8673
0
                              nl_attr_get_string(b));
8674
0
                    tp_id = DEFAULT_TP_ID;
8675
0
                }
8676
0
                break;
8677
0
            case OVS_CT_ATTR_NAT: {
8678
0
                const struct nlattr *b_nest;
8679
0
                unsigned int left_nest;
8680
0
                bool ip_min_specified = false;
8681
0
                bool proto_num_min_specified = false;
8682
0
                bool ip_max_specified = false;
8683
0
                bool proto_num_max_specified = false;
8684
0
                memset(&nat_action_info, 0, sizeof nat_action_info);
8685
0
                nat_action_info_ref = &nat_action_info;
8686
8687
0
                NL_NESTED_FOR_EACH_UNSAFE (b_nest, left_nest, b) {
8688
0
                    enum ovs_nat_attr sub_type_nest = nl_attr_type(b_nest);
8689
8690
0
                    switch (sub_type_nest) {
8691
0
                    case OVS_NAT_ATTR_SRC:
8692
0
                    case OVS_NAT_ATTR_DST:
8693
0
                        nat_config = true;
8694
0
                        nat_action_info.nat_action |=
8695
0
                            ((sub_type_nest == OVS_NAT_ATTR_SRC)
8696
0
                                ? NAT_ACTION_SRC : NAT_ACTION_DST);
8697
0
                        break;
8698
0
                    case OVS_NAT_ATTR_IP_MIN:
8699
0
                        memcpy(&nat_action_info.min_addr,
8700
0
                               nl_attr_get(b_nest),
8701
0
                               nl_attr_get_size(b_nest));
8702
0
                        ip_min_specified = true;
8703
0
                        break;
8704
0
                    case OVS_NAT_ATTR_IP_MAX:
8705
0
                        memcpy(&nat_action_info.max_addr,
8706
0
                               nl_attr_get(b_nest),
8707
0
                               nl_attr_get_size(b_nest));
8708
0
                        ip_max_specified = true;
8709
0
                        break;
8710
0
                    case OVS_NAT_ATTR_PROTO_MIN:
8711
0
                        nat_action_info.min_port =
8712
0
                            nl_attr_get_u16(b_nest);
8713
0
                        proto_num_min_specified = true;
8714
0
                        break;
8715
0
                    case OVS_NAT_ATTR_PROTO_MAX:
8716
0
                        nat_action_info.max_port =
8717
0
                            nl_attr_get_u16(b_nest);
8718
0
                        proto_num_max_specified = true;
8719
0
                        break;
8720
0
                    case OVS_NAT_ATTR_PROTO_RANDOM:
8721
0
                        nat_action_info.nat_flags |= NAT_RANGE_RANDOM;
8722
0
                        break;
8723
0
                    case OVS_NAT_ATTR_PERSISTENT:
8724
0
                        nat_action_info.nat_flags |= NAT_PERSISTENT;
8725
0
                        break;
8726
0
                    case OVS_NAT_ATTR_PROTO_HASH:
8727
0
                        break;
8728
0
                    case OVS_NAT_ATTR_UNSPEC:
8729
0
                    case __OVS_NAT_ATTR_MAX:
8730
0
                        OVS_NOT_REACHED();
8731
0
                    }
8732
0
                }
8733
8734
0
                if (ip_min_specified && !ip_max_specified) {
8735
0
                    nat_action_info.max_addr = nat_action_info.min_addr;
8736
0
                }
8737
0
                if (proto_num_min_specified && !proto_num_max_specified) {
8738
0
                    nat_action_info.max_port = nat_action_info.min_port;
8739
0
                }
8740
0
                if (proto_num_min_specified || proto_num_max_specified) {
8741
0
                    if (nat_action_info.nat_action & NAT_ACTION_SRC) {
8742
0
                        nat_action_info.nat_action |= NAT_ACTION_SRC_PORT;
8743
0
                    } else if (nat_action_info.nat_action & NAT_ACTION_DST) {
8744
0
                        nat_action_info.nat_action |= NAT_ACTION_DST_PORT;
8745
0
                    }
8746
0
                }
8747
0
                break;
8748
0
            }
8749
0
            case OVS_CT_ATTR_UNSPEC:
8750
0
            case __OVS_CT_ATTR_MAX:
8751
0
                OVS_NOT_REACHED();
8752
0
            }
8753
0
        }
8754
8755
        /* We won't be able to function properly in this case, hence
8756
         * complain loudly. */
8757
0
        if (nat_config && !commit) {
8758
0
            static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
8759
0
            VLOG_WARN_RL(&rl, "NAT specified without commit.");
8760
0
        }
8761
8762
0
        conntrack_execute(dp->conntrack, packets_, aux->flow->dl_type, force,
8763
0
                          commit, zone, setmark, setlabel, helper,
8764
0
                          nat_action_info_ref, pmd->ctx.now / 1000, tp_id);
8765
0
        break;
8766
0
    }
8767
8768
0
    case OVS_ACTION_ATTR_METER:
8769
0
        dp_netdev_run_meter(pmd->dp, packets_, nl_attr_get_u32(a),
8770
0
                            pmd->ctx.now / 1000);
8771
0
        break;
8772
8773
0
    case OVS_ACTION_ATTR_PUSH_VLAN:
8774
0
    case OVS_ACTION_ATTR_POP_VLAN:
8775
0
    case OVS_ACTION_ATTR_PUSH_MPLS:
8776
0
    case OVS_ACTION_ATTR_POP_MPLS:
8777
0
    case OVS_ACTION_ATTR_SET:
8778
0
    case OVS_ACTION_ATTR_SET_MASKED:
8779
0
    case OVS_ACTION_ATTR_SAMPLE:
8780
0
    case OVS_ACTION_ATTR_HASH:
8781
0
    case OVS_ACTION_ATTR_UNSPEC:
8782
0
    case OVS_ACTION_ATTR_TRUNC:
8783
0
    case OVS_ACTION_ATTR_PUSH_ETH:
8784
0
    case OVS_ACTION_ATTR_POP_ETH:
8785
0
    case OVS_ACTION_ATTR_CLONE:
8786
0
    case OVS_ACTION_ATTR_PUSH_NSH:
8787
0
    case OVS_ACTION_ATTR_POP_NSH:
8788
0
    case OVS_ACTION_ATTR_CT_CLEAR:
8789
0
    case OVS_ACTION_ATTR_CHECK_PKT_LEN:
8790
0
    case OVS_ACTION_ATTR_DROP:
8791
0
    case OVS_ACTION_ATTR_ADD_MPLS:
8792
0
    case OVS_ACTION_ATTR_DEC_TTL:
8793
0
    case OVS_ACTION_ATTR_PSAMPLE:
8794
0
    case __OVS_ACTION_ATTR_MAX:
8795
0
        OVS_NOT_REACHED();
8796
0
    }
8797
8798
0
    dp_packet_delete_batch(packets_, should_steal);
8799
0
}
8800
8801
static void
8802
dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
8803
                          struct dp_packet_batch *packets,
8804
                          bool should_steal, const struct flow *flow,
8805
                          const struct nlattr *actions, size_t actions_len)
8806
0
{
8807
0
    struct dp_netdev_execute_aux aux = { pmd, flow };
8808
8809
0
    odp_execute_actions(&aux, packets, should_steal, actions,
8810
0
                        actions_len, dp_execute_cb);
8811
0
}
8812
8813
struct dp_netdev_ct_dump {
8814
    struct ct_dpif_dump_state up;
8815
    struct conntrack_dump dump;
8816
    struct conntrack *ct;
8817
    struct dp_netdev *dp;
8818
};
8819
8820
static int
8821
dpif_netdev_ct_dump_start(struct dpif *dpif, struct ct_dpif_dump_state **dump_,
8822
                          const uint16_t *pzone, int *ptot_bkts)
8823
0
{
8824
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
8825
0
    struct dp_netdev_ct_dump *dump;
8826
8827
0
    dump = xzalloc(sizeof *dump);
8828
0
    dump->dp = dp;
8829
0
    dump->ct = dp->conntrack;
8830
8831
0
    conntrack_dump_start(dp->conntrack, &dump->dump, pzone, ptot_bkts);
8832
8833
0
    *dump_ = &dump->up;
8834
8835
0
    return 0;
8836
0
}
8837
8838
static int
8839
dpif_netdev_ct_dump_next(struct dpif *dpif OVS_UNUSED,
8840
                         struct ct_dpif_dump_state *dump_,
8841
                         struct ct_dpif_entry *entry)
8842
0
{
8843
0
    struct dp_netdev_ct_dump *dump;
8844
8845
0
    INIT_CONTAINER(dump, dump_, up);
8846
8847
0
    return conntrack_dump_next(&dump->dump, entry);
8848
0
}
8849
8850
static int
8851
dpif_netdev_ct_dump_done(struct dpif *dpif OVS_UNUSED,
8852
                         struct ct_dpif_dump_state *dump_)
8853
0
{
8854
0
    struct dp_netdev_ct_dump *dump;
8855
0
    int err;
8856
8857
0
    INIT_CONTAINER(dump, dump_, up);
8858
8859
0
    err = conntrack_dump_done(&dump->dump);
8860
8861
0
    free(dump);
8862
8863
0
    return err;
8864
0
}
8865
8866
static int
8867
dpif_netdev_ct_exp_dump_start(struct dpif *dpif,
8868
                              struct ct_dpif_dump_state **dump_,
8869
                              const uint16_t *pzone)
8870
0
{
8871
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
8872
0
    struct dp_netdev_ct_dump *dump;
8873
8874
0
    dump = xzalloc(sizeof *dump);
8875
0
    dump->dp = dp;
8876
0
    dump->ct = dp->conntrack;
8877
8878
0
    conntrack_exp_dump_start(dp->conntrack, &dump->dump, pzone);
8879
8880
0
    *dump_ = &dump->up;
8881
8882
0
    return 0;
8883
0
}
8884
8885
static int
8886
dpif_netdev_ct_exp_dump_next(struct dpif *dpif OVS_UNUSED,
8887
                             struct ct_dpif_dump_state *dump_,
8888
                             struct ct_dpif_exp *entry)
8889
0
{
8890
0
    struct dp_netdev_ct_dump *dump;
8891
8892
0
    INIT_CONTAINER(dump, dump_, up);
8893
8894
0
    return conntrack_exp_dump_next(&dump->dump, entry);
8895
0
}
8896
8897
static int
8898
dpif_netdev_ct_exp_dump_done(struct dpif *dpif OVS_UNUSED,
8899
                             struct ct_dpif_dump_state *dump_)
8900
0
{
8901
0
    struct dp_netdev_ct_dump *dump;
8902
0
    int err;
8903
8904
0
    INIT_CONTAINER(dump, dump_, up);
8905
8906
0
    err = conntrack_exp_dump_done(&dump->dump);
8907
8908
0
    free(dump);
8909
8910
0
    return err;
8911
0
}
8912
8913
static int
8914
dpif_netdev_ct_flush(struct dpif *dpif, const uint16_t *zone,
8915
                     const struct ct_dpif_tuple *tuple)
8916
0
{
8917
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
8918
8919
0
    if (tuple) {
8920
0
        return conntrack_flush_tuple(dp->conntrack, tuple, zone ? *zone : 0);
8921
0
    }
8922
0
    return conntrack_flush(dp->conntrack, zone);
8923
0
}
8924
8925
static int
8926
dpif_netdev_ct_set_maxconns(struct dpif *dpif, uint32_t maxconns)
8927
0
{
8928
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
8929
8930
0
    return conntrack_set_maxconns(dp->conntrack, maxconns);
8931
0
}
8932
8933
static int
8934
dpif_netdev_ct_get_maxconns(struct dpif *dpif, uint32_t *maxconns)
8935
0
{
8936
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
8937
8938
0
    return conntrack_get_maxconns(dp->conntrack, maxconns);
8939
0
}
8940
8941
static int
8942
dpif_netdev_ct_get_nconns(struct dpif *dpif, uint32_t *nconns)
8943
0
{
8944
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
8945
8946
0
    return conntrack_get_nconns(dp->conntrack, nconns);
8947
0
}
8948
8949
static int
8950
dpif_netdev_ct_set_tcp_seq_chk(struct dpif *dpif, bool enabled)
8951
0
{
8952
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
8953
8954
0
    return conntrack_set_tcp_seq_chk(dp->conntrack, enabled);
8955
0
}
8956
8957
static int
8958
dpif_netdev_ct_get_tcp_seq_chk(struct dpif *dpif, bool *enabled)
8959
0
{
8960
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
8961
0
    *enabled = conntrack_get_tcp_seq_chk(dp->conntrack);
8962
0
    return 0;
8963
0
}
8964
8965
static int
8966
dpif_netdev_ct_set_sweep_interval(struct dpif *dpif, uint32_t ms)
8967
0
{
8968
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
8969
0
    return conntrack_set_sweep_interval(dp->conntrack, ms);
8970
0
}
8971
8972
static int
8973
dpif_netdev_ct_get_sweep_interval(struct dpif *dpif, uint32_t *ms)
8974
0
{
8975
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
8976
0
    *ms = conntrack_get_sweep_interval(dp->conntrack);
8977
0
    return 0;
8978
0
}
8979
8980
static int
8981
dpif_netdev_ct_set_limits(struct dpif *dpif,
8982
                           const struct ovs_list *zone_limits)
8983
0
{
8984
0
    int err = 0;
8985
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
8986
8987
0
    struct ct_dpif_zone_limit *zone_limit;
8988
0
    LIST_FOR_EACH (zone_limit, node, zone_limits) {
8989
0
        err = zone_limit_update(dp->conntrack, zone_limit->zone,
8990
0
                                zone_limit->limit);
8991
0
        if (err != 0) {
8992
0
            break;
8993
0
        }
8994
0
    }
8995
0
    return err;
8996
0
}
8997
8998
static int
8999
dpif_netdev_ct_get_limits(struct dpif *dpif,
9000
                           const struct ovs_list *zone_limits_request,
9001
                           struct ovs_list *zone_limits_reply)
9002
0
{
9003
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
9004
0
    struct conntrack_zone_info czl;
9005
9006
0
    if (!ovs_list_is_empty(zone_limits_request)) {
9007
0
        struct ct_dpif_zone_limit *zone_limit;
9008
0
        LIST_FOR_EACH (zone_limit, node, zone_limits_request) {
9009
0
            czl = zone_limit_get(dp->conntrack, zone_limit->zone);
9010
0
            if (czl.zone == zone_limit->zone || czl.zone == DEFAULT_ZONE) {
9011
0
                ct_dpif_push_zone_limit(zone_limits_reply, zone_limit->zone,
9012
0
                                        czl.limit,
9013
0
                                        czl.count);
9014
0
            } else {
9015
0
                return EINVAL;
9016
0
            }
9017
0
        }
9018
0
    } else {
9019
0
        czl = zone_limit_get(dp->conntrack, DEFAULT_ZONE);
9020
0
        if (czl.zone == DEFAULT_ZONE) {
9021
0
            ct_dpif_push_zone_limit(zone_limits_reply, DEFAULT_ZONE,
9022
0
                                    czl.limit, 0);
9023
0
        }
9024
9025
0
        for (int z = MIN_ZONE; z <= MAX_ZONE; z++) {
9026
0
            czl = zone_limit_get(dp->conntrack, z);
9027
0
            if (czl.zone == z) {
9028
0
                ct_dpif_push_zone_limit(zone_limits_reply, z, czl.limit,
9029
0
                                        czl.count);
9030
0
            }
9031
0
        }
9032
0
    }
9033
9034
0
    return 0;
9035
0
}
9036
9037
static int
9038
dpif_netdev_ct_del_limits(struct dpif *dpif,
9039
                           const struct ovs_list *zone_limits)
9040
0
{
9041
0
    int err = 0;
9042
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
9043
0
    struct ct_dpif_zone_limit *zone_limit;
9044
0
    LIST_FOR_EACH (zone_limit, node, zone_limits) {
9045
0
        err = zone_limit_delete(dp->conntrack, zone_limit->zone);
9046
0
        if (err != 0) {
9047
0
            break;
9048
0
        }
9049
0
    }
9050
9051
0
    return err;
9052
0
}
9053
9054
static int
9055
dpif_netdev_ct_get_features(struct dpif *dpif OVS_UNUSED,
9056
                            enum ct_features *features)
9057
0
{
9058
0
    if (features != NULL) {
9059
0
        *features = CONNTRACK_F_ZERO_SNAT;
9060
0
    }
9061
0
    return 0;
9062
0
}
9063
9064
static int
9065
dpif_netdev_ct_set_timeout_policy(struct dpif *dpif,
9066
                                  const struct ct_dpif_timeout_policy *dpif_tp)
9067
0
{
9068
0
    struct timeout_policy tp;
9069
0
    struct dp_netdev *dp;
9070
9071
0
    dp = get_dp_netdev(dpif);
9072
0
    memcpy(&tp.policy, dpif_tp, sizeof tp.policy);
9073
0
    return timeout_policy_update(dp->conntrack, &tp);
9074
0
}
9075
9076
static int
9077
dpif_netdev_ct_get_timeout_policy(struct dpif *dpif, uint32_t tp_id,
9078
                                  struct ct_dpif_timeout_policy *dpif_tp)
9079
0
{
9080
0
    struct timeout_policy *tp;
9081
0
    struct dp_netdev *dp;
9082
0
    int err = 0;
9083
9084
0
    dp = get_dp_netdev(dpif);
9085
0
    tp = timeout_policy_get(dp->conntrack, tp_id);
9086
0
    if (!tp) {
9087
0
        return ENOENT;
9088
0
    }
9089
0
    memcpy(dpif_tp, &tp->policy, sizeof tp->policy);
9090
0
    return err;
9091
0
}
9092
9093
static int
9094
dpif_netdev_ct_del_timeout_policy(struct dpif *dpif,
9095
                                  uint32_t tp_id)
9096
0
{
9097
0
    struct dp_netdev *dp;
9098
0
    int err = 0;
9099
9100
0
    dp = get_dp_netdev(dpif);
9101
0
    err = timeout_policy_delete(dp->conntrack, tp_id);
9102
0
    return err;
9103
0
}
9104
9105
static int
9106
dpif_netdev_ct_get_timeout_policy_name(struct dpif *dpif OVS_UNUSED,
9107
                                       uint32_t tp_id,
9108
                                       uint16_t dl_type OVS_UNUSED,
9109
                                       uint8_t nw_proto OVS_UNUSED,
9110
                                       char **tp_name, bool *is_generic)
9111
0
{
9112
0
    struct ds ds = DS_EMPTY_INITIALIZER;
9113
9114
0
    ds_put_format(&ds, "%"PRIu32, tp_id);
9115
0
    *tp_name = ds_steal_cstr(&ds);
9116
0
    *is_generic = true;
9117
0
    return 0;
9118
0
}
9119
9120
static int
9121
dpif_netdev_ipf_set_enabled(struct dpif *dpif, bool v6, bool enable)
9122
0
{
9123
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
9124
0
    return ipf_set_enabled(conntrack_ipf_ctx(dp->conntrack), v6, enable);
9125
0
}
9126
9127
static int
9128
dpif_netdev_ipf_set_min_frag(struct dpif *dpif, bool v6, uint32_t min_frag)
9129
0
{
9130
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
9131
0
    return ipf_set_min_frag(conntrack_ipf_ctx(dp->conntrack), v6, min_frag);
9132
0
}
9133
9134
static int
9135
dpif_netdev_ipf_set_max_nfrags(struct dpif *dpif, uint32_t max_frags)
9136
0
{
9137
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
9138
0
    return ipf_set_max_nfrags(conntrack_ipf_ctx(dp->conntrack), max_frags);
9139
0
}
9140
9141
/* Adjust this function if 'dpif_ipf_status' and 'ipf_status' were to
9142
 * diverge. */
9143
static int
9144
dpif_netdev_ipf_get_status(struct dpif *dpif,
9145
                           struct dpif_ipf_status *dpif_ipf_status)
9146
0
{
9147
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
9148
0
    ipf_get_status(conntrack_ipf_ctx(dp->conntrack),
9149
0
                   (struct ipf_status *) dpif_ipf_status);
9150
0
    return 0;
9151
0
}
9152
9153
static int
9154
dpif_netdev_ipf_dump_start(struct dpif *dpif OVS_UNUSED,
9155
                           struct ipf_dump_ctx **ipf_dump_ctx)
9156
0
{
9157
0
    return ipf_dump_start(ipf_dump_ctx);
9158
0
}
9159
9160
static int
9161
dpif_netdev_ipf_dump_next(struct dpif *dpif, void *ipf_dump_ctx, char **dump)
9162
0
{
9163
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
9164
0
    return ipf_dump_next(conntrack_ipf_ctx(dp->conntrack), ipf_dump_ctx,
9165
0
                         dump);
9166
0
}
9167
9168
static int
9169
dpif_netdev_ipf_dump_done(struct dpif *dpif OVS_UNUSED, void *ipf_dump_ctx)
9170
0
{
9171
0
    return ipf_dump_done(ipf_dump_ctx);
9172
9173
0
}
9174
9175
static int
9176
dpif_netdev_bond_add(struct dpif *dpif, uint32_t bond_id,
9177
                     odp_port_t *member_map)
9178
0
{
9179
0
    struct tx_bond *new_tx = xzalloc(sizeof *new_tx);
9180
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
9181
0
    struct dp_netdev_pmd_thread *pmd;
9182
9183
    /* Prepare new bond mapping. */
9184
0
    new_tx->bond_id = bond_id;
9185
0
    for (int bucket = 0; bucket < BOND_BUCKETS; bucket++) {
9186
0
        new_tx->member_buckets[bucket].member_id = member_map[bucket];
9187
0
    }
9188
9189
0
    ovs_mutex_lock(&dp->bond_mutex);
9190
    /* Check if bond already existed. */
9191
0
    struct tx_bond *old_tx = tx_bond_lookup(&dp->tx_bonds, bond_id);
9192
0
    if (old_tx) {
9193
0
        cmap_replace(&dp->tx_bonds, &old_tx->node, &new_tx->node,
9194
0
                     hash_bond_id(bond_id));
9195
0
        ovsrcu_postpone(free, old_tx);
9196
0
    } else {
9197
0
        cmap_insert(&dp->tx_bonds, &new_tx->node, hash_bond_id(bond_id));
9198
0
    }
9199
0
    ovs_mutex_unlock(&dp->bond_mutex);
9200
9201
    /* Update all PMDs with new bond mapping. */
9202
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
9203
0
        dp_netdev_add_bond_tx_to_pmd(pmd, new_tx, true);
9204
0
    }
9205
0
    return 0;
9206
0
}
9207
9208
static int
9209
dpif_netdev_bond_del(struct dpif *dpif, uint32_t bond_id)
9210
0
{
9211
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
9212
0
    struct dp_netdev_pmd_thread *pmd;
9213
0
    struct tx_bond *tx;
9214
9215
0
    ovs_mutex_lock(&dp->bond_mutex);
9216
    /* Check if bond existed. */
9217
0
    tx = tx_bond_lookup(&dp->tx_bonds, bond_id);
9218
0
    if (tx) {
9219
0
        cmap_remove(&dp->tx_bonds, &tx->node, hash_bond_id(bond_id));
9220
0
        ovsrcu_postpone(free, tx);
9221
0
    } else {
9222
        /* Bond is not present. */
9223
0
        ovs_mutex_unlock(&dp->bond_mutex);
9224
0
        return ENOENT;
9225
0
    }
9226
0
    ovs_mutex_unlock(&dp->bond_mutex);
9227
9228
    /* Remove the bond map in all pmds. */
9229
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
9230
0
        dp_netdev_del_bond_tx_from_pmd(pmd, bond_id);
9231
0
    }
9232
0
    return 0;
9233
0
}
9234
9235
static int
9236
dpif_netdev_bond_stats_get(struct dpif *dpif, uint32_t bond_id,
9237
                           uint64_t *n_bytes)
9238
0
{
9239
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
9240
0
    struct dp_netdev_pmd_thread *pmd;
9241
9242
0
    if (!tx_bond_lookup(&dp->tx_bonds, bond_id)) {
9243
0
        return ENOENT;
9244
0
    }
9245
9246
    /* Search the bond in all PMDs. */
9247
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
9248
0
        struct tx_bond *pmd_bond_entry
9249
0
            = tx_bond_lookup(&pmd->tx_bonds, bond_id);
9250
9251
0
        if (!pmd_bond_entry) {
9252
0
            continue;
9253
0
        }
9254
9255
        /* Read bond stats. */
9256
0
        for (int i = 0; i < BOND_BUCKETS; i++) {
9257
0
            uint64_t pmd_n_bytes;
9258
9259
0
            atomic_read_relaxed(&pmd_bond_entry->member_buckets[i].n_bytes,
9260
0
                                &pmd_n_bytes);
9261
0
            n_bytes[i] += pmd_n_bytes;
9262
0
        }
9263
0
    }
9264
0
    return 0;
9265
0
}
9266
9267
const struct dpif_class dpif_netdev_class = {
9268
    "netdev",
9269
    true,                       /* cleanup_required */
9270
    dpif_netdev_init,
9271
    dpif_netdev_enumerate,
9272
    dpif_netdev_port_open_type,
9273
    dpif_netdev_open,
9274
    dpif_netdev_close,
9275
    dpif_netdev_destroy,
9276
    dpif_netdev_run,
9277
    dpif_netdev_wait,
9278
    dpif_netdev_get_stats,
9279
    NULL,                      /* set_features */
9280
    NULL,                      /* get_features */
9281
    dpif_netdev_port_add,
9282
    dpif_netdev_port_del,
9283
    dpif_netdev_port_set_config,
9284
    dpif_netdev_port_query_by_number,
9285
    dpif_netdev_port_query_by_name,
9286
    NULL,                       /* port_get_pid */
9287
    dpif_netdev_port_dump_start,
9288
    dpif_netdev_port_dump_next,
9289
    dpif_netdev_port_dump_done,
9290
    dpif_netdev_port_poll,
9291
    dpif_netdev_port_poll_wait,
9292
    dpif_netdev_flow_flush,
9293
    dpif_netdev_flow_dump_create,
9294
    dpif_netdev_flow_dump_destroy,
9295
    dpif_netdev_flow_dump_thread_create,
9296
    dpif_netdev_flow_dump_thread_destroy,
9297
    dpif_netdev_flow_dump_next,
9298
    dpif_netdev_operate,
9299
    NULL,                       /* recv_set */
9300
    NULL,                       /* handlers_set */
9301
    dpif_netdev_number_handlers_required,
9302
    dpif_netdev_set_config,
9303
    dpif_netdev_queue_to_priority,
9304
    NULL,                       /* recv */
9305
    NULL,                       /* recv_wait */
9306
    NULL,                       /* recv_purge */
9307
    dpif_netdev_register_dp_purge_cb,
9308
    dpif_netdev_register_upcall_cb,
9309
    dpif_netdev_enable_upcall,
9310
    dpif_netdev_disable_upcall,
9311
    dpif_netdev_get_datapath_version,
9312
    dpif_netdev_ct_dump_start,
9313
    dpif_netdev_ct_dump_next,
9314
    dpif_netdev_ct_dump_done,
9315
    dpif_netdev_ct_exp_dump_start,
9316
    dpif_netdev_ct_exp_dump_next,
9317
    dpif_netdev_ct_exp_dump_done,
9318
    dpif_netdev_ct_flush,
9319
    dpif_netdev_ct_set_maxconns,
9320
    dpif_netdev_ct_get_maxconns,
9321
    dpif_netdev_ct_get_nconns,
9322
    dpif_netdev_ct_set_tcp_seq_chk,
9323
    dpif_netdev_ct_get_tcp_seq_chk,
9324
    dpif_netdev_ct_set_sweep_interval,
9325
    dpif_netdev_ct_get_sweep_interval,
9326
    dpif_netdev_ct_set_limits,
9327
    dpif_netdev_ct_get_limits,
9328
    dpif_netdev_ct_del_limits,
9329
    dpif_netdev_ct_set_timeout_policy,
9330
    dpif_netdev_ct_get_timeout_policy,
9331
    dpif_netdev_ct_del_timeout_policy,
9332
    NULL,                       /* ct_timeout_policy_dump_start */
9333
    NULL,                       /* ct_timeout_policy_dump_next */
9334
    NULL,                       /* ct_timeout_policy_dump_done */
9335
    dpif_netdev_ct_get_timeout_policy_name,
9336
    dpif_netdev_ct_get_features,
9337
    dpif_netdev_ipf_set_enabled,
9338
    dpif_netdev_ipf_set_min_frag,
9339
    dpif_netdev_ipf_set_max_nfrags,
9340
    dpif_netdev_ipf_get_status,
9341
    dpif_netdev_ipf_dump_start,
9342
    dpif_netdev_ipf_dump_next,
9343
    dpif_netdev_ipf_dump_done,
9344
    dpif_netdev_meter_get_features,
9345
    dpif_netdev_meter_set,
9346
    dpif_netdev_meter_get,
9347
    dpif_netdev_meter_del,
9348
    dpif_netdev_bond_add,
9349
    dpif_netdev_bond_del,
9350
    dpif_netdev_bond_stats_get,
9351
    NULL,                       /* cache_get_supported_levels */
9352
    NULL,                       /* cache_get_name */
9353
    NULL,                       /* cache_get_size */
9354
    NULL,                       /* cache_set_size */
9355
};
9356
9357
static void
9358
dpif_dummy_change_port_number(struct unixctl_conn *conn, int argc OVS_UNUSED,
9359
                              const char *argv[], void *aux OVS_UNUSED)
9360
0
{
9361
0
    struct dp_netdev_port *port;
9362
0
    struct dp_netdev *dp;
9363
0
    odp_port_t port_no;
9364
9365
0
    ovs_mutex_lock(&dp_netdev_mutex);
9366
0
    dp = shash_find_data(&dp_netdevs, argv[1]);
9367
0
    if (!dp || !dpif_netdev_class_is_dummy(dp->class)) {
9368
0
        ovs_mutex_unlock(&dp_netdev_mutex);
9369
0
        unixctl_command_reply_error(conn, "unknown datapath or not a dummy");
9370
0
        return;
9371
0
    }
9372
0
    ovs_refcount_ref(&dp->ref_cnt);
9373
0
    ovs_mutex_unlock(&dp_netdev_mutex);
9374
9375
0
    ovs_rwlock_wrlock(&dp->port_rwlock);
9376
0
    if (get_port_by_name(dp, argv[2], &port)) {
9377
0
        unixctl_command_reply_error(conn, "unknown port");
9378
0
        goto exit;
9379
0
    }
9380
9381
0
    port_no = u32_to_odp(atoi(argv[3]));
9382
0
    if (!port_no || port_no == ODPP_NONE) {
9383
0
        unixctl_command_reply_error(conn, "bad port number");
9384
0
        goto exit;
9385
0
    }
9386
0
    if (dp_netdev_lookup_port(dp, port_no)) {
9387
0
        unixctl_command_reply_error(conn, "port number already in use");
9388
0
        goto exit;
9389
0
    }
9390
9391
    /* Remove port. */
9392
0
    hmap_remove(&dp->ports, &port->node);
9393
0
    reconfigure_datapath(dp);
9394
9395
    /* Reinsert with new port number. */
9396
0
    port->port_no = port_no;
9397
0
    hmap_insert(&dp->ports, &port->node, hash_port_no(port_no));
9398
0
    reconfigure_datapath(dp);
9399
9400
0
    seq_change(dp->port_seq);
9401
0
    unixctl_command_reply(conn, NULL);
9402
9403
0
exit:
9404
0
    ovs_rwlock_unlock(&dp->port_rwlock);
9405
0
    dp_netdev_unref(dp);
9406
0
}
9407
9408
static void
9409
dpif_dummy_register__(const char *type)
9410
0
{
9411
0
    struct dpif_class *class;
9412
9413
0
    class = xmalloc(sizeof *class);
9414
0
    *class = dpif_netdev_class;
9415
0
    class->type = xstrdup(type);
9416
0
    dp_register_provider(class);
9417
0
}
9418
9419
static void
9420
dpif_dummy_override(const char *type)
9421
0
{
9422
0
    int error;
9423
9424
    /*
9425
     * Ignore EAFNOSUPPORT to allow --enable-dummy=system with
9426
     * a userland-only build.  It's useful for testsuite.
9427
     */
9428
0
    error = dp_unregister_provider(type);
9429
0
    if (error == 0 || error == EAFNOSUPPORT) {
9430
0
        dpif_dummy_register__(type);
9431
0
    }
9432
0
}
9433
9434
void
9435
dpif_dummy_register(enum dummy_level level)
9436
0
{
9437
0
    if (level == DUMMY_OVERRIDE_ALL) {
9438
0
        struct sset types;
9439
0
        const char *type;
9440
9441
0
        sset_init(&types);
9442
0
        dp_enumerate_types(&types);
9443
0
        SSET_FOR_EACH (type, &types) {
9444
0
            dpif_dummy_override(type);
9445
0
        }
9446
0
        sset_destroy(&types);
9447
0
    } else if (level == DUMMY_OVERRIDE_SYSTEM) {
9448
0
        dpif_dummy_override("system");
9449
0
    }
9450
9451
0
    dpif_dummy_register__("dummy");
9452
9453
0
    unixctl_command_register("dpif-dummy/change-port-number",
9454
0
                             "dp port new-number",
9455
0
                             3, 3, dpif_dummy_change_port_number, NULL);
9456
0
}
9457

9458
/* Datapath Classifier. */
9459
9460
static void
9461
dpcls_subtable_destroy_cb(struct dpcls_subtable *subtable)
9462
0
{
9463
0
    cmap_destroy(&subtable->rules);
9464
0
    ovsrcu_postpone(free, subtable->mf_masks);
9465
0
    ovsrcu_postpone(free, subtable);
9466
0
}
9467
9468
/* Initializes 'cls' as a classifier that initially contains no classification
9469
 * rules. */
9470
static void
9471
dpcls_init(struct dpcls *cls)
9472
0
{
9473
0
    cmap_init(&cls->subtables_map);
9474
0
    pvector_init(&cls->subtables);
9475
0
}
9476
9477
static void
9478
dpcls_destroy_subtable(struct dpcls *cls, struct dpcls_subtable *subtable)
9479
0
{
9480
0
    VLOG_DBG("Destroying subtable %p for in_port %d", subtable, cls->in_port);
9481
0
    pvector_remove(&cls->subtables, subtable);
9482
0
    cmap_remove(&cls->subtables_map, &subtable->cmap_node,
9483
0
                subtable->mask.hash);
9484
0
    dpcls_info_dec_usage(subtable->lookup_func_info);
9485
0
    ovsrcu_postpone(dpcls_subtable_destroy_cb, subtable);
9486
0
}
9487
9488
/* Destroys 'cls'.  Rules within 'cls', if any, are not freed; this is the
9489
 * caller's responsibility.
9490
 * May only be called after all the readers have been terminated. */
9491
static void
9492
dpcls_destroy(struct dpcls *cls)
9493
0
{
9494
0
    if (cls) {
9495
0
        struct dpcls_subtable *subtable;
9496
9497
0
        CMAP_FOR_EACH (subtable, cmap_node, &cls->subtables_map) {
9498
0
            ovs_assert(cmap_count(&subtable->rules) == 0);
9499
0
            dpcls_destroy_subtable(cls, subtable);
9500
0
        }
9501
0
        cmap_destroy(&cls->subtables_map);
9502
0
        pvector_destroy(&cls->subtables);
9503
0
    }
9504
0
}
9505
9506
static struct dpcls_subtable *
9507
dpcls_create_subtable(struct dpcls *cls, const struct netdev_flow_key *mask)
9508
0
{
9509
0
    struct dpcls_subtable *subtable;
9510
9511
    /* Need to add one. */
9512
0
    subtable = xmalloc(sizeof *subtable
9513
0
                       - sizeof subtable->mask.mf + mask->len);
9514
0
    cmap_init(&subtable->rules);
9515
0
    subtable->hit_cnt = 0;
9516
0
    netdev_flow_key_clone(&subtable->mask, mask);
9517
9518
    /* The count of bits in the mask defines the space required for masks.
9519
     * Then call gen_masks() to create the appropriate masks, avoiding the cost
9520
     * of doing runtime calculations. */
9521
0
    uint32_t unit0 = count_1bits(mask->mf.map.bits[0]);
9522
0
    uint32_t unit1 = count_1bits(mask->mf.map.bits[1]);
9523
0
    subtable->mf_bits_set_unit0 = unit0;
9524
0
    subtable->mf_bits_set_unit1 = unit1;
9525
0
    subtable->mf_masks = xmalloc(sizeof(uint64_t) * (unit0 + unit1));
9526
0
    dpcls_flow_key_gen_masks(mask, subtable->mf_masks, unit0, unit1);
9527
9528
    /* Get the preferred subtable search function for this (u0,u1) subtable.
9529
     * The function is guaranteed to always return a valid implementation, and
9530
     * possibly an ISA optimized, and/or specialized implementation. Initialize
9531
     * the subtable search function atomically to avoid garbage data being read
9532
     * by the PMD thread.
9533
     */
9534
0
    atomic_init(&subtable->lookup_func,
9535
0
                dpcls_subtable_get_best_impl(unit0, unit1,
9536
0
                                             &subtable->lookup_func_info));
9537
0
    dpcls_info_inc_usage(subtable->lookup_func_info);
9538
9539
0
    cmap_insert(&cls->subtables_map, &subtable->cmap_node, mask->hash);
9540
    /* Add the new subtable at the end of the pvector (with no hits yet) */
9541
0
    pvector_insert(&cls->subtables, subtable, 0);
9542
0
    VLOG_DBG("Creating %"PRIuSIZE". subtable %p for in_port %d",
9543
0
             cmap_count(&cls->subtables_map), subtable, cls->in_port);
9544
0
    pvector_publish(&cls->subtables);
9545
9546
0
    return subtable;
9547
0
}
9548
9549
static inline struct dpcls_subtable *
9550
dpcls_find_subtable(struct dpcls *cls, const struct netdev_flow_key *mask)
9551
0
{
9552
0
    struct dpcls_subtable *subtable;
9553
9554
0
    CMAP_FOR_EACH_WITH_HASH (subtable, cmap_node, mask->hash,
9555
0
                             &cls->subtables_map) {
9556
0
        if (netdev_flow_key_equal(&subtable->mask, mask)) {
9557
0
            return subtable;
9558
0
        }
9559
0
    }
9560
0
    return dpcls_create_subtable(cls, mask);
9561
0
}
9562
9563
/* Checks for the best available implementation for each subtable lookup
9564
 * function, and assigns it as the lookup function pointer for each subtable.
9565
 * Returns the number of subtables that have changed lookup implementation.
9566
 * This function requires holding a flow_mutex when called. This is to make
9567
 * sure modifications done by this function are not overwritten. This could
9568
 * happen if dpcls_sort_subtable_vector() is called at the same time as this
9569
 * function.
9570
 */
9571
static uint32_t
9572
dpcls_subtable_lookup_reprobe(struct dpcls *cls)
9573
0
{
9574
0
    struct pvector *pvec = &cls->subtables;
9575
0
    uint32_t subtables_changed = 0;
9576
0
    struct dpcls_subtable *subtable = NULL;
9577
9578
0
    PVECTOR_FOR_EACH (subtable, pvec) {
9579
0
        uint32_t u0_bits = subtable->mf_bits_set_unit0;
9580
0
        uint32_t u1_bits = subtable->mf_bits_set_unit1;
9581
0
        void *old_func = subtable->lookup_func;
9582
0
        struct dpcls_subtable_lookup_info_t *old_info;
9583
0
        old_info = subtable->lookup_func_info;
9584
        /* Set the subtable lookup function atomically to avoid garbage data
9585
         * being read by the PMD thread. */
9586
0
        atomic_store_relaxed(&subtable->lookup_func,
9587
0
                dpcls_subtable_get_best_impl(u0_bits, u1_bits,
9588
0
                                             &subtable->lookup_func_info));
9589
0
        if (old_func != subtable->lookup_func) {
9590
0
            subtables_changed += 1;
9591
0
        }
9592
9593
0
        if (old_info != subtable->lookup_func_info) {
9594
            /* In theory, functions can be shared between implementations, so
9595
             * do an explicit check on the function info structures. */
9596
0
            dpcls_info_dec_usage(old_info);
9597
0
            dpcls_info_inc_usage(subtable->lookup_func_info);
9598
0
        }
9599
0
    }
9600
9601
0
    return subtables_changed;
9602
0
}
9603
9604
/* Periodically sort the dpcls subtable vectors according to hit counts */
9605
static void
9606
dpcls_sort_subtable_vector(struct dpcls *cls)
9607
0
{
9608
0
    struct pvector *pvec = &cls->subtables;
9609
0
    struct dpcls_subtable *subtable;
9610
9611
0
    PVECTOR_FOR_EACH (subtable, pvec) {
9612
0
        pvector_change_priority(pvec, subtable, subtable->hit_cnt);
9613
0
        subtable->hit_cnt = 0;
9614
0
    }
9615
0
    pvector_publish(pvec);
9616
0
}
9617
9618
static inline void
9619
dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd,
9620
                           struct polled_queue *poll_list, int poll_cnt)
9621
0
{
9622
0
    struct dpcls *cls;
9623
0
    uint64_t tot_idle = 0, tot_proc = 0, tot_sleep = 0;
9624
0
    unsigned int pmd_load = 0;
9625
9626
0
    if (pmd->ctx.now > pmd->next_cycle_store) {
9627
0
        uint64_t curr_tsc;
9628
0
        uint8_t rebalance_load_trigger;
9629
0
        struct pmd_auto_lb *pmd_alb = &pmd->dp->pmd_alb;
9630
0
        unsigned int idx;
9631
9632
0
        if (pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE] >=
9633
0
                pmd->prev_stats[PMD_CYCLES_ITER_IDLE] &&
9634
0
            pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY] >=
9635
0
                pmd->prev_stats[PMD_CYCLES_ITER_BUSY]) {
9636
0
            tot_idle = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE] -
9637
0
                       pmd->prev_stats[PMD_CYCLES_ITER_IDLE];
9638
0
            tot_proc = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY] -
9639
0
                       pmd->prev_stats[PMD_CYCLES_ITER_BUSY];
9640
0
            tot_sleep = pmd->perf_stats.counters.n[PMD_CYCLES_SLEEP] -
9641
0
                        pmd->prev_stats[PMD_CYCLES_SLEEP];
9642
9643
0
            if (pmd_alb->is_enabled && !pmd->isolated) {
9644
0
                if (tot_proc) {
9645
0
                    pmd_load = ((tot_proc * 100) /
9646
0
                                    (tot_idle + tot_proc + tot_sleep));
9647
0
                }
9648
9649
0
                atomic_read_relaxed(&pmd_alb->rebalance_load_thresh,
9650
0
                                    &rebalance_load_trigger);
9651
0
                if (pmd_load >= rebalance_load_trigger) {
9652
0
                    atomic_count_inc(&pmd->pmd_overloaded);
9653
0
                } else {
9654
0
                    atomic_count_set(&pmd->pmd_overloaded, 0);
9655
0
                }
9656
0
            }
9657
0
        }
9658
9659
0
        pmd->prev_stats[PMD_CYCLES_ITER_IDLE] =
9660
0
                        pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE];
9661
0
        pmd->prev_stats[PMD_CYCLES_ITER_BUSY] =
9662
0
                        pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY];
9663
0
        pmd->prev_stats[PMD_CYCLES_SLEEP] =
9664
0
                        pmd->perf_stats.counters.n[PMD_CYCLES_SLEEP];
9665
9666
        /* Get the cycles that were used to process each queue and store. */
9667
0
        for (unsigned i = 0; i < poll_cnt; i++) {
9668
0
            uint64_t rxq_cyc_curr = dp_netdev_rxq_get_cycles(poll_list[i].rxq,
9669
0
                                                        RXQ_CYCLES_PROC_CURR);
9670
0
            dp_netdev_rxq_set_intrvl_cycles(poll_list[i].rxq, rxq_cyc_curr);
9671
0
            dp_netdev_rxq_set_cycles(poll_list[i].rxq, RXQ_CYCLES_PROC_CURR,
9672
0
                                     0);
9673
0
        }
9674
0
        curr_tsc = cycles_counter_update(&pmd->perf_stats);
9675
0
        if (pmd->intrvl_tsc_prev) {
9676
            /* There is a prev timestamp, store a new intrvl cycle count. */
9677
0
            atomic_store_relaxed(&pmd->intrvl_cycles,
9678
0
                                 curr_tsc - pmd->intrvl_tsc_prev);
9679
0
        }
9680
0
        idx = atomic_count_inc(&pmd->intrvl_idx) % PMD_INTERVAL_MAX;
9681
0
        atomic_store_relaxed(&pmd->busy_cycles_intrvl[idx], tot_proc);
9682
0
        pmd->intrvl_tsc_prev = curr_tsc;
9683
        /* Start new measuring interval */
9684
0
        pmd->next_cycle_store = pmd->ctx.now + PMD_INTERVAL_LEN;
9685
0
    }
9686
9687
0
    if (pmd->ctx.now > pmd->next_optimization) {
9688
        /* Try to obtain the flow lock to block out revalidator threads.
9689
         * If not possible, just try next time. */
9690
0
        if (!ovs_mutex_trylock(&pmd->flow_mutex)) {
9691
            /* Optimize each classifier */
9692
0
            CMAP_FOR_EACH (cls, node, &pmd->classifiers) {
9693
0
                dpcls_sort_subtable_vector(cls);
9694
0
            }
9695
0
            ovs_mutex_unlock(&pmd->flow_mutex);
9696
            /* Start new measuring interval */
9697
0
            pmd->next_optimization = pmd->ctx.now
9698
0
                                     + DPCLS_OPTIMIZATION_INTERVAL;
9699
0
        }
9700
0
    }
9701
0
}
9702
9703
/* Returns the sum of a specified number of newest to
9704
 * oldest interval values. 'cur_idx' is where the next
9705
 * write will be and wrap around needs to be handled.
9706
 */
9707
static uint64_t
9708
get_interval_values(atomic_ullong *source, atomic_count *cur_idx,
9709
0
                    int num_to_read) {
9710
0
    unsigned int i;
9711
0
    uint64_t total = 0;
9712
9713
0
    i = atomic_count_get(cur_idx) % PMD_INTERVAL_MAX;
9714
0
    for (int read = 0; read < num_to_read; read++) {
9715
0
        uint64_t interval_value;
9716
9717
0
        i = i ? i - 1 : PMD_INTERVAL_MAX - 1;
9718
0
        atomic_read_relaxed(&source[i], &interval_value);
9719
0
        total += interval_value;
9720
0
    }
9721
0
    return total;
9722
0
}
9723
9724
/* Insert 'rule' into 'cls'. */
9725
static void
9726
dpcls_insert(struct dpcls *cls, struct dpcls_rule *rule,
9727
             const struct netdev_flow_key *mask)
9728
0
{
9729
0
    struct dpcls_subtable *subtable = dpcls_find_subtable(cls, mask);
9730
9731
    /* Refer to subtable's mask, also for later removal. */
9732
0
    rule->mask = &subtable->mask;
9733
0
    cmap_insert(&subtable->rules, &rule->cmap_node, rule->flow.hash);
9734
0
}
9735
9736
/* Removes 'rule' from 'cls', also destructing the 'rule'. */
9737
static void
9738
dpcls_remove(struct dpcls *cls, struct dpcls_rule *rule)
9739
0
{
9740
0
    struct dpcls_subtable *subtable;
9741
9742
0
    ovs_assert(rule->mask);
9743
9744
    /* Get subtable from reference in rule->mask. */
9745
0
    INIT_CONTAINER(subtable, rule->mask, mask);
9746
0
    if (cmap_remove(&subtable->rules, &rule->cmap_node, rule->flow.hash)
9747
0
        == 0) {
9748
        /* Delete empty subtable. */
9749
0
        dpcls_destroy_subtable(cls, subtable);
9750
0
        pvector_publish(&cls->subtables);
9751
0
    }
9752
0
}
9753
9754
/* Inner loop for mask generation of a unit, see dpcls_flow_key_gen_masks. */
9755
static inline void
9756
dpcls_flow_key_gen_mask_unit(uint64_t iter, const uint64_t count,
9757
                             uint64_t *mf_masks)
9758
0
{
9759
0
    int i;
9760
0
    for (i = 0; i < count; i++) {
9761
0
        uint64_t lowest_bit = (iter & -iter);
9762
0
        iter &= ~lowest_bit;
9763
0
        mf_masks[i] = (lowest_bit - 1);
9764
0
    }
9765
    /* Checks that count has covered all bits in the iter bitmap. */
9766
0
    ovs_assert(iter == 0);
9767
0
}
9768
9769
/* Generate a mask for each block in the miniflow, based on the bits set. This
9770
 * allows easily masking packets with the generated array here, without
9771
 * calculations. This replaces runtime-calculating the masks.
9772
 * @param key The table to generate the mf_masks for
9773
 * @param mf_masks Pointer to a u64 array of at least *mf_bits* in size
9774
 * @param mf_bits_total Number of bits set in the whole miniflow (both units)
9775
 * @param mf_bits_unit0 Number of bits set in unit0 of the miniflow
9776
 */
9777
void
9778
dpcls_flow_key_gen_masks(const struct netdev_flow_key *tbl,
9779
                         uint64_t *mf_masks,
9780
                         const uint32_t mf_bits_u0,
9781
                         const uint32_t mf_bits_u1)
9782
0
{
9783
0
    uint64_t iter_u0 = tbl->mf.map.bits[0];
9784
0
    uint64_t iter_u1 = tbl->mf.map.bits[1];
9785
9786
0
    dpcls_flow_key_gen_mask_unit(iter_u0, mf_bits_u0, &mf_masks[0]);
9787
0
    dpcls_flow_key_gen_mask_unit(iter_u1, mf_bits_u1, &mf_masks[mf_bits_u0]);
9788
0
}
9789
9790
/* Returns true if 'target' satisfies 'key' in 'mask', that is, if each 1-bit
9791
 * in 'mask' the values in 'key' and 'target' are the same. */
9792
inline bool
9793
dpcls_rule_matches_key(const struct dpcls_rule *rule,
9794
                       const struct netdev_flow_key *target)
9795
0
{
9796
0
    const uint64_t *keyp = miniflow_get_values(&rule->flow.mf);
9797
0
    const uint64_t *maskp = miniflow_get_values(&rule->mask->mf);
9798
0
    uint64_t value;
9799
9800
0
    NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(value, target, rule->flow.mf.map) {
9801
0
        if (OVS_UNLIKELY((value & *maskp++) != *keyp++)) {
9802
0
            return false;
9803
0
        }
9804
0
    }
9805
0
    return true;
9806
0
}
9807
9808
/* For each miniflow in 'keys' performs a classifier lookup writing the result
9809
 * into the corresponding slot in 'rules'.  If a particular entry in 'keys' is
9810
 * NULL it is skipped.
9811
 *
9812
 * This function is optimized for use in the userspace datapath and therefore
9813
 * does not implement a lot of features available in the standard
9814
 * classifier_lookup() function.  Specifically, it does not implement
9815
 * priorities, instead returning any rule which matches the flow.
9816
 *
9817
 * Returns true if all miniflows found a corresponding rule. */
9818
bool
9819
dpcls_lookup(struct dpcls *cls, const struct netdev_flow_key *keys[],
9820
             struct dpcls_rule **rules, const size_t cnt,
9821
             int *num_lookups_p)
9822
0
{
9823
    /* The received 'cnt' miniflows are the search-keys that will be processed
9824
     * to find a matching entry into the available subtables.
9825
     * The number of bits in map_type is equal to NETDEV_MAX_BURST. */
9826
0
#define MAP_BITS (sizeof(uint32_t) * CHAR_BIT)
9827
0
    BUILD_ASSERT_DECL(MAP_BITS >= NETDEV_MAX_BURST);
9828
9829
0
    struct dpcls_subtable *subtable;
9830
0
    uint32_t keys_map = TYPE_MAXIMUM(uint32_t); /* Set all bits. */
9831
9832
0
    if (cnt != MAP_BITS) {
9833
0
        keys_map >>= MAP_BITS - cnt; /* Clear extra bits. */
9834
0
    }
9835
0
    memset(rules, 0, cnt * sizeof *rules);
9836
9837
0
    int lookups_match = 0, subtable_pos = 1;
9838
0
    uint32_t found_map;
9839
9840
    /* The Datapath classifier - aka dpcls - is composed of subtables.
9841
     * Subtables are dynamically created as needed when new rules are inserted.
9842
     * Each subtable collects rules with matches on a specific subset of packet
9843
     * fields as defined by the subtable's mask.  We proceed to process every
9844
     * search-key against each subtable, but when a match is found for a
9845
     * search-key, the search for that key can stop because the rules are
9846
     * non-overlapping. */
9847
0
    PVECTOR_FOR_EACH (subtable, &cls->subtables) {
9848
        /* Call the subtable specific lookup function. */
9849
0
        found_map = subtable->lookup_func(subtable, keys_map, keys, rules);
9850
9851
        /* Count the number of subtables searched for this packet match. This
9852
         * estimates the "spread" of subtables looked at per matched packet. */
9853
0
        uint32_t pkts_matched = count_1bits(found_map);
9854
0
        lookups_match += pkts_matched * subtable_pos;
9855
9856
        /* Clear the found rules, and return early if all packets are found. */
9857
0
        keys_map &= ~found_map;
9858
0
        if (!keys_map) {
9859
0
            if (num_lookups_p) {
9860
0
                *num_lookups_p = lookups_match;
9861
0
            }
9862
0
            return true;
9863
0
        }
9864
0
        subtable_pos++;
9865
0
    }
9866
9867
0
    if (num_lookups_p) {
9868
0
        *num_lookups_p = lookups_match;
9869
0
    }
9870
    return false;
9871
0
}