Coverage Report

Created: 2026-02-26 06:41

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/openvswitch/lib/dpif-netdev.c
Line
Count
Source
1
/*
2
 * Copyright (c) 2009-2014, 2016-2018 Nicira, Inc.
3
 *
4
 * Licensed under the Apache License, Version 2.0 (the "License");
5
 * you may not use this file except in compliance with the License.
6
 * You may obtain a copy of the License at:
7
 *
8
 *     http://www.apache.org/licenses/LICENSE-2.0
9
 *
10
 * Unless required by applicable law or agreed to in writing, software
11
 * distributed under the License is distributed on an "AS IS" BASIS,
12
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
 * See the License for the specific language governing permissions and
14
 * limitations under the License.
15
 */
16
17
#include <config.h>
18
#include "dpif-netdev.h"
19
#include "dpif-netdev-private.h"
20
#include "dpif-netdev-private-dfc.h"
21
#include "dpif-offload.h"
22
23
#include <ctype.h>
24
#include <errno.h>
25
#include <fcntl.h>
26
#include <inttypes.h>
27
#include <net/if.h>
28
#include <sys/types.h>
29
#include <netinet/in.h>
30
#include <stdint.h>
31
#include <stdlib.h>
32
#include <string.h>
33
#include <sys/ioctl.h>
34
#include <sys/socket.h>
35
#include <sys/stat.h>
36
#include <unistd.h>
37
38
#include "bitmap.h"
39
#include "ccmap.h"
40
#include "cmap.h"
41
#include "conntrack.h"
42
#include "conntrack-tp.h"
43
#include "coverage.h"
44
#include "ct-dpif.h"
45
#include "csum.h"
46
#include "dp-packet.h"
47
#include "dpif.h"
48
#include "dpif-netdev-lookup.h"
49
#include "dpif-netdev-perf.h"
50
#include "dpif-netdev-private-extract.h"
51
#include "dpif-provider.h"
52
#include "dummy.h"
53
#include "fat-rwlock.h"
54
#include "flow.h"
55
#include "hmapx.h"
56
#include "id-fpool.h"
57
#include "id-pool.h"
58
#include "ipf.h"
59
#include "mov-avg.h"
60
#include "mpsc-queue.h"
61
#include "netdev.h"
62
#include "netdev-provider.h"
63
#include "netdev-vport.h"
64
#include "netlink.h"
65
#include "odp-execute.h"
66
#include "odp-util.h"
67
#include "openvswitch/dynamic-string.h"
68
#include "openvswitch/list.h"
69
#include "openvswitch/match.h"
70
#include "openvswitch/ofp-parse.h"
71
#include "openvswitch/ofp-print.h"
72
#include "openvswitch/ofpbuf.h"
73
#include "openvswitch/shash.h"
74
#include "openvswitch/vlog.h"
75
#include "ovs-numa.h"
76
#include "ovs-rcu.h"
77
#include "packets.h"
78
#include "openvswitch/poll-loop.h"
79
#include "pvector.h"
80
#include "random.h"
81
#include "seq.h"
82
#include "smap.h"
83
#include "sset.h"
84
#include "timeval.h"
85
#include "tnl-neigh-cache.h"
86
#include "tnl-ports.h"
87
#include "unixctl.h"
88
#include "util.h"
89
#include "uuid.h"
90
91
VLOG_DEFINE_THIS_MODULE(dpif_netdev);
92
93
/* Auto Load Balancing Defaults */
94
0
#define ALB_IMPROVEMENT_THRESHOLD    25
95
0
#define ALB_LOAD_THRESHOLD           95
96
0
#define ALB_REBALANCE_INTERVAL       1     /* 1 Min */
97
0
#define MAX_ALB_REBALANCE_INTERVAL   20000 /* 20000 Min */
98
0
#define MIN_TO_MSEC                  60000
99
100
#define FLOW_DUMP_MAX_BATCH 50
101
/* Use per thread recirc_depth to prevent recirculation loop. */
102
0
#define MAX_RECIRC_DEPTH 8
103
DEFINE_STATIC_PER_THREAD_DATA(uint32_t, recirc_depth, 0)
104
105
/* Use instant packet send by default. */
106
0
#define DEFAULT_TX_FLUSH_INTERVAL 0
107
108
/* Configuration parameters. */
109
enum { MAX_METERS = 1 << 18 };  /* Maximum number of meters. */
110
enum { MAX_BANDS = 8 };         /* Maximum number of bands / meter. */
111
112
COVERAGE_DEFINE(datapath_drop_meter);
113
COVERAGE_DEFINE(datapath_drop_upcall_error);
114
COVERAGE_DEFINE(datapath_drop_lock_error);
115
COVERAGE_DEFINE(datapath_drop_userspace_action_error);
116
COVERAGE_DEFINE(datapath_drop_tunnel_push_error);
117
COVERAGE_DEFINE(datapath_drop_tunnel_pop_error);
118
COVERAGE_DEFINE(datapath_drop_recirc_error);
119
COVERAGE_DEFINE(datapath_drop_invalid_port);
120
COVERAGE_DEFINE(datapath_drop_invalid_bond);
121
COVERAGE_DEFINE(datapath_drop_invalid_tnl_port);
122
COVERAGE_DEFINE(datapath_drop_rx_invalid_packet);
123
COVERAGE_DEFINE(datapath_drop_hw_post_process);
124
COVERAGE_DEFINE(datapath_drop_hw_post_process_consumed);
125
126
/* Protects against changes to 'dp_netdevs'. */
127
struct ovs_mutex dp_netdev_mutex = OVS_MUTEX_INITIALIZER;
128
129
/* Contains all 'struct dp_netdev's. */
130
static struct shash dp_netdevs OVS_GUARDED_BY(dp_netdev_mutex)
131
    = SHASH_INITIALIZER(&dp_netdevs);
132
133
static struct vlog_rate_limit upcall_rl = VLOG_RATE_LIMIT_INIT(600, 600);
134
135
0
#define DP_NETDEV_CS_SUPPORTED_MASK (CS_NEW | CS_ESTABLISHED | CS_RELATED \
136
0
                                     | CS_INVALID | CS_REPLY_DIR | CS_TRACKED \
137
0
                                     | CS_SRC_NAT | CS_DST_NAT)
138
0
#define DP_NETDEV_CS_UNSUPPORTED_MASK (~(uint32_t)DP_NETDEV_CS_SUPPORTED_MASK)
139
140
static struct odp_support dp_netdev_support = {
141
    .max_vlan_headers = SIZE_MAX,
142
    .max_mpls_depth = SIZE_MAX,
143
    .recirc = true,
144
    .ct_state = true,
145
    .ct_zone = true,
146
    .ct_mark = true,
147
    .ct_label = true,
148
    .ct_state_nat = true,
149
    .ct_orig_tuple = true,
150
    .ct_orig_tuple6 = true,
151
};
152
153

154
/* Simple non-wildcarding single-priority classifier. */
155
156
/* Time in microseconds between successive optimizations of the dpcls
157
 * subtable vector */
158
0
#define DPCLS_OPTIMIZATION_INTERVAL 1000000LL
159
160
/* Time in microseconds of the interval in which rxq processing cycles used
161
 * in rxq to pmd assignments is measured and stored. */
162
0
#define PMD_INTERVAL_LEN 5000000LL
163
/* For converting PMD_INTERVAL_LEN to secs. */
164
0
#define INTERVAL_USEC_TO_SEC 1000000LL
165
166
/* Number of intervals for which cycles are stored
167
 * and used during rxq to pmd assignment. */
168
0
#define PMD_INTERVAL_MAX 12
169
170
/* Time in microseconds to try RCU quiescing. */
171
0
#define PMD_RCU_QUIESCE_INTERVAL 10000LL
172
173
/* Timer resolution for PMD threads in nanoseconds. */
174
0
#define PMD_TIMER_RES_NS 1000
175
176
/* Number of pkts Rx on an interface that will stop pmd thread sleeping. */
177
0
#define PMD_SLEEP_THRESH (NETDEV_MAX_BURST / 2)
178
/* Time in uS to increment a pmd thread sleep time. */
179
0
#define PMD_SLEEP_INC_US 1
180
181
struct pmd_sleep {
182
    unsigned core_id;
183
    uint64_t max_sleep;
184
};
185
186
struct dpcls {
187
    struct cmap_node node;      /* Within dp_netdev_pmd_thread.classifiers */
188
    odp_port_t in_port;
189
    struct cmap subtables_map;
190
    struct pvector subtables;
191
};
192
193
/* Data structure to keep packet order till fastpath processing. */
194
struct dp_packet_flow_map {
195
    struct dp_packet *packet;
196
    struct dp_netdev_flow *flow;
197
    uint16_t tcp_flags;
198
};
199
200
static void dpcls_init(struct dpcls *);
201
static void dpcls_destroy(struct dpcls *);
202
static void dpcls_sort_subtable_vector(struct dpcls *);
203
static uint32_t dpcls_subtable_lookup_reprobe(struct dpcls *cls);
204
static void dpcls_insert(struct dpcls *, struct dpcls_rule *,
205
                         const struct netdev_flow_key *mask);
206
static void dpcls_remove(struct dpcls *, struct dpcls_rule *);
207
208
/* Set of supported meter flags */
209
#define DP_SUPPORTED_METER_FLAGS_MASK \
210
0
    (OFPMF13_STATS | OFPMF13_PKTPS | OFPMF13_KBPS | OFPMF13_BURST)
211
212
/* Set of supported meter band types */
213
#define DP_SUPPORTED_METER_BAND_TYPES           \
214
0
    ( 1 << OFPMBT13_DROP )
215
216
struct dp_meter_band {
217
    uint32_t rate;
218
    uint32_t burst_size;
219
    atomic_uint64_t bucket;          /* In 1/1000 packets for PKTPS,
220
                                      * or in bits for KBPS. */
221
    atomic_uint64_t packet_count;
222
    atomic_uint64_t byte_count;
223
};
224
225
struct dp_meter {
226
    struct cmap_node node;
227
    uint32_t id;
228
    uint16_t flags;
229
    uint16_t n_bands;
230
    uint32_t max_delta_t;
231
    atomic_uint64_t used;  /* Time of a last use in milliseconds. */
232
    atomic_uint64_t packet_count;
233
    atomic_uint64_t byte_count;
234
    struct dp_meter_band bands[];
235
};
236
237
struct pmd_auto_lb {
238
    bool do_dry_run;
239
    bool recheck_config;
240
    bool is_enabled;            /* Current status of Auto load balancing. */
241
    uint64_t rebalance_intvl;
242
    uint64_t rebalance_poll_timer;
243
    uint8_t rebalance_improve_thresh;
244
    atomic_uint8_t rebalance_load_thresh;
245
};
246
247
enum sched_assignment_type {
248
    SCHED_ROUNDROBIN,
249
    SCHED_CYCLES, /* Default.*/
250
    SCHED_GROUP
251
};
252
253
/* Datapath based on the network device interface from netdev.h.
254
 *
255
 *
256
 * Thread-safety
257
 * =============
258
 *
259
 * Some members, marked 'const', are immutable.  Accessing other members
260
 * requires synchronization, as noted in more detail below.
261
 *
262
 * Acquisition order is, from outermost to innermost:
263
 *
264
 *    dp_netdev_mutex (global)
265
 *    port_rwlock
266
 *    bond_mutex
267
 *    non_pmd_mutex
268
 */
269
struct dp_netdev {
270
    const struct dpif_class *const class;
271
    const char *const name;
272
    const char *const full_name;
273
    struct ovs_refcount ref_cnt;
274
    atomic_flag destroyed;
275
276
    /* Ports.
277
     *
278
     * Any lookup into 'ports' or any access to the dp_netdev_ports found
279
     * through 'ports' requires taking 'port_rwlock'. */
280
    struct ovs_rwlock port_rwlock;
281
    struct hmap ports;
282
    struct seq *port_seq;       /* Incremented whenever a port changes. */
283
284
    /* The time that a packet can wait in output batch for sending. */
285
    atomic_uint32_t tx_flush_interval;
286
287
    /* Meters. */
288
    struct ovs_mutex meters_lock;
289
    struct cmap meters;
290
291
    /* Probability of EMC insertions is a factor of 'emc_insert_min'.*/
292
    atomic_uint32_t emc_insert_min;
293
    /* Enable collection of PMD performance metrics. */
294
    atomic_bool pmd_perf_metrics;
295
    /* Default max load based sleep request. */
296
    uint64_t pmd_max_sleep_default;
297
    /* Enable the SMC cache from ovsdb config */
298
    atomic_bool smc_enable_db;
299
300
    /* Protects access to ofproto-dpif-upcall interface during revalidator
301
     * thread synchronization. */
302
    struct fat_rwlock upcall_rwlock;
303
    upcall_callback *upcall_cb;  /* Callback function for executing upcalls. */
304
    void *upcall_aux;
305
306
    /* Callback function for notifying the purging of dp flows (during
307
     * reseting pmd deletion). */
308
    dp_purge_callback *dp_purge_cb;
309
    void *dp_purge_aux;
310
311
    /* Stores all 'struct dp_netdev_pmd_thread's. */
312
    struct cmap poll_threads;
313
    /* id pool for per thread static_tx_qid. */
314
    struct id_pool *tx_qid_pool;
315
    struct ovs_mutex tx_qid_pool_mutex;
316
    /* Rxq to pmd assignment type. */
317
    enum sched_assignment_type pmd_rxq_assign_type;
318
    bool pmd_iso;
319
320
    /* Protects the access of the 'struct dp_netdev_pmd_thread'
321
     * instance for non-pmd thread. */
322
    struct ovs_mutex non_pmd_mutex;
323
324
    /* Each pmd thread will store its pointer to
325
     * 'struct dp_netdev_pmd_thread' in 'per_pmd_key'. */
326
    ovsthread_key_t per_pmd_key;
327
328
    struct seq *reconfigure_seq;
329
    uint64_t last_reconfigure_seq;
330
    struct ovsthread_once once_set_config;
331
332
    /* Cpu mask for pin of pmd threads. */
333
    char *pmd_cmask;
334
335
    /* PMD max load based sleep request user string. */
336
    char *max_sleep_list;
337
338
    uint64_t last_tnl_conf_seq;
339
340
    struct conntrack *conntrack;
341
    struct pmd_auto_lb pmd_alb;
342
343
    /* Bonds. */
344
    struct ovs_mutex bond_mutex; /* Protects updates of 'tx_bonds'. */
345
    struct cmap tx_bonds; /* Contains 'struct tx_bond'. */
346
};
347
348
static struct dp_netdev_port *dp_netdev_lookup_port(const struct dp_netdev *dp,
349
                                                    odp_port_t)
350
    OVS_REQ_RDLOCK(dp->port_rwlock);
351
352
enum rxq_cycles_counter_type {
353
    RXQ_CYCLES_PROC_CURR,       /* Cycles spent successfully polling and
354
                                   processing packets during the current
355
                                   interval. */
356
    RXQ_CYCLES_PROC_HIST,       /* Total cycles of all intervals that are used
357
                                   during rxq to pmd assignment. */
358
    RXQ_N_CYCLES
359
};
360
361
0
#define XPS_TIMEOUT 500000LL    /* In microseconds. */
362
363
/* Contained by struct dp_netdev_port's 'rxqs' member.  */
364
struct dp_netdev_rxq {
365
    struct dp_netdev_port *port;
366
    struct netdev_rxq *rx;
367
    unsigned core_id;                  /* Core to which this queue should be
368
                                          pinned. OVS_CORE_UNSPEC if the
369
                                          queue doesn't need to be pinned to a
370
                                          particular core. */
371
    atomic_count intrvl_idx;           /* Write index for 'cycles_intrvl'. */
372
    struct dp_netdev_pmd_thread *pmd;  /* pmd thread that polls this queue. */
373
    bool is_vhost;                     /* Is rxq of a vhost port. */
374
375
    /* Counters of cycles spent successfully polling and processing pkts. */
376
    atomic_ullong cycles[RXQ_N_CYCLES];
377
    /* We store PMD_INTERVAL_MAX intervals of data for an rxq and then
378
       sum them to yield the cycles used for an rxq. */
379
    atomic_ullong cycles_intrvl[PMD_INTERVAL_MAX];
380
};
381
382
enum txq_req_mode {
383
    TXQ_REQ_MODE_THREAD,
384
    TXQ_REQ_MODE_HASH,
385
};
386
387
enum txq_mode {
388
    TXQ_MODE_STATIC,
389
    TXQ_MODE_XPS,
390
    TXQ_MODE_XPS_HASH,
391
};
392
393
/* A port in a netdev-based datapath. */
394
struct dp_netdev_port {
395
    odp_port_t port_no;
396
    enum txq_mode txq_mode;     /* static, XPS, XPS_HASH. */
397
    bool need_reconfigure;      /* True if we should reconfigure netdev. */
398
    struct netdev *netdev;
399
    struct hmap_node node;      /* Node in dp_netdev's 'ports'. */
400
    struct netdev_saved_flags *sf;
401
    struct dp_netdev_rxq *rxqs;
402
    unsigned n_rxq;             /* Number of elements in 'rxqs' */
403
    unsigned *txq_used;         /* Number of threads that use each tx queue. */
404
    struct ovs_mutex txq_used_mutex;
405
    bool emc_enabled;           /* If true EMC will be used. */
406
    char *type;                 /* Port type as requested by user. */
407
    char *rxq_affinity_list;    /* Requested affinity of rx queues. */
408
    enum txq_req_mode txq_requested_mode;
409
};
410
411
static bool dp_netdev_flow_ref(struct dp_netdev_flow *);
412
static int dpif_netdev_flow_from_nlattrs(const struct nlattr *, uint32_t,
413
                                         struct flow *, bool);
414
415
struct dp_netdev_actions *dp_netdev_actions_create(const struct nlattr *,
416
                                                   size_t);
417
struct dp_netdev_actions *dp_netdev_flow_get_actions(
418
    const struct dp_netdev_flow *);
419
static void dp_netdev_actions_free(struct dp_netdev_actions *);
420
421
struct polled_queue {
422
    struct dp_netdev_rxq *rxq;
423
    odp_port_t port_no;
424
    bool emc_enabled;
425
    bool rxq_enabled;
426
    uint64_t change_seq;
427
};
428
429
/* Contained by struct dp_netdev_pmd_thread's 'poll_list' member. */
430
struct rxq_poll {
431
    struct dp_netdev_rxq *rxq;
432
    struct hmap_node node;
433
};
434
435
/* Contained by struct dp_netdev_pmd_thread's 'send_port_cache',
436
 * 'tnl_port_cache' or 'tx_ports'. */
437
struct tx_port {
438
    struct dp_netdev_port *port;
439
    int qid;
440
    long long last_used;
441
    struct hmap_node node;
442
    long long flush_time;
443
    struct dp_packet_batch output_pkts;
444
    struct dp_packet_batch *txq_pkts; /* Only for hash mode. */
445
    struct dp_netdev_rxq *output_pkts_rxqs[NETDEV_MAX_BURST];
446
};
447
448
/* Contained by struct tx_bond 'member_buckets'. */
449
struct member_entry {
450
    odp_port_t member_id;
451
    atomic_ullong n_packets;
452
    atomic_ullong n_bytes;
453
};
454
455
/* Contained by struct dp_netdev_pmd_thread's 'tx_bonds'. */
456
struct tx_bond {
457
    struct cmap_node node;
458
    uint32_t bond_id;
459
    struct member_entry member_buckets[BOND_BUCKETS];
460
};
461
462
/* Interface to netdev-based datapath. */
463
struct dpif_netdev {
464
    struct dpif dpif;
465
    struct dp_netdev *dp;
466
    uint64_t last_port_seq;
467
};
468
469
static int get_port_by_number(struct dp_netdev *dp, odp_port_t port_no,
470
                              struct dp_netdev_port **portp)
471
    OVS_REQ_RDLOCK(dp->port_rwlock);
472
static int get_port_by_name(struct dp_netdev *dp, const char *devname,
473
                            struct dp_netdev_port **portp)
474
    OVS_REQ_RDLOCK(dp->port_rwlock);
475
static void dp_netdev_free(struct dp_netdev *)
476
    OVS_REQUIRES(dp_netdev_mutex);
477
static int do_add_port(struct dp_netdev *dp, const char *devname,
478
                       const char *type, odp_port_t port_no)
479
    OVS_REQ_WRLOCK(dp->port_rwlock);
480
static void do_del_port(struct dp_netdev *dp, struct dp_netdev_port *)
481
    OVS_REQ_WRLOCK(dp->port_rwlock);
482
static int dpif_netdev_open(const struct dpif_class *, const char *name,
483
                            bool create, struct dpif **);
484
static void dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
485
                                      struct dp_packet_batch *,
486
                                      bool should_steal,
487
                                      const struct flow *flow,
488
                                      const struct nlattr *actions,
489
                                      size_t actions_len);
490
static void dp_netdev_recirculate(struct dp_netdev_pmd_thread *,
491
                                  struct dp_packet_batch *);
492
493
static void dp_netdev_disable_upcall(struct dp_netdev *);
494
static void dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd);
495
static void dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd,
496
                                    struct dp_netdev *dp, unsigned core_id,
497
                                    int numa_id);
498
static void dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd);
499
static void dp_netdev_set_nonpmd(struct dp_netdev *dp)
500
    OVS_REQ_WRLOCK(dp->port_rwlock);
501
502
static void *pmd_thread_main(void *);
503
static struct dp_netdev_pmd_thread *dp_netdev_get_pmd(struct dp_netdev *dp,
504
                                                      unsigned core_id);
505
static struct dp_netdev_pmd_thread *
506
dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos);
507
static void dp_netdev_del_pmd(struct dp_netdev *dp,
508
                              struct dp_netdev_pmd_thread *pmd);
509
static void dp_netdev_destroy_all_pmds(struct dp_netdev *dp, bool non_pmd);
510
static void dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd);
511
static void dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
512
                                         struct dp_netdev_port *port)
513
    OVS_REQUIRES(pmd->port_mutex);
514
static void dp_netdev_del_port_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
515
                                           struct tx_port *tx)
516
    OVS_REQUIRES(pmd->port_mutex);
517
static void dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd,
518
                                     struct dp_netdev_rxq *rxq)
519
    OVS_REQUIRES(pmd->port_mutex);
520
static void dp_netdev_del_rxq_from_pmd(struct dp_netdev_pmd_thread *pmd,
521
                                       struct rxq_poll *poll)
522
    OVS_REQUIRES(pmd->port_mutex);
523
static int
524
dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd,
525
                                   bool force);
526
static void dp_netdev_add_bond_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
527
                                         struct tx_bond *bond, bool update)
528
    OVS_EXCLUDED(pmd->bond_mutex);
529
static void dp_netdev_del_bond_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
530
                                           uint32_t bond_id)
531
    OVS_EXCLUDED(pmd->bond_mutex);
532
533
static void reconfigure_datapath(struct dp_netdev *dp)
534
    OVS_REQ_RDLOCK(dp->port_rwlock);
535
static bool dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd);
536
static void dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd);
537
static void dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd);
538
static void pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
539
    OVS_REQUIRES(pmd->port_mutex);
540
static inline void
541
dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd,
542
                           struct polled_queue *poll_list, int poll_cnt);
543
static void
544
dp_netdev_rxq_set_cycles(struct dp_netdev_rxq *rx,
545
                         enum rxq_cycles_counter_type type,
546
                         unsigned long long cycles);
547
static uint64_t
548
dp_netdev_rxq_get_cycles(struct dp_netdev_rxq *rx,
549
                         enum rxq_cycles_counter_type type);
550
static void
551
dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx,
552
                           unsigned long long cycles);
553
static uint64_t
554
dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned idx);
555
static uint64_t
556
get_interval_values(atomic_ullong *source, atomic_count *cur_idx,
557
                    int num_to_read);
558
static void
559
dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
560
                               bool purge);
561
static int dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd,
562
                                      struct tx_port *tx);
563
inline struct dpcls *
564
dp_netdev_pmd_lookup_dpcls(struct dp_netdev_pmd_thread *pmd,
565
                           odp_port_t in_port);
566
567
static void dp_netdev_request_reconfigure(struct dp_netdev *dp);
568
static inline bool
569
pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd);
570
571
static void dp_netdev_simple_match_insert(struct dp_netdev_pmd_thread *pmd,
572
                                          struct dp_netdev_flow *flow)
573
    OVS_REQUIRES(pmd->flow_mutex);
574
static void dp_netdev_simple_match_remove(struct dp_netdev_pmd_thread *pmd,
575
                                          struct dp_netdev_flow *flow)
576
    OVS_REQUIRES(pmd->flow_mutex);
577
578
static bool dp_netdev_flow_is_simple_match(const struct match *);
579
580
/* Updates the time in PMD threads context and should be called in three cases:
581
 *
582
 *     1. PMD structure initialization:
583
 *         - dp_netdev_configure_pmd()
584
 *
585
 *     2. Before processing of the new packet batch:
586
 *         - dpif_netdev_execute()
587
 *         - dp_netdev_process_rxq_port()
588
 *
589
 *     3. At least once per polling iteration in main polling threads if no
590
 *        packets received on current iteration:
591
 *         - dpif_netdev_run()
592
 *         - pmd_thread_main()
593
 *
594
 * 'pmd->ctx.now' should be used without update in all other cases if possible.
595
 */
596
static inline void
597
pmd_thread_ctx_time_update(struct dp_netdev_pmd_thread *pmd)
598
0
{
599
0
    pmd->ctx.now = time_usec();
600
0
}
601
602
/* Returns true if 'dpif' is a netdev or dummy dpif, false otherwise. */
603
bool
604
dpif_is_netdev(const struct dpif *dpif)
605
0
{
606
0
    return dpif->dpif_class->open == dpif_netdev_open;
607
0
}
608
609
static struct dpif_netdev *
610
dpif_netdev_cast(const struct dpif *dpif)
611
0
{
612
0
    ovs_assert(dpif_is_netdev(dpif));
613
0
    return CONTAINER_OF(dpif, struct dpif_netdev, dpif);
614
0
}
615
616
static struct dp_netdev *
617
get_dp_netdev(const struct dpif *dpif)
618
0
{
619
0
    return dpif_netdev_cast(dpif)->dp;
620
0
}
621

622
enum pmd_info_type {
623
    PMD_INFO_SHOW_STATS,  /* Show how cpu cycles are spent. */
624
    PMD_INFO_CLEAR_STATS, /* Set the cycles count to 0. */
625
    PMD_INFO_SHOW_RXQ,    /* Show poll lists of pmd threads. */
626
    PMD_INFO_PERF_SHOW,   /* Show pmd performance details. */
627
    PMD_INFO_SLEEP_SHOW,  /* Show max sleep configuration details. */
628
};
629
630
static void
631
format_pmd_thread(struct ds *reply, struct dp_netdev_pmd_thread *pmd)
632
0
{
633
0
    ds_put_cstr(reply, (pmd->core_id == NON_PMD_CORE_ID)
634
0
                        ? "main thread" : "pmd thread");
635
0
    if (pmd->numa_id != OVS_NUMA_UNSPEC) {
636
0
        ds_put_format(reply, " numa_id %d", pmd->numa_id);
637
0
    }
638
0
    if (pmd->core_id != OVS_CORE_UNSPEC && pmd->core_id != NON_PMD_CORE_ID) {
639
0
        ds_put_format(reply, " core_id %u", pmd->core_id);
640
0
    }
641
0
    ds_put_cstr(reply, ":\n");
642
0
}
643
644
static void
645
pmd_info_show_stats(struct ds *reply,
646
                    struct dp_netdev_pmd_thread *pmd)
647
0
{
648
0
    uint64_t stats[PMD_N_STATS];
649
0
    uint64_t total_cycles, total_packets;
650
0
    double passes_per_pkt = 0;
651
0
    double lookups_per_hit = 0;
652
0
    double packets_per_batch = 0;
653
654
0
    pmd_perf_read_counters(&pmd->perf_stats, stats);
655
0
    total_cycles = stats[PMD_CYCLES_ITER_IDLE]
656
0
                         + stats[PMD_CYCLES_ITER_BUSY];
657
0
    total_packets = stats[PMD_STAT_RECV];
658
659
0
    format_pmd_thread(reply, pmd);
660
661
0
    if (total_packets > 0) {
662
0
        passes_per_pkt = (total_packets + stats[PMD_STAT_RECIRC])
663
0
                            / (double) total_packets;
664
0
    }
665
0
    if (stats[PMD_STAT_MASKED_HIT] > 0) {
666
0
        lookups_per_hit = stats[PMD_STAT_MASKED_LOOKUP]
667
0
                            / (double) stats[PMD_STAT_MASKED_HIT];
668
0
    }
669
0
    if (stats[PMD_STAT_SENT_BATCHES] > 0) {
670
0
        packets_per_batch = stats[PMD_STAT_SENT_PKTS]
671
0
                            / (double) stats[PMD_STAT_SENT_BATCHES];
672
0
    }
673
674
0
    ds_put_format(reply,
675
0
                  "  packets received: %"PRIu64"\n"
676
0
                  "  packet recirculations: %"PRIu64"\n"
677
0
                  "  avg. datapath passes per packet: %.02f\n"
678
0
                  "  phwol hits: %"PRIu64"\n"
679
0
                  "  mfex opt hits: %"PRIu64"\n"
680
0
                  "  simple match hits: %"PRIu64"\n"
681
0
                  "  emc hits: %"PRIu64"\n"
682
0
                  "  smc hits: %"PRIu64"\n"
683
0
                  "  megaflow hits: %"PRIu64"\n"
684
0
                  "  avg. subtable lookups per megaflow hit: %.02f\n"
685
0
                  "  miss with success upcall: %"PRIu64"\n"
686
0
                  "  miss with failed upcall: %"PRIu64"\n"
687
0
                  "  avg. packets per output batch: %.02f\n",
688
0
                  total_packets, stats[PMD_STAT_RECIRC],
689
0
                  passes_per_pkt, stats[PMD_STAT_PHWOL_HIT],
690
0
                  stats[PMD_STAT_MFEX_OPT_HIT],
691
0
                  stats[PMD_STAT_SIMPLE_HIT],
692
0
                  stats[PMD_STAT_EXACT_HIT],
693
0
                  stats[PMD_STAT_SMC_HIT],
694
0
                  stats[PMD_STAT_MASKED_HIT],
695
0
                  lookups_per_hit, stats[PMD_STAT_MISS], stats[PMD_STAT_LOST],
696
0
                  packets_per_batch);
697
698
0
    if (total_cycles == 0) {
699
0
        return;
700
0
    }
701
702
0
    ds_put_format(reply,
703
0
                  "  idle cycles: %"PRIu64" (%.02f%%)\n"
704
0
                  "  processing cycles: %"PRIu64" (%.02f%%)\n",
705
0
                  stats[PMD_CYCLES_ITER_IDLE],
706
0
                  stats[PMD_CYCLES_ITER_IDLE] / (double) total_cycles * 100,
707
0
                  stats[PMD_CYCLES_ITER_BUSY],
708
0
                  stats[PMD_CYCLES_ITER_BUSY] / (double) total_cycles * 100);
709
710
0
    if (total_packets == 0) {
711
0
        return;
712
0
    }
713
714
0
    ds_put_format(reply,
715
0
                  "  avg cycles per packet: %.02f (%"PRIu64"/%"PRIu64")\n",
716
0
                  total_cycles / (double) total_packets,
717
0
                  total_cycles, total_packets);
718
719
0
    ds_put_format(reply,
720
0
                  "  avg processing cycles per packet: "
721
0
                  "%.02f (%"PRIu64"/%"PRIu64")\n",
722
0
                  stats[PMD_CYCLES_ITER_BUSY] / (double) total_packets,
723
0
                  stats[PMD_CYCLES_ITER_BUSY], total_packets);
724
0
}
725
726
static void
727
pmd_info_show_perf(struct ds *reply,
728
                   struct dp_netdev_pmd_thread *pmd,
729
                   struct pmd_perf_params *par)
730
0
{
731
0
    if (pmd->core_id != NON_PMD_CORE_ID) {
732
0
        char *time_str =
733
0
                xastrftime_msec("%H:%M:%S.###", time_wall_msec(), true);
734
0
        long long now = time_msec();
735
0
        double duration = (now - pmd->perf_stats.start_ms) / 1000.0;
736
737
0
        ds_put_cstr(reply, "\n");
738
0
        ds_put_format(reply, "Time: %s\n", time_str);
739
0
        ds_put_format(reply, "Measurement duration: %.3f s\n", duration);
740
0
        ds_put_cstr(reply, "\n");
741
0
        format_pmd_thread(reply, pmd);
742
0
        ds_put_cstr(reply, "\n");
743
0
        pmd_perf_format_overall_stats(reply, &pmd->perf_stats, duration);
744
0
        if (pmd_perf_metrics_enabled(pmd)) {
745
            /* Prevent parallel clearing of perf metrics. */
746
0
            ovs_mutex_lock(&pmd->perf_stats.clear_mutex);
747
0
            if (par->histograms) {
748
0
                ds_put_cstr(reply, "\n");
749
0
                pmd_perf_format_histograms(reply, &pmd->perf_stats);
750
0
            }
751
0
            if (par->iter_hist_len > 0) {
752
0
                ds_put_cstr(reply, "\n");
753
0
                pmd_perf_format_iteration_history(reply, &pmd->perf_stats,
754
0
                        par->iter_hist_len);
755
0
            }
756
0
            if (par->ms_hist_len > 0) {
757
0
                ds_put_cstr(reply, "\n");
758
0
                pmd_perf_format_ms_history(reply, &pmd->perf_stats,
759
0
                        par->ms_hist_len);
760
0
            }
761
0
            ovs_mutex_unlock(&pmd->perf_stats.clear_mutex);
762
0
        }
763
0
        free(time_str);
764
0
    }
765
0
}
766
767
static int
768
compare_poll_list(const void *a_, const void *b_)
769
0
{
770
0
    const struct rxq_poll *a = a_;
771
0
    const struct rxq_poll *b = b_;
772
773
0
    const char *namea = netdev_rxq_get_name(a->rxq->rx);
774
0
    const char *nameb = netdev_rxq_get_name(b->rxq->rx);
775
776
0
    int cmp = strcmp(namea, nameb);
777
0
    if (!cmp) {
778
0
        return netdev_rxq_get_queue_id(a->rxq->rx)
779
0
               - netdev_rxq_get_queue_id(b->rxq->rx);
780
0
    } else {
781
0
        return cmp;
782
0
    }
783
0
}
784
785
static void
786
sorted_poll_list(struct dp_netdev_pmd_thread *pmd, struct rxq_poll **list,
787
                 size_t *n)
788
    OVS_REQUIRES(pmd->port_mutex)
789
0
{
790
0
    struct rxq_poll *ret, *poll;
791
0
    size_t i;
792
793
0
    *n = hmap_count(&pmd->poll_list);
794
0
    if (!*n) {
795
0
        ret = NULL;
796
0
    } else {
797
0
        ret = xcalloc(*n, sizeof *ret);
798
0
        i = 0;
799
0
        HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
800
0
            ret[i] = *poll;
801
0
            i++;
802
0
        }
803
0
        ovs_assert(i == *n);
804
0
        qsort(ret, *n, sizeof *ret, compare_poll_list);
805
0
    }
806
807
0
    *list = ret;
808
0
}
809
810
static void
811
pmd_info_show_rxq(struct ds *reply, struct dp_netdev_pmd_thread *pmd,
812
                  int secs)
813
0
{
814
0
    if (pmd->core_id != NON_PMD_CORE_ID) {
815
0
        struct rxq_poll *list;
816
0
        size_t n_rxq;
817
0
        uint64_t total_pmd_cycles = 0;
818
0
        uint64_t busy_pmd_cycles = 0;
819
0
        uint64_t total_rxq_proc_cycles = 0;
820
0
        unsigned int intervals;
821
822
0
        ds_put_format(reply,
823
0
                      "pmd thread numa_id %d core_id %u:\n  isolated : %s\n",
824
0
                      pmd->numa_id, pmd->core_id, (pmd->isolated)
825
0
                                                  ? "true" : "false");
826
827
0
        ovs_mutex_lock(&pmd->port_mutex);
828
0
        sorted_poll_list(pmd, &list, &n_rxq);
829
830
        /* Get the total pmd cycles for an interval. */
831
0
        atomic_read_relaxed(&pmd->intrvl_cycles, &total_pmd_cycles);
832
        /* Calculate how many intervals are to be used. */
833
0
        intervals = DIV_ROUND_UP(secs,
834
0
                                 PMD_INTERVAL_LEN / INTERVAL_USEC_TO_SEC);
835
        /* Estimate the cycles to cover all intervals. */
836
0
        total_pmd_cycles *= intervals;
837
0
        busy_pmd_cycles = get_interval_values(pmd->busy_cycles_intrvl,
838
0
                                              &pmd->intrvl_idx,
839
0
                                              intervals);
840
0
        if (busy_pmd_cycles > total_pmd_cycles) {
841
0
            busy_pmd_cycles = total_pmd_cycles;
842
0
        }
843
844
0
        for (int i = 0; i < n_rxq; i++) {
845
0
            struct dp_netdev_rxq *rxq = list[i].rxq;
846
0
            const char *name = netdev_rxq_get_name(rxq->rx);
847
0
            uint64_t rxq_proc_cycles = 0;
848
849
0
            rxq_proc_cycles = get_interval_values(rxq->cycles_intrvl,
850
0
                                                  &rxq->intrvl_idx,
851
0
                                                  intervals);
852
0
            total_rxq_proc_cycles += rxq_proc_cycles;
853
0
            ds_put_format(reply, "  port: %-16s  queue-id: %2d", name,
854
0
                          netdev_rxq_get_queue_id(list[i].rxq->rx));
855
0
            ds_put_format(reply, " %s", netdev_rxq_enabled(list[i].rxq->rx)
856
0
                                        ? "(enabled) " : "(disabled)");
857
0
            ds_put_format(reply, "  pmd usage: ");
858
0
            if (total_pmd_cycles) {
859
0
                ds_put_format(reply, "%2.0f %%",
860
0
                              (double) (rxq_proc_cycles * 100) /
861
0
                              total_pmd_cycles);
862
0
            } else {
863
0
                ds_put_format(reply, "%s", "NOT AVAIL");
864
0
            }
865
0
            ds_put_cstr(reply, "\n");
866
0
        }
867
868
0
        if (n_rxq > 0) {
869
0
            ds_put_cstr(reply, "  overhead: ");
870
0
            if (total_pmd_cycles) {
871
0
                uint64_t overhead_cycles = 0;
872
873
0
                if (total_rxq_proc_cycles < busy_pmd_cycles) {
874
0
                    overhead_cycles = busy_pmd_cycles - total_rxq_proc_cycles;
875
0
                }
876
877
0
                ds_put_format(reply, "%2.0f %%",
878
0
                              (double) (overhead_cycles * 100) /
879
0
                              total_pmd_cycles);
880
0
            } else {
881
0
                ds_put_cstr(reply, "NOT AVAIL");
882
0
            }
883
0
            ds_put_cstr(reply, "\n");
884
0
        }
885
886
0
        ovs_mutex_unlock(&pmd->port_mutex);
887
0
        free(list);
888
0
    }
889
0
}
890
891
static int
892
compare_poll_thread_list(const void *a_, const void *b_)
893
0
{
894
0
    const struct dp_netdev_pmd_thread *a, *b;
895
896
0
    a = *(struct dp_netdev_pmd_thread **)a_;
897
0
    b = *(struct dp_netdev_pmd_thread **)b_;
898
899
0
    if (a->core_id < b->core_id) {
900
0
        return -1;
901
0
    }
902
0
    if (a->core_id > b->core_id) {
903
0
        return 1;
904
0
    }
905
0
    return 0;
906
0
}
907
908
/* Create a sorted list of pmd's from the dp->poll_threads cmap. We can use
909
 * this list, as long as we do not go to quiescent state. */
910
static void
911
sorted_poll_thread_list(struct dp_netdev *dp,
912
                        struct dp_netdev_pmd_thread ***list,
913
                        size_t *n)
914
0
{
915
0
    struct dp_netdev_pmd_thread *pmd;
916
0
    struct dp_netdev_pmd_thread **pmd_list;
917
0
    size_t k = 0, n_pmds;
918
919
0
    n_pmds = cmap_count(&dp->poll_threads);
920
0
    pmd_list = xcalloc(n_pmds, sizeof *pmd_list);
921
922
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
923
0
        if (k >= n_pmds) {
924
0
            break;
925
0
        }
926
0
        pmd_list[k++] = pmd;
927
0
    }
928
929
0
    qsort(pmd_list, k, sizeof *pmd_list, compare_poll_thread_list);
930
931
0
    *list = pmd_list;
932
0
    *n = k;
933
0
}
934
935
static void
936
dpif_netdev_subtable_lookup_get(struct unixctl_conn *conn, int argc OVS_UNUSED,
937
                                const char *argv[] OVS_UNUSED,
938
                                void *aux OVS_UNUSED)
939
0
{
940
0
    struct ds reply = DS_EMPTY_INITIALIZER;
941
942
0
    dpcls_impl_print_stats(&reply);
943
0
    unixctl_command_reply(conn, ds_cstr(&reply));
944
0
    ds_destroy(&reply);
945
0
}
946
947
static void
948
dpif_netdev_subtable_lookup_set(struct unixctl_conn *conn, int argc OVS_UNUSED,
949
                                const char *argv[], void *aux OVS_UNUSED)
950
0
{
951
    /* This function requires 2 parameters (argv[1] and argv[2]) to execute.
952
     *   argv[1] is subtable name
953
     *   argv[2] is priority
954
     */
955
0
    const char *func_name = argv[1];
956
957
0
    errno = 0;
958
0
    char *err_char;
959
0
    uint32_t new_prio = strtoul(argv[2], &err_char, 10);
960
0
    uint32_t lookup_dpcls_changed = 0;
961
0
    uint32_t lookup_subtable_changed = 0;
962
0
    struct shash_node *node;
963
0
    if (errno != 0 || new_prio > UINT8_MAX) {
964
0
        unixctl_command_reply_error(conn,
965
0
            "error converting priority, use integer in range 0-255\n");
966
0
        return;
967
0
    }
968
969
0
    int32_t err = dpcls_subtable_set_prio(func_name, new_prio);
970
0
    if (err) {
971
0
        unixctl_command_reply_error(conn,
972
0
            "error, subtable lookup function not found\n");
973
0
        return;
974
0
    }
975
976
0
    ovs_mutex_lock(&dp_netdev_mutex);
977
0
    SHASH_FOR_EACH (node, &dp_netdevs) {
978
0
        struct dp_netdev *dp = node->data;
979
980
        /* Get PMD threads list, required to get DPCLS instances. */
981
0
        size_t n;
982
0
        struct dp_netdev_pmd_thread **pmd_list;
983
0
        sorted_poll_thread_list(dp, &pmd_list, &n);
984
985
        /* take port mutex as HMAP iters over them. */
986
0
        ovs_rwlock_rdlock(&dp->port_rwlock);
987
988
0
        for (size_t i = 0; i < n; i++) {
989
0
            struct dp_netdev_pmd_thread *pmd = pmd_list[i];
990
0
            if (pmd->core_id == NON_PMD_CORE_ID) {
991
0
                continue;
992
0
            }
993
994
0
            struct dp_netdev_port *port = NULL;
995
0
            HMAP_FOR_EACH (port, node, &dp->ports) {
996
0
                odp_port_t in_port = port->port_no;
997
0
                struct dpcls *cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
998
0
                if (!cls) {
999
0
                    continue;
1000
0
                }
1001
0
                ovs_mutex_lock(&pmd->flow_mutex);
1002
0
                uint32_t subtbl_changes = dpcls_subtable_lookup_reprobe(cls);
1003
0
                ovs_mutex_unlock(&pmd->flow_mutex);
1004
0
                if (subtbl_changes) {
1005
0
                    lookup_dpcls_changed++;
1006
0
                    lookup_subtable_changed += subtbl_changes;
1007
0
                }
1008
0
            }
1009
0
        }
1010
1011
        /* release port mutex before netdev mutex. */
1012
0
        ovs_rwlock_unlock(&dp->port_rwlock);
1013
0
        free(pmd_list);
1014
0
    }
1015
0
    ovs_mutex_unlock(&dp_netdev_mutex);
1016
1017
0
    struct ds reply = DS_EMPTY_INITIALIZER;
1018
0
    ds_put_format(&reply,
1019
0
        "Lookup priority change affected %d dpcls ports and %d subtables.\n",
1020
0
        lookup_dpcls_changed, lookup_subtable_changed);
1021
0
    const char *reply_str = ds_cstr(&reply);
1022
0
    unixctl_command_reply(conn, reply_str);
1023
0
    VLOG_INFO("%s", reply_str);
1024
0
    ds_destroy(&reply);
1025
0
}
1026
1027
static void
1028
dpif_netdev_impl_get(struct unixctl_conn *conn, int argc OVS_UNUSED,
1029
                     const char *argv[] OVS_UNUSED, void *aux OVS_UNUSED)
1030
0
{
1031
0
    struct ds reply = DS_EMPTY_INITIALIZER;
1032
0
    struct shash_node *node;
1033
1034
0
    ovs_mutex_lock(&dp_netdev_mutex);
1035
0
    SHASH_FOR_EACH (node, &dp_netdevs) {
1036
0
        struct dp_netdev_pmd_thread **pmd_list;
1037
0
        struct dp_netdev *dp = node->data;
1038
0
        size_t n;
1039
1040
        /* Get PMD threads list, required to get the DPIF impl used by each PMD
1041
         * thread. */
1042
0
        sorted_poll_thread_list(dp, &pmd_list, &n);
1043
0
        dp_netdev_impl_get(&reply, pmd_list, n);
1044
0
        free(pmd_list);
1045
0
    }
1046
0
    ovs_mutex_unlock(&dp_netdev_mutex);
1047
0
    unixctl_command_reply(conn, ds_cstr(&reply));
1048
0
    ds_destroy(&reply);
1049
0
}
1050
1051
static void
1052
dpif_netdev_impl_set(struct unixctl_conn *conn, int argc OVS_UNUSED,
1053
                     const char *argv[], void *aux OVS_UNUSED)
1054
0
{
1055
    /* This function requires just one parameter, the DPIF name. */
1056
0
    const char *dpif_name = argv[1];
1057
0
    struct shash_node *node;
1058
1059
0
    static const char *error_description[2] = {
1060
0
        "Unknown DPIF implementation",
1061
0
        "CPU doesn't support the required instruction for",
1062
0
    };
1063
1064
0
    ovs_mutex_lock(&dp_netdev_mutex);
1065
0
    int32_t err = dp_netdev_impl_set_default_by_name(dpif_name);
1066
1067
0
    if (err) {
1068
0
        struct ds reply = DS_EMPTY_INITIALIZER;
1069
0
        ds_put_format(&reply, "DPIF implementation not available: %s %s.\n",
1070
0
                      error_description[ (err == -ENOTSUP) ], dpif_name);
1071
0
        const char *reply_str = ds_cstr(&reply);
1072
0
        unixctl_command_reply_error(conn, reply_str);
1073
0
        VLOG_ERR("%s", reply_str);
1074
0
        ds_destroy(&reply);
1075
0
        ovs_mutex_unlock(&dp_netdev_mutex);
1076
0
        return;
1077
0
    }
1078
1079
0
    SHASH_FOR_EACH (node, &dp_netdevs) {
1080
0
        struct dp_netdev *dp = node->data;
1081
1082
        /* Get PMD threads list, required to get DPCLS instances. */
1083
0
        size_t n;
1084
0
        struct dp_netdev_pmd_thread **pmd_list;
1085
0
        sorted_poll_thread_list(dp, &pmd_list, &n);
1086
1087
0
        for (size_t i = 0; i < n; i++) {
1088
0
            struct dp_netdev_pmd_thread *pmd = pmd_list[i];
1089
0
            if (pmd->core_id == NON_PMD_CORE_ID) {
1090
0
                continue;
1091
0
            }
1092
1093
            /* Initialize DPIF function pointer to the newly configured
1094
             * default. */
1095
0
            atomic_store_relaxed(&pmd->netdev_input_func,
1096
0
                                 dp_netdev_impl_get_default());
1097
0
        };
1098
1099
0
        free(pmd_list);
1100
0
    }
1101
0
    ovs_mutex_unlock(&dp_netdev_mutex);
1102
1103
    /* Reply with success to command. */
1104
0
    struct ds reply = DS_EMPTY_INITIALIZER;
1105
0
    ds_put_format(&reply, "DPIF implementation set to %s.\n", dpif_name);
1106
0
    const char *reply_str = ds_cstr(&reply);
1107
0
    unixctl_command_reply(conn, reply_str);
1108
0
    VLOG_INFO("%s", reply_str);
1109
0
    ds_destroy(&reply);
1110
0
}
1111
1112
static void
1113
dpif_miniflow_extract_impl_get(struct unixctl_conn *conn, int argc OVS_UNUSED,
1114
                               const char *argv[] OVS_UNUSED,
1115
                               void *aux OVS_UNUSED)
1116
0
{
1117
0
    struct ds reply = DS_EMPTY_INITIALIZER;
1118
0
    struct shash_node *node;
1119
1120
0
    ovs_mutex_lock(&dp_netdev_mutex);
1121
0
    SHASH_FOR_EACH (node, &dp_netdevs) {
1122
0
        struct dp_netdev_pmd_thread **pmd_list;
1123
0
        struct dp_netdev *dp = node->data;
1124
0
        size_t n;
1125
1126
        /* Get PMD threads list, required to get the DPIF impl used by each PMD
1127
         * thread. */
1128
0
        sorted_poll_thread_list(dp, &pmd_list, &n);
1129
0
        dp_mfex_impl_get(&reply, pmd_list, n);
1130
0
        free(pmd_list);
1131
0
    }
1132
0
    ovs_mutex_unlock(&dp_netdev_mutex);
1133
0
    unixctl_command_reply(conn, ds_cstr(&reply));
1134
0
    ds_destroy(&reply);
1135
0
}
1136
1137
static void
1138
dpif_miniflow_extract_impl_set(struct unixctl_conn *conn, int argc,
1139
                               const char *argv[], void *aux OVS_UNUSED)
1140
0
{
1141
    /* This command takes some optional and mandatory arguments. The function
1142
     * here first parses all of the options, saving results in local variables.
1143
     * Then the parsed values are acted on.
1144
     */
1145
0
    unsigned int pmd_thread_to_change = NON_PMD_CORE_ID;
1146
0
    unsigned int study_count = MFEX_MAX_PKT_COUNT;
1147
0
    struct ds reply = DS_EMPTY_INITIALIZER;
1148
0
    bool pmd_thread_update_done = false;
1149
0
    bool mfex_name_is_study = false;
1150
0
    const char *mfex_name = NULL;
1151
0
    const char *reply_str = NULL;
1152
0
    struct shash_node *node;
1153
0
    int err;
1154
1155
0
    while (argc > 1) {
1156
        /* Optional argument "-pmd" limits the commands actions to just this
1157
         * PMD thread.
1158
         */
1159
0
        if ((!strcmp(argv[1], "-pmd") && !mfex_name)) {
1160
0
            if (argc < 3) {
1161
0
                ds_put_format(&reply,
1162
0
                              "Error: -pmd option requires a thread id"
1163
0
                              " argument.\n");
1164
0
                goto error;
1165
0
            }
1166
1167
            /* Ensure argument can be parsed to an integer. */
1168
0
            if (!str_to_uint(argv[2], 10, &pmd_thread_to_change) ||
1169
0
                (pmd_thread_to_change == NON_PMD_CORE_ID)) {
1170
0
                ds_put_format(&reply,
1171
0
                              "Error: miniflow extract parser not changed,"
1172
0
                              " PMD thread passed is not valid: '%s'."
1173
0
                              " Pass a valid pmd thread ID.\n",
1174
0
                              argv[2]);
1175
0
                goto error;
1176
0
            }
1177
1178
0
            argc -= 2;
1179
0
            argv += 2;
1180
1181
0
        } else if (!mfex_name) {
1182
            /* Name of MFEX impl requested by user. */
1183
0
            mfex_name = argv[1];
1184
0
            mfex_name_is_study = strcmp("study", mfex_name) == 0;
1185
0
            argc -= 1;
1186
0
            argv += 1;
1187
1188
        /* If name is study and more args exist, parse study_count value. */
1189
0
        } else if (mfex_name && mfex_name_is_study) {
1190
0
            if (!str_to_uint(argv[1], 10, &study_count) ||
1191
0
                (study_count == 0)) {
1192
0
                ds_put_format(&reply,
1193
0
                              "Error: invalid study_pkt_cnt value: %s.\n",
1194
0
                              argv[1]);
1195
0
                goto error;
1196
0
            }
1197
1198
0
            argc -= 1;
1199
0
            argv += 1;
1200
0
        } else {
1201
0
            ds_put_format(&reply, "Error: unknown argument %s.\n", argv[1]);
1202
0
            goto error;
1203
0
        }
1204
0
    }
1205
1206
    /* Ensure user passed an MFEX name. */
1207
0
    if (!mfex_name) {
1208
0
        ds_put_format(&reply, "Error: no miniflow extract name provided."
1209
0
                      " Output of miniflow-parser-get shows implementation"
1210
0
                      " list.\n");
1211
0
        goto error;
1212
0
    }
1213
1214
    /* If the MFEX name is "study", set the study packet count. */
1215
0
    if (mfex_name_is_study) {
1216
0
        err = mfex_set_study_pkt_cnt(study_count, mfex_name);
1217
0
        if (err) {
1218
0
            ds_put_format(&reply, "Error: failed to set study count %d for"
1219
0
                          " miniflow extract implementation %s.\n",
1220
0
                          study_count, mfex_name);
1221
0
            goto error;
1222
0
        }
1223
0
    }
1224
1225
    /* Set the default MFEX impl only if the command was applied to all PMD
1226
     * threads. If a PMD thread was selected, do NOT update the default.
1227
     */
1228
0
    if (pmd_thread_to_change == NON_PMD_CORE_ID) {
1229
0
        err = dp_mfex_impl_set_default_by_name(mfex_name);
1230
0
        if (err == -ENODEV) {
1231
0
            ds_put_format(&reply,
1232
0
                          "Error: miniflow extract not available due to CPU"
1233
0
                          " ISA requirements: %s",
1234
0
                          mfex_name);
1235
0
            goto error;
1236
0
        } else if (err) {
1237
0
            ds_put_format(&reply,
1238
0
                          "Error: unknown miniflow extract implementation %s.",
1239
0
                          mfex_name);
1240
0
            goto error;
1241
0
        }
1242
0
    }
1243
1244
    /* Get the desired MFEX function pointer and error check its usage. */
1245
0
    miniflow_extract_func mfex_func = NULL;
1246
0
    err = dp_mfex_impl_get_by_name(mfex_name, &mfex_func);
1247
0
    if (err) {
1248
0
        if (err == -ENODEV) {
1249
0
            ds_put_format(&reply,
1250
0
                          "Error: miniflow extract not available due to CPU"
1251
0
                          " ISA requirements: %s", mfex_name);
1252
0
        } else {
1253
0
            ds_put_format(&reply,
1254
0
                          "Error: unknown miniflow extract implementation %s.",
1255
0
                          mfex_name);
1256
0
        }
1257
0
        goto error;
1258
0
    }
1259
1260
    /* Apply the MFEX pointer to each pmd thread in each netdev, filtering
1261
     * by the users "-pmd" argument if required.
1262
     */
1263
0
    ovs_mutex_lock(&dp_netdev_mutex);
1264
1265
0
    SHASH_FOR_EACH (node, &dp_netdevs) {
1266
0
        struct dp_netdev_pmd_thread **pmd_list;
1267
0
        struct dp_netdev *dp = node->data;
1268
0
        size_t n;
1269
1270
0
        sorted_poll_thread_list(dp, &pmd_list, &n);
1271
1272
0
        for (size_t i = 0; i < n; i++) {
1273
0
            struct dp_netdev_pmd_thread *pmd = pmd_list[i];
1274
0
            if (pmd->core_id == NON_PMD_CORE_ID) {
1275
0
                continue;
1276
0
            }
1277
1278
            /* If -pmd specified, skip all other pmd threads. */
1279
0
            if ((pmd_thread_to_change != NON_PMD_CORE_ID) &&
1280
0
                (pmd->core_id != pmd_thread_to_change)) {
1281
0
                continue;
1282
0
            }
1283
1284
0
            pmd_thread_update_done = true;
1285
0
            atomic_store_relaxed(&pmd->miniflow_extract_opt, mfex_func);
1286
0
        };
1287
1288
0
        free(pmd_list);
1289
0
    }
1290
1291
0
    ovs_mutex_unlock(&dp_netdev_mutex);
1292
1293
    /* If PMD thread was specified, but it wasn't found, return error. */
1294
0
    if (pmd_thread_to_change != NON_PMD_CORE_ID && !pmd_thread_update_done) {
1295
0
        ds_put_format(&reply,
1296
0
                      "Error: miniflow extract parser not changed, "
1297
0
                      "PMD thread %d not in use, pass a valid pmd"
1298
0
                      " thread ID.\n", pmd_thread_to_change);
1299
0
        goto error;
1300
0
    }
1301
1302
    /* Reply with success to command. */
1303
0
    ds_put_format(&reply, "Miniflow extract implementation set to %s",
1304
0
                  mfex_name);
1305
0
    if (pmd_thread_to_change != NON_PMD_CORE_ID) {
1306
0
        ds_put_format(&reply, ", on pmd thread %d", pmd_thread_to_change);
1307
0
    }
1308
0
    if (mfex_name_is_study) {
1309
0
        ds_put_format(&reply, ", studying %d packets", study_count);
1310
0
    }
1311
0
    ds_put_format(&reply, ".\n");
1312
1313
0
    reply_str = ds_cstr(&reply);
1314
0
    VLOG_INFO("%s", reply_str);
1315
0
    unixctl_command_reply(conn, reply_str);
1316
0
    ds_destroy(&reply);
1317
0
    return;
1318
1319
0
error:
1320
0
    reply_str = ds_cstr(&reply);
1321
0
    VLOG_ERR("%s", reply_str);
1322
0
    unixctl_command_reply_error(conn, reply_str);
1323
0
    ds_destroy(&reply);
1324
0
}
1325
1326
static void
1327
dpif_netdev_pmd_rebalance(struct unixctl_conn *conn, int argc,
1328
                          const char *argv[], void *aux OVS_UNUSED)
1329
0
{
1330
0
    struct ds reply = DS_EMPTY_INITIALIZER;
1331
0
    struct dp_netdev *dp = NULL;
1332
1333
0
    ovs_mutex_lock(&dp_netdev_mutex);
1334
1335
0
    if (argc == 2) {
1336
0
        dp = shash_find_data(&dp_netdevs, argv[1]);
1337
0
    } else if (shash_count(&dp_netdevs) == 1) {
1338
        /* There's only one datapath */
1339
0
        dp = shash_first(&dp_netdevs)->data;
1340
0
    }
1341
1342
0
    if (!dp) {
1343
0
        ovs_mutex_unlock(&dp_netdev_mutex);
1344
0
        unixctl_command_reply_error(conn,
1345
0
                                    "please specify an existing datapath");
1346
0
        return;
1347
0
    }
1348
1349
0
    dp_netdev_request_reconfigure(dp);
1350
0
    ovs_mutex_unlock(&dp_netdev_mutex);
1351
0
    ds_put_cstr(&reply, "pmd rxq rebalance requested.\n");
1352
0
    unixctl_command_reply(conn, ds_cstr(&reply));
1353
0
    ds_destroy(&reply);
1354
0
}
1355
1356
static void
1357
pmd_info_show_sleep(struct ds *reply, unsigned core_id, int numa_id,
1358
                    uint64_t pmd_max_sleep)
1359
0
{
1360
0
    if (core_id == NON_PMD_CORE_ID) {
1361
0
        return;
1362
0
    }
1363
0
    ds_put_format(reply,
1364
0
                  "pmd thread numa_id %d core_id %d:\n"
1365
0
                  "  max sleep: %4"PRIu64" us\n",
1366
0
                  numa_id, core_id, pmd_max_sleep);
1367
0
}
1368
1369
static void
1370
dpif_netdev_pmd_info(struct unixctl_conn *conn, int argc, const char *argv[],
1371
                     void *aux)
1372
0
{
1373
0
    struct ds reply = DS_EMPTY_INITIALIZER;
1374
0
    struct dp_netdev_pmd_thread **pmd_list;
1375
0
    struct dp_netdev *dp = NULL;
1376
0
    enum pmd_info_type type = *(enum pmd_info_type *) aux;
1377
0
    unsigned int core_id;
1378
0
    bool filter_on_pmd = false;
1379
0
    size_t n;
1380
0
    unsigned int secs = 0;
1381
0
    unsigned long long max_secs = (PMD_INTERVAL_LEN * PMD_INTERVAL_MAX)
1382
0
                                      / INTERVAL_USEC_TO_SEC;
1383
0
    bool show_header = true;
1384
0
    uint64_t max_sleep;
1385
1386
0
    ovs_mutex_lock(&dp_netdev_mutex);
1387
1388
0
    while (argc > 1) {
1389
0
        if (!strcmp(argv[1], "-pmd") && argc > 2) {
1390
0
            if (str_to_uint(argv[2], 10, &core_id)) {
1391
0
                filter_on_pmd = true;
1392
0
            }
1393
0
            argc -= 2;
1394
0
            argv += 2;
1395
0
        } else if (type == PMD_INFO_SHOW_RXQ &&
1396
0
                       !strcmp(argv[1], "-secs") &&
1397
0
                       argc > 2) {
1398
0
            if (!str_to_uint(argv[2], 10, &secs)) {
1399
0
                secs = max_secs;
1400
0
            }
1401
0
            argc -= 2;
1402
0
            argv += 2;
1403
0
        } else {
1404
0
            dp = shash_find_data(&dp_netdevs, argv[1]);
1405
0
            argc -= 1;
1406
0
            argv += 1;
1407
0
        }
1408
0
    }
1409
1410
0
    if (!dp) {
1411
0
        if (shash_count(&dp_netdevs) == 1) {
1412
            /* There's only one datapath */
1413
0
            dp = shash_first(&dp_netdevs)->data;
1414
0
        } else {
1415
0
            ovs_mutex_unlock(&dp_netdev_mutex);
1416
0
            unixctl_command_reply_error(conn,
1417
0
                                        "please specify an existing datapath");
1418
0
            return;
1419
0
        }
1420
0
    }
1421
1422
0
    sorted_poll_thread_list(dp, &pmd_list, &n);
1423
0
    for (size_t i = 0; i < n; i++) {
1424
0
        struct dp_netdev_pmd_thread *pmd = pmd_list[i];
1425
0
        if (!pmd) {
1426
0
            break;
1427
0
        }
1428
0
        if (filter_on_pmd && pmd->core_id != core_id) {
1429
0
            continue;
1430
0
        }
1431
0
        if (type == PMD_INFO_SHOW_RXQ) {
1432
0
            if (show_header) {
1433
0
                if (!secs || secs > max_secs) {
1434
0
                    secs = max_secs;
1435
0
                } else {
1436
0
                    secs = ROUND_UP(secs,
1437
0
                                    PMD_INTERVAL_LEN / INTERVAL_USEC_TO_SEC);
1438
0
                }
1439
0
                ds_put_format(&reply, "Displaying last %u seconds "
1440
0
                              "pmd usage %%\n", secs);
1441
0
                show_header = false;
1442
0
            }
1443
0
            pmd_info_show_rxq(&reply, pmd, secs);
1444
0
        } else if (type == PMD_INFO_CLEAR_STATS) {
1445
0
            pmd_perf_stats_clear(&pmd->perf_stats);
1446
0
        } else if (type == PMD_INFO_SHOW_STATS) {
1447
0
            pmd_info_show_stats(&reply, pmd);
1448
0
        } else if (type == PMD_INFO_PERF_SHOW) {
1449
0
            pmd_info_show_perf(&reply, pmd, (struct pmd_perf_params *)aux);
1450
0
        } else if (type == PMD_INFO_SLEEP_SHOW) {
1451
0
            if (show_header) {
1452
0
                ds_put_format(&reply, "Default max sleep: %4"PRIu64" us\n",
1453
0
                              dp->pmd_max_sleep_default);
1454
0
                show_header = false;
1455
0
            }
1456
0
            atomic_read_relaxed(&pmd->max_sleep, &max_sleep);
1457
0
            pmd_info_show_sleep(&reply, pmd->core_id, pmd->numa_id,
1458
0
                                max_sleep);
1459
0
        }
1460
0
    }
1461
0
    free(pmd_list);
1462
1463
0
    ovs_mutex_unlock(&dp_netdev_mutex);
1464
1465
0
    unixctl_command_reply(conn, ds_cstr(&reply));
1466
0
    ds_destroy(&reply);
1467
0
}
1468
1469
static void
1470
pmd_perf_show_cmd(struct unixctl_conn *conn, int argc,
1471
                          const char *argv[],
1472
                          void *aux OVS_UNUSED)
1473
0
{
1474
0
    struct pmd_perf_params par;
1475
0
    long int it_hist = 0, ms_hist = 0;
1476
0
    par.histograms = true;
1477
1478
0
    while (argc > 1) {
1479
0
        if (!strcmp(argv[1], "-nh")) {
1480
0
            par.histograms = false;
1481
0
            argc -= 1;
1482
0
            argv += 1;
1483
0
        } else if (!strcmp(argv[1], "-it") && argc > 2) {
1484
0
            it_hist = strtol(argv[2], NULL, 10);
1485
0
            if (it_hist < 0) {
1486
0
                it_hist = 0;
1487
0
            } else if (it_hist > HISTORY_LEN) {
1488
0
                it_hist = HISTORY_LEN;
1489
0
            }
1490
0
            argc -= 2;
1491
0
            argv += 2;
1492
0
        } else if (!strcmp(argv[1], "-ms") && argc > 2) {
1493
0
            ms_hist = strtol(argv[2], NULL, 10);
1494
0
            if (ms_hist < 0) {
1495
0
                ms_hist = 0;
1496
0
            } else if (ms_hist > HISTORY_LEN) {
1497
0
                ms_hist = HISTORY_LEN;
1498
0
            }
1499
0
            argc -= 2;
1500
0
            argv += 2;
1501
0
        } else {
1502
0
            break;
1503
0
        }
1504
0
    }
1505
0
    par.iter_hist_len = it_hist;
1506
0
    par.ms_hist_len = ms_hist;
1507
0
    par.command_type = PMD_INFO_PERF_SHOW;
1508
0
    dpif_netdev_pmd_info(conn, argc, argv, &par);
1509
0
}
1510
1511
static void
1512
dpif_netdev_bond_show(struct unixctl_conn *conn, int argc,
1513
                      const char *argv[], void *aux OVS_UNUSED)
1514
0
{
1515
0
    struct ds reply = DS_EMPTY_INITIALIZER;
1516
0
    struct dp_netdev *dp = NULL;
1517
1518
0
    ovs_mutex_lock(&dp_netdev_mutex);
1519
0
    if (argc == 2) {
1520
0
        dp = shash_find_data(&dp_netdevs, argv[1]);
1521
0
    } else if (shash_count(&dp_netdevs) == 1) {
1522
        /* There's only one datapath. */
1523
0
        dp = shash_first(&dp_netdevs)->data;
1524
0
    }
1525
0
    if (!dp) {
1526
0
        ovs_mutex_unlock(&dp_netdev_mutex);
1527
0
        unixctl_command_reply_error(conn,
1528
0
                                    "please specify an existing datapath");
1529
0
        return;
1530
0
    }
1531
1532
0
    if (cmap_count(&dp->tx_bonds) > 0) {
1533
0
        struct tx_bond *dp_bond_entry;
1534
1535
0
        ds_put_cstr(&reply, "Bonds:\n");
1536
0
        CMAP_FOR_EACH (dp_bond_entry, node, &dp->tx_bonds) {
1537
0
            ds_put_format(&reply, "  bond-id %"PRIu32":\n",
1538
0
                          dp_bond_entry->bond_id);
1539
0
            for (int bucket = 0; bucket < BOND_BUCKETS; bucket++) {
1540
0
                uint32_t member_id = odp_to_u32(
1541
0
                    dp_bond_entry->member_buckets[bucket].member_id);
1542
0
                ds_put_format(&reply,
1543
0
                              "    bucket %d - member %"PRIu32"\n",
1544
0
                              bucket, member_id);
1545
0
            }
1546
0
        }
1547
0
    }
1548
0
    ovs_mutex_unlock(&dp_netdev_mutex);
1549
0
    unixctl_command_reply(conn, ds_cstr(&reply));
1550
0
    ds_destroy(&reply);
1551
0
}
1552
1553

1554
static int
1555
dpif_netdev_init(void)
1556
0
{
1557
0
    static enum pmd_info_type show_aux = PMD_INFO_SHOW_STATS,
1558
0
                              clear_aux = PMD_INFO_CLEAR_STATS,
1559
0
                              poll_aux = PMD_INFO_SHOW_RXQ,
1560
0
                              sleep_aux = PMD_INFO_SLEEP_SHOW;
1561
1562
0
    unixctl_command_register("dpif-netdev/pmd-stats-show", "[-pmd core] [dp]",
1563
0
                             0, 3, dpif_netdev_pmd_info,
1564
0
                             (void *)&show_aux);
1565
0
    unixctl_command_register("dpif-netdev/pmd-stats-clear", "[-pmd core] [dp]",
1566
0
                             0, 3, dpif_netdev_pmd_info,
1567
0
                             (void *)&clear_aux);
1568
0
    unixctl_command_register("dpif-netdev/pmd-rxq-show", "[-pmd core] "
1569
0
                             "[-secs secs] [dp]",
1570
0
                             0, 5, dpif_netdev_pmd_info,
1571
0
                             (void *)&poll_aux);
1572
0
    unixctl_command_register("dpif-netdev/pmd-sleep-show", "[dp]",
1573
0
                             0, 1, dpif_netdev_pmd_info,
1574
0
                             (void *)&sleep_aux);
1575
0
    unixctl_command_register("dpif-netdev/pmd-perf-show",
1576
0
                             "[-nh] [-it iter-history-len]"
1577
0
                             " [-ms ms-history-len]"
1578
0
                             " [-pmd core] [dp]",
1579
0
                             0, 8, pmd_perf_show_cmd,
1580
0
                             NULL);
1581
0
    unixctl_command_register("dpif-netdev/pmd-rxq-rebalance", "[dp]",
1582
0
                             0, 1, dpif_netdev_pmd_rebalance,
1583
0
                             NULL);
1584
0
    unixctl_command_register("dpif-netdev/pmd-perf-log-set",
1585
0
                             "on|off [-b before] [-a after] [-e|-ne] "
1586
0
                             "[-us usec] [-q qlen]",
1587
0
                             0, 10, pmd_perf_log_set_cmd,
1588
0
                             NULL);
1589
0
    unixctl_command_register("dpif-netdev/bond-show", "[dp]",
1590
0
                             0, 1, dpif_netdev_bond_show,
1591
0
                             NULL);
1592
0
    unixctl_command_register("dpif-netdev/subtable-lookup-prio-set",
1593
0
                             "[lookup_func] [prio]",
1594
0
                             2, 2, dpif_netdev_subtable_lookup_set,
1595
0
                             NULL);
1596
0
    unixctl_command_register("dpif-netdev/subtable-lookup-info-get", "",
1597
0
                             0, 0, dpif_netdev_subtable_lookup_get,
1598
0
                             NULL);
1599
0
    unixctl_command_register("dpif-netdev/subtable-lookup-prio-get", NULL,
1600
0
                             0, 0, dpif_netdev_subtable_lookup_get,
1601
0
                             NULL);
1602
0
    unixctl_command_register("dpif-netdev/dpif-impl-set",
1603
0
                             "dpif_implementation_name",
1604
0
                             1, 1, dpif_netdev_impl_set,
1605
0
                             NULL);
1606
0
    unixctl_command_register("dpif-netdev/dpif-impl-get", "",
1607
0
                             0, 0, dpif_netdev_impl_get,
1608
0
                             NULL);
1609
0
    unixctl_command_register("dpif-netdev/miniflow-parser-set",
1610
0
                             "[-pmd core] miniflow_implementation_name"
1611
0
                             " [study_pkt_cnt]",
1612
0
                             1, 5, dpif_miniflow_extract_impl_set,
1613
0
                             NULL);
1614
0
    unixctl_command_register("dpif-netdev/miniflow-parser-get", "",
1615
0
                             0, 0, dpif_miniflow_extract_impl_get,
1616
0
                             NULL);
1617
0
    return 0;
1618
0
}
1619
1620
static int
1621
dpif_netdev_enumerate(struct sset *all_dps,
1622
                      const struct dpif_class *dpif_class)
1623
0
{
1624
0
    struct shash_node *node;
1625
1626
0
    ovs_mutex_lock(&dp_netdev_mutex);
1627
0
    SHASH_FOR_EACH(node, &dp_netdevs) {
1628
0
        struct dp_netdev *dp = node->data;
1629
0
        if (dpif_class != dp->class) {
1630
            /* 'dp_netdevs' contains both "netdev" and "dummy" dpifs.
1631
             * If the class doesn't match, skip this dpif. */
1632
0
             continue;
1633
0
        }
1634
0
        sset_add(all_dps, node->name);
1635
0
    }
1636
0
    ovs_mutex_unlock(&dp_netdev_mutex);
1637
1638
0
    return 0;
1639
0
}
1640
1641
static bool
1642
dpif_netdev_class_is_dummy(const struct dpif_class *class)
1643
0
{
1644
0
    return class != &dpif_netdev_class;
1645
0
}
1646
1647
static const char *
1648
dpif_netdev_port_open_type(const struct dpif_class *class, const char *type)
1649
0
{
1650
0
    return strcmp(type, "internal") ? type
1651
0
                  : dpif_netdev_class_is_dummy(class) ? "dummy-internal"
1652
0
                  : "tap";
1653
0
}
1654
1655
static struct dpif *
1656
create_dpif_netdev(struct dp_netdev *dp)
1657
0
{
1658
0
    uint16_t netflow_id = hash_string(dp->name, 0);
1659
0
    struct dpif_netdev *dpif;
1660
1661
0
    ovs_refcount_ref(&dp->ref_cnt);
1662
1663
0
    dpif = xmalloc(sizeof *dpif);
1664
0
    dpif_init(&dpif->dpif, dp->class, dp->name, netflow_id >> 8, netflow_id);
1665
0
    dpif->dp = dp;
1666
0
    dpif->last_port_seq = seq_read(dp->port_seq);
1667
1668
0
    return &dpif->dpif;
1669
0
}
1670
1671
/* Choose an unused, non-zero port number and return it on success.
1672
 * Return ODPP_NONE on failure. */
1673
static odp_port_t
1674
choose_port(struct dp_netdev *dp, const char *name)
1675
    OVS_REQ_RDLOCK(dp->port_rwlock)
1676
0
{
1677
0
    uint32_t port_no;
1678
1679
0
    if (dp->class != &dpif_netdev_class) {
1680
0
        const char *p;
1681
0
        int start_no = 0;
1682
1683
        /* If the port name begins with "br", start the number search at
1684
         * 100 to make writing tests easier. */
1685
0
        if (!strncmp(name, "br", 2)) {
1686
0
            start_no = 100;
1687
0
        }
1688
1689
        /* If the port name contains a number, try to assign that port number.
1690
         * This can make writing unit tests easier because port numbers are
1691
         * predictable. */
1692
0
        for (p = name; *p != '\0'; p++) {
1693
0
            if (isdigit((unsigned char) *p)) {
1694
0
                port_no = start_no + strtol(p, NULL, 10);
1695
0
                if (port_no > 0 && port_no != odp_to_u32(ODPP_NONE)
1696
0
                    && !dp_netdev_lookup_port(dp, u32_to_odp(port_no))) {
1697
0
                    return u32_to_odp(port_no);
1698
0
                }
1699
0
                break;
1700
0
            }
1701
0
        }
1702
0
    }
1703
1704
0
    for (port_no = 1; port_no <= UINT16_MAX; port_no++) {
1705
0
        if (!dp_netdev_lookup_port(dp, u32_to_odp(port_no))) {
1706
0
            return u32_to_odp(port_no);
1707
0
        }
1708
0
    }
1709
1710
0
    return ODPP_NONE;
1711
0
}
1712
1713
static uint32_t
1714
dp_meter_hash(uint32_t meter_id)
1715
0
{
1716
    /* In the ofproto-dpif layer, we use the id-pool to alloc meter id
1717
     * orderly (e.g. 1, 2, ... N.), which provides a better hash
1718
     * distribution.  Use them directly instead of hash_xxx function for
1719
     * achieving high-performance. */
1720
0
    return meter_id;
1721
0
}
1722
1723
static void
1724
dp_netdev_meter_destroy(struct dp_netdev *dp)
1725
0
{
1726
0
    struct dp_meter *m;
1727
1728
0
    ovs_mutex_lock(&dp->meters_lock);
1729
0
    CMAP_FOR_EACH (m, node, &dp->meters) {
1730
0
        cmap_remove(&dp->meters, &m->node, dp_meter_hash(m->id));
1731
0
        ovsrcu_postpone(free, m);
1732
0
    }
1733
1734
0
    cmap_destroy(&dp->meters);
1735
0
    ovs_mutex_unlock(&dp->meters_lock);
1736
0
    ovs_mutex_destroy(&dp->meters_lock);
1737
0
}
1738
1739
static struct dp_meter *
1740
dp_meter_lookup(struct cmap *meters, uint32_t meter_id)
1741
0
{
1742
0
    uint32_t hash = dp_meter_hash(meter_id);
1743
0
    struct dp_meter *m;
1744
1745
0
    CMAP_FOR_EACH_WITH_HASH (m, node, hash, meters) {
1746
0
        if (m->id == meter_id) {
1747
0
            return m;
1748
0
        }
1749
0
    }
1750
1751
0
    return NULL;
1752
0
}
1753
1754
static void
1755
dp_meter_detach_free(struct cmap *meters, uint32_t meter_id)
1756
0
{
1757
0
    struct dp_meter *m = dp_meter_lookup(meters, meter_id);
1758
1759
0
    if (m) {
1760
0
        cmap_remove(meters, &m->node, dp_meter_hash(meter_id));
1761
0
        ovsrcu_postpone(free, m);
1762
0
    }
1763
0
}
1764
1765
static void
1766
dp_meter_attach(struct cmap *meters, struct dp_meter *meter)
1767
0
{
1768
0
    cmap_insert(meters, &meter->node, dp_meter_hash(meter->id));
1769
0
}
1770
1771
static int
1772
create_dp_netdev(const char *name, const struct dpif_class *class,
1773
                 struct dp_netdev **dpp)
1774
    OVS_REQUIRES(dp_netdev_mutex)
1775
0
{
1776
0
    static struct ovsthread_once tsc_freq_check = OVSTHREAD_ONCE_INITIALIZER;
1777
0
    struct dp_netdev *dp;
1778
0
    int error;
1779
1780
    /* Avoid estimating TSC frequency for dummy datapath to not slow down
1781
     * unit tests. */
1782
0
    if (!dpif_netdev_class_is_dummy(class)
1783
0
        && ovsthread_once_start(&tsc_freq_check)) {
1784
0
        pmd_perf_estimate_tsc_frequency();
1785
0
        ovsthread_once_done(&tsc_freq_check);
1786
0
    }
1787
1788
0
    dp = xzalloc(sizeof *dp);
1789
0
    shash_add(&dp_netdevs, name, dp);
1790
1791
0
    *CONST_CAST(const struct dpif_class **, &dp->class) = class;
1792
0
    *CONST_CAST(const char **, &dp->name) = xstrdup(name);
1793
0
    *CONST_CAST(const char **, &dp->full_name) = xasprintf("%s@%s",
1794
0
                                                           class->type, name);
1795
0
    ovs_refcount_init(&dp->ref_cnt);
1796
0
    atomic_flag_clear(&dp->destroyed);
1797
1798
0
    ovs_rwlock_init(&dp->port_rwlock);
1799
0
    hmap_init(&dp->ports);
1800
0
    dp->port_seq = seq_create();
1801
0
    ovs_mutex_init(&dp->bond_mutex);
1802
0
    cmap_init(&dp->tx_bonds);
1803
1804
0
    fat_rwlock_init(&dp->upcall_rwlock);
1805
1806
0
    dp->reconfigure_seq = seq_create();
1807
0
    dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq);
1808
0
    dp->once_set_config = (struct ovsthread_once) OVSTHREAD_ONCE_INITIALIZER;
1809
1810
    /* Init meter resources. */
1811
0
    cmap_init(&dp->meters);
1812
0
    ovs_mutex_init(&dp->meters_lock);
1813
1814
    /* Disable upcalls by default. */
1815
0
    dp_netdev_disable_upcall(dp);
1816
0
    dp->upcall_aux = NULL;
1817
0
    dp->upcall_cb = NULL;
1818
1819
0
    dp->conntrack = conntrack_init();
1820
1821
0
    dpif_miniflow_extract_init();
1822
1823
0
    atomic_init(&dp->emc_insert_min, DEFAULT_EM_FLOW_INSERT_MIN);
1824
0
    atomic_init(&dp->tx_flush_interval, DEFAULT_TX_FLUSH_INTERVAL);
1825
1826
0
    cmap_init(&dp->poll_threads);
1827
0
    dp->pmd_rxq_assign_type = SCHED_CYCLES;
1828
1829
0
    ovs_mutex_init(&dp->tx_qid_pool_mutex);
1830
    /* We need 1 Tx queue for each possible core + 1 for non-PMD threads. */
1831
0
    dp->tx_qid_pool = id_pool_create(0, ovs_numa_get_n_cores() + 1);
1832
1833
0
    ovs_mutex_init_recursive(&dp->non_pmd_mutex);
1834
0
    ovsthread_key_create(&dp->per_pmd_key, NULL);
1835
1836
0
    ovs_rwlock_wrlock(&dp->port_rwlock);
1837
    /* non-PMD will be created before all other threads and will
1838
     * allocate static_tx_qid = 0. */
1839
0
    dp_netdev_set_nonpmd(dp);
1840
1841
0
    error = do_add_port(dp, name, dpif_netdev_port_open_type(dp->class,
1842
0
                                                             "internal"),
1843
0
                        ODPP_LOCAL);
1844
0
    ovs_rwlock_unlock(&dp->port_rwlock);
1845
0
    if (error) {
1846
0
        dp_netdev_free(dp);
1847
0
        return error;
1848
0
    }
1849
1850
0
    dp->max_sleep_list = NULL;
1851
1852
0
    dp->last_tnl_conf_seq = seq_read(tnl_conf_seq);
1853
0
    *dpp = dp;
1854
0
    return 0;
1855
0
}
1856
1857
static void
1858
dp_netdev_request_reconfigure(struct dp_netdev *dp)
1859
0
{
1860
0
    seq_change(dp->reconfigure_seq);
1861
0
}
1862
1863
static bool
1864
dp_netdev_is_reconf_required(struct dp_netdev *dp)
1865
0
{
1866
0
    return seq_read(dp->reconfigure_seq) != dp->last_reconfigure_seq;
1867
0
}
1868
1869
static int
1870
dpif_netdev_open(const struct dpif_class *class, const char *name,
1871
                 bool create, struct dpif **dpifp)
1872
0
{
1873
0
    struct dp_netdev *dp;
1874
0
    int error;
1875
1876
0
    ovs_mutex_lock(&dp_netdev_mutex);
1877
0
    dp = shash_find_data(&dp_netdevs, name);
1878
0
    if (!dp) {
1879
0
        error = create ? create_dp_netdev(name, class, &dp) : ENODEV;
1880
0
    } else {
1881
0
        error = (dp->class != class ? EINVAL
1882
0
                 : create ? EEXIST
1883
0
                 : 0);
1884
0
    }
1885
0
    if (!error) {
1886
0
        *dpifp = create_dpif_netdev(dp);
1887
0
    }
1888
0
    ovs_mutex_unlock(&dp_netdev_mutex);
1889
1890
0
    return error;
1891
0
}
1892
1893
static void
1894
dp_netdev_destroy_upcall_lock(struct dp_netdev *dp)
1895
    OVS_NO_THREAD_SAFETY_ANALYSIS
1896
0
{
1897
    /* Check that upcalls are disabled, i.e. that the rwlock is taken */
1898
0
    ovs_assert(fat_rwlock_tryrdlock(&dp->upcall_rwlock));
1899
1900
    /* Before freeing a lock we should release it */
1901
0
    fat_rwlock_unlock(&dp->upcall_rwlock);
1902
0
    fat_rwlock_destroy(&dp->upcall_rwlock);
1903
0
}
1904
1905
static uint32_t
1906
hash_bond_id(uint32_t bond_id)
1907
0
{
1908
0
    return hash_int(bond_id, 0);
1909
0
}
1910
1911
/* Requires dp_netdev_mutex so that we can't get a new reference to 'dp'
1912
 * through the 'dp_netdevs' shash while freeing 'dp'. */
1913
static void
1914
dp_netdev_free(struct dp_netdev *dp)
1915
    OVS_REQUIRES(dp_netdev_mutex)
1916
0
{
1917
0
    struct dp_netdev_port *port;
1918
0
    struct tx_bond *bond;
1919
1920
0
    shash_find_and_delete(&dp_netdevs, dp->name);
1921
1922
0
    ovs_rwlock_wrlock(&dp->port_rwlock);
1923
0
    HMAP_FOR_EACH_SAFE (port, node, &dp->ports) {
1924
0
        do_del_port(dp, port);
1925
0
    }
1926
0
    ovs_rwlock_unlock(&dp->port_rwlock);
1927
1928
0
    ovs_mutex_lock(&dp->bond_mutex);
1929
0
    CMAP_FOR_EACH (bond, node, &dp->tx_bonds) {
1930
0
        cmap_remove(&dp->tx_bonds, &bond->node, hash_bond_id(bond->bond_id));
1931
0
        ovsrcu_postpone(free, bond);
1932
0
    }
1933
0
    ovs_mutex_unlock(&dp->bond_mutex);
1934
1935
0
    dp_netdev_destroy_all_pmds(dp, true);
1936
0
    cmap_destroy(&dp->poll_threads);
1937
1938
0
    ovs_mutex_destroy(&dp->tx_qid_pool_mutex);
1939
0
    id_pool_destroy(dp->tx_qid_pool);
1940
1941
0
    ovs_mutex_destroy(&dp->non_pmd_mutex);
1942
0
    ovsthread_key_delete(dp->per_pmd_key);
1943
1944
0
    conntrack_destroy(dp->conntrack);
1945
1946
1947
0
    seq_destroy(dp->reconfigure_seq);
1948
0
    ovsthread_once_destroy(&dp->once_set_config);
1949
1950
0
    seq_destroy(dp->port_seq);
1951
0
    hmap_destroy(&dp->ports);
1952
0
    ovs_rwlock_destroy(&dp->port_rwlock);
1953
1954
0
    cmap_destroy(&dp->tx_bonds);
1955
0
    ovs_mutex_destroy(&dp->bond_mutex);
1956
1957
    /* Upcalls must be disabled at this point */
1958
0
    dp_netdev_destroy_upcall_lock(dp);
1959
1960
0
    dp_netdev_meter_destroy(dp);
1961
1962
0
    free(dp->max_sleep_list);
1963
0
    free(dp->pmd_cmask);
1964
0
    free(CONST_CAST(char *, dp->name));
1965
0
    free(CONST_CAST(char *, dp->full_name));
1966
0
    free(dp);
1967
0
}
1968
1969
static void
1970
dp_netdev_unref(struct dp_netdev *dp)
1971
0
{
1972
0
    if (dp) {
1973
        /* Take dp_netdev_mutex so that, if dp->ref_cnt falls to zero, we can't
1974
         * get a new reference to 'dp' through the 'dp_netdevs' shash. */
1975
0
        ovs_mutex_lock(&dp_netdev_mutex);
1976
0
        if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) {
1977
0
            dp_netdev_free(dp);
1978
0
        }
1979
0
        ovs_mutex_unlock(&dp_netdev_mutex);
1980
0
    }
1981
0
}
1982
1983
static void
1984
dpif_netdev_close(struct dpif *dpif)
1985
0
{
1986
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
1987
1988
0
    dp_netdev_unref(dp);
1989
0
    free(dpif);
1990
0
}
1991
1992
static int
1993
dpif_netdev_destroy(struct dpif *dpif)
1994
0
{
1995
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
1996
1997
0
    if (!atomic_flag_test_and_set(&dp->destroyed)) {
1998
0
        if (ovs_refcount_unref_relaxed(&dp->ref_cnt) == 1) {
1999
            /* Can't happen: 'dpif' still owns a reference to 'dp'. */
2000
0
            OVS_NOT_REACHED();
2001
0
        }
2002
0
    }
2003
2004
0
    return 0;
2005
0
}
2006
2007
/* Add 'n' to the atomic variable 'var' non-atomically and using relaxed
2008
 * load/store semantics.  While the increment is not atomic, the load and
2009
 * store operations are, making it impossible to read inconsistent values.
2010
 *
2011
 * This is used to update thread local stats counters. */
2012
static void
2013
non_atomic_ullong_add(atomic_ullong *var, unsigned long long n)
2014
0
{
2015
0
    unsigned long long tmp;
2016
2017
0
    atomic_read_relaxed(var, &tmp);
2018
0
    tmp += n;
2019
0
    atomic_store_relaxed(var, tmp);
2020
0
}
2021
2022
static int
2023
dpif_netdev_get_stats(const struct dpif *dpif, struct dpif_dp_stats *stats)
2024
0
{
2025
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
2026
0
    struct dp_netdev_pmd_thread *pmd;
2027
0
    uint64_t pmd_stats[PMD_N_STATS];
2028
2029
0
    stats->n_flows = stats->n_hit = stats->n_missed = stats->n_lost = 0;
2030
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
2031
0
        stats->n_flows += cmap_count(&pmd->flow_table);
2032
0
        pmd_perf_read_counters(&pmd->perf_stats, pmd_stats);
2033
0
        stats->n_hit += pmd_stats[PMD_STAT_PHWOL_HIT];
2034
0
        stats->n_hit += pmd_stats[PMD_STAT_SIMPLE_HIT];
2035
0
        stats->n_hit += pmd_stats[PMD_STAT_EXACT_HIT];
2036
0
        stats->n_hit += pmd_stats[PMD_STAT_SMC_HIT];
2037
0
        stats->n_hit += pmd_stats[PMD_STAT_MASKED_HIT];
2038
0
        stats->n_missed += pmd_stats[PMD_STAT_MISS];
2039
0
        stats->n_lost += pmd_stats[PMD_STAT_LOST];
2040
0
    }
2041
0
    stats->n_masks = UINT32_MAX;
2042
0
    stats->n_mask_hit = UINT64_MAX;
2043
0
    stats->n_cache_hit = UINT64_MAX;
2044
2045
0
    return 0;
2046
0
}
2047
2048
static void
2049
dp_netdev_reload_pmd__(struct dp_netdev_pmd_thread *pmd)
2050
0
{
2051
0
    if (pmd->core_id == NON_PMD_CORE_ID) {
2052
0
        ovs_mutex_lock(&pmd->dp->non_pmd_mutex);
2053
0
        ovs_mutex_lock(&pmd->port_mutex);
2054
0
        pmd_load_cached_ports(pmd);
2055
0
        ovs_mutex_unlock(&pmd->port_mutex);
2056
0
        ovs_mutex_unlock(&pmd->dp->non_pmd_mutex);
2057
0
        return;
2058
0
    }
2059
2060
0
    seq_change(pmd->reload_seq);
2061
0
    atomic_store_explicit(&pmd->reload, true, memory_order_release);
2062
0
}
2063
2064
static uint32_t
2065
hash_port_no(odp_port_t port_no)
2066
0
{
2067
0
    return hash_int(odp_to_u32(port_no), 0);
2068
0
}
2069
2070
static int
2071
port_create(const char *devname, const char *type,
2072
            odp_port_t port_no, struct dp_netdev_port **portp)
2073
0
{
2074
0
    struct dp_netdev_port *port;
2075
0
    enum netdev_flags flags;
2076
0
    struct netdev *netdev;
2077
0
    int error;
2078
2079
0
    *portp = NULL;
2080
2081
    /* Open and validate network device. */
2082
0
    error = netdev_open(devname, type, &netdev);
2083
0
    if (error) {
2084
0
        return error;
2085
0
    }
2086
    /* XXX reject non-Ethernet devices */
2087
2088
0
    netdev_get_flags(netdev, &flags);
2089
0
    if (flags & NETDEV_LOOPBACK) {
2090
0
        VLOG_ERR("%s: cannot add a loopback device", devname);
2091
0
        error = EINVAL;
2092
0
        goto out;
2093
0
    }
2094
2095
0
    port = xzalloc(sizeof *port);
2096
0
    port->port_no = port_no;
2097
0
    port->netdev = netdev;
2098
0
    port->type = xstrdup(type);
2099
0
    port->sf = NULL;
2100
0
    port->emc_enabled = true;
2101
0
    port->need_reconfigure = true;
2102
0
    ovs_mutex_init(&port->txq_used_mutex);
2103
2104
0
    *portp = port;
2105
2106
0
    return 0;
2107
2108
0
out:
2109
0
    netdev_close(netdev);
2110
0
    return error;
2111
0
}
2112
2113
static int
2114
do_add_port(struct dp_netdev *dp, const char *devname, const char *type,
2115
            odp_port_t port_no)
2116
    OVS_REQ_WRLOCK(dp->port_rwlock)
2117
0
{
2118
0
    struct netdev_saved_flags *sf;
2119
0
    struct dp_netdev_port *port;
2120
0
    int error;
2121
2122
    /* Reject devices already in 'dp'. */
2123
0
    if (!get_port_by_name(dp, devname, &port)) {
2124
0
        return EEXIST;
2125
0
    }
2126
2127
0
    error = port_create(devname, type, port_no, &port);
2128
0
    if (error) {
2129
0
        return error;
2130
0
    }
2131
2132
0
    hmap_insert(&dp->ports, &port->node, hash_port_no(port_no));
2133
0
    seq_change(dp->port_seq);
2134
2135
0
    reconfigure_datapath(dp);
2136
2137
    /* Check that port was successfully configured. */
2138
0
    if (!dp_netdev_lookup_port(dp, port_no)) {
2139
0
        return EINVAL;
2140
0
    }
2141
2142
    /* Updating device flags triggers an if_notifier, which triggers a bridge
2143
     * reconfiguration and another attempt to add this port, leading to an
2144
     * infinite loop if the device is configured incorrectly and cannot be
2145
     * added.  Setting the promisc mode after a successful reconfiguration,
2146
     * since we already know that the device is somehow properly configured. */
2147
0
    error = netdev_turn_flags_on(port->netdev, NETDEV_PROMISC, &sf);
2148
0
    if (error) {
2149
0
        VLOG_ERR("%s: cannot set promisc flag", devname);
2150
0
        do_del_port(dp, port);
2151
0
        return error;
2152
0
    }
2153
0
    port->sf = sf;
2154
2155
0
    return 0;
2156
0
}
2157
2158
static int
2159
dpif_netdev_port_add(struct dpif *dpif, struct netdev *netdev,
2160
                     odp_port_t *port_nop)
2161
0
{
2162
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
2163
0
    char namebuf[NETDEV_VPORT_NAME_BUFSIZE];
2164
0
    const char *dpif_port;
2165
0
    odp_port_t port_no;
2166
0
    int error;
2167
2168
0
    ovs_rwlock_wrlock(&dp->port_rwlock);
2169
0
    dpif_port = netdev_vport_get_dpif_port(netdev, namebuf, sizeof namebuf);
2170
0
    if (*port_nop != ODPP_NONE) {
2171
0
        port_no = *port_nop;
2172
0
        error = dp_netdev_lookup_port(dp, *port_nop) ? EBUSY : 0;
2173
0
    } else {
2174
0
        port_no = choose_port(dp, dpif_port);
2175
0
        error = port_no == ODPP_NONE ? EFBIG : 0;
2176
0
    }
2177
0
    if (!error) {
2178
0
        *port_nop = port_no;
2179
0
        error = do_add_port(dp, dpif_port, netdev_get_type(netdev), port_no);
2180
0
    }
2181
0
    ovs_rwlock_unlock(&dp->port_rwlock);
2182
2183
0
    return error;
2184
0
}
2185
2186
static int
2187
dpif_netdev_port_del(struct dpif *dpif, odp_port_t port_no)
2188
0
{
2189
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
2190
0
    int error;
2191
2192
0
    ovs_rwlock_wrlock(&dp->port_rwlock);
2193
0
    if (port_no == ODPP_LOCAL) {
2194
0
        error = EINVAL;
2195
0
    } else {
2196
0
        struct dp_netdev_port *port;
2197
2198
0
        error = get_port_by_number(dp, port_no, &port);
2199
0
        if (!error) {
2200
0
            do_del_port(dp, port);
2201
0
        }
2202
0
    }
2203
0
    ovs_rwlock_unlock(&dp->port_rwlock);
2204
2205
0
    return error;
2206
0
}
2207
2208
static bool
2209
is_valid_port_number(odp_port_t port_no)
2210
0
{
2211
0
    return port_no != ODPP_NONE;
2212
0
}
2213
2214
static struct dp_netdev_port *
2215
dp_netdev_lookup_port(const struct dp_netdev *dp, odp_port_t port_no)
2216
    OVS_REQ_RDLOCK(dp->port_rwlock)
2217
0
{
2218
0
    struct dp_netdev_port *port;
2219
2220
0
    HMAP_FOR_EACH_WITH_HASH (port, node, hash_port_no(port_no), &dp->ports) {
2221
0
        if (port->port_no == port_no) {
2222
0
            return port;
2223
0
        }
2224
0
    }
2225
0
    return NULL;
2226
0
}
2227
2228
static int
2229
get_port_by_number(struct dp_netdev *dp,
2230
                   odp_port_t port_no, struct dp_netdev_port **portp)
2231
    OVS_REQ_RDLOCK(dp->port_rwlock)
2232
0
{
2233
0
    if (!is_valid_port_number(port_no)) {
2234
0
        *portp = NULL;
2235
0
        return EINVAL;
2236
0
    } else {
2237
0
        *portp = dp_netdev_lookup_port(dp, port_no);
2238
0
        return *portp ? 0 : ENODEV;
2239
0
    }
2240
0
}
2241
2242
static void
2243
port_destroy(struct dp_netdev_port *port)
2244
0
{
2245
0
    if (!port) {
2246
0
        return;
2247
0
    }
2248
2249
0
    netdev_close(port->netdev);
2250
0
    netdev_restore_flags(port->sf);
2251
2252
0
    for (unsigned i = 0; i < port->n_rxq; i++) {
2253
0
        netdev_rxq_close(port->rxqs[i].rx);
2254
0
    }
2255
0
    ovs_mutex_destroy(&port->txq_used_mutex);
2256
0
    free(port->rxq_affinity_list);
2257
0
    free(port->txq_used);
2258
0
    free(port->rxqs);
2259
0
    free(port->type);
2260
0
    free(port);
2261
0
}
2262
2263
static int
2264
get_port_by_name(struct dp_netdev *dp,
2265
                 const char *devname, struct dp_netdev_port **portp)
2266
    OVS_REQ_RDLOCK(dp->port_rwlock)
2267
0
{
2268
0
    struct dp_netdev_port *port;
2269
2270
0
    HMAP_FOR_EACH (port, node, &dp->ports) {
2271
0
        if (!strcmp(netdev_get_name(port->netdev), devname)) {
2272
0
            *portp = port;
2273
0
            return 0;
2274
0
        }
2275
0
    }
2276
2277
    /* Callers of dpif_netdev_port_query_by_name() expect ENODEV for a non
2278
     * existing port. */
2279
0
    return ENODEV;
2280
0
}
2281
2282
/* Returns 'true' if there is a port with pmd netdev. */
2283
static bool
2284
has_pmd_port(struct dp_netdev *dp)
2285
    OVS_REQ_RDLOCK(dp->port_rwlock)
2286
0
{
2287
0
    struct dp_netdev_port *port;
2288
2289
0
    HMAP_FOR_EACH (port, node, &dp->ports) {
2290
0
        if (netdev_is_pmd(port->netdev)) {
2291
0
            return true;
2292
0
        }
2293
0
    }
2294
2295
0
    return false;
2296
0
}
2297
2298
static void
2299
do_del_port(struct dp_netdev *dp, struct dp_netdev_port *port)
2300
    OVS_REQ_WRLOCK(dp->port_rwlock)
2301
0
{
2302
0
    hmap_remove(&dp->ports, &port->node);
2303
0
    seq_change(dp->port_seq);
2304
2305
0
    reconfigure_datapath(dp);
2306
0
    port_destroy(port);
2307
0
}
2308
2309
static void
2310
answer_port_query(const struct dp_netdev_port *port,
2311
                  struct dpif_port *dpif_port)
2312
0
{
2313
0
    dpif_port->name = xstrdup(netdev_get_name(port->netdev));
2314
0
    dpif_port->type = xstrdup(port->type);
2315
0
    dpif_port->port_no = port->port_no;
2316
0
}
2317
2318
static int
2319
dpif_netdev_port_query_by_number(const struct dpif *dpif, odp_port_t port_no,
2320
                                 struct dpif_port *dpif_port)
2321
0
{
2322
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
2323
0
    struct dp_netdev_port *port;
2324
0
    int error;
2325
2326
0
    ovs_rwlock_rdlock(&dp->port_rwlock);
2327
0
    error = get_port_by_number(dp, port_no, &port);
2328
0
    if (!error && dpif_port) {
2329
0
        answer_port_query(port, dpif_port);
2330
0
    }
2331
0
    ovs_rwlock_unlock(&dp->port_rwlock);
2332
2333
0
    return error;
2334
0
}
2335
2336
static int
2337
dpif_netdev_port_query_by_name(const struct dpif *dpif, const char *devname,
2338
                               struct dpif_port *dpif_port)
2339
0
{
2340
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
2341
0
    struct dp_netdev_port *port;
2342
0
    int error;
2343
2344
0
    ovs_rwlock_rdlock(&dp->port_rwlock);
2345
0
    error = get_port_by_name(dp, devname, &port);
2346
0
    if (!error && dpif_port) {
2347
0
        answer_port_query(port, dpif_port);
2348
0
    }
2349
0
    ovs_rwlock_unlock(&dp->port_rwlock);
2350
2351
0
    return error;
2352
0
}
2353
2354
static void
2355
dp_netdev_flow_free(struct dp_netdev_flow *flow)
2356
0
{
2357
0
    dp_netdev_actions_free(dp_netdev_flow_get_actions(flow));
2358
0
    free(flow->dp_extra_info);
2359
0
    free(flow);
2360
0
}
2361
2362
void dp_netdev_flow_unref(struct dp_netdev_flow *flow)
2363
0
{
2364
0
    if (ovs_refcount_unref_relaxed(&flow->ref_cnt) == 1) {
2365
0
        ovsrcu_postpone(dp_netdev_flow_free, flow);
2366
0
    }
2367
0
}
2368
2369
inline struct dpcls *
2370
dp_netdev_pmd_lookup_dpcls(struct dp_netdev_pmd_thread *pmd,
2371
                           odp_port_t in_port)
2372
0
{
2373
0
    struct dpcls *cls;
2374
0
    uint32_t hash = hash_port_no(in_port);
2375
0
    CMAP_FOR_EACH_WITH_HASH (cls, node, hash, &pmd->classifiers) {
2376
0
        if (cls->in_port == in_port) {
2377
            /* Port classifier exists already */
2378
0
            return cls;
2379
0
        }
2380
0
    }
2381
0
    return NULL;
2382
0
}
2383
2384
static inline struct dpcls *
2385
dp_netdev_pmd_find_dpcls(struct dp_netdev_pmd_thread *pmd,
2386
                         odp_port_t in_port)
2387
    OVS_REQUIRES(pmd->flow_mutex)
2388
0
{
2389
0
    struct dpcls *cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
2390
2391
0
    if (!cls) {
2392
0
        uint32_t hash = hash_port_no(in_port);
2393
2394
        /* Create new classifier for in_port */
2395
0
        cls = xmalloc(sizeof(*cls));
2396
0
        dpcls_init(cls);
2397
0
        cls->in_port = in_port;
2398
0
        cmap_insert(&pmd->classifiers, &cls->node, hash);
2399
0
        VLOG_DBG("Creating dpcls %p for in_port %d", cls, in_port);
2400
0
    }
2401
0
    return cls;
2402
0
}
2403
2404
static void
2405
log_netdev_flow_change(const struct dp_netdev_flow *flow,
2406
                       const struct match *match,
2407
                       const struct dp_netdev_actions *old_actions,
2408
                       const struct nlattr *actions,
2409
                       size_t actions_len)
2410
0
{
2411
0
    struct ds ds = DS_EMPTY_INITIALIZER;
2412
0
    struct ofpbuf key_buf, mask_buf;
2413
0
    struct odp_flow_key_parms odp_parms = {
2414
0
        .flow = &match->flow,
2415
0
        .mask = &match->wc.masks,
2416
0
        .support = dp_netdev_support,
2417
0
    };
2418
2419
0
    if (OVS_LIKELY(VLOG_DROP_DBG((&upcall_rl)))) {
2420
0
        return;
2421
0
    }
2422
2423
0
    ofpbuf_init(&key_buf, 0);
2424
0
    ofpbuf_init(&mask_buf, 0);
2425
2426
0
    odp_flow_key_from_flow(&odp_parms, &key_buf);
2427
0
    odp_parms.key_buf = &key_buf;
2428
0
    odp_flow_key_from_mask(&odp_parms, &mask_buf);
2429
2430
0
    if (old_actions) {
2431
0
        ds_put_cstr(&ds, "flow_mod: ");
2432
0
    } else {
2433
0
        ds_put_cstr(&ds, "flow_add: ");
2434
0
    }
2435
0
    odp_format_ufid(&flow->ufid, &ds);
2436
0
    ds_put_cstr(&ds, " mega_");
2437
0
    odp_format_ufid(&flow->mega_ufid, &ds);
2438
0
    ds_put_cstr(&ds, " ");
2439
0
    odp_flow_format(key_buf.data, key_buf.size,
2440
0
                    mask_buf.data, mask_buf.size,
2441
0
                    NULL, &ds, false, true);
2442
0
    if (old_actions) {
2443
0
        ds_put_cstr(&ds, ", old_actions:");
2444
0
        format_odp_actions(&ds, old_actions->actions, old_actions->size,
2445
0
                           NULL);
2446
0
    }
2447
0
    ds_put_cstr(&ds, ", actions:");
2448
0
    format_odp_actions(&ds, actions, actions_len, NULL);
2449
2450
0
    VLOG_DBG("%s", ds_cstr(&ds));
2451
2452
0
    ofpbuf_uninit(&key_buf);
2453
0
    ofpbuf_uninit(&mask_buf);
2454
2455
    /* Add a printout of the actual match installed. */
2456
0
    struct match m;
2457
0
    ds_clear(&ds);
2458
0
    ds_put_cstr(&ds, "flow match: ");
2459
0
    miniflow_expand(&flow->cr.flow.mf, &m.flow);
2460
0
    miniflow_expand(&flow->cr.mask->mf, &m.wc.masks);
2461
0
    memset(&m.tun_md, 0, sizeof m.tun_md);
2462
0
    match_format(&m, NULL, &ds, OFP_DEFAULT_PRIORITY);
2463
2464
0
    VLOG_DBG("%s", ds_cstr(&ds));
2465
2466
0
    ds_destroy(&ds);
2467
0
}
2468
2469
/* Offloaded flows can be handled asynchronously, so we do not always know
2470
 * whether a specific flow is offloaded or not.  It might still be pending;
2471
 * in fact, multiple modifications can be pending, and the actual offload
2472
 * state depends on the completion of each modification.
2473
 *
2474
 * To correctly determine whether a flow is offloaded when it is being
2475
 * destroyed (and therefore requires cleanup), we must ensure that all
2476
 * operations have completed.  To achieve this, we track the number of
2477
 * outstanding offloaded flow modifications. */
2478
static bool
2479
offload_queue_inc(struct dp_netdev_flow *flow)
2480
0
{
2481
0
    int current;
2482
2483
0
    while (true) {
2484
0
        atomic_read(&flow->offload_queue_depth, &current);
2485
0
        if (current < 0) {
2486
            /* We are cleaning up, so no longer enqueue operations. */
2487
0
            return false;
2488
0
        }
2489
2490
        /* Here we try to atomically increase the value.  If we do not succeed,
2491
         * someone else has modified it, and we need to check again for a
2492
         * current negative value. */
2493
0
        if (atomic_compare_exchange_strong(&flow->offload_queue_depth,
2494
0
                                           &current, current + 1)) {
2495
0
            return true;
2496
0
        }
2497
0
    }
2498
0
}
2499
2500
static bool
2501
offload_queue_dec(struct dp_netdev_flow *flow)
2502
0
{
2503
0
    int old;
2504
2505
0
    atomic_sub(&flow->offload_queue_depth, 1, &old);
2506
0
    ovs_assert(old >= 1);
2507
2508
0
    if (old == 1) {
2509
        /* Note that this only indicates that the queue might be empty. */
2510
0
        return true;
2511
0
    }
2512
0
    return false;
2513
0
}
2514
2515
static bool
2516
offload_queue_complete(struct dp_netdev_flow *flow)
2517
0
{
2518
    /* This function returns false if the queue is still in use.
2519
     * If the queue is empty, it will attempt to atomically mark it as
2520
     * 'not in use' by making the queue depth negative.  This prevents
2521
     * other flow operations from being added.  If successful, it returns
2522
     * true. */
2523
0
     int expected_val = 0;
2524
2525
0
    return atomic_compare_exchange_strong(&flow->offload_queue_depth,
2526
0
                                          &expected_val, -1);
2527
0
}
2528
2529
static void
2530
offload_flow_reference_unreference_cb(unsigned pmd_id OVS_UNUSED,
2531
                                      void *flow_reference_)
2532
0
{
2533
0
    struct dp_netdev_flow *flow_reference = flow_reference_;
2534
2535
0
    if (flow_reference) {
2536
0
        flow_reference->offloaded = false;
2537
0
        dp_netdev_flow_unref(flow_reference);
2538
0
    }
2539
0
}
2540
2541
static void
2542
offload_flow_del_resume(struct dp_netdev_flow *flow_reference,
2543
                        int error)
2544
0
{
2545
0
    if (error == EINPROGRESS) {
2546
0
        return;
2547
0
    }
2548
2549
0
    if (error) {
2550
0
        odp_port_t in_port = flow_reference->flow.in_port.odp_port;
2551
2552
0
        VLOG_DBG(
2553
0
            "Failed removing offload flow ufid " UUID_FMT " from port %d: %d",
2554
0
            UUID_ARGS((struct uuid *)&flow_reference->mega_ufid), in_port,
2555
0
            error);
2556
0
    } else {
2557
        /* Release because we successfully removed the reference. */
2558
0
        dp_netdev_flow_unref(flow_reference);
2559
0
    }
2560
2561
    /* Release as we took a reference in offload_flow_del(). */
2562
0
    dp_netdev_flow_unref(flow_reference);
2563
0
}
2564
2565
static void
2566
offload_flow_del_resume_cb(void *aux OVS_UNUSED,
2567
                           struct dpif_flow_stats *stats OVS_UNUSED,
2568
                           unsigned pmd_id OVS_UNUSED,
2569
                           void *flow_reference,
2570
                           void *previous_flow_reference OVS_UNUSED, int error)
2571
0
{
2572
0
    offload_flow_del_resume(flow_reference, error);
2573
0
}
2574
2575
static void
2576
offload_flow_del(struct dp_netdev *dp, unsigned pmd_id,
2577
                 struct dp_netdev_flow *flow)
2578
0
{
2579
0
    odp_port_t in_port = flow->flow.in_port.odp_port;
2580
0
    struct dpif_offload_flow_del del = {
2581
0
        .in_port = in_port,
2582
0
        .pmd_id = pmd_id,
2583
0
        .ufid = CONST_CAST(ovs_u128 *, &flow->mega_ufid),
2584
0
        .flow_reference = flow,
2585
0
        .stats = NULL,
2586
0
        .cb_data = { .callback = offload_flow_del_resume_cb },
2587
0
    };
2588
0
    int error;
2589
2590
0
    if (!dpif_offload_enabled()) {
2591
0
        return;
2592
0
    }
2593
2594
    /* This offload flow delete is only called when the actual flow is
2595
     * destructed.  However, we can only trust the state of flow->offloaded
2596
     * if no more flow_put operations are pending.  Below, we check whether
2597
     * the queue can be marked as complete, and then determine if we need
2598
     * to schedule a removal.  If not, the delete will be rescheduled later
2599
     * in the last offload_flow_put_resume_cb() callback. */
2600
0
    ovs_assert(flow->dead);
2601
0
    if (!offload_queue_complete(flow) || !flow->offloaded) {
2602
0
        return;
2603
0
    }
2604
2605
0
    flow->offloaded = false;
2606
0
    dp_netdev_flow_ref(flow);
2607
2608
    /* It's the responsibility of the offload provider to remove the
2609
     * actual rule from hardware only if none of the other PMD threads
2610
     * have the rule installed in hardware. */
2611
0
    error = dpif_offload_datapath_flow_del(dp->full_name, &del);
2612
0
    offload_flow_del_resume(flow, error);
2613
0
}
2614
2615
static void
2616
dp_netdev_pmd_remove_flow(struct dp_netdev_pmd_thread *pmd,
2617
                          struct dp_netdev_flow *flow)
2618
    OVS_REQUIRES(pmd->flow_mutex)
2619
0
{
2620
0
    struct cmap_node *node = CONST_CAST(struct cmap_node *, &flow->node);
2621
0
    struct dpcls *cls;
2622
0
    odp_port_t in_port = flow->flow.in_port.odp_port;
2623
2624
0
    cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
2625
0
    ovs_assert(cls != NULL);
2626
0
    dpcls_remove(cls, &flow->cr);
2627
0
    dp_netdev_simple_match_remove(pmd, flow);
2628
0
    cmap_remove(&pmd->flow_table, node, dp_netdev_flow_hash(&flow->ufid));
2629
0
    ccmap_dec(&pmd->n_flows, odp_to_u32(in_port));
2630
0
    flow->dead = true;
2631
0
    offload_flow_del(pmd->dp, pmd->core_id, flow);
2632
2633
0
    dp_netdev_flow_unref(flow);
2634
0
}
2635
2636
static void
2637
dp_netdev_pmd_flow_flush(struct dp_netdev_pmd_thread *pmd)
2638
0
{
2639
0
    struct dp_netdev_flow *netdev_flow;
2640
2641
0
    ovs_mutex_lock(&pmd->flow_mutex);
2642
0
    CMAP_FOR_EACH (netdev_flow, node, &pmd->flow_table) {
2643
0
        dp_netdev_pmd_remove_flow(pmd, netdev_flow);
2644
0
    }
2645
0
    ovs_mutex_unlock(&pmd->flow_mutex);
2646
0
}
2647
2648
static int
2649
dpif_netdev_flow_flush(struct dpif *dpif)
2650
0
{
2651
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
2652
0
    struct dp_netdev_pmd_thread *pmd;
2653
2654
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
2655
0
        dp_netdev_pmd_flow_flush(pmd);
2656
0
    }
2657
2658
0
    return 0;
2659
0
}
2660
2661
struct dp_netdev_port_state {
2662
    struct hmap_position position;
2663
    char *name;
2664
};
2665
2666
static int
2667
dpif_netdev_port_dump_start(const struct dpif *dpif OVS_UNUSED, void **statep)
2668
0
{
2669
0
    *statep = xzalloc(sizeof(struct dp_netdev_port_state));
2670
0
    return 0;
2671
0
}
2672
2673
static int
2674
dpif_netdev_port_dump_next(const struct dpif *dpif, void *state_,
2675
                           struct dpif_port *dpif_port)
2676
0
{
2677
0
    struct dp_netdev_port_state *state = state_;
2678
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
2679
0
    struct hmap_node *node;
2680
0
    int retval;
2681
2682
0
    ovs_rwlock_rdlock(&dp->port_rwlock);
2683
0
    node = hmap_at_position(&dp->ports, &state->position);
2684
0
    if (node) {
2685
0
        struct dp_netdev_port *port;
2686
2687
0
        port = CONTAINER_OF(node, struct dp_netdev_port, node);
2688
2689
0
        free(state->name);
2690
0
        state->name = xstrdup(netdev_get_name(port->netdev));
2691
0
        dpif_port->name = state->name;
2692
0
        dpif_port->type = port->type;
2693
0
        dpif_port->port_no = port->port_no;
2694
2695
0
        retval = 0;
2696
0
    } else {
2697
0
        retval = EOF;
2698
0
    }
2699
0
    ovs_rwlock_unlock(&dp->port_rwlock);
2700
2701
0
    return retval;
2702
0
}
2703
2704
static int
2705
dpif_netdev_port_dump_done(const struct dpif *dpif OVS_UNUSED, void *state_)
2706
0
{
2707
0
    struct dp_netdev_port_state *state = state_;
2708
0
    free(state->name);
2709
0
    free(state);
2710
0
    return 0;
2711
0
}
2712
2713
static int
2714
dpif_netdev_port_poll(const struct dpif *dpif_, char **devnamep OVS_UNUSED)
2715
0
{
2716
0
    struct dpif_netdev *dpif = dpif_netdev_cast(dpif_);
2717
0
    uint64_t new_port_seq;
2718
0
    int error;
2719
2720
0
    new_port_seq = seq_read(dpif->dp->port_seq);
2721
0
    if (dpif->last_port_seq != new_port_seq) {
2722
0
        dpif->last_port_seq = new_port_seq;
2723
0
        error = ENOBUFS;
2724
0
    } else {
2725
0
        error = EAGAIN;
2726
0
    }
2727
2728
0
    return error;
2729
0
}
2730
2731
static void
2732
dpif_netdev_port_poll_wait(const struct dpif *dpif_)
2733
0
{
2734
0
    struct dpif_netdev *dpif = dpif_netdev_cast(dpif_);
2735
2736
0
    seq_wait(dpif->dp->port_seq, dpif->last_port_seq);
2737
0
}
2738
2739
static struct dp_netdev_flow *
2740
dp_netdev_flow_cast(const struct dpcls_rule *cr)
2741
0
{
2742
0
    return cr ? CONTAINER_OF(cr, struct dp_netdev_flow, cr) : NULL;
2743
0
}
2744
2745
static bool dp_netdev_flow_ref(struct dp_netdev_flow *flow)
2746
0
{
2747
0
    return ovs_refcount_try_ref_rcu(&flow->ref_cnt);
2748
0
}
2749
2750
/* netdev_flow_key utilities.
2751
 *
2752
 * netdev_flow_key is basically a miniflow.  We use these functions
2753
 * (netdev_flow_key_clone, netdev_flow_key_equal, ...) instead of the miniflow
2754
 * functions (miniflow_clone_inline, miniflow_equal, ...), because:
2755
 *
2756
 * - Since we are dealing exclusively with miniflows created by
2757
 *   miniflow_extract(), if the map is different the miniflow is different.
2758
 *   Therefore we can be faster by comparing the map and the miniflow in a
2759
 *   single memcmp().
2760
 * - These functions can be inlined by the compiler. */
2761
2762
static inline bool
2763
netdev_flow_key_equal(const struct netdev_flow_key *a,
2764
                      const struct netdev_flow_key *b)
2765
0
{
2766
    /* 'b->len' may be not set yet. */
2767
0
    return a->hash == b->hash && !memcmp(&a->mf, &b->mf, a->len);
2768
0
}
2769
2770
static inline void
2771
netdev_flow_key_clone(struct netdev_flow_key *dst,
2772
                      const struct netdev_flow_key *src)
2773
0
{
2774
0
    memcpy(dst, src,
2775
0
           offsetof(struct netdev_flow_key, mf) + src->len);
2776
0
}
2777
2778
/* Initialize a netdev_flow_key 'mask' from 'match'. */
2779
static inline void
2780
netdev_flow_mask_init(struct netdev_flow_key *mask,
2781
                      const struct match *match)
2782
0
{
2783
0
    uint64_t *dst = miniflow_values(&mask->mf);
2784
0
    struct flowmap fmap;
2785
0
    uint32_t hash = 0;
2786
0
    size_t idx;
2787
2788
    /* Only check masks that make sense for the flow. */
2789
0
    flow_wc_map(&match->flow, &fmap);
2790
0
    flowmap_init(&mask->mf.map);
2791
2792
0
    FLOWMAP_FOR_EACH_INDEX(idx, fmap) {
2793
0
        uint64_t mask_u64 = flow_u64_value(&match->wc.masks, idx);
2794
2795
0
        if (mask_u64) {
2796
0
            flowmap_set(&mask->mf.map, idx, 1);
2797
0
            *dst++ = mask_u64;
2798
0
            hash = hash_add64(hash, mask_u64);
2799
0
        }
2800
0
    }
2801
2802
0
    map_t map;
2803
2804
0
    FLOWMAP_FOR_EACH_MAP (map, mask->mf.map) {
2805
0
        hash = hash_add64(hash, map);
2806
0
    }
2807
2808
0
    size_t n = dst - miniflow_get_values(&mask->mf);
2809
2810
0
    mask->hash = hash_finish(hash, n * 8);
2811
0
    mask->len = netdev_flow_key_size(n);
2812
0
}
2813
2814
/* Initializes 'dst' as a copy of 'flow' masked with 'mask'. */
2815
static inline void
2816
netdev_flow_key_init_masked(struct netdev_flow_key *dst,
2817
                            const struct flow *flow,
2818
                            const struct netdev_flow_key *mask)
2819
0
{
2820
0
    uint64_t *dst_u64 = miniflow_values(&dst->mf);
2821
0
    const uint64_t *mask_u64 = miniflow_get_values(&mask->mf);
2822
0
    uint32_t hash = 0;
2823
0
    uint64_t value;
2824
2825
0
    dst->len = mask->len;
2826
0
    dst->mf = mask->mf;   /* Copy maps. */
2827
2828
0
    FLOW_FOR_EACH_IN_MAPS(value, flow, mask->mf.map) {
2829
0
        *dst_u64 = value & *mask_u64++;
2830
0
        hash = hash_add64(hash, *dst_u64++);
2831
0
    }
2832
0
    dst->hash = hash_finish(hash,
2833
0
                            (dst_u64 - miniflow_get_values(&dst->mf)) * 8);
2834
0
}
2835
2836
/* Initializes 'key' as a copy of 'flow'. */
2837
static inline void
2838
netdev_flow_key_init(struct netdev_flow_key *key,
2839
                     const struct flow *flow)
2840
0
{
2841
0
    uint32_t hash = 0;
2842
0
    uint64_t value;
2843
2844
0
    miniflow_map_init(&key->mf, flow);
2845
0
    miniflow_init(&key->mf, flow);
2846
2847
0
    size_t n = miniflow_n_values(&key->mf);
2848
2849
0
    FLOW_FOR_EACH_IN_MAPS (value, flow, key->mf.map) {
2850
0
        hash = hash_add64(hash, value);
2851
0
    }
2852
2853
0
    key->hash = hash_finish(hash, n * 8);
2854
0
    key->len = netdev_flow_key_size(n);
2855
0
}
2856
2857
static inline void
2858
emc_change_entry(struct emc_entry *ce, struct dp_netdev_flow *flow,
2859
                 const struct netdev_flow_key *key)
2860
0
{
2861
0
    if (ce->flow != flow) {
2862
0
        if (ce->flow) {
2863
0
            dp_netdev_flow_unref(ce->flow);
2864
0
        }
2865
2866
0
        if (dp_netdev_flow_ref(flow)) {
2867
0
            ce->flow = flow;
2868
0
        } else {
2869
0
            ce->flow = NULL;
2870
0
        }
2871
0
    }
2872
0
    if (key) {
2873
0
        netdev_flow_key_clone(&ce->key, key);
2874
0
    }
2875
0
}
2876
2877
static inline void
2878
emc_insert(struct emc_cache *cache, const struct netdev_flow_key *key,
2879
           struct dp_netdev_flow *flow)
2880
0
{
2881
0
    struct emc_entry *to_be_replaced = NULL;
2882
0
    struct emc_entry *current_entry;
2883
2884
0
    EMC_FOR_EACH_POS_WITH_HASH(cache, current_entry, key->hash) {
2885
0
        if (netdev_flow_key_equal(&current_entry->key, key)) {
2886
            /* We found the entry with the 'mf' miniflow */
2887
0
            emc_change_entry(current_entry, flow, NULL);
2888
0
            return;
2889
0
        }
2890
2891
        /* Replacement policy: put the flow in an empty (not alive) entry, or
2892
         * in the first entry where it can be */
2893
0
        if (!to_be_replaced
2894
0
            || (emc_entry_alive(to_be_replaced)
2895
0
                && !emc_entry_alive(current_entry))
2896
0
            || current_entry->key.hash < to_be_replaced->key.hash) {
2897
0
            to_be_replaced = current_entry;
2898
0
        }
2899
0
    }
2900
    /* We didn't find the miniflow in the cache.
2901
     * The 'to_be_replaced' entry is where the new flow will be stored */
2902
2903
0
    emc_change_entry(to_be_replaced, flow, key);
2904
0
}
2905
2906
static inline void
2907
emc_probabilistic_insert(struct dp_netdev_pmd_thread *pmd,
2908
                         const struct netdev_flow_key *key,
2909
                         struct dp_netdev_flow *flow)
2910
0
{
2911
    /* Insert an entry into the EMC based on probability value 'min'. By
2912
     * default the value is UINT32_MAX / 100 which yields an insertion
2913
     * probability of 1/100 ie. 1% */
2914
2915
0
    uint32_t min = pmd->ctx.emc_insert_min;
2916
2917
0
    if (min && random_uint32() <= min) {
2918
0
        emc_insert(&(pmd->flow_cache).emc_cache, key, flow);
2919
0
    }
2920
0
}
2921
2922
static inline const struct cmap_node *
2923
smc_entry_get(struct dp_netdev_pmd_thread *pmd, const uint32_t hash)
2924
0
{
2925
0
    struct smc_cache *cache = &(pmd->flow_cache).smc_cache;
2926
0
    struct smc_bucket *bucket = &cache->buckets[hash & SMC_MASK];
2927
0
    uint16_t sig = hash >> 16;
2928
0
    uint16_t index = UINT16_MAX;
2929
2930
0
    for (int i = 0; i < SMC_ENTRY_PER_BUCKET; i++) {
2931
0
        if (bucket->sig[i] == sig) {
2932
0
            index = bucket->flow_idx[i];
2933
0
            break;
2934
0
        }
2935
0
    }
2936
0
    if (index != UINT16_MAX) {
2937
0
        return cmap_find_by_index(&pmd->flow_table, index);
2938
0
    }
2939
0
    return NULL;
2940
0
}
2941
2942
/* Insert the flow_table index into SMC. Insertion may fail when 1) SMC is
2943
 * turned off, 2) the flow_table index is larger than uint16_t can handle.
2944
 * If there is already an SMC entry having same signature, the index will be
2945
 * updated. If there is no existing entry, but an empty entry is available,
2946
 * the empty entry will be taken. If no empty entry or existing same signature,
2947
 * a random entry from the hashed bucket will be picked. */
2948
static inline void
2949
smc_insert(struct dp_netdev_pmd_thread *pmd,
2950
           const struct netdev_flow_key *key,
2951
           uint32_t hash)
2952
0
{
2953
0
    struct smc_cache *smc_cache = &(pmd->flow_cache).smc_cache;
2954
0
    struct smc_bucket *bucket = &smc_cache->buckets[key->hash & SMC_MASK];
2955
0
    uint16_t index;
2956
0
    uint32_t cmap_index;
2957
0
    int i;
2958
2959
0
    if (!pmd->ctx.smc_enable_db) {
2960
0
        return;
2961
0
    }
2962
2963
0
    cmap_index = cmap_find_index(&pmd->flow_table, hash);
2964
0
    index = (cmap_index >= UINT16_MAX) ? UINT16_MAX : (uint16_t)cmap_index;
2965
2966
    /* If the index is larger than SMC can handle (uint16_t), we don't
2967
     * insert */
2968
0
    if (index == UINT16_MAX) {
2969
0
        return;
2970
0
    }
2971
2972
    /* If an entry with same signature already exists, update the index */
2973
0
    uint16_t sig = key->hash >> 16;
2974
0
    for (i = 0; i < SMC_ENTRY_PER_BUCKET; i++) {
2975
0
        if (bucket->sig[i] == sig) {
2976
0
            bucket->flow_idx[i] = index;
2977
0
            return;
2978
0
        }
2979
0
    }
2980
    /* If there is an empty entry, occupy it. */
2981
0
    for (i = 0; i < SMC_ENTRY_PER_BUCKET; i++) {
2982
0
        if (bucket->flow_idx[i] == UINT16_MAX) {
2983
0
            bucket->sig[i] = sig;
2984
0
            bucket->flow_idx[i] = index;
2985
0
            return;
2986
0
        }
2987
0
    }
2988
    /* Otherwise, pick a random entry. */
2989
0
    i = random_uint32() % SMC_ENTRY_PER_BUCKET;
2990
0
    bucket->sig[i] = sig;
2991
0
    bucket->flow_idx[i] = index;
2992
0
}
2993
2994
inline void
2995
emc_probabilistic_insert_batch(struct dp_netdev_pmd_thread *pmd,
2996
                               const struct netdev_flow_key *keys,
2997
                               struct dpcls_rule **rules,
2998
                               uint32_t emc_insert_mask)
2999
0
{
3000
0
    while (emc_insert_mask) {
3001
0
        uint32_t i = raw_ctz(emc_insert_mask);
3002
0
        emc_insert_mask &= emc_insert_mask - 1;
3003
        /* Get the require parameters for EMC/SMC from the rule */
3004
0
        struct dp_netdev_flow *flow = dp_netdev_flow_cast(rules[i]);
3005
        /* Insert the key into EMC/SMC. */
3006
0
        emc_probabilistic_insert(pmd, &keys[i], flow);
3007
0
    }
3008
0
}
3009
3010
inline void
3011
smc_insert_batch(struct dp_netdev_pmd_thread *pmd,
3012
                 const struct netdev_flow_key *keys,
3013
                 struct dpcls_rule **rules,
3014
                 uint32_t smc_insert_mask)
3015
0
{
3016
0
    while (smc_insert_mask) {
3017
0
        uint32_t i = raw_ctz(smc_insert_mask);
3018
0
        smc_insert_mask &= smc_insert_mask - 1;
3019
        /* Get the require parameters for EMC/SMC from the rule */
3020
0
        struct dp_netdev_flow *flow = dp_netdev_flow_cast(rules[i]);
3021
0
        uint32_t hash = dp_netdev_flow_hash(&flow->ufid);
3022
        /* Insert the key into EMC/SMC. */
3023
0
        smc_insert(pmd, &keys[i], hash);
3024
0
    }
3025
0
}
3026
3027
static struct dp_netdev_flow *
3028
dp_netdev_pmd_lookup_flow(struct dp_netdev_pmd_thread *pmd,
3029
                          const struct netdev_flow_key *key,
3030
                          int *lookup_num_p)
3031
0
{
3032
0
    struct dpcls *cls;
3033
0
    struct dpcls_rule *rule = NULL;
3034
0
    odp_port_t in_port = u32_to_odp(MINIFLOW_GET_U32(&key->mf,
3035
0
                                                     in_port.odp_port));
3036
0
    struct dp_netdev_flow *netdev_flow = NULL;
3037
3038
0
    cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
3039
0
    if (OVS_LIKELY(cls)) {
3040
0
        dpcls_lookup(cls, &key, &rule, 1, lookup_num_p);
3041
0
        netdev_flow = dp_netdev_flow_cast(rule);
3042
0
    }
3043
0
    return netdev_flow;
3044
0
}
3045
3046
static struct dp_netdev_flow *
3047
dp_netdev_pmd_find_flow(const struct dp_netdev_pmd_thread *pmd,
3048
                        const ovs_u128 *ufidp, const struct nlattr *key,
3049
                        size_t key_len)
3050
0
{
3051
0
    struct dp_netdev_flow *netdev_flow;
3052
0
    struct flow flow;
3053
0
    ovs_u128 ufid;
3054
3055
    /* If a UFID is not provided, determine one based on the key. */
3056
0
    if (!ufidp && key && key_len
3057
0
        && !dpif_netdev_flow_from_nlattrs(key, key_len, &flow, false)) {
3058
0
        odp_flow_key_hash(&flow, sizeof flow, &ufid);
3059
0
        ufidp = &ufid;
3060
0
    }
3061
3062
0
    if (ufidp) {
3063
0
        CMAP_FOR_EACH_WITH_HASH (netdev_flow, node, dp_netdev_flow_hash(ufidp),
3064
0
                                 &pmd->flow_table) {
3065
0
            if (ovs_u128_equals(netdev_flow->ufid, *ufidp)) {
3066
0
                return netdev_flow;
3067
0
            }
3068
0
        }
3069
0
    }
3070
3071
0
    return NULL;
3072
0
}
3073
3074
static void
3075
get_dpif_flow_status(const struct dp_netdev *dp,
3076
                     const struct dp_netdev_flow *netdev_flow_,
3077
                     struct dpif_flow_stats *stats,
3078
                     struct dpif_flow_attrs *attrs)
3079
0
{
3080
0
    struct dpif_flow_stats offload_stats;
3081
0
    struct dpif_flow_attrs offload_attrs;
3082
0
    struct dp_netdev_flow *netdev_flow;
3083
0
    unsigned long long n;
3084
0
    long long used;
3085
0
    uint16_t flags;
3086
3087
0
    netdev_flow = CONST_CAST(struct dp_netdev_flow *, netdev_flow_);
3088
3089
0
    atomic_read_relaxed(&netdev_flow->stats.packet_count, &n);
3090
0
    stats->n_packets = n;
3091
0
    atomic_read_relaxed(&netdev_flow->stats.byte_count, &n);
3092
0
    stats->n_bytes = n;
3093
0
    atomic_read_relaxed(&netdev_flow->stats.used, &used);
3094
0
    stats->used = used;
3095
0
    atomic_read_relaxed(&netdev_flow->stats.tcp_flags, &flags);
3096
0
    stats->tcp_flags = flags;
3097
3098
0
    if (dpif_offload_datapath_flow_stats(dp->full_name,
3099
0
                                         netdev_flow->flow.in_port.odp_port,
3100
0
                                         &netdev_flow->mega_ufid,
3101
0
                                         &offload_stats, &offload_attrs)) {
3102
0
        stats->n_packets += offload_stats.n_packets;
3103
0
        stats->n_bytes += offload_stats.n_bytes;
3104
0
        stats->used = MAX(stats->used, offload_stats.used);
3105
0
        stats->tcp_flags |= offload_stats.tcp_flags;
3106
0
        if (attrs) {
3107
0
            attrs->offloaded = offload_attrs.offloaded;
3108
0
            attrs->dp_layer = offload_attrs.dp_layer;
3109
0
        }
3110
0
    } else if (attrs) {
3111
0
        attrs->offloaded = false;
3112
0
        attrs->dp_layer = "ovs";
3113
0
    }
3114
0
}
3115
3116
/* Converts to the dpif_flow format, using 'key_buf' and 'mask_buf' for
3117
 * storing the netlink-formatted key/mask. 'key_buf' may be the same as
3118
 * 'mask_buf'. Actions will be returned without copying, by relying on RCU to
3119
 * protect them. */
3120
static void
3121
dp_netdev_flow_to_dpif_flow(const struct dp_netdev *dp,
3122
                            const struct dp_netdev_flow *netdev_flow,
3123
                            struct ofpbuf *key_buf, struct ofpbuf *mask_buf,
3124
                            struct dpif_flow *flow, bool terse)
3125
0
{
3126
0
    if (terse) {
3127
0
        memset(flow, 0, sizeof *flow);
3128
0
    } else {
3129
0
        struct flow_wildcards wc;
3130
0
        struct dp_netdev_actions *actions;
3131
0
        size_t offset;
3132
0
        struct odp_flow_key_parms odp_parms = {
3133
0
            .flow = &netdev_flow->flow,
3134
0
            .mask = &wc.masks,
3135
0
            .support = dp_netdev_support,
3136
0
        };
3137
3138
0
        miniflow_expand(&netdev_flow->cr.mask->mf, &wc.masks);
3139
        /* in_port is exact matched, but we have left it out from the mask for
3140
         * optimnization reasons. Add in_port back to the mask. */
3141
0
        wc.masks.in_port.odp_port = ODPP_NONE;
3142
3143
        /* Key */
3144
0
        offset = key_buf->size;
3145
0
        flow->key = ofpbuf_tail(key_buf);
3146
0
        odp_flow_key_from_flow(&odp_parms, key_buf);
3147
0
        flow->key_len = key_buf->size - offset;
3148
3149
        /* Mask */
3150
0
        offset = mask_buf->size;
3151
0
        flow->mask = ofpbuf_tail(mask_buf);
3152
0
        odp_parms.key_buf = key_buf;
3153
0
        odp_flow_key_from_mask(&odp_parms, mask_buf);
3154
0
        flow->mask_len = mask_buf->size - offset;
3155
3156
        /* Actions */
3157
0
        actions = dp_netdev_flow_get_actions(netdev_flow);
3158
0
        flow->actions = actions->actions;
3159
0
        flow->actions_len = actions->size;
3160
0
    }
3161
3162
0
    flow->ufid = netdev_flow->ufid;
3163
0
    flow->ufid_present = true;
3164
0
    flow->pmd_id = netdev_flow->pmd_id;
3165
3166
0
    get_dpif_flow_status(dp, netdev_flow, &flow->stats, &flow->attrs);
3167
0
    flow->attrs.dp_extra_info = netdev_flow->dp_extra_info;
3168
0
}
3169
3170
static int
3171
dpif_netdev_mask_from_nlattrs(const struct nlattr *key, uint32_t key_len,
3172
                              const struct nlattr *mask_key,
3173
                              uint32_t mask_key_len, const struct flow *flow,
3174
                              struct flow_wildcards *wc, bool probe)
3175
0
{
3176
0
    enum odp_key_fitness fitness;
3177
3178
0
    fitness = odp_flow_key_to_mask(mask_key, mask_key_len, wc, flow, NULL);
3179
0
    if (fitness) {
3180
0
        if (!probe) {
3181
            /* This should not happen: it indicates that
3182
             * odp_flow_key_from_mask() and odp_flow_key_to_mask()
3183
             * disagree on the acceptable form of a mask.  Log the problem
3184
             * as an error, with enough details to enable debugging. */
3185
0
            static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
3186
3187
0
            if (!VLOG_DROP_ERR(&rl)) {
3188
0
                struct ds s;
3189
3190
0
                ds_init(&s);
3191
0
                odp_flow_format(key, key_len, mask_key, mask_key_len, NULL, &s,
3192
0
                                true, true);
3193
0
                VLOG_ERR("internal error parsing flow mask %s (%s)",
3194
0
                ds_cstr(&s), odp_key_fitness_to_string(fitness));
3195
0
                ds_destroy(&s);
3196
0
            }
3197
0
        }
3198
3199
0
        return EINVAL;
3200
0
    }
3201
3202
0
    return 0;
3203
0
}
3204
3205
static int
3206
dpif_netdev_flow_from_nlattrs(const struct nlattr *key, uint32_t key_len,
3207
                              struct flow *flow, bool probe)
3208
0
{
3209
0
    if (odp_flow_key_to_flow(key, key_len, flow, NULL)) {
3210
0
        if (!probe) {
3211
            /* This should not happen: it indicates that
3212
             * odp_flow_key_from_flow() and odp_flow_key_to_flow() disagree on
3213
             * the acceptable form of a flow.  Log the problem as an error,
3214
             * with enough details to enable debugging. */
3215
0
            static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
3216
3217
0
            if (!VLOG_DROP_ERR(&rl)) {
3218
0
                struct ds s;
3219
3220
0
                ds_init(&s);
3221
0
                odp_flow_format(key, key_len, NULL, 0, NULL, &s, true, false);
3222
0
                VLOG_ERR("internal error parsing flow key %s", ds_cstr(&s));
3223
0
                ds_destroy(&s);
3224
0
            }
3225
0
        }
3226
3227
0
        return EINVAL;
3228
0
    }
3229
3230
0
    if (flow->ct_state & DP_NETDEV_CS_UNSUPPORTED_MASK) {
3231
0
        return EINVAL;
3232
0
    }
3233
3234
0
    return 0;
3235
0
}
3236
3237
static int
3238
dpif_netdev_flow_get(const struct dpif *dpif, const struct dpif_flow_get *get)
3239
0
{
3240
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
3241
0
    struct dp_netdev_flow *netdev_flow;
3242
0
    struct dp_netdev_pmd_thread *pmd;
3243
0
    struct hmapx to_find = HMAPX_INITIALIZER(&to_find);
3244
0
    struct hmapx_node *node;
3245
0
    int error = EINVAL;
3246
3247
0
    if (get->pmd_id == PMD_ID_NULL) {
3248
0
        CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3249
0
            if (dp_netdev_pmd_try_ref(pmd) && !hmapx_add(&to_find, pmd)) {
3250
0
                dp_netdev_pmd_unref(pmd);
3251
0
            }
3252
0
        }
3253
0
    } else {
3254
0
        pmd = dp_netdev_get_pmd(dp, get->pmd_id);
3255
0
        if (!pmd) {
3256
0
            goto out;
3257
0
        }
3258
0
        hmapx_add(&to_find, pmd);
3259
0
    }
3260
3261
0
    if (!hmapx_count(&to_find)) {
3262
0
        goto out;
3263
0
    }
3264
3265
0
    HMAPX_FOR_EACH (node, &to_find) {
3266
0
        pmd = (struct dp_netdev_pmd_thread *) node->data;
3267
0
        netdev_flow = dp_netdev_pmd_find_flow(pmd, get->ufid, get->key,
3268
0
                                              get->key_len);
3269
0
        if (netdev_flow) {
3270
0
            dp_netdev_flow_to_dpif_flow(dp, netdev_flow, get->buffer,
3271
0
                                        get->buffer, get->flow, false);
3272
0
            error = 0;
3273
0
            break;
3274
0
        } else {
3275
0
            error = ENOENT;
3276
0
        }
3277
0
    }
3278
3279
0
    HMAPX_FOR_EACH (node, &to_find) {
3280
0
        pmd = (struct dp_netdev_pmd_thread *) node->data;
3281
0
        dp_netdev_pmd_unref(pmd);
3282
0
    }
3283
0
out:
3284
0
    hmapx_destroy(&to_find);
3285
0
    return error;
3286
0
}
3287
3288
static void
3289
dp_netdev_get_mega_ufid(const struct match *match, ovs_u128 *mega_ufid)
3290
0
{
3291
0
    struct flow masked_flow;
3292
0
    size_t i;
3293
3294
0
    for (i = 0; i < sizeof(struct flow); i++) {
3295
0
        ((uint8_t *)&masked_flow)[i] = ((uint8_t *)&match->flow)[i] &
3296
0
                                       ((uint8_t *)&match->wc)[i];
3297
0
    }
3298
0
    odp_flow_key_hash(&masked_flow, sizeof masked_flow, mega_ufid);
3299
0
}
3300
3301
uint64_t
3302
dp_netdev_simple_match_mark(odp_port_t in_port, ovs_be16 dl_type,
3303
                            uint8_t nw_frag, ovs_be16 vlan_tci)
3304
0
{
3305
    /* Simple Match Mark:
3306
     *
3307
     * BE:
3308
     * +-----------------+-------------++---------+---+-----------+
3309
     * |     in_port     |   dl_type   || nw_frag |CFI|  VID(12)  |
3310
     * +-----------------+-------------++---------+---+-----------+
3311
     * 0                 32          47 49         51  52     63
3312
     *
3313
     * LE:
3314
     * +-----------------+-------------+------++-------+---+------+
3315
     * |     in_port     |   dl_type   |VID(8)||nw_frag|CFI|VID(4)|
3316
     * +-----------------+-------------+------++-------+---+------+
3317
     * 0                 32          47 48  55  57   59 60  61   63
3318
     *
3319
     *         Big Endian              Little Endian
3320
     * in_port : 32 bits [ 0..31]  in_port : 32 bits [ 0..31]
3321
     * dl_type : 16 bits [32..47]  dl_type : 16 bits [32..47]
3322
     * <empty> :  1 bit  [48..48]  vlan VID:  8 bits [48..55]
3323
     * nw_frag :  2 bits [49..50]  <empty> :  1 bit  [56..56]
3324
     * vlan CFI:  1 bit  [51..51]  nw_frag :  2 bits [57..59]
3325
     * vlan VID: 12 bits [52..63]  vlan CFI:  1 bit  [60..60]
3326
     *                             vlan VID:  4 bits [61..63]
3327
     *
3328
     * Layout is different for LE and BE in order to save a couple of
3329
     * network to host translations.
3330
     * */
3331
0
    return ((uint64_t) odp_to_u32(in_port) << 32)
3332
0
           | ((OVS_FORCE uint32_t) dl_type << 16)
3333
#if WORDS_BIGENDIAN
3334
           | (((uint16_t) nw_frag & FLOW_NW_FRAG_MASK) << VLAN_PCP_SHIFT)
3335
#else
3336
0
           | ((nw_frag & FLOW_NW_FRAG_MASK) << (VLAN_PCP_SHIFT - 8))
3337
0
#endif
3338
0
           | (OVS_FORCE uint16_t) (vlan_tci & htons(VLAN_VID_MASK | VLAN_CFI));
3339
0
}
3340
3341
struct dp_netdev_flow *
3342
dp_netdev_simple_match_lookup(const struct dp_netdev_pmd_thread *pmd,
3343
                              odp_port_t in_port, ovs_be16 dl_type,
3344
                              uint8_t nw_frag, ovs_be16 vlan_tci)
3345
0
{
3346
0
    uint64_t mark = dp_netdev_simple_match_mark(in_port, dl_type,
3347
0
                                                nw_frag, vlan_tci);
3348
0
    uint32_t hash = hash_uint64(mark);
3349
0
    struct dp_netdev_flow *flow;
3350
0
    bool found = false;
3351
3352
0
    CMAP_FOR_EACH_WITH_HASH (flow, simple_match_node,
3353
0
                             hash, &pmd->simple_match_table) {
3354
0
        if (flow->simple_match_mark == mark) {
3355
0
            found = true;
3356
0
            break;
3357
0
        }
3358
0
    }
3359
0
    return found ? flow : NULL;
3360
0
}
3361
3362
bool
3363
dp_netdev_simple_match_enabled(const struct dp_netdev_pmd_thread *pmd,
3364
                               odp_port_t in_port)
3365
0
{
3366
0
    return ccmap_find(&pmd->n_flows, odp_to_u32(in_port))
3367
0
           == ccmap_find(&pmd->n_simple_flows, odp_to_u32(in_port));
3368
0
}
3369
3370
static void
3371
dp_netdev_simple_match_insert(struct dp_netdev_pmd_thread *pmd,
3372
                              struct dp_netdev_flow *dp_flow)
3373
    OVS_REQUIRES(pmd->flow_mutex)
3374
0
{
3375
0
    odp_port_t in_port = dp_flow->flow.in_port.odp_port;
3376
0
    ovs_be16 vlan_tci = dp_flow->flow.vlans[0].tci;
3377
0
    ovs_be16 dl_type = dp_flow->flow.dl_type;
3378
0
    uint8_t nw_frag = dp_flow->flow.nw_frag;
3379
3380
0
    if (!dp_netdev_flow_ref(dp_flow)) {
3381
0
        return;
3382
0
    }
3383
3384
    /* Avoid double insertion.  Should not happen in practice. */
3385
0
    dp_netdev_simple_match_remove(pmd, dp_flow);
3386
3387
0
    uint64_t mark = dp_netdev_simple_match_mark(in_port, dl_type,
3388
0
                                                nw_frag, vlan_tci);
3389
0
    uint32_t hash = hash_uint64(mark);
3390
3391
0
    dp_flow->simple_match_mark = mark;
3392
0
    cmap_insert(&pmd->simple_match_table,
3393
0
                CONST_CAST(struct cmap_node *, &dp_flow->simple_match_node),
3394
0
                hash);
3395
0
    ccmap_inc(&pmd->n_simple_flows, odp_to_u32(in_port));
3396
3397
0
    VLOG_DBG("Simple match insert: "
3398
0
             "core_id(%d),in_port(%"PRIu32"),mark(0x%016"PRIx64").",
3399
0
             pmd->core_id, in_port, mark);
3400
0
}
3401
3402
static void
3403
dp_netdev_simple_match_remove(struct dp_netdev_pmd_thread *pmd,
3404
                               struct dp_netdev_flow *dp_flow)
3405
    OVS_REQUIRES(pmd->flow_mutex)
3406
0
{
3407
0
    odp_port_t in_port = dp_flow->flow.in_port.odp_port;
3408
0
    ovs_be16 vlan_tci = dp_flow->flow.vlans[0].tci;
3409
0
    ovs_be16 dl_type = dp_flow->flow.dl_type;
3410
0
    uint8_t nw_frag = dp_flow->flow.nw_frag;
3411
0
    struct dp_netdev_flow *flow;
3412
0
    uint64_t mark = dp_netdev_simple_match_mark(in_port, dl_type,
3413
0
                                                nw_frag, vlan_tci);
3414
0
    uint32_t hash = hash_uint64(mark);
3415
3416
0
    flow = dp_netdev_simple_match_lookup(pmd, in_port, dl_type,
3417
0
                                         nw_frag, vlan_tci);
3418
0
    if (flow == dp_flow) {
3419
0
        VLOG_DBG("Simple match remove: "
3420
0
                 "core_id(%d),in_port(%"PRIu32"),mark(0x%016"PRIx64").",
3421
0
                 pmd->core_id, in_port, mark);
3422
0
        cmap_remove(&pmd->simple_match_table,
3423
0
                    CONST_CAST(struct cmap_node *, &flow->simple_match_node),
3424
0
                    hash);
3425
0
        ccmap_dec(&pmd->n_simple_flows, odp_to_u32(in_port));
3426
0
        dp_netdev_flow_unref(flow);
3427
0
    }
3428
0
}
3429
3430
static bool
3431
dp_netdev_flow_is_simple_match(const struct match *match)
3432
0
{
3433
0
    const struct flow *flow = &match->flow;
3434
0
    const struct flow_wildcards *wc = &match->wc;
3435
3436
0
    if (flow->recirc_id || flow->packet_type != htonl(PT_ETH)) {
3437
0
        return false;
3438
0
    }
3439
3440
    /* Check that flow matches only minimal set of fields that always set.
3441
     * Also checking that VLAN VID+CFI is an exact match, because these
3442
     * are not mandatory and could be masked. */
3443
0
    struct flow_wildcards *minimal = xmalloc(sizeof *minimal);
3444
0
    ovs_be16 vlan_tci_mask = htons(VLAN_VID_MASK | VLAN_CFI);
3445
3446
0
    flow_wildcards_init_catchall(minimal);
3447
    /* 'dpif-netdev' always has following in exact match:
3448
     *   - recirc_id                   <-- recirc_id == 0 checked on input.
3449
     *   - in_port                     <-- Will be checked on input.
3450
     *   - packet_type                 <-- Assuming all packets are PT_ETH.
3451
     *   - dl_type                     <-- Need to match with.
3452
     *   - vlan_tci                    <-- Need to match with.
3453
     *   - and nw_frag for ip packets. <-- Need to match with.
3454
     */
3455
0
    WC_MASK_FIELD(minimal, recirc_id);
3456
0
    WC_MASK_FIELD(minimal, in_port);
3457
0
    WC_MASK_FIELD(minimal, packet_type);
3458
0
    WC_MASK_FIELD(minimal, dl_type);
3459
0
    WC_MASK_FIELD_MASK(minimal, vlans[0].tci, vlan_tci_mask);
3460
0
    WC_MASK_FIELD_MASK(minimal, nw_frag, FLOW_NW_FRAG_MASK);
3461
3462
0
    if (flow_wildcards_has_extra(minimal, wc)
3463
0
        || wc->masks.vlans[0].tci != vlan_tci_mask) {
3464
0
        free(minimal);
3465
0
        return false;
3466
0
    }
3467
0
    free(minimal);
3468
3469
0
    return true;
3470
0
}
3471
3472
static void
3473
offload_flow_put_resume(struct dp_netdev *dp, struct dp_netdev_flow *flow,
3474
                        struct dp_netdev_flow *previous_flow_reference,
3475
                        unsigned pmd_id, int error)
3476
0
{
3477
0
    if (error == EINPROGRESS) {
3478
0
        return;
3479
0
    }
3480
3481
0
    if (!error) {
3482
0
        flow->offloaded = true;
3483
0
    } else {
3484
        /* If the flow was already offloaded, the new action set can no
3485
         * longer be offloaded.  In theory, we should disassociate the
3486
         * offload from all PMDs that have this flow marked as offloaded.
3487
         * Unfortunately, there is no mechanism to inform other PMDs, so
3488
         * we cannot explicitly mark such flows.  This situation typically
3489
         * occurs when the revalidator modifies the flow, so it is safe to
3490
         * assume it will update all affected flows and that the offload
3491
         * will subsequently fail. */
3492
0
        flow->offloaded = false;
3493
3494
        /* On error, the flow reference was not stored by the offload provider,
3495
         * so we should decrease the reference. */
3496
0
        dp_netdev_flow_unref(flow);
3497
0
    }
3498
3499
0
    if (offload_queue_dec(flow) && flow->dead) {
3500
        /* If flows are processed asynchronously, modifications might
3501
         * still be queued up while the flow is being removed.  If this
3502
         * was the last flow in the queue on a dead flow, we try again
3503
         * to see if we need to remove this flow. */
3504
0
        offload_flow_del(dp, pmd_id, flow);
3505
0
    }
3506
3507
0
    if (previous_flow_reference) {
3508
0
        dp_netdev_flow_unref(previous_flow_reference);
3509
0
        if (previous_flow_reference != flow) {
3510
0
            VLOG_DBG("Updated flow reference was from outdated flow");
3511
0
        }
3512
0
    }
3513
0
}
3514
3515
static void
3516
offload_flow_put_resume_cb(void *aux, struct dpif_flow_stats *stats OVS_UNUSED,
3517
                           unsigned pmd_id, void *flow_reference_,
3518
                           void *old_flow_reference_,
3519
                           int error)
3520
0
{
3521
0
    struct dp_netdev *dp = aux;
3522
0
    struct dp_netdev_flow *flow_reference = flow_reference_;
3523
0
    struct dp_netdev_flow *old_flow_reference = old_flow_reference_;
3524
3525
0
    offload_flow_put_resume(dp, flow_reference, old_flow_reference,
3526
0
                            pmd_id, error);
3527
0
}
3528
3529
static void
3530
offload_flow_put(struct dp_netdev_pmd_thread *pmd, struct dp_netdev_flow *flow,
3531
                 struct match *match, const struct nlattr *actions,
3532
                 size_t actions_len)
3533
0
{
3534
0
    struct dpif_offload_flow_put put = {
3535
0
        .in_port = match->flow.in_port.odp_port,
3536
0
        .orig_in_port = flow->orig_in_port,
3537
0
        .pmd_id = pmd->core_id,
3538
0
        .ufid = CONST_CAST(ovs_u128 *, &flow->mega_ufid),
3539
0
        .match = match,
3540
0
        .actions = actions,
3541
0
        .actions_len = actions_len,
3542
0
        .stats = NULL,
3543
0
        .flow_reference = flow,
3544
0
        .cb_data = {
3545
0
            .callback = offload_flow_put_resume_cb,
3546
0
            .callback_aux = pmd->dp,
3547
0
        },
3548
0
    };
3549
0
    void *previous_flow_reference = NULL;
3550
0
    int error;
3551
3552
0
    if (!dpif_offload_enabled() || flow->dead || !offload_queue_inc(flow)) {
3553
0
        return;
3554
0
    }
3555
3556
0
    dp_netdev_flow_ref(flow);
3557
3558
0
    error = dpif_offload_datapath_flow_put(pmd->dp->full_name, &put,
3559
0
                                           &previous_flow_reference);
3560
0
    offload_flow_put_resume(pmd->dp, put.flow_reference,
3561
0
                            previous_flow_reference,
3562
0
                            pmd->core_id, error);
3563
0
}
3564
3565
static struct dp_netdev_flow *
3566
dp_netdev_flow_add(struct dp_netdev_pmd_thread *pmd,
3567
                   struct match *match, const ovs_u128 *ufid,
3568
                   const struct nlattr *actions, size_t actions_len,
3569
                   odp_port_t orig_in_port)
3570
    OVS_REQUIRES(pmd->flow_mutex)
3571
0
{
3572
0
    struct ds extra_info = DS_EMPTY_INITIALIZER;
3573
0
    struct dp_netdev_flow *flow;
3574
0
    struct netdev_flow_key mask;
3575
0
    struct dpcls *cls;
3576
0
    size_t unit;
3577
3578
    /* Make sure in_port is exact matched before we read it. */
3579
0
    ovs_assert(match->wc.masks.in_port.odp_port == ODPP_NONE);
3580
0
    odp_port_t in_port = match->flow.in_port.odp_port;
3581
3582
    /* As we select the dpcls based on the port number, each netdev flow
3583
     * belonging to the same dpcls will have the same odp_port value.
3584
     * For performance reasons we wildcard odp_port here in the mask.  In the
3585
     * typical case dp_hash is also wildcarded, and the resulting 8-byte
3586
     * chunk {dp_hash, in_port} will be ignored by netdev_flow_mask_init() and
3587
     * will not be part of the subtable mask.
3588
     * This will speed up the hash computation during dpcls_lookup() because
3589
     * there is one less call to hash_add64() in this case. */
3590
0
    match->wc.masks.in_port.odp_port = 0;
3591
0
    netdev_flow_mask_init(&mask, match);
3592
0
    match->wc.masks.in_port.odp_port = ODPP_NONE;
3593
3594
    /* Make sure wc does not have metadata. */
3595
0
    ovs_assert(!FLOWMAP_HAS_FIELD(&mask.mf.map, metadata)
3596
0
               && !FLOWMAP_HAS_FIELD(&mask.mf.map, regs));
3597
3598
    /* Do not allocate extra space. */
3599
0
    flow = xmalloc(sizeof *flow - sizeof flow->cr.flow.mf + mask.len);
3600
0
    memset(&flow->stats, 0, sizeof flow->stats);
3601
0
    flow->dead = false;
3602
0
    flow->offloaded = false;
3603
0
    atomic_init(&flow->offload_queue_depth, 0);
3604
0
    flow->batch = NULL;
3605
0
    flow->orig_in_port = orig_in_port;
3606
0
    *CONST_CAST(unsigned *, &flow->pmd_id) = pmd->core_id;
3607
0
    *CONST_CAST(struct flow *, &flow->flow) = match->flow;
3608
0
    *CONST_CAST(ovs_u128 *, &flow->ufid) = *ufid;
3609
0
    ovs_refcount_init(&flow->ref_cnt);
3610
0
    ovsrcu_set(&flow->actions, dp_netdev_actions_create(actions, actions_len));
3611
3612
0
    dp_netdev_get_mega_ufid(match, CONST_CAST(ovs_u128 *, &flow->mega_ufid));
3613
0
    netdev_flow_key_init_masked(&flow->cr.flow, &match->flow, &mask);
3614
3615
    /* Select dpcls for in_port. Relies on in_port to be exact match. */
3616
0
    cls = dp_netdev_pmd_find_dpcls(pmd, in_port);
3617
0
    dpcls_insert(cls, &flow->cr, &mask);
3618
3619
0
    ds_put_cstr(&extra_info, "miniflow_bits(");
3620
0
    FLOWMAP_FOR_EACH_UNIT (unit) {
3621
0
        if (unit) {
3622
0
            ds_put_char(&extra_info, ',');
3623
0
        }
3624
0
        ds_put_format(&extra_info, "%d",
3625
0
                      count_1bits(flow->cr.mask->mf.map.bits[unit]));
3626
0
    }
3627
0
    ds_put_char(&extra_info, ')');
3628
0
    flow->dp_extra_info = ds_steal_cstr(&extra_info);
3629
0
    ds_destroy(&extra_info);
3630
3631
0
    cmap_insert(&pmd->flow_table, CONST_CAST(struct cmap_node *, &flow->node),
3632
0
                dp_netdev_flow_hash(&flow->ufid));
3633
0
    ccmap_inc(&pmd->n_flows, odp_to_u32(in_port));
3634
3635
0
    if (dp_netdev_flow_is_simple_match(match)) {
3636
0
        dp_netdev_simple_match_insert(pmd, flow);
3637
0
    }
3638
3639
0
    offload_flow_put(pmd, flow, match, actions, actions_len);
3640
0
    log_netdev_flow_change(flow, match, NULL, actions, actions_len);
3641
3642
0
    return flow;
3643
0
}
3644
3645
static int
3646
flow_put_on_pmd(struct dp_netdev_pmd_thread *pmd,
3647
                struct netdev_flow_key *key,
3648
                struct match *match,
3649
                ovs_u128 *ufid,
3650
                const struct dpif_flow_put *put,
3651
                struct dpif_flow_stats *stats)
3652
0
{
3653
0
    struct dp_netdev_flow *netdev_flow = NULL;
3654
0
    int error = 0;
3655
3656
0
    if (stats) {
3657
0
        memset(stats, 0, sizeof *stats);
3658
0
    }
3659
3660
0
    ovs_mutex_lock(&pmd->flow_mutex);
3661
0
    if (put->ufid) {
3662
0
        netdev_flow = dp_netdev_pmd_find_flow(pmd, put->ufid,
3663
0
                                              put->key, put->key_len);
3664
0
    } else {
3665
        /* Use key instead of the locally generated ufid
3666
         * to search netdev_flow. */
3667
0
        netdev_flow = dp_netdev_pmd_lookup_flow(pmd, key, NULL);
3668
0
    }
3669
3670
0
    if (put->flags & DPIF_FP_CREATE) {
3671
0
        if (!netdev_flow) {
3672
0
            dp_netdev_flow_add(pmd, match, ufid,
3673
0
                               put->actions, put->actions_len, ODPP_NONE);
3674
0
        } else {
3675
0
            error = EEXIST;
3676
0
        }
3677
0
        goto exit;
3678
0
    }
3679
3680
0
    if (put->flags & DPIF_FP_MODIFY) {
3681
0
        if (!netdev_flow) {
3682
0
            error = ENOENT;
3683
0
        } else {
3684
0
            if (!put->ufid && !flow_equal(&match->flow, &netdev_flow->flow)) {
3685
                /* Overlapping flow. */
3686
0
                error = EINVAL;
3687
0
                goto exit;
3688
0
            }
3689
3690
0
            struct dp_netdev_actions *new_actions;
3691
0
            struct dp_netdev_actions *old_actions;
3692
3693
0
            new_actions = dp_netdev_actions_create(put->actions,
3694
0
                                                   put->actions_len);
3695
3696
0
            old_actions = dp_netdev_flow_get_actions(netdev_flow);
3697
0
            ovsrcu_set(&netdev_flow->actions, new_actions);
3698
3699
0
            offload_flow_put(pmd, netdev_flow, match, put->actions,
3700
0
                             put->actions_len);
3701
0
            log_netdev_flow_change(netdev_flow, match, old_actions,
3702
0
                                   put->actions, put->actions_len);
3703
3704
0
            if (stats) {
3705
0
                get_dpif_flow_status(pmd->dp, netdev_flow, stats, NULL);
3706
0
            }
3707
0
            if (put->flags & DPIF_FP_ZERO_STATS) {
3708
                /* XXX: The userspace datapath uses thread local statistics
3709
                 * (for flows), which should be updated only by the owning
3710
                 * thread.  Since we cannot write on stats memory here,
3711
                 * we choose not to support this flag.  Please note:
3712
                 * - This feature is currently used only by dpctl commands with
3713
                 *   option --clear.
3714
                 * - Should the need arise, this operation can be implemented
3715
                 *   by keeping a base value (to be update here) for each
3716
                 *   counter, and subtracting it before outputting the stats */
3717
0
                error = EOPNOTSUPP;
3718
0
            }
3719
0
            ovsrcu_postpone(dp_netdev_actions_free, old_actions);
3720
0
        }
3721
0
    }
3722
3723
0
exit:
3724
0
    ovs_mutex_unlock(&pmd->flow_mutex);
3725
0
    return error;
3726
0
}
3727
3728
static int
3729
dpif_netdev_flow_put(struct dpif *dpif, const struct dpif_flow_put *put)
3730
0
{
3731
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
3732
0
    struct netdev_flow_key key;
3733
0
    struct dp_netdev_pmd_thread *pmd;
3734
0
    struct match match;
3735
0
    ovs_u128 ufid;
3736
0
    int error;
3737
0
    bool probe = put->flags & DPIF_FP_PROBE;
3738
3739
0
    if (put->stats) {
3740
0
        memset(put->stats, 0, sizeof *put->stats);
3741
0
    }
3742
0
    error = dpif_netdev_flow_from_nlattrs(put->key, put->key_len, &match.flow,
3743
0
                                          probe);
3744
0
    if (error) {
3745
0
        return error;
3746
0
    }
3747
0
    error = dpif_netdev_mask_from_nlattrs(put->key, put->key_len,
3748
0
                                          put->mask, put->mask_len,
3749
0
                                          &match.flow, &match.wc, probe);
3750
0
    if (error) {
3751
0
        return error;
3752
0
    }
3753
3754
0
    if (match.wc.masks.in_port.odp_port != ODPP_NONE) {
3755
0
        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
3756
3757
0
        VLOG_ERR_RL(&rl, "failed to put%s flow: in_port is not an exact match",
3758
0
                    (put->flags & DPIF_FP_CREATE) ? "[create]"
3759
0
                    : (put->flags & DPIF_FP_MODIFY) ? "[modify]" : "[zero]");
3760
0
        return EINVAL;
3761
0
    }
3762
3763
0
    if (put->ufid) {
3764
0
        ufid = *put->ufid;
3765
0
    } else {
3766
0
        odp_flow_key_hash(&match.flow, sizeof match.flow, &ufid);
3767
0
    }
3768
3769
    /* The Netlink encoding of datapath flow keys cannot express
3770
     * wildcarding the presence of a VLAN tag. Instead, a missing VLAN
3771
     * tag is interpreted as exact match on the fact that there is no
3772
     * VLAN.  Unless we refactor a lot of code that translates between
3773
     * Netlink and struct flow representations, we have to do the same
3774
     * here.  This must be in sync with 'match' in handle_packet_upcall(). */
3775
0
    if (!match.wc.masks.vlans[0].tci) {
3776
0
        match.wc.masks.vlans[0].tci = htons(VLAN_VID_MASK | VLAN_CFI);
3777
0
    }
3778
3779
    /* Must produce a netdev_flow_key for lookup.
3780
     * Use the same method as employed to create the key when adding
3781
     * the flow to the dplcs to make sure they match.
3782
     * We need to put in the unmasked key as flow_put_on_pmd() will first try
3783
     * to see if an entry exists doing a packet type lookup. As masked-out
3784
     * fields are interpreted as zeros, they could falsely match a wider IP
3785
     * address mask. Installation of the flow will use the match variable. */
3786
0
    netdev_flow_key_init(&key, &match.flow);
3787
3788
0
    if (put->pmd_id == PMD_ID_NULL) {
3789
0
        if (cmap_count(&dp->poll_threads) == 0) {
3790
0
            return EINVAL;
3791
0
        }
3792
0
        CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3793
0
            struct dpif_flow_stats pmd_stats;
3794
0
            int pmd_error;
3795
3796
0
            pmd_error = flow_put_on_pmd(pmd, &key, &match, &ufid, put,
3797
0
                                        &pmd_stats);
3798
0
            if (pmd_error) {
3799
0
                error = pmd_error;
3800
0
            } else if (put->stats) {
3801
0
                put->stats->n_packets += pmd_stats.n_packets;
3802
0
                put->stats->n_bytes += pmd_stats.n_bytes;
3803
0
                put->stats->used = MAX(put->stats->used, pmd_stats.used);
3804
0
                put->stats->tcp_flags |= pmd_stats.tcp_flags;
3805
0
            }
3806
0
        }
3807
0
    } else {
3808
0
        pmd = dp_netdev_get_pmd(dp, put->pmd_id);
3809
0
        if (!pmd) {
3810
0
            return EINVAL;
3811
0
        }
3812
0
        error = flow_put_on_pmd(pmd, &key, &match, &ufid, put, put->stats);
3813
0
        dp_netdev_pmd_unref(pmd);
3814
0
    }
3815
3816
0
    return error;
3817
0
}
3818
3819
static int
3820
flow_del_on_pmd(struct dp_netdev_pmd_thread *pmd,
3821
                struct dpif_flow_stats *stats,
3822
                const struct dpif_flow_del *del)
3823
0
{
3824
0
    struct dp_netdev_flow *netdev_flow;
3825
0
    int error = 0;
3826
3827
0
    ovs_mutex_lock(&pmd->flow_mutex);
3828
0
    netdev_flow = dp_netdev_pmd_find_flow(pmd, del->ufid, del->key,
3829
0
                                          del->key_len);
3830
0
    if (netdev_flow) {
3831
0
        if (stats) {
3832
0
            get_dpif_flow_status(pmd->dp, netdev_flow, stats, NULL);
3833
0
        }
3834
0
        dp_netdev_pmd_remove_flow(pmd, netdev_flow);
3835
0
    } else {
3836
0
        error = ENOENT;
3837
0
    }
3838
0
    ovs_mutex_unlock(&pmd->flow_mutex);
3839
3840
0
    return error;
3841
0
}
3842
3843
static int
3844
dpif_netdev_flow_del(struct dpif *dpif, const struct dpif_flow_del *del)
3845
0
{
3846
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
3847
0
    struct dp_netdev_pmd_thread *pmd;
3848
0
    int error = 0;
3849
3850
0
    if (del->stats) {
3851
0
        memset(del->stats, 0, sizeof *del->stats);
3852
0
    }
3853
3854
0
    if (del->pmd_id == PMD_ID_NULL) {
3855
0
        if (cmap_count(&dp->poll_threads) == 0) {
3856
0
            return EINVAL;
3857
0
        }
3858
0
        CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
3859
0
            struct dpif_flow_stats pmd_stats;
3860
0
            int pmd_error;
3861
3862
0
            pmd_error = flow_del_on_pmd(pmd, &pmd_stats, del);
3863
0
            if (pmd_error) {
3864
0
                error = pmd_error;
3865
0
            } else if (del->stats) {
3866
0
                del->stats->n_packets += pmd_stats.n_packets;
3867
0
                del->stats->n_bytes += pmd_stats.n_bytes;
3868
0
                del->stats->used = MAX(del->stats->used, pmd_stats.used);
3869
0
                del->stats->tcp_flags |= pmd_stats.tcp_flags;
3870
0
            }
3871
0
        }
3872
0
    } else {
3873
0
        pmd = dp_netdev_get_pmd(dp, del->pmd_id);
3874
0
        if (!pmd) {
3875
0
            return EINVAL;
3876
0
        }
3877
0
        error = flow_del_on_pmd(pmd, del->stats, del);
3878
0
        dp_netdev_pmd_unref(pmd);
3879
0
    }
3880
3881
3882
0
    return error;
3883
0
}
3884
3885
struct dpif_netdev_flow_dump {
3886
    struct dpif_flow_dump up;
3887
    struct cmap_position poll_thread_pos;
3888
    struct cmap_position flow_pos;
3889
    struct dp_netdev_pmd_thread *cur_pmd;
3890
    int status;
3891
    struct ovs_mutex mutex;
3892
};
3893
3894
static struct dpif_netdev_flow_dump *
3895
dpif_netdev_flow_dump_cast(struct dpif_flow_dump *dump)
3896
0
{
3897
0
    return CONTAINER_OF(dump, struct dpif_netdev_flow_dump, up);
3898
0
}
3899
3900
static struct dpif_flow_dump *
3901
dpif_netdev_flow_dump_create(const struct dpif *dpif_, bool terse,
3902
                             struct dpif_flow_dump_types *types)
3903
0
{
3904
0
    struct dpif_netdev_flow_dump *dump;
3905
3906
0
    dump = xzalloc(sizeof *dump);
3907
0
    dpif_flow_dump_init(&dump->up, dpif_, terse, types);
3908
0
    ovs_mutex_init(&dump->mutex);
3909
3910
0
    return &dump->up;
3911
0
}
3912
3913
static int
3914
dpif_netdev_flow_dump_destroy(struct dpif_flow_dump *dump_)
3915
0
{
3916
0
    struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_);
3917
3918
0
    ovs_mutex_destroy(&dump->mutex);
3919
0
    free(dump);
3920
0
    return 0;
3921
0
}
3922
3923
struct dpif_netdev_flow_dump_thread {
3924
    struct dpif_flow_dump_thread up;
3925
    struct dpif_netdev_flow_dump *dump;
3926
    struct odputil_keybuf keybuf[FLOW_DUMP_MAX_BATCH];
3927
    struct odputil_keybuf maskbuf[FLOW_DUMP_MAX_BATCH];
3928
};
3929
3930
static struct dpif_netdev_flow_dump_thread *
3931
dpif_netdev_flow_dump_thread_cast(struct dpif_flow_dump_thread *thread)
3932
0
{
3933
0
    return CONTAINER_OF(thread, struct dpif_netdev_flow_dump_thread, up);
3934
0
}
3935
3936
static struct dpif_flow_dump_thread *
3937
dpif_netdev_flow_dump_thread_create(struct dpif_flow_dump *dump_)
3938
0
{
3939
0
    struct dpif_netdev_flow_dump *dump = dpif_netdev_flow_dump_cast(dump_);
3940
0
    struct dpif_netdev_flow_dump_thread *thread;
3941
3942
0
    thread = xmalloc(sizeof *thread);
3943
0
    dpif_flow_dump_thread_init(&thread->up, &dump->up);
3944
0
    thread->dump = dump;
3945
0
    return &thread->up;
3946
0
}
3947
3948
static void
3949
dpif_netdev_flow_dump_thread_destroy(struct dpif_flow_dump_thread *thread_)
3950
0
{
3951
0
    struct dpif_netdev_flow_dump_thread *thread
3952
0
        = dpif_netdev_flow_dump_thread_cast(thread_);
3953
3954
0
    free(thread);
3955
0
}
3956
3957
static int
3958
dpif_netdev_flow_dump_next(struct dpif_flow_dump_thread *thread_,
3959
                           struct dpif_flow *flows, int max_flows)
3960
0
{
3961
0
    struct dpif_netdev_flow_dump_thread *thread
3962
0
        = dpif_netdev_flow_dump_thread_cast(thread_);
3963
0
    struct dpif_netdev_flow_dump *dump = thread->dump;
3964
0
    struct dp_netdev_flow *netdev_flows[FLOW_DUMP_MAX_BATCH];
3965
0
    struct dpif_netdev *dpif = dpif_netdev_cast(thread->up.dump->dpif);
3966
0
    struct dp_netdev *dp = get_dp_netdev(&dpif->dpif);
3967
0
    int n_flows = 0;
3968
0
    int i;
3969
3970
0
    ovs_mutex_lock(&dump->mutex);
3971
0
    if (!dump->status) {
3972
0
        struct dp_netdev_pmd_thread *pmd = dump->cur_pmd;
3973
0
        int flow_limit = MIN(max_flows, FLOW_DUMP_MAX_BATCH);
3974
3975
        /* First call to dump_next(), extracts the first pmd thread.
3976
         * If there is no pmd thread, returns immediately. */
3977
0
        if (!pmd) {
3978
0
            pmd = dp_netdev_pmd_get_next(dp, &dump->poll_thread_pos);
3979
0
            if (!pmd) {
3980
0
                ovs_mutex_unlock(&dump->mutex);
3981
0
                return n_flows;
3982
3983
0
            }
3984
0
        }
3985
3986
0
        do {
3987
0
            for (n_flows = 0; n_flows < flow_limit; n_flows++) {
3988
0
                struct cmap_node *node;
3989
3990
0
                node = cmap_next_position(&pmd->flow_table, &dump->flow_pos);
3991
0
                if (!node) {
3992
0
                    break;
3993
0
                }
3994
0
                netdev_flows[n_flows] = CONTAINER_OF(node,
3995
0
                                                     struct dp_netdev_flow,
3996
0
                                                     node);
3997
0
            }
3998
            /* When finishing dumping the current pmd thread, moves to
3999
             * the next. */
4000
0
            if (n_flows < flow_limit) {
4001
0
                memset(&dump->flow_pos, 0, sizeof dump->flow_pos);
4002
0
                dp_netdev_pmd_unref(pmd);
4003
0
                pmd = dp_netdev_pmd_get_next(dp, &dump->poll_thread_pos);
4004
0
                if (!pmd) {
4005
0
                    dump->status = EOF;
4006
0
                    break;
4007
0
                }
4008
0
            }
4009
            /* Keeps the reference to next caller. */
4010
0
            dump->cur_pmd = pmd;
4011
4012
            /* If the current dump is empty, do not exit the loop, since the
4013
             * remaining pmds could have flows to be dumped.  Just dumps again
4014
             * on the new 'pmd'. */
4015
0
        } while (!n_flows);
4016
0
    }
4017
0
    ovs_mutex_unlock(&dump->mutex);
4018
4019
0
    for (i = 0; i < n_flows; i++) {
4020
0
        struct odputil_keybuf *maskbuf = &thread->maskbuf[i];
4021
0
        struct odputil_keybuf *keybuf = &thread->keybuf[i];
4022
0
        struct dp_netdev_flow *netdev_flow = netdev_flows[i];
4023
0
        struct dpif_flow *f = &flows[i];
4024
0
        struct ofpbuf key, mask;
4025
4026
0
        ofpbuf_use_stack(&key, keybuf, sizeof *keybuf);
4027
0
        ofpbuf_use_stack(&mask, maskbuf, sizeof *maskbuf);
4028
0
        dp_netdev_flow_to_dpif_flow(dp, netdev_flow, &key, &mask, f,
4029
0
                                    dump->up.terse);
4030
0
    }
4031
4032
0
    return n_flows;
4033
0
}
4034
4035
static int
4036
dpif_netdev_execute(struct dpif *dpif, struct dpif_execute *execute)
4037
    OVS_NO_THREAD_SAFETY_ANALYSIS
4038
0
{
4039
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
4040
0
    struct dp_netdev_pmd_thread *pmd;
4041
0
    struct dp_packet_batch pp;
4042
4043
0
    if (dp_packet_size(execute->packet) < ETH_HEADER_LEN ||
4044
0
        dp_packet_size(execute->packet) > UINT16_MAX) {
4045
0
        return EINVAL;
4046
0
    }
4047
4048
    /* Tries finding the 'pmd'.  If NULL is returned, that means
4049
     * the current thread is a non-pmd thread and should use
4050
     * dp_netdev_get_pmd(dp, NON_PMD_CORE_ID). */
4051
0
    pmd = ovsthread_getspecific(dp->per_pmd_key);
4052
0
    if (!pmd) {
4053
0
        pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
4054
0
        if (!pmd) {
4055
0
            return EBUSY;
4056
0
        }
4057
0
    }
4058
4059
0
    if (execute->probe) {
4060
        /* If this is part of a probe, Drop the packet, since executing
4061
         * the action may actually cause spurious packets be sent into
4062
         * the network. */
4063
0
        if (pmd->core_id == NON_PMD_CORE_ID) {
4064
0
            dp_netdev_pmd_unref(pmd);
4065
0
        }
4066
0
        return 0;
4067
0
    }
4068
4069
    /* If the current thread is non-pmd thread, acquires
4070
     * the 'non_pmd_mutex'. */
4071
0
    if (pmd->core_id == NON_PMD_CORE_ID) {
4072
0
        ovs_mutex_lock(&dp->non_pmd_mutex);
4073
0
    }
4074
4075
    /* Update current time in PMD context. We don't care about EMC insertion
4076
     * probability, because we are on a slow path. */
4077
0
    pmd_thread_ctx_time_update(pmd);
4078
4079
    /* The action processing expects the RSS hash to be valid, because
4080
     * it's always initialized at the beginning of datapath processing.
4081
     * In this case, though, 'execute->packet' may not have gone through
4082
     * the datapath at all, it may have been generated by the upper layer
4083
     * (OpenFlow packet-out, BFD frame, ...). */
4084
0
    if (!dp_packet_rss_valid(execute->packet)) {
4085
0
        dp_packet_set_rss_hash(execute->packet,
4086
0
                               flow_hash_5tuple(execute->flow, 0));
4087
0
    }
4088
4089
    /* Making a copy because the packet might be stolen during the execution
4090
     * and caller might still need it.  */
4091
0
    struct dp_packet *packet_clone = dp_packet_clone(execute->packet);
4092
0
    dp_packet_batch_init_packet(&pp, packet_clone);
4093
0
    dp_netdev_execute_actions(pmd, &pp, false, execute->flow,
4094
0
                              execute->actions, execute->actions_len);
4095
0
    dp_netdev_pmd_flush_output_packets(pmd, true);
4096
4097
0
    if (pmd->core_id == NON_PMD_CORE_ID) {
4098
0
        ovs_mutex_unlock(&dp->non_pmd_mutex);
4099
0
        dp_netdev_pmd_unref(pmd);
4100
0
    }
4101
4102
0
    if (dp_packet_batch_size(&pp) == 1) {
4103
        /* Packet wasn't dropped during the execution.  Swapping content with
4104
         * the original packet, because the caller might expect actions to
4105
         * modify it.  Uisng the packet from a batch instead of 'packet_clone'
4106
         * because it maybe stolen and replaced by other packet, e.g. by
4107
         * the fragmentation engine. */
4108
0
        dp_packet_swap(execute->packet, pp.packets[0]);
4109
0
        dp_packet_delete_batch(&pp, true);
4110
0
    } else if (dp_packet_batch_size(&pp)) {
4111
        /* FIXME: We have more packets than expected.  Likely, we got IP
4112
         * fragments of the reassembled packet.  Dropping them here as we have
4113
         * no way to get them to the caller.  It might be that all the required
4114
         * actions with them are already executed, but it also might not be a
4115
         * case, e.g. if dpif_netdev_execute() called to execute a single
4116
         * tunnel push. */
4117
0
        dp_packet_delete_batch(&pp, true);
4118
0
    }
4119
4120
0
    return 0;
4121
0
}
4122
4123
static void
4124
dpif_netdev_operate(struct dpif *dpif, struct dpif_op **ops, size_t n_ops)
4125
0
{
4126
0
    size_t i;
4127
4128
0
    for (i = 0; i < n_ops; i++) {
4129
0
        struct dpif_op *op = ops[i];
4130
4131
0
        switch (op->type) {
4132
0
        case DPIF_OP_FLOW_PUT:
4133
0
            op->error = dpif_netdev_flow_put(dpif, &op->flow_put);
4134
0
            break;
4135
4136
0
        case DPIF_OP_FLOW_DEL:
4137
0
            op->error = dpif_netdev_flow_del(dpif, &op->flow_del);
4138
0
            break;
4139
4140
0
        case DPIF_OP_EXECUTE:
4141
0
            op->error = dpif_netdev_execute(dpif, &op->execute);
4142
0
            break;
4143
4144
0
        case DPIF_OP_FLOW_GET:
4145
0
            op->error = dpif_netdev_flow_get(dpif, &op->flow_get);
4146
0
            break;
4147
0
        }
4148
0
    }
4149
0
}
4150
4151
/* Enable or Disable PMD auto load balancing. */
4152
static void
4153
set_pmd_auto_lb(struct dp_netdev *dp, bool state, bool always_log)
4154
0
{
4155
0
    struct pmd_auto_lb *pmd_alb = &dp->pmd_alb;
4156
4157
0
    if (pmd_alb->is_enabled != state || always_log) {
4158
0
        pmd_alb->is_enabled = state;
4159
0
        if (pmd_alb->is_enabled) {
4160
0
            uint8_t rebalance_load_thresh;
4161
4162
0
            atomic_read_relaxed(&pmd_alb->rebalance_load_thresh,
4163
0
                                &rebalance_load_thresh);
4164
0
            VLOG_INFO("PMD auto load balance is enabled, "
4165
0
                      "interval %"PRIu64" mins, "
4166
0
                      "pmd load threshold %"PRIu8"%%, "
4167
0
                      "improvement threshold %"PRIu8"%%.",
4168
0
                       pmd_alb->rebalance_intvl / MIN_TO_MSEC,
4169
0
                       rebalance_load_thresh,
4170
0
                       pmd_alb->rebalance_improve_thresh);
4171
0
        } else {
4172
0
            pmd_alb->rebalance_poll_timer = 0;
4173
0
            VLOG_INFO("PMD auto load balance is disabled.");
4174
0
        }
4175
0
    }
4176
0
}
4177
4178
static int
4179
parse_pmd_sleep_list(const char *max_sleep_list,
4180
                     struct pmd_sleep **pmd_sleeps)
4181
0
{
4182
0
    char *list, *copy, *key, *value;
4183
0
    int num_vals = 0;
4184
4185
0
    if (!max_sleep_list) {
4186
0
        return num_vals;
4187
0
    }
4188
4189
0
    list = copy = xstrdup(max_sleep_list);
4190
4191
0
    while (ofputil_parse_key_value(&list, &key, &value)) {
4192
0
        uint64_t temp, pmd_max_sleep;
4193
0
        char *error = NULL;
4194
0
        unsigned core;
4195
0
        int i;
4196
4197
0
        error = str_to_u64(key, &temp);
4198
0
        if (error) {
4199
0
            free(error);
4200
0
            continue;
4201
0
        }
4202
4203
0
        if (value[0] == '\0') {
4204
            /* No value specified. key is dp default. */
4205
0
            core = UINT_MAX;
4206
0
            pmd_max_sleep = temp;
4207
0
        } else {
4208
0
            error = str_to_u64(value, &pmd_max_sleep);
4209
0
            if (!error && temp < UINT_MAX) {
4210
                /* Key is pmd core id. */
4211
0
                core = (unsigned) temp;
4212
0
            } else {
4213
0
                free(error);
4214
0
                continue;
4215
0
            }
4216
0
        }
4217
4218
        /* Detect duplicate max sleep values. */
4219
0
        for (i = 0; i < num_vals; i++) {
4220
0
            if ((*pmd_sleeps)[i].core_id == core) {
4221
0
                break;
4222
0
            }
4223
0
        }
4224
0
        if (i == num_vals) {
4225
            /* Not duplicate, add a new entry. */
4226
0
            *pmd_sleeps = xrealloc(*pmd_sleeps,
4227
0
                                   (num_vals + 1) * sizeof **pmd_sleeps);
4228
0
            num_vals++;
4229
0
        }
4230
4231
0
        pmd_max_sleep = MIN(PMD_RCU_QUIESCE_INTERVAL, pmd_max_sleep);
4232
4233
0
        (*pmd_sleeps)[i].core_id = core;
4234
0
        (*pmd_sleeps)[i].max_sleep = pmd_max_sleep;
4235
0
    }
4236
4237
0
    free(copy);
4238
0
    return num_vals;
4239
0
}
4240
4241
static void
4242
log_pmd_sleep(unsigned core_id, int numa_id, uint64_t pmd_max_sleep)
4243
0
{
4244
0
    if (core_id == NON_PMD_CORE_ID) {
4245
0
        return;
4246
0
    }
4247
0
    VLOG_INFO("PMD thread on numa_id: %d, core id: %2d, "
4248
0
              "max sleep: %4"PRIu64" us.", numa_id, core_id, pmd_max_sleep);
4249
0
}
4250
4251
static void
4252
pmd_init_max_sleep(struct dp_netdev *dp, struct dp_netdev_pmd_thread *pmd)
4253
0
{
4254
0
    uint64_t max_sleep = dp->pmd_max_sleep_default;
4255
0
    struct pmd_sleep *pmd_sleeps = NULL;
4256
0
    int num_vals;
4257
4258
0
    num_vals = parse_pmd_sleep_list(dp->max_sleep_list, &pmd_sleeps);
4259
4260
    /* Check if the user has set a specific value for this pmd. */
4261
0
    for (int i = 0; i < num_vals; i++) {
4262
0
        if (pmd_sleeps[i].core_id == pmd->core_id) {
4263
0
            max_sleep = pmd_sleeps[i].max_sleep;
4264
0
            break;
4265
0
        }
4266
0
    }
4267
0
    atomic_init(&pmd->max_sleep, max_sleep);
4268
0
    log_pmd_sleep(pmd->core_id, pmd->numa_id, max_sleep);
4269
0
    free(pmd_sleeps);
4270
0
}
4271
4272
static bool
4273
assign_sleep_values_to_pmds(struct dp_netdev *dp, int num_vals,
4274
                            struct pmd_sleep *pmd_sleeps)
4275
0
{
4276
0
    struct dp_netdev_pmd_thread *pmd;
4277
0
    bool value_changed = false;
4278
4279
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4280
0
        uint64_t new_max_sleep, cur_pmd_max_sleep;
4281
4282
0
        if (pmd->core_id == NON_PMD_CORE_ID) {
4283
0
            continue;
4284
0
        }
4285
4286
        /* Default to global value. */
4287
0
        new_max_sleep = dp->pmd_max_sleep_default;
4288
4289
        /* Check for pmd specific value. */
4290
0
        for (int i = 0;  i < num_vals; i++) {
4291
0
            if (pmd->core_id == pmd_sleeps[i].core_id) {
4292
0
                new_max_sleep = pmd_sleeps[i].max_sleep;
4293
0
                break;
4294
0
            }
4295
0
        }
4296
0
        atomic_read_relaxed(&pmd->max_sleep, &cur_pmd_max_sleep);
4297
0
        if (new_max_sleep != cur_pmd_max_sleep) {
4298
0
            atomic_store_relaxed(&pmd->max_sleep, new_max_sleep);
4299
0
            value_changed = true;
4300
0
        }
4301
0
    }
4302
0
    return value_changed;
4303
0
}
4304
4305
static void
4306
log_all_pmd_sleeps(struct dp_netdev *dp)
4307
0
{
4308
0
    struct dp_netdev_pmd_thread **pmd_list = NULL;
4309
0
    struct dp_netdev_pmd_thread *pmd;
4310
0
    size_t n;
4311
4312
0
    VLOG_INFO("Default PMD thread max sleep: %4"PRIu64" us.",
4313
0
              dp->pmd_max_sleep_default);
4314
4315
0
    sorted_poll_thread_list(dp, &pmd_list, &n);
4316
4317
0
    for (size_t i = 0; i < n; i++) {
4318
0
        uint64_t cur_pmd_max_sleep;
4319
4320
0
        pmd = pmd_list[i];
4321
0
        atomic_read_relaxed(&pmd->max_sleep, &cur_pmd_max_sleep);
4322
0
        log_pmd_sleep(pmd->core_id, pmd->numa_id, cur_pmd_max_sleep);
4323
0
    }
4324
0
    free(pmd_list);
4325
0
}
4326
4327
static bool
4328
set_all_pmd_max_sleeps(struct dp_netdev *dp, const struct smap *config)
4329
0
{
4330
0
    const char *max_sleep_list = smap_get(config, "pmd-sleep-max");
4331
0
    struct pmd_sleep *pmd_sleeps = NULL;
4332
0
    uint64_t default_max_sleep = 0;
4333
0
    bool default_changed = false;
4334
0
    bool pmd_changed = false;
4335
0
    uint64_t pmd_maxsleep;
4336
0
    int num_vals = 0;
4337
4338
    /* Check for deprecated 'pmd-maxsleep' value. */
4339
0
    pmd_maxsleep = smap_get_ullong(config, "pmd-maxsleep", UINT64_MAX);
4340
0
    if (pmd_maxsleep != UINT64_MAX && !max_sleep_list) {
4341
0
        VLOG_WARN_ONCE("pmd-maxsleep is deprecated. "
4342
0
                       "Please use pmd-sleep-max instead.");
4343
0
        default_max_sleep = pmd_maxsleep;
4344
0
    }
4345
4346
    /* Check if there is no change in string or value. */
4347
0
    if (!!dp->max_sleep_list == !!max_sleep_list) {
4348
0
        if (max_sleep_list
4349
0
            ? nullable_string_is_equal(max_sleep_list, dp->max_sleep_list)
4350
0
            : default_max_sleep == dp->pmd_max_sleep_default) {
4351
0
            return false;
4352
0
        }
4353
0
    }
4354
4355
    /* Free existing string and copy new one (if any). */
4356
0
    free(dp->max_sleep_list);
4357
0
    dp->max_sleep_list = nullable_xstrdup(max_sleep_list);
4358
4359
0
    if (max_sleep_list) {
4360
0
        num_vals = parse_pmd_sleep_list(max_sleep_list, &pmd_sleeps);
4361
4362
        /* Check if the user has set a global value. */
4363
0
        for (int i = 0; i < num_vals; i++) {
4364
0
            if (pmd_sleeps[i].core_id == UINT_MAX) {
4365
0
                default_max_sleep = pmd_sleeps[i].max_sleep;
4366
0
                break;
4367
0
            }
4368
0
        }
4369
0
    }
4370
4371
0
    if (dp->pmd_max_sleep_default != default_max_sleep) {
4372
0
        dp->pmd_max_sleep_default = default_max_sleep;
4373
0
        default_changed = true;
4374
0
    }
4375
0
    pmd_changed = assign_sleep_values_to_pmds(dp, num_vals, pmd_sleeps);
4376
4377
0
    free(pmd_sleeps);
4378
0
    return default_changed || pmd_changed;
4379
0
}
4380
4381
/* Applies datapath configuration from the database. Some of the changes are
4382
 * actually applied in dpif_netdev_run(). */
4383
static int
4384
dpif_netdev_set_config(struct dpif *dpif, const struct smap *other_config)
4385
0
{
4386
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
4387
0
    const char *cmask = smap_get(other_config, "pmd-cpu-mask");
4388
0
    const char *pmd_rxq_assign = smap_get_def(other_config, "pmd-rxq-assign",
4389
0
                                             "cycles");
4390
0
    unsigned long long insert_prob =
4391
0
        smap_get_ullong(other_config, "emc-insert-inv-prob",
4392
0
                        DEFAULT_EM_FLOW_INSERT_INV_PROB);
4393
0
    uint32_t insert_min, cur_min;
4394
0
    uint32_t tx_flush_interval, cur_tx_flush_interval;
4395
0
    uint64_t rebalance_intvl;
4396
0
    uint8_t cur_rebalance_load;
4397
0
    uint32_t rebalance_load, rebalance_improve;
4398
0
    bool log_autolb = false;
4399
0
    enum sched_assignment_type pmd_rxq_assign_type;
4400
4401
0
    tx_flush_interval = smap_get_int(other_config, "tx-flush-interval",
4402
0
                                     DEFAULT_TX_FLUSH_INTERVAL);
4403
0
    atomic_read_relaxed(&dp->tx_flush_interval, &cur_tx_flush_interval);
4404
0
    if (tx_flush_interval != cur_tx_flush_interval) {
4405
0
        atomic_store_relaxed(&dp->tx_flush_interval, tx_flush_interval);
4406
0
        VLOG_INFO("Flushing interval for tx queues set to %"PRIu32" us",
4407
0
                  tx_flush_interval);
4408
0
    }
4409
4410
0
    if (!nullable_string_is_equal(dp->pmd_cmask, cmask)) {
4411
0
        free(dp->pmd_cmask);
4412
0
        dp->pmd_cmask = nullable_xstrdup(cmask);
4413
0
        dp_netdev_request_reconfigure(dp);
4414
0
    }
4415
4416
0
    atomic_read_relaxed(&dp->emc_insert_min, &cur_min);
4417
0
    if (insert_prob <= UINT32_MAX) {
4418
0
        insert_min = insert_prob == 0 ? 0 : UINT32_MAX / insert_prob;
4419
0
    } else {
4420
0
        insert_min = DEFAULT_EM_FLOW_INSERT_MIN;
4421
0
        insert_prob = DEFAULT_EM_FLOW_INSERT_INV_PROB;
4422
0
    }
4423
4424
0
    if (insert_min != cur_min) {
4425
0
        atomic_store_relaxed(&dp->emc_insert_min, insert_min);
4426
0
        if (insert_min == 0) {
4427
0
            VLOG_INFO("EMC insertion probability changed to zero");
4428
0
        } else {
4429
0
            VLOG_INFO("EMC insertion probability changed to 1/%llu (~%.2f%%)",
4430
0
                      insert_prob, (100 / (float)insert_prob));
4431
0
        }
4432
0
    }
4433
4434
0
    bool perf_enabled = smap_get_bool(other_config, "pmd-perf-metrics", false);
4435
0
    bool cur_perf_enabled;
4436
0
    atomic_read_relaxed(&dp->pmd_perf_metrics, &cur_perf_enabled);
4437
0
    if (perf_enabled != cur_perf_enabled) {
4438
0
        atomic_store_relaxed(&dp->pmd_perf_metrics, perf_enabled);
4439
0
        if (perf_enabled) {
4440
0
            VLOG_INFO("PMD performance metrics collection enabled");
4441
0
        } else {
4442
0
            VLOG_INFO("PMD performance metrics collection disabled");
4443
0
        }
4444
0
    }
4445
4446
0
    bool smc_enable = smap_get_bool(other_config, "smc-enable", false);
4447
0
    bool cur_smc;
4448
0
    atomic_read_relaxed(&dp->smc_enable_db, &cur_smc);
4449
0
    if (smc_enable != cur_smc) {
4450
0
        atomic_store_relaxed(&dp->smc_enable_db, smc_enable);
4451
0
        if (smc_enable) {
4452
0
            VLOG_INFO("SMC cache is enabled");
4453
0
        } else {
4454
0
            VLOG_INFO("SMC cache is disabled");
4455
0
        }
4456
0
    }
4457
4458
0
    if (!strcmp(pmd_rxq_assign, "roundrobin")) {
4459
0
        pmd_rxq_assign_type = SCHED_ROUNDROBIN;
4460
0
    } else if (!strcmp(pmd_rxq_assign, "cycles")) {
4461
0
        pmd_rxq_assign_type = SCHED_CYCLES;
4462
0
    } else if (!strcmp(pmd_rxq_assign, "group")) {
4463
0
        pmd_rxq_assign_type = SCHED_GROUP;
4464
0
    } else {
4465
        /* Default. */
4466
0
        VLOG_WARN("Unsupported rx queue to PMD assignment mode in "
4467
0
                  "pmd-rxq-assign. Defaulting to 'cycles'.");
4468
0
        pmd_rxq_assign_type = SCHED_CYCLES;
4469
0
        pmd_rxq_assign = "cycles";
4470
0
    }
4471
0
    if (dp->pmd_rxq_assign_type != pmd_rxq_assign_type) {
4472
0
        dp->pmd_rxq_assign_type = pmd_rxq_assign_type;
4473
0
        VLOG_INFO("Rxq to PMD assignment mode changed to: \'%s\'.",
4474
0
                  pmd_rxq_assign);
4475
0
        dp_netdev_request_reconfigure(dp);
4476
0
    }
4477
4478
0
    bool pmd_iso = smap_get_bool(other_config, "pmd-rxq-isolate", true);
4479
4480
0
    if (pmd_rxq_assign_type != SCHED_GROUP && pmd_iso == false) {
4481
        /* Invalid combination. */
4482
0
        VLOG_WARN("pmd-rxq-isolate can only be set false "
4483
0
                  "when using pmd-rxq-assign=group");
4484
0
        pmd_iso = true;
4485
0
    }
4486
0
    if (dp->pmd_iso != pmd_iso) {
4487
0
        dp->pmd_iso = pmd_iso;
4488
0
        if (pmd_iso) {
4489
0
            VLOG_INFO("pmd-rxq-affinity isolates PMD core");
4490
0
        } else {
4491
0
            VLOG_INFO("pmd-rxq-affinity does not isolate PMD core");
4492
0
        }
4493
0
        dp_netdev_request_reconfigure(dp);
4494
0
    }
4495
4496
0
    struct pmd_auto_lb *pmd_alb = &dp->pmd_alb;
4497
4498
0
    rebalance_intvl = smap_get_ullong(other_config,
4499
0
                                      "pmd-auto-lb-rebal-interval",
4500
0
                                      ALB_REBALANCE_INTERVAL);
4501
0
    if (rebalance_intvl > MAX_ALB_REBALANCE_INTERVAL) {
4502
0
        rebalance_intvl = ALB_REBALANCE_INTERVAL;
4503
0
    }
4504
4505
    /* Input is in min, convert it to msec. */
4506
0
    rebalance_intvl =
4507
0
        rebalance_intvl ? rebalance_intvl * MIN_TO_MSEC : MIN_TO_MSEC;
4508
4509
0
    if (pmd_alb->rebalance_intvl != rebalance_intvl) {
4510
0
        pmd_alb->rebalance_intvl = rebalance_intvl;
4511
0
        VLOG_INFO("PMD auto load balance interval set to "
4512
0
                  "%"PRIu64" mins\n", rebalance_intvl / MIN_TO_MSEC);
4513
0
        log_autolb = true;
4514
0
    }
4515
4516
0
    rebalance_improve = smap_get_uint(other_config,
4517
0
                                      "pmd-auto-lb-improvement-threshold",
4518
0
                                      ALB_IMPROVEMENT_THRESHOLD);
4519
0
    if (rebalance_improve > 100) {
4520
0
        rebalance_improve = ALB_IMPROVEMENT_THRESHOLD;
4521
0
    }
4522
0
    if (rebalance_improve != pmd_alb->rebalance_improve_thresh) {
4523
0
        pmd_alb->rebalance_improve_thresh = rebalance_improve;
4524
0
        VLOG_INFO("PMD auto load balance improvement threshold set to "
4525
0
                  "%"PRIu32"%%", rebalance_improve);
4526
0
        log_autolb = true;
4527
0
    }
4528
4529
0
    rebalance_load = smap_get_uint(other_config, "pmd-auto-lb-load-threshold",
4530
0
                                   ALB_LOAD_THRESHOLD);
4531
0
    if (rebalance_load > 100) {
4532
0
        rebalance_load = ALB_LOAD_THRESHOLD;
4533
0
    }
4534
0
    atomic_read_relaxed(&pmd_alb->rebalance_load_thresh, &cur_rebalance_load);
4535
0
    if (rebalance_load != cur_rebalance_load) {
4536
0
        atomic_store_relaxed(&pmd_alb->rebalance_load_thresh,
4537
0
                             rebalance_load);
4538
0
        VLOG_INFO("PMD auto load balance load threshold set to %"PRIu32"%%",
4539
0
                  rebalance_load);
4540
0
        log_autolb = true;
4541
0
    }
4542
4543
0
    bool autolb_state = smap_get_bool(other_config, "pmd-auto-lb", false);
4544
4545
0
    set_pmd_auto_lb(dp, autolb_state, log_autolb);
4546
4547
0
    bool sleep_changed = set_all_pmd_max_sleeps(dp, other_config);
4548
4549
0
    if (ovsthread_once_start(&dp->once_set_config)) {
4550
0
        log_all_pmd_sleeps(dp);
4551
0
        dpif_offload_datapath_register_flow_unreference_cb(
4552
0
            dpif, offload_flow_reference_unreference_cb);
4553
4554
0
        ovsthread_once_done(&dp->once_set_config);
4555
0
    } else if (sleep_changed) {
4556
0
        log_all_pmd_sleeps(dp);
4557
0
    }
4558
4559
0
    return 0;
4560
0
}
4561
4562
static bool
4563
dpif_netdev_number_handlers_required(struct dpif *dpif_ OVS_UNUSED,
4564
                                     uint32_t *n_handlers)
4565
0
{
4566
0
    *n_handlers = 0;
4567
0
    return true;
4568
0
}
4569
4570
/* Parses affinity list and returns result in 'core_ids'. */
4571
static int
4572
parse_affinity_list(const char *affinity_list, unsigned *core_ids, int n_rxq)
4573
0
{
4574
0
    unsigned i;
4575
0
    char *list, *copy, *key, *value;
4576
0
    int error = 0;
4577
4578
0
    for (i = 0; i < n_rxq; i++) {
4579
0
        core_ids[i] = OVS_CORE_UNSPEC;
4580
0
    }
4581
4582
0
    if (!affinity_list) {
4583
0
        return 0;
4584
0
    }
4585
4586
0
    list = copy = xstrdup(affinity_list);
4587
4588
0
    while (ofputil_parse_key_value(&list, &key, &value)) {
4589
0
        int rxq_id, core_id;
4590
4591
0
        if (!str_to_int(key, 0, &rxq_id) || rxq_id < 0
4592
0
            || !str_to_int(value, 0, &core_id) || core_id < 0) {
4593
0
            error = EINVAL;
4594
0
            break;
4595
0
        }
4596
4597
0
        if (rxq_id < n_rxq) {
4598
0
            core_ids[rxq_id] = core_id;
4599
0
        }
4600
0
    }
4601
4602
0
    free(copy);
4603
0
    return error;
4604
0
}
4605
4606
/* Parses 'affinity_list' and applies configuration if it is valid. */
4607
static int
4608
dpif_netdev_port_set_rxq_affinity(struct dp_netdev_port *port,
4609
                                  const char *affinity_list)
4610
0
{
4611
0
    unsigned *core_ids, i;
4612
0
    int error = 0;
4613
4614
0
    core_ids = xmalloc(port->n_rxq * sizeof *core_ids);
4615
0
    if (parse_affinity_list(affinity_list, core_ids, port->n_rxq)) {
4616
0
        error = EINVAL;
4617
0
        goto exit;
4618
0
    }
4619
4620
0
    for (i = 0; i < port->n_rxq; i++) {
4621
0
        port->rxqs[i].core_id = core_ids[i];
4622
0
    }
4623
4624
0
exit:
4625
0
    free(core_ids);
4626
0
    return error;
4627
0
}
4628
4629
/* Returns 'true' if one of the 'port's RX queues exists in 'poll_list'
4630
 * of given PMD thread. */
4631
static bool
4632
dpif_netdev_pmd_polls_port(struct dp_netdev_pmd_thread *pmd,
4633
                           struct dp_netdev_port *port)
4634
    OVS_EXCLUDED(pmd->port_mutex)
4635
0
{
4636
0
    struct rxq_poll *poll;
4637
0
    bool found = false;
4638
4639
0
    ovs_mutex_lock(&pmd->port_mutex);
4640
0
    HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
4641
0
        if (port == poll->rxq->port) {
4642
0
            found = true;
4643
0
            break;
4644
0
        }
4645
0
    }
4646
0
    ovs_mutex_unlock(&pmd->port_mutex);
4647
0
    return found;
4648
0
}
4649
4650
/* Updates port configuration from the database.  The changes are actually
4651
 * applied in dpif_netdev_run(). */
4652
static int
4653
dpif_netdev_port_set_config(struct dpif *dpif, odp_port_t port_no,
4654
                            const struct smap *cfg)
4655
0
{
4656
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
4657
0
    struct dp_netdev_port *port;
4658
0
    int error = 0;
4659
0
    const char *affinity_list = smap_get(cfg, "pmd-rxq-affinity");
4660
0
    bool emc_enabled = smap_get_bool(cfg, "emc-enable", true);
4661
0
    const char *tx_steering_mode = smap_get(cfg, "tx-steering");
4662
0
    enum txq_req_mode txq_mode;
4663
4664
0
    ovs_rwlock_wrlock(&dp->port_rwlock);
4665
0
    error = get_port_by_number(dp, port_no, &port);
4666
0
    if (error) {
4667
0
        goto unlock;
4668
0
    }
4669
4670
0
    if (emc_enabled != port->emc_enabled) {
4671
0
        struct dp_netdev_pmd_thread *pmd;
4672
0
        struct ds ds = DS_EMPTY_INITIALIZER;
4673
0
        uint32_t cur_min, insert_prob;
4674
4675
0
        port->emc_enabled = emc_enabled;
4676
        /* Mark for reload all the threads that polls this port and request
4677
         * for reconfiguration for the actual reloading of threads. */
4678
0
        CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
4679
0
            if (dpif_netdev_pmd_polls_port(pmd, port)) {
4680
0
                pmd->need_reload = true;
4681
0
            }
4682
0
        }
4683
0
        dp_netdev_request_reconfigure(dp);
4684
4685
0
        ds_put_format(&ds, "%s: EMC has been %s.",
4686
0
                      netdev_get_name(port->netdev),
4687
0
                      (emc_enabled) ? "enabled" : "disabled");
4688
0
        if (emc_enabled) {
4689
0
            ds_put_cstr(&ds, " Current insertion probability is ");
4690
0
            atomic_read_relaxed(&dp->emc_insert_min, &cur_min);
4691
0
            if (!cur_min) {
4692
0
                ds_put_cstr(&ds, "zero.");
4693
0
            } else {
4694
0
                insert_prob = UINT32_MAX / cur_min;
4695
0
                ds_put_format(&ds, "1/%"PRIu32" (~%.2f%%).",
4696
0
                              insert_prob, 100 / (float) insert_prob);
4697
0
            }
4698
0
        }
4699
0
        VLOG_INFO("%s", ds_cstr(&ds));
4700
0
        ds_destroy(&ds);
4701
0
    }
4702
4703
    /* Checking for RXq affinity changes. */
4704
0
    if (netdev_is_pmd(port->netdev)
4705
0
        && !nullable_string_is_equal(affinity_list, port->rxq_affinity_list)) {
4706
4707
0
        error = dpif_netdev_port_set_rxq_affinity(port, affinity_list);
4708
0
        if (error) {
4709
0
            goto unlock;
4710
0
        }
4711
0
        free(port->rxq_affinity_list);
4712
0
        port->rxq_affinity_list = nullable_xstrdup(affinity_list);
4713
4714
0
        dp_netdev_request_reconfigure(dp);
4715
0
    }
4716
4717
0
    if (nullable_string_is_equal(tx_steering_mode, "hash")) {
4718
0
        txq_mode = TXQ_REQ_MODE_HASH;
4719
0
    } else {
4720
0
        txq_mode = TXQ_REQ_MODE_THREAD;
4721
0
    }
4722
4723
0
    if (txq_mode != port->txq_requested_mode) {
4724
0
        port->txq_requested_mode = txq_mode;
4725
0
        VLOG_INFO("%s: Tx packet steering mode has been set to '%s'.",
4726
0
                  netdev_get_name(port->netdev),
4727
0
                  (txq_mode == TXQ_REQ_MODE_THREAD) ? "thread" : "hash");
4728
0
        dp_netdev_request_reconfigure(dp);
4729
0
    }
4730
4731
0
unlock:
4732
0
    ovs_rwlock_unlock(&dp->port_rwlock);
4733
0
    return error;
4734
0
}
4735
4736
static int
4737
dpif_netdev_queue_to_priority(const struct dpif *dpif OVS_UNUSED,
4738
                              uint32_t queue_id, uint32_t *priority)
4739
0
{
4740
0
    *priority = queue_id;
4741
0
    return 0;
4742
0
}
4743
4744

4745
/* Creates and returns a new 'struct dp_netdev_actions', whose actions are
4746
 * a copy of the 'size' bytes of 'actions' input parameters. */
4747
struct dp_netdev_actions *
4748
dp_netdev_actions_create(const struct nlattr *actions, size_t size)
4749
0
{
4750
0
    struct dp_netdev_actions *netdev_actions;
4751
4752
0
    netdev_actions = xmalloc(sizeof *netdev_actions + size);
4753
0
    netdev_actions->size = size;
4754
0
    if (size) {
4755
0
        memcpy(netdev_actions->actions, actions, size);
4756
0
    }
4757
4758
0
    return netdev_actions;
4759
0
}
4760
4761
struct dp_netdev_actions *
4762
dp_netdev_flow_get_actions(const struct dp_netdev_flow *flow)
4763
0
{
4764
0
    return ovsrcu_get(struct dp_netdev_actions *, &flow->actions);
4765
0
}
4766
4767
static void
4768
dp_netdev_actions_free(struct dp_netdev_actions *actions)
4769
0
{
4770
0
    free(actions);
4771
0
}
4772

4773
static void
4774
dp_netdev_rxq_set_cycles(struct dp_netdev_rxq *rx,
4775
                         enum rxq_cycles_counter_type type,
4776
                         unsigned long long cycles)
4777
0
{
4778
0
   atomic_store_relaxed(&rx->cycles[type], cycles);
4779
0
}
4780
4781
static void
4782
dp_netdev_rxq_add_cycles(struct dp_netdev_rxq *rx,
4783
                         enum rxq_cycles_counter_type type,
4784
                         unsigned long long cycles)
4785
0
{
4786
0
    non_atomic_ullong_add(&rx->cycles[type], cycles);
4787
0
}
4788
4789
static uint64_t
4790
dp_netdev_rxq_get_cycles(struct dp_netdev_rxq *rx,
4791
                         enum rxq_cycles_counter_type type)
4792
0
{
4793
0
    unsigned long long processing_cycles;
4794
0
    atomic_read_relaxed(&rx->cycles[type], &processing_cycles);
4795
0
    return processing_cycles;
4796
0
}
4797
4798
static void
4799
dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx,
4800
                                unsigned long long cycles)
4801
0
{
4802
0
    unsigned int idx = atomic_count_inc(&rx->intrvl_idx) % PMD_INTERVAL_MAX;
4803
0
    atomic_store_relaxed(&rx->cycles_intrvl[idx], cycles);
4804
0
}
4805
4806
static uint64_t
4807
dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned idx)
4808
0
{
4809
0
    unsigned long long processing_cycles;
4810
0
    atomic_read_relaxed(&rx->cycles_intrvl[idx], &processing_cycles);
4811
0
    return processing_cycles;
4812
0
}
4813
4814
#if ATOMIC_ALWAYS_LOCK_FREE_8B
4815
static inline bool
4816
pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd)
4817
0
{
4818
0
    bool pmd_perf_enabled;
4819
0
    atomic_read_relaxed(&pmd->dp->pmd_perf_metrics, &pmd_perf_enabled);
4820
0
    return pmd_perf_enabled;
4821
0
}
4822
#else
4823
/* If stores and reads of 64-bit integers are not atomic, the full PMD
4824
 * performance metrics are not available as locked access to 64 bit
4825
 * integers would be prohibitively expensive. */
4826
static inline bool
4827
pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd OVS_UNUSED)
4828
{
4829
    return false;
4830
}
4831
#endif
4832
4833
static int
4834
dp_netdev_pmd_flush_output_on_port(struct dp_netdev_pmd_thread *pmd,
4835
                                   struct tx_port *p)
4836
0
{
4837
0
    int i;
4838
0
    int tx_qid;
4839
0
    int output_cnt;
4840
0
    bool concurrent_txqs;
4841
0
    struct cycle_timer timer;
4842
0
    uint64_t cycles;
4843
0
    uint32_t tx_flush_interval;
4844
4845
0
    cycle_timer_start(&pmd->perf_stats, &timer);
4846
4847
0
    output_cnt = dp_packet_batch_size(&p->output_pkts);
4848
0
    ovs_assert(output_cnt > 0);
4849
4850
0
    if (p->port->txq_mode == TXQ_MODE_XPS_HASH) {
4851
0
        int n_txq = netdev_n_txq(p->port->netdev);
4852
4853
        /* Re-batch per txq based on packet hash. */
4854
0
        struct dp_packet *packet;
4855
0
        DP_PACKET_BATCH_FOR_EACH (j, packet, &p->output_pkts) {
4856
0
            uint32_t hash;
4857
4858
0
            if (OVS_LIKELY(dp_packet_rss_valid(packet))) {
4859
0
                hash = dp_packet_get_rss_hash(packet);
4860
0
            } else {
4861
0
                struct flow flow;
4862
4863
0
                flow_extract(packet, &flow);
4864
0
                hash = flow_hash_5tuple(&flow, 0);
4865
0
            }
4866
0
            dp_packet_batch_add(&p->txq_pkts[hash % n_txq], packet);
4867
0
        }
4868
4869
        /* Flush batches of each Tx queues. */
4870
0
        for (i = 0; i < n_txq; i++) {
4871
0
            if (dp_packet_batch_is_empty(&p->txq_pkts[i])) {
4872
0
                continue;
4873
0
            }
4874
0
            netdev_send(p->port->netdev, i, &p->txq_pkts[i], true);
4875
0
            dp_packet_batch_init(&p->txq_pkts[i]);
4876
0
        }
4877
0
    } else {
4878
0
        if (p->port->txq_mode == TXQ_MODE_XPS) {
4879
0
            tx_qid = dpif_netdev_xps_get_tx_qid(pmd, p);
4880
0
            concurrent_txqs = true;
4881
0
        } else {
4882
0
            tx_qid = pmd->static_tx_qid;
4883
0
            concurrent_txqs = false;
4884
0
        }
4885
0
        netdev_send(p->port->netdev, tx_qid, &p->output_pkts, concurrent_txqs);
4886
0
    }
4887
0
    dp_packet_batch_init(&p->output_pkts);
4888
4889
    /* Update time of the next flush. */
4890
0
    atomic_read_relaxed(&pmd->dp->tx_flush_interval, &tx_flush_interval);
4891
0
    p->flush_time = pmd->ctx.now + tx_flush_interval;
4892
4893
0
    ovs_assert(pmd->n_output_batches > 0);
4894
0
    pmd->n_output_batches--;
4895
4896
0
    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_PKTS, output_cnt);
4897
0
    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_BATCHES, 1);
4898
4899
    /* Distribute send cycles evenly among transmitted packets and assign to
4900
     * their respective rx queues. */
4901
0
    cycles = cycle_timer_stop(&pmd->perf_stats, &timer) / output_cnt;
4902
0
    for (i = 0; i < output_cnt; i++) {
4903
0
        if (p->output_pkts_rxqs[i]) {
4904
0
            dp_netdev_rxq_add_cycles(p->output_pkts_rxqs[i],
4905
0
                                     RXQ_CYCLES_PROC_CURR, cycles);
4906
0
        }
4907
0
    }
4908
4909
0
    return output_cnt;
4910
0
}
4911
4912
static int
4913
dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd,
4914
                                   bool force)
4915
0
{
4916
0
    struct tx_port *p;
4917
0
    int output_cnt = 0;
4918
4919
0
    if (!pmd->n_output_batches) {
4920
0
        return 0;
4921
0
    }
4922
4923
0
    HMAP_FOR_EACH (p, node, &pmd->send_port_cache) {
4924
0
        if (!dp_packet_batch_is_empty(&p->output_pkts)
4925
0
            && (force || pmd->ctx.now >= p->flush_time)) {
4926
0
            output_cnt += dp_netdev_pmd_flush_output_on_port(pmd, p);
4927
0
        }
4928
0
    }
4929
0
    return output_cnt;
4930
0
}
4931
4932
static int
4933
dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
4934
                           struct dp_netdev_rxq *rxq,
4935
                           odp_port_t port_no)
4936
0
{
4937
0
    struct pmd_perf_stats *s = &pmd->perf_stats;
4938
0
    struct dp_packet_batch batch;
4939
0
    struct cycle_timer timer;
4940
0
    int error;
4941
0
    int batch_cnt = 0;
4942
0
    int rem_qlen = 0, *qlen_p = NULL;
4943
0
    uint64_t cycles;
4944
4945
    /* Measure duration for polling and processing rx burst. */
4946
0
    cycle_timer_start(&pmd->perf_stats, &timer);
4947
4948
0
    pmd->ctx.last_rxq = rxq;
4949
0
    dp_packet_batch_init(&batch);
4950
4951
    /* Fetch the rx queue length only for vhostuser ports. */
4952
0
    if (pmd_perf_metrics_enabled(pmd) && rxq->is_vhost) {
4953
0
        qlen_p = &rem_qlen;
4954
0
    }
4955
4956
0
    error = netdev_rxq_recv(rxq->rx, &batch, qlen_p);
4957
0
    if (!error) {
4958
        /* At least one packet received. */
4959
0
        *recirc_depth_get() = 0;
4960
0
        pmd_thread_ctx_time_update(pmd);
4961
0
        batch_cnt = dp_packet_batch_size(&batch);
4962
0
        if (pmd_perf_metrics_enabled(pmd)) {
4963
            /* Update batch histogram. */
4964
0
            s->current.batches++;
4965
0
            histogram_add_sample(&s->pkts_per_batch, batch_cnt);
4966
            /* Update the maximum vhost rx queue fill level. */
4967
0
            if (rxq->is_vhost && rem_qlen >= 0) {
4968
0
                uint32_t qfill = batch_cnt + rem_qlen;
4969
0
                if (qfill > s->current.max_vhost_qfill) {
4970
0
                    s->current.max_vhost_qfill = qfill;
4971
0
                }
4972
0
            }
4973
0
        }
4974
4975
        /* Process packet batch. */
4976
0
        int ret = pmd->netdev_input_func(pmd, &batch, port_no);
4977
0
        if (ret) {
4978
0
            dp_netdev_input(pmd, &batch, port_no);
4979
0
        }
4980
4981
        /* Assign processing cycles to rx queue. */
4982
0
        cycles = cycle_timer_stop(&pmd->perf_stats, &timer);
4983
0
        dp_netdev_rxq_add_cycles(rxq, RXQ_CYCLES_PROC_CURR, cycles);
4984
4985
0
        dp_netdev_pmd_flush_output_packets(pmd, false);
4986
0
    } else {
4987
        /* Discard cycles. */
4988
0
        cycle_timer_stop(&pmd->perf_stats, &timer);
4989
0
        if (error != EAGAIN && error != EOPNOTSUPP) {
4990
0
            static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
4991
4992
0
            VLOG_ERR_RL(&rl, "error receiving data from %s: %s",
4993
0
                    netdev_rxq_get_name(rxq->rx), ovs_strerror(error));
4994
0
        }
4995
0
    }
4996
4997
0
    pmd->ctx.last_rxq = NULL;
4998
4999
0
    return batch_cnt;
5000
0
}
5001
5002
static struct tx_port *
5003
tx_port_lookup(const struct hmap *hmap, odp_port_t port_no)
5004
0
{
5005
0
    struct tx_port *tx;
5006
5007
0
    HMAP_FOR_EACH_IN_BUCKET (tx, node, hash_port_no(port_no), hmap) {
5008
0
        if (tx->port->port_no == port_no) {
5009
0
            return tx;
5010
0
        }
5011
0
    }
5012
5013
0
    return NULL;
5014
0
}
5015
5016
static struct tx_bond *
5017
tx_bond_lookup(const struct cmap *tx_bonds, uint32_t bond_id)
5018
0
{
5019
0
    uint32_t hash = hash_bond_id(bond_id);
5020
0
    struct tx_bond *tx;
5021
5022
0
    CMAP_FOR_EACH_WITH_HASH (tx, node, hash, tx_bonds) {
5023
0
        if (tx->bond_id == bond_id) {
5024
0
            return tx;
5025
0
        }
5026
0
    }
5027
0
    return NULL;
5028
0
}
5029
5030
static int
5031
port_reconfigure(struct dp_netdev_port *port)
5032
0
{
5033
0
    struct netdev *netdev = port->netdev;
5034
0
    int i, err;
5035
5036
    /* Closes the existing 'rxq's. */
5037
0
    for (i = 0; i < port->n_rxq; i++) {
5038
0
        netdev_rxq_close(port->rxqs[i].rx);
5039
0
        port->rxqs[i].rx = NULL;
5040
0
    }
5041
0
    unsigned last_nrxq = port->n_rxq;
5042
0
    port->n_rxq = 0;
5043
5044
    /* Allows 'netdev' to apply the pending configuration changes. */
5045
0
    if (netdev_is_reconf_required(netdev) || port->need_reconfigure) {
5046
0
        err = netdev_reconfigure(netdev);
5047
0
        if (err && (err != EOPNOTSUPP)) {
5048
0
            VLOG_ERR("Failed to set interface %s new configuration",
5049
0
                     netdev_get_name(netdev));
5050
0
            return err;
5051
0
        }
5052
0
    }
5053
    /* If the netdev_reconfigure() above succeeds, reopens the 'rxq's. */
5054
0
    port->rxqs = xrealloc(port->rxqs,
5055
0
                          sizeof *port->rxqs * netdev_n_rxq(netdev));
5056
    /* Realloc 'used' counters for tx queues. */
5057
0
    free(port->txq_used);
5058
0
    port->txq_used = xcalloc(netdev_n_txq(netdev), sizeof *port->txq_used);
5059
5060
0
    for (i = 0; i < netdev_n_rxq(netdev); i++) {
5061
0
        bool new_queue = i >= last_nrxq;
5062
0
        if (new_queue) {
5063
0
            memset(&port->rxqs[i], 0, sizeof port->rxqs[i]);
5064
0
        }
5065
5066
0
        port->rxqs[i].port = port;
5067
0
        port->rxqs[i].is_vhost = !strncmp(port->type, "dpdkvhost", 9);
5068
5069
0
        err = netdev_rxq_open(netdev, &port->rxqs[i].rx, i);
5070
0
        if (err) {
5071
0
            return err;
5072
0
        }
5073
0
        port->n_rxq++;
5074
0
    }
5075
5076
    /* Parse affinity list to apply configuration for new queues. */
5077
0
    dpif_netdev_port_set_rxq_affinity(port, port->rxq_affinity_list);
5078
5079
    /* If reconfiguration was successful mark it as such, so we can use it */
5080
0
    port->need_reconfigure = false;
5081
5082
0
    return 0;
5083
0
}
5084
5085
struct sched_numa_list {
5086
    struct hmap numas;  /* Contains 'struct sched_numa'. */
5087
};
5088
5089
/* Meta data for out-of-place pmd rxq assignments. */
5090
struct sched_pmd {
5091
    struct sched_numa *numa;
5092
    /* Associated PMD thread. */
5093
    struct dp_netdev_pmd_thread *pmd;
5094
    uint64_t pmd_proc_cycles;
5095
    struct dp_netdev_rxq **rxqs;
5096
    unsigned n_rxq;
5097
    bool isolated;
5098
};
5099
5100
struct sched_numa {
5101
    struct hmap_node node;
5102
    int numa_id;
5103
    /* PMDs on numa node. */
5104
    struct sched_pmd *pmds;
5105
    /* Num of PMDs on numa node. */
5106
    unsigned n_pmds;
5107
    /* Num of isolated PMDs on numa node. */
5108
    unsigned n_isolated;
5109
    int rr_cur_index;
5110
    bool rr_idx_inc;
5111
};
5112
5113
static size_t
5114
sched_numa_list_count(struct sched_numa_list *numa_list)
5115
0
{
5116
0
    return hmap_count(&numa_list->numas);
5117
0
}
5118
5119
static struct sched_numa *
5120
sched_numa_list_next(struct sched_numa_list *numa_list,
5121
                     const struct sched_numa *numa)
5122
0
{
5123
0
    struct hmap_node *node = NULL;
5124
5125
0
    if (numa) {
5126
0
        node = hmap_next(&numa_list->numas, &numa->node);
5127
0
    }
5128
0
    if (!node) {
5129
0
        node = hmap_first(&numa_list->numas);
5130
0
    }
5131
5132
0
    return (node) ? CONTAINER_OF(node, struct sched_numa, node) : NULL;
5133
0
}
5134
5135
static struct sched_numa *
5136
sched_numa_list_lookup(struct sched_numa_list *numa_list, int numa_id)
5137
0
{
5138
0
    struct sched_numa *numa;
5139
5140
0
    HMAP_FOR_EACH_WITH_HASH (numa, node, hash_int(numa_id, 0),
5141
0
                             &numa_list->numas) {
5142
0
        if (numa->numa_id == numa_id) {
5143
0
            return numa;
5144
0
        }
5145
0
    }
5146
0
    return NULL;
5147
0
}
5148
5149
static int
5150
compare_sched_pmd_list(const void *a_, const void *b_)
5151
0
{
5152
0
    struct sched_pmd *a, *b;
5153
5154
0
    a = (struct sched_pmd *) a_;
5155
0
    b = (struct sched_pmd *) b_;
5156
5157
0
    return compare_poll_thread_list(&a->pmd, &b->pmd);
5158
0
}
5159
5160
static void
5161
sort_numa_list_pmds(struct sched_numa_list *numa_list)
5162
0
{
5163
0
    struct sched_numa *numa;
5164
5165
0
    HMAP_FOR_EACH (numa, node, &numa_list->numas) {
5166
0
        if (numa->n_pmds > 1) {
5167
0
            qsort(numa->pmds, numa->n_pmds, sizeof *numa->pmds,
5168
0
                  compare_sched_pmd_list);
5169
0
        }
5170
0
    }
5171
0
}
5172
5173
/* Populate numas and pmds on those numas. */
5174
static void
5175
sched_numa_list_populate(struct sched_numa_list *numa_list,
5176
                         struct dp_netdev *dp)
5177
0
{
5178
0
    struct dp_netdev_pmd_thread *pmd;
5179
5180
0
    hmap_init(&numa_list->numas);
5181
5182
    /* For each pmd on this datapath. */
5183
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5184
0
        struct sched_numa *numa;
5185
0
        struct sched_pmd *sched_pmd;
5186
0
        if (pmd->core_id == NON_PMD_CORE_ID) {
5187
0
            continue;
5188
0
        }
5189
5190
        /* Get the numa of the PMD. */
5191
0
        numa = sched_numa_list_lookup(numa_list, pmd->numa_id);
5192
        /* Create a new numa node for it if not already created. */
5193
0
        if (!numa) {
5194
0
            numa = xzalloc(sizeof *numa);
5195
0
            numa->numa_id = pmd->numa_id;
5196
0
            hmap_insert(&numa_list->numas, &numa->node,
5197
0
                        hash_int(pmd->numa_id, 0));
5198
0
        }
5199
5200
        /* Create a sched_pmd on this numa for the pmd. */
5201
0
        numa->n_pmds++;
5202
0
        numa->pmds = xrealloc(numa->pmds, numa->n_pmds * sizeof *numa->pmds);
5203
0
        sched_pmd = &numa->pmds[numa->n_pmds - 1];
5204
0
        memset(sched_pmd, 0, sizeof *sched_pmd);
5205
0
        sched_pmd->numa = numa;
5206
0
        sched_pmd->pmd = pmd;
5207
        /* At least one pmd is present so initialize curr_idx and idx_inc. */
5208
0
        numa->rr_cur_index = 0;
5209
0
        numa->rr_idx_inc = true;
5210
0
    }
5211
0
    sort_numa_list_pmds(numa_list);
5212
0
}
5213
5214
static void
5215
sched_numa_list_free_entries(struct sched_numa_list *numa_list)
5216
0
{
5217
0
    struct sched_numa *numa;
5218
5219
0
    HMAP_FOR_EACH_POP (numa, node, &numa_list->numas) {
5220
0
        for (unsigned i = 0; i < numa->n_pmds; i++) {
5221
0
            struct sched_pmd *sched_pmd;
5222
5223
0
            sched_pmd = &numa->pmds[i];
5224
0
            sched_pmd->n_rxq = 0;
5225
0
            free(sched_pmd->rxqs);
5226
0
        }
5227
0
        numa->n_pmds = 0;
5228
0
        free(numa->pmds);
5229
0
        free(numa);
5230
0
    }
5231
0
    hmap_destroy(&numa_list->numas);
5232
0
}
5233
5234
static struct sched_pmd *
5235
sched_pmd_find_by_pmd(struct sched_numa_list *numa_list,
5236
                      struct dp_netdev_pmd_thread *pmd)
5237
0
{
5238
0
    struct sched_numa *numa;
5239
5240
0
    HMAP_FOR_EACH (numa, node, &numa_list->numas) {
5241
0
        for (unsigned i = 0; i < numa->n_pmds; i++) {
5242
0
            struct sched_pmd *sched_pmd;
5243
5244
0
            sched_pmd = &numa->pmds[i];
5245
0
            if (pmd == sched_pmd->pmd) {
5246
0
                return sched_pmd;
5247
0
            }
5248
0
        }
5249
0
    }
5250
0
    return NULL;
5251
0
}
5252
5253
static void
5254
sched_pmd_add_rxq(struct sched_pmd *sched_pmd, struct dp_netdev_rxq *rxq,
5255
                  uint64_t cycles)
5256
0
{
5257
    /* As sched_pmd is allocated outside this fn. better to not assume
5258
     * rxqs is initialized to NULL. */
5259
0
    if (sched_pmd->n_rxq == 0) {
5260
0
        sched_pmd->rxqs = xmalloc(sizeof *sched_pmd->rxqs);
5261
0
    } else {
5262
0
        sched_pmd->rxqs = xrealloc(sched_pmd->rxqs, (sched_pmd->n_rxq + 1) *
5263
0
                                                    sizeof *sched_pmd->rxqs);
5264
0
    }
5265
5266
0
    sched_pmd->rxqs[sched_pmd->n_rxq++] = rxq;
5267
0
    sched_pmd->pmd_proc_cycles += cycles;
5268
0
}
5269
5270
static void
5271
sched_numa_list_assignments(struct sched_numa_list *numa_list,
5272
                            struct dp_netdev *dp)
5273
    OVS_REQ_RDLOCK(dp->port_rwlock)
5274
0
{
5275
0
    struct dp_netdev_port *port;
5276
5277
    /* For each port. */
5278
0
    HMAP_FOR_EACH (port, node, &dp->ports) {
5279
0
        if (!netdev_is_pmd(port->netdev)) {
5280
0
            continue;
5281
0
        }
5282
        /* For each rxq on the port. */
5283
0
        for (unsigned qid = 0; qid < port->n_rxq; qid++) {
5284
0
            struct dp_netdev_rxq *rxq = &port->rxqs[qid];
5285
0
            struct sched_pmd *sched_pmd;
5286
0
            uint64_t proc_cycles = 0;
5287
5288
0
            for (int i = 0; i < PMD_INTERVAL_MAX; i++) {
5289
0
                proc_cycles  += dp_netdev_rxq_get_intrvl_cycles(rxq, i);
5290
0
            }
5291
5292
0
            sched_pmd = sched_pmd_find_by_pmd(numa_list, rxq->pmd);
5293
0
            if (sched_pmd) {
5294
0
                if (rxq->core_id != OVS_CORE_UNSPEC && dp->pmd_iso) {
5295
0
                    sched_pmd->isolated = true;
5296
0
                }
5297
0
                sched_pmd_add_rxq(sched_pmd, rxq, proc_cycles);
5298
0
            }
5299
0
        }
5300
0
    }
5301
0
}
5302
5303
static void
5304
sched_numa_list_put_in_place(struct sched_numa_list *numa_list)
5305
0
{
5306
0
    struct sched_numa *numa;
5307
5308
    /* For each numa. */
5309
0
    HMAP_FOR_EACH (numa, node, &numa_list->numas) {
5310
        /* For each pmd. */
5311
0
        for (int i = 0; i < numa->n_pmds; i++) {
5312
0
            struct sched_pmd *sched_pmd;
5313
5314
0
            sched_pmd = &numa->pmds[i];
5315
0
            sched_pmd->pmd->isolated = sched_pmd->isolated;
5316
            /* For each rxq. */
5317
0
            for (unsigned k = 0; k < sched_pmd->n_rxq; k++) {
5318
                /* Store the new pmd from the out of place sched_numa_list
5319
                 * struct to the dp_netdev_rxq struct */
5320
0
                sched_pmd->rxqs[k]->pmd = sched_pmd->pmd;
5321
0
            }
5322
0
        }
5323
0
    }
5324
0
}
5325
5326
/* Returns 'true' if OVS rxq scheduling algorithm assigned any unpinned rxq to
5327
 * a PMD thread core on a non-local numa node. */
5328
static bool
5329
sched_numa_list_cross_numa_polling(struct sched_numa_list *numa_list)
5330
0
{
5331
0
    struct sched_numa *numa;
5332
5333
0
    HMAP_FOR_EACH (numa, node, &numa_list->numas) {
5334
0
        for (int i = 0; i < numa->n_pmds; i++) {
5335
0
            struct sched_pmd *sched_pmd;
5336
5337
0
            sched_pmd = &numa->pmds[i];
5338
0
            if (sched_pmd->isolated) {
5339
                /* All rxqs on this PMD thread core are pinned. */
5340
0
                continue;
5341
0
            }
5342
0
            for (unsigned k = 0; k < sched_pmd->n_rxq; k++) {
5343
0
                struct dp_netdev_rxq *rxq = sched_pmd->rxqs[k];
5344
                /* Check if the rxq is not pinned to a specific PMD thread core
5345
                 * by the user AND the PMD thread core that OVS assigned is
5346
                 * non-local to the rxq port. */
5347
0
                if (rxq->core_id == OVS_CORE_UNSPEC &&
5348
0
                    rxq->pmd->numa_id !=
5349
0
                        netdev_get_numa_id(rxq->port->netdev)) {
5350
0
                    return true;
5351
0
                }
5352
0
            }
5353
0
        }
5354
0
    }
5355
0
    return false;
5356
0
}
5357
5358
static unsigned
5359
sched_numa_noniso_pmd_count(struct sched_numa *numa)
5360
0
{
5361
0
    if (numa->n_pmds > numa->n_isolated) {
5362
0
        return numa->n_pmds - numa->n_isolated;
5363
0
    }
5364
0
    return 0;
5365
0
}
5366
5367
/* Sort Rx Queues by the processing cycles they are consuming. */
5368
static int
5369
compare_rxq_cycles(const void *a, const void *b)
5370
0
{
5371
0
    struct dp_netdev_rxq *qa;
5372
0
    struct dp_netdev_rxq *qb;
5373
0
    uint64_t cycles_qa, cycles_qb;
5374
5375
0
    qa = *(struct dp_netdev_rxq **) a;
5376
0
    qb = *(struct dp_netdev_rxq **) b;
5377
5378
0
    cycles_qa = dp_netdev_rxq_get_cycles(qa, RXQ_CYCLES_PROC_HIST);
5379
0
    cycles_qb = dp_netdev_rxq_get_cycles(qb, RXQ_CYCLES_PROC_HIST);
5380
5381
0
    if (cycles_qa != cycles_qb) {
5382
0
        return (cycles_qa < cycles_qb) ? 1 : -1;
5383
0
    } else {
5384
        /* Cycles are the same so tiebreak on port/queue id.
5385
         * Tiebreaking (as opposed to return 0) ensures consistent
5386
         * sort results across multiple OS's. */
5387
0
        uint32_t port_qa = odp_to_u32(qa->port->port_no);
5388
0
        uint32_t port_qb = odp_to_u32(qb->port->port_no);
5389
0
        if (port_qa != port_qb) {
5390
0
            return port_qa > port_qb ? 1 : -1;
5391
0
        } else {
5392
0
            return netdev_rxq_get_queue_id(qa->rx)
5393
0
                    - netdev_rxq_get_queue_id(qb->rx);
5394
0
        }
5395
0
    }
5396
0
}
5397
5398
static bool
5399
sched_pmd_new_lowest(struct sched_pmd *current_lowest, struct sched_pmd *pmd,
5400
                     bool has_proc)
5401
0
{
5402
0
    uint64_t current_num, pmd_num;
5403
5404
0
    if (current_lowest == NULL) {
5405
0
        return true;
5406
0
    }
5407
5408
0
    if (has_proc) {
5409
0
        current_num = current_lowest->pmd_proc_cycles;
5410
0
        pmd_num = pmd->pmd_proc_cycles;
5411
0
    } else {
5412
0
        current_num = current_lowest->n_rxq;
5413
0
        pmd_num = pmd->n_rxq;
5414
0
    }
5415
5416
0
    if (pmd_num < current_num) {
5417
0
        return true;
5418
0
    }
5419
0
    return false;
5420
0
}
5421
5422
static struct sched_pmd *
5423
sched_pmd_get_lowest(struct sched_numa *numa, bool has_cyc)
5424
0
{
5425
0
    struct sched_pmd *lowest_sched_pmd = NULL;
5426
5427
0
    for (unsigned i = 0; i < numa->n_pmds; i++) {
5428
0
        struct sched_pmd *sched_pmd;
5429
5430
0
        sched_pmd = &numa->pmds[i];
5431
0
        if (sched_pmd->isolated) {
5432
0
            continue;
5433
0
        }
5434
0
        if (sched_pmd_new_lowest(lowest_sched_pmd, sched_pmd, has_cyc)) {
5435
0
            lowest_sched_pmd = sched_pmd;
5436
0
        }
5437
0
    }
5438
0
    return lowest_sched_pmd;
5439
0
}
5440
5441
/*
5442
 * Returns the next pmd from the numa node.
5443
 *
5444
 * If 'updown' is 'true' it will alternate between selecting the next pmd in
5445
 * either an up or down walk, switching between up/down when the first or last
5446
 * core is reached. e.g. 1,2,3,3,2,1,1,2...
5447
 *
5448
 * If 'updown' is 'false' it will select the next pmd wrapping around when
5449
 * last core reached. e.g. 1,2,3,1,2,3,1,2...
5450
 */
5451
static struct sched_pmd *
5452
sched_pmd_next_rr(struct sched_numa *numa, bool updown)
5453
0
{
5454
0
    int numa_idx = numa->rr_cur_index;
5455
5456
0
    if (numa->rr_idx_inc == true) {
5457
        /* Incrementing through list of pmds. */
5458
0
        if (numa->rr_cur_index == numa->n_pmds - 1) {
5459
            /* Reached the last pmd. */
5460
0
            if (updown) {
5461
0
                numa->rr_idx_inc = false;
5462
0
            } else {
5463
0
                numa->rr_cur_index = 0;
5464
0
            }
5465
0
        } else {
5466
0
            numa->rr_cur_index++;
5467
0
        }
5468
0
    } else {
5469
        /* Decrementing through list of pmds. */
5470
0
        if (numa->rr_cur_index == 0) {
5471
            /* Reached the first pmd. */
5472
0
            numa->rr_idx_inc = true;
5473
0
        } else {
5474
0
            numa->rr_cur_index--;
5475
0
        }
5476
0
    }
5477
0
    return &numa->pmds[numa_idx];
5478
0
}
5479
5480
static struct sched_pmd *
5481
sched_pmd_next_noniso_rr(struct sched_numa *numa, bool updown)
5482
0
{
5483
0
    struct sched_pmd *sched_pmd = NULL;
5484
5485
    /* sched_pmd_next_rr() may return duplicate PMDs before all PMDs have been
5486
     * returned depending on updown. Call it more than n_pmds to ensure all
5487
     * PMDs can be searched for the next non-isolated PMD. */
5488
0
    for (unsigned i = 0; i < numa->n_pmds * 2; i++) {
5489
0
        sched_pmd = sched_pmd_next_rr(numa, updown);
5490
0
        if (!sched_pmd->isolated) {
5491
0
            break;
5492
0
        }
5493
0
        sched_pmd = NULL;
5494
0
    }
5495
0
    return sched_pmd;
5496
0
}
5497
5498
static struct sched_pmd *
5499
sched_pmd_next(struct sched_numa *numa, enum sched_assignment_type algo,
5500
               bool has_proc)
5501
0
{
5502
0
    if (algo == SCHED_GROUP) {
5503
0
        return sched_pmd_get_lowest(numa, has_proc);
5504
0
    }
5505
5506
    /* By default RR the PMDs. */
5507
0
    return sched_pmd_next_noniso_rr(numa, algo == SCHED_CYCLES ? true : false);
5508
0
}
5509
5510
static const char *
5511
get_assignment_type_string(enum sched_assignment_type algo)
5512
0
{
5513
0
    switch (algo) {
5514
0
    case SCHED_ROUNDROBIN: return "roundrobin";
5515
0
    case SCHED_CYCLES: return "cycles";
5516
0
    case SCHED_GROUP: return "group";
5517
0
    default: return "Unknown";
5518
0
    }
5519
0
}
5520
5521
0
#define MAX_RXQ_CYC_TEXT 40
5522
0
#define MAX_RXQ_CYC_STRLEN (INT_STRLEN(uint64_t) + MAX_RXQ_CYC_TEXT)
5523
5524
static char *
5525
get_rxq_cyc_log(char *a, enum sched_assignment_type algo, uint64_t cycles)
5526
0
{
5527
0
    int ret = 0;
5528
5529
0
    if (algo != SCHED_ROUNDROBIN) {
5530
0
        ret = snprintf(a, MAX_RXQ_CYC_STRLEN,
5531
0
                       " (measured processing cycles %"PRIu64")", cycles);
5532
0
    }
5533
5534
0
    if (algo == SCHED_ROUNDROBIN || ret <= 0) {
5535
0
        a[0] = '\0';
5536
0
    }
5537
0
    return a;
5538
0
}
5539
5540
static void
5541
sched_numa_list_schedule(struct sched_numa_list *numa_list,
5542
                         struct dp_netdev *dp,
5543
                         enum sched_assignment_type algo,
5544
                         enum vlog_level level)
5545
    OVS_REQ_RDLOCK(dp->port_rwlock)
5546
0
{
5547
0
    struct dp_netdev_port *port;
5548
0
    struct dp_netdev_rxq **rxqs = NULL;
5549
0
    struct sched_numa *last_cross_numa;
5550
0
    unsigned n_rxqs = 0;
5551
0
    bool start_logged = false;
5552
0
    size_t n_numa;
5553
5554
    /* For each port. */
5555
0
    HMAP_FOR_EACH (port, node, &dp->ports) {
5556
0
        if (!netdev_is_pmd(port->netdev)) {
5557
0
            continue;
5558
0
        }
5559
5560
        /* For each rxq on the port. */
5561
0
        for (int qid = 0; qid < port->n_rxq; qid++) {
5562
0
            struct dp_netdev_rxq *rxq = &port->rxqs[qid];
5563
5564
0
            if (algo != SCHED_ROUNDROBIN) {
5565
0
                uint64_t cycle_hist = 0;
5566
5567
                /* Sum the queue intervals and store the cycle history. */
5568
0
                for (unsigned i = 0; i < PMD_INTERVAL_MAX; i++) {
5569
0
                    cycle_hist += dp_netdev_rxq_get_intrvl_cycles(rxq, i);
5570
0
                }
5571
0
                dp_netdev_rxq_set_cycles(rxq, RXQ_CYCLES_PROC_HIST,
5572
0
                                         cycle_hist);
5573
0
            }
5574
5575
            /* Check if this rxq is pinned. */
5576
0
            if (rxq->core_id != OVS_CORE_UNSPEC) {
5577
0
                struct sched_pmd *sched_pmd;
5578
0
                struct dp_netdev_pmd_thread *pmd;
5579
0
                struct sched_numa *numa;
5580
0
                bool iso = dp->pmd_iso;
5581
0
                uint64_t proc_cycles;
5582
0
                char rxq_cyc_log[MAX_RXQ_CYC_STRLEN];
5583
5584
                /* This rxq should be pinned, pin it now. */
5585
0
                pmd = dp_netdev_get_pmd(dp, rxq->core_id);
5586
0
                sched_pmd = sched_pmd_find_by_pmd(numa_list, pmd);
5587
0
                dp_netdev_pmd_unref(pmd);
5588
0
                if (!sched_pmd) {
5589
                    /* Cannot find the PMD.  Cannot pin this rxq. */
5590
0
                    VLOG(level == VLL_DBG ? VLL_DBG : VLL_WARN,
5591
0
                            "Core %2u cannot be pinned with "
5592
0
                            "port \'%s\' rx queue %d. Use pmd-cpu-mask to "
5593
0
                            "enable a pmd on core %u. An alternative core "
5594
0
                            "will be assigned.",
5595
0
                            rxq->core_id,
5596
0
                            netdev_rxq_get_name(rxq->rx),
5597
0
                            netdev_rxq_get_queue_id(rxq->rx),
5598
0
                            rxq->core_id);
5599
0
                    rxqs = xrealloc(rxqs, (n_rxqs + 1) * sizeof *rxqs);
5600
0
                    rxqs[n_rxqs++] = rxq;
5601
0
                    continue;
5602
0
                }
5603
0
                if (iso) {
5604
                    /* Mark PMD as isolated if not done already. */
5605
0
                    if (sched_pmd->isolated == false) {
5606
0
                        sched_pmd->isolated = true;
5607
0
                        numa = sched_pmd->numa;
5608
0
                        numa->n_isolated++;
5609
0
                    }
5610
0
                }
5611
0
                proc_cycles = dp_netdev_rxq_get_cycles(rxq,
5612
0
                                                       RXQ_CYCLES_PROC_HIST);
5613
0
                VLOG(level, "Core %2u on numa node %d is pinned with "
5614
0
                            "port \'%s\' rx queue %d%s",
5615
0
                            sched_pmd->pmd->core_id, sched_pmd->pmd->numa_id,
5616
0
                            netdev_rxq_get_name(rxq->rx),
5617
0
                            netdev_rxq_get_queue_id(rxq->rx),
5618
0
                            get_rxq_cyc_log(rxq_cyc_log, algo, proc_cycles));
5619
0
                sched_pmd_add_rxq(sched_pmd, rxq, proc_cycles);
5620
0
            } else {
5621
0
                rxqs = xrealloc(rxqs, (n_rxqs + 1) * sizeof *rxqs);
5622
0
                rxqs[n_rxqs++] = rxq;
5623
0
            }
5624
0
        }
5625
0
    }
5626
5627
0
    if (n_rxqs > 1 && algo != SCHED_ROUNDROBIN) {
5628
        /* Sort the queues in order of the processing cycles
5629
         * they consumed during their last pmd interval. */
5630
0
        qsort(rxqs, n_rxqs, sizeof *rxqs, compare_rxq_cycles);
5631
0
    }
5632
5633
0
    last_cross_numa = NULL;
5634
0
    n_numa = sched_numa_list_count(numa_list);
5635
0
    for (unsigned i = 0; i < n_rxqs; i++) {
5636
0
        struct dp_netdev_rxq *rxq = rxqs[i];
5637
0
        struct sched_pmd *sched_pmd = NULL;
5638
0
        struct sched_numa *numa;
5639
0
        int port_numa_id;
5640
0
        uint64_t proc_cycles;
5641
0
        char rxq_cyc_log[MAX_RXQ_CYC_STRLEN];
5642
5643
0
        if (start_logged == false && level != VLL_DBG) {
5644
0
            VLOG(level, "Performing pmd to rx queue assignment using %s "
5645
0
                        "algorithm.", get_assignment_type_string(algo));
5646
0
            start_logged = true;
5647
0
        }
5648
5649
        /* Store the cycles for this rxq as we will log these later. */
5650
0
        proc_cycles = dp_netdev_rxq_get_cycles(rxq, RXQ_CYCLES_PROC_HIST);
5651
5652
0
        port_numa_id = netdev_get_numa_id(rxq->port->netdev);
5653
5654
        /* Select numa. */
5655
0
        numa = sched_numa_list_lookup(numa_list, port_numa_id);
5656
5657
        /* Check if numa has no PMDs or no non-isolated PMDs. */
5658
0
        if (!numa || !sched_numa_noniso_pmd_count(numa)) {
5659
            /* Unable to use this numa to find a PMD. */
5660
0
            numa = NULL;
5661
            /* Find any numa with available PMDs. */
5662
0
            for (int j = 0; j < n_numa; j++) {
5663
0
                numa = sched_numa_list_next(numa_list, last_cross_numa);
5664
0
                last_cross_numa = numa;
5665
0
                if (sched_numa_noniso_pmd_count(numa)) {
5666
0
                    break;
5667
0
                }
5668
0
                numa = NULL;
5669
0
            }
5670
0
        }
5671
5672
0
        if (numa) {
5673
            /* Select the PMD that should be used for this rxq. */
5674
0
            sched_pmd = sched_pmd_next(numa, algo,
5675
0
                                       proc_cycles ? true : false);
5676
0
        }
5677
5678
        /* Check that a pmd has been selected. */
5679
0
        if (sched_pmd) {
5680
0
            int pmd_numa_id;
5681
5682
0
            pmd_numa_id = sched_pmd->numa->numa_id;
5683
            /* Check if selected pmd numa matches port numa. */
5684
0
            if (pmd_numa_id != port_numa_id) {
5685
0
                VLOG(level, "There's no available (non-isolated) pmd thread "
5686
0
                            "on numa node %d. Port \'%s\' rx queue %d will "
5687
0
                            "be assigned to a pmd on numa node %d. "
5688
0
                            "This may lead to reduced performance.",
5689
0
                            port_numa_id, netdev_rxq_get_name(rxq->rx),
5690
0
                            netdev_rxq_get_queue_id(rxq->rx), pmd_numa_id);
5691
0
            }
5692
0
            VLOG(level, "Core %2u on numa node %d assigned port \'%s\' "
5693
0
                        "rx queue %d%s.",
5694
0
                        sched_pmd->pmd->core_id, sched_pmd->pmd->numa_id,
5695
0
                        netdev_rxq_get_name(rxq->rx),
5696
0
                        netdev_rxq_get_queue_id(rxq->rx),
5697
0
                        get_rxq_cyc_log(rxq_cyc_log, algo, proc_cycles));
5698
0
            sched_pmd_add_rxq(sched_pmd, rxq, proc_cycles);
5699
0
        } else  {
5700
0
            VLOG(level == VLL_DBG ? level : VLL_WARN,
5701
0
                 "No non-isolated pmd on any numa available for "
5702
0
                 "port \'%s\' rx queue %d%s. "
5703
0
                 "This rx queue will not be polled.",
5704
0
                 netdev_rxq_get_name(rxq->rx),
5705
0
                 netdev_rxq_get_queue_id(rxq->rx),
5706
0
                 get_rxq_cyc_log(rxq_cyc_log, algo, proc_cycles));
5707
0
        }
5708
0
    }
5709
0
    free(rxqs);
5710
0
}
5711
5712
static void
5713
rxq_scheduling(struct dp_netdev *dp)
5714
    OVS_REQ_RDLOCK(dp->port_rwlock)
5715
0
{
5716
0
    struct sched_numa_list numa_list;
5717
0
    enum sched_assignment_type algo = dp->pmd_rxq_assign_type;
5718
5719
0
    sched_numa_list_populate(&numa_list, dp);
5720
0
    sched_numa_list_schedule(&numa_list, dp, algo, VLL_INFO);
5721
0
    sched_numa_list_put_in_place(&numa_list);
5722
5723
0
    sched_numa_list_free_entries(&numa_list);
5724
0
}
5725
5726
static uint64_t variance(uint64_t a[], int n);
5727
5728
static uint64_t
5729
sched_numa_variance(struct sched_numa *numa)
5730
0
{
5731
0
    uint64_t *percent_busy = NULL;
5732
0
    int n_proc = 0;
5733
0
    uint64_t var;
5734
5735
0
    percent_busy = xmalloc(numa->n_pmds * sizeof *percent_busy);
5736
5737
0
    for (unsigned i = 0; i < numa->n_pmds; i++) {
5738
0
        struct sched_pmd *sched_pmd;
5739
0
        uint64_t total_cycles = 0;
5740
5741
0
        sched_pmd = &numa->pmds[i];
5742
        /* Exclude isolated PMDs from variance calculations. */
5743
0
        if (sched_pmd->isolated == true) {
5744
0
            continue;
5745
0
        }
5746
        /* Get the total pmd cycles for an interval. */
5747
0
        atomic_read_relaxed(&sched_pmd->pmd->intrvl_cycles, &total_cycles);
5748
5749
0
        if (total_cycles) {
5750
            /* Estimate the cycles to cover all intervals. */
5751
0
            total_cycles *= PMD_INTERVAL_MAX;
5752
0
            percent_busy[n_proc++] = (sched_pmd->pmd_proc_cycles * 100)
5753
0
                                            / total_cycles;
5754
0
        } else {
5755
0
            percent_busy[n_proc++] = 0;
5756
0
        }
5757
0
    }
5758
0
    var = variance(percent_busy, n_proc);
5759
0
    free(percent_busy);
5760
0
    return var;
5761
0
}
5762
5763
/*
5764
 * This function checks that some basic conditions needed for a rebalance to be
5765
 * effective are met. Such as Rxq scheduling assignment type, more than one
5766
 * PMD, more than 2 Rxqs on a PMD. If there was no reconfiguration change
5767
 * since the last check, it reuses the last result.
5768
 *
5769
 * It is not intended to be an inclusive check of every condition that may make
5770
 * a rebalance ineffective. It is done as a quick check so a full
5771
 * pmd_rebalance_dry_run() can be avoided when it is not needed.
5772
 */
5773
static bool
5774
pmd_rebalance_dry_run_needed(struct dp_netdev *dp)
5775
    OVS_REQ_RDLOCK(dp->port_rwlock)
5776
0
{
5777
0
    struct dp_netdev_pmd_thread *pmd;
5778
0
    struct pmd_auto_lb *pmd_alb = &dp->pmd_alb;
5779
0
    unsigned int cnt = 0;
5780
0
    bool multi_rxq = false;
5781
5782
    /* Check if there was no reconfiguration since last check. */
5783
0
    if (!pmd_alb->recheck_config) {
5784
0
        if (!pmd_alb->do_dry_run) {
5785
0
            VLOG_DBG("PMD auto load balance nothing to do, "
5786
0
                     "no configuration changes since last check.");
5787
0
            return false;
5788
0
        }
5789
0
        return true;
5790
0
    }
5791
0
    pmd_alb->recheck_config = false;
5792
5793
    /* Check for incompatible assignment type. */
5794
0
    if (dp->pmd_rxq_assign_type == SCHED_ROUNDROBIN) {
5795
0
        VLOG_DBG("PMD auto load balance nothing to do, "
5796
0
                 "pmd-rxq-assign=roundrobin assignment type configured.");
5797
0
        return pmd_alb->do_dry_run = false;
5798
0
    }
5799
5800
    /* Check that there is at least 2 non-isolated PMDs and
5801
     * one of them is polling more than one rxq. */
5802
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5803
0
        if (pmd->core_id == NON_PMD_CORE_ID || pmd->isolated) {
5804
0
            continue;
5805
0
        }
5806
5807
0
        if (hmap_count(&pmd->poll_list) > 1) {
5808
0
            multi_rxq = true;
5809
0
        }
5810
0
        if (cnt && multi_rxq) {
5811
0
            return pmd_alb->do_dry_run = true;
5812
0
        }
5813
0
        cnt++;
5814
0
    }
5815
5816
0
    VLOG_DBG("PMD auto load balance nothing to do, "
5817
0
             "not enough non-isolated PMDs or RxQs.");
5818
0
    return pmd_alb->do_dry_run = false;
5819
0
}
5820
5821
static bool
5822
pmd_rebalance_dry_run(struct dp_netdev *dp)
5823
    OVS_REQ_RDLOCK(dp->port_rwlock)
5824
0
{
5825
0
    struct sched_numa_list numa_list_cur;
5826
0
    struct sched_numa_list numa_list_est;
5827
0
    bool thresh_met = false;
5828
5829
0
    VLOG_DBG("PMD auto load balance performing dry run.");
5830
5831
    /* Populate current assignments. */
5832
0
    sched_numa_list_populate(&numa_list_cur, dp);
5833
0
    sched_numa_list_assignments(&numa_list_cur, dp);
5834
5835
    /* Populate estimated assignments. */
5836
0
    sched_numa_list_populate(&numa_list_est, dp);
5837
0
    sched_numa_list_schedule(&numa_list_est, dp,
5838
0
                             dp->pmd_rxq_assign_type, VLL_DBG);
5839
5840
    /* Check if cross-numa polling, there is only one numa with PMDs. */
5841
0
    if (!sched_numa_list_cross_numa_polling(&numa_list_est) ||
5842
0
            sched_numa_list_count(&numa_list_est) == 1) {
5843
0
        struct sched_numa *numa_cur;
5844
5845
        /* Calculate variances. */
5846
0
        HMAP_FOR_EACH (numa_cur, node, &numa_list_cur.numas) {
5847
0
            uint64_t current_var, estimate_var;
5848
0
            struct sched_numa *numa_est;
5849
0
            uint64_t improvement = 0;
5850
5851
0
            numa_est = sched_numa_list_lookup(&numa_list_est,
5852
0
                                              numa_cur->numa_id);
5853
0
            if (!numa_est) {
5854
0
                continue;
5855
0
            }
5856
0
            current_var = sched_numa_variance(numa_cur);
5857
0
            estimate_var = sched_numa_variance(numa_est);
5858
0
            if (estimate_var < current_var) {
5859
0
                improvement = ((current_var - estimate_var) * 100)
5860
0
                              / current_var;
5861
0
            }
5862
0
            VLOG_DBG("Numa node %d. Current variance %"PRIu64" Estimated "
5863
0
                     "variance %"PRIu64". Variance improvement %"PRIu64"%%.",
5864
0
                     numa_cur->numa_id, current_var,
5865
0
                     estimate_var, improvement);
5866
0
            if (improvement >= dp->pmd_alb.rebalance_improve_thresh) {
5867
0
                thresh_met = true;
5868
0
            }
5869
0
        }
5870
0
        VLOG_DBG("PMD load variance improvement threshold %u%% is %s.",
5871
0
                 dp->pmd_alb.rebalance_improve_thresh,
5872
0
                 thresh_met ? "met" : "not met");
5873
0
    } else {
5874
0
        VLOG_DBG("PMD auto load balance detected cross-numa polling with "
5875
0
                 "multiple numa nodes. Unable to accurately estimate.");
5876
0
    }
5877
5878
0
    sched_numa_list_free_entries(&numa_list_cur);
5879
0
    sched_numa_list_free_entries(&numa_list_est);
5880
5881
0
    return thresh_met;
5882
0
}
5883
5884
static void
5885
reload_affected_pmds(struct dp_netdev *dp)
5886
0
{
5887
0
    struct dp_netdev_pmd_thread *pmd;
5888
5889
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5890
0
        if (pmd->need_reload) {
5891
0
            dp_netdev_reload_pmd__(pmd);
5892
0
        }
5893
0
    }
5894
5895
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5896
0
        if (pmd->need_reload) {
5897
0
            if (pmd->core_id != NON_PMD_CORE_ID) {
5898
0
                bool reload;
5899
5900
0
                do {
5901
0
                    atomic_read_explicit(&pmd->reload, &reload,
5902
0
                                         memory_order_acquire);
5903
0
                } while (reload);
5904
0
            }
5905
0
            pmd->need_reload = false;
5906
0
        }
5907
0
    }
5908
0
}
5909
5910
static void
5911
reconfigure_pmd_threads(struct dp_netdev *dp)
5912
    OVS_REQ_RDLOCK(dp->port_rwlock)
5913
0
{
5914
0
    struct dp_netdev_pmd_thread *pmd;
5915
0
    struct ovs_numa_dump *pmd_cores;
5916
0
    struct ovs_numa_info_core *core;
5917
0
    struct hmapx to_delete = HMAPX_INITIALIZER(&to_delete);
5918
0
    struct hmapx_node *node;
5919
0
    bool changed = false;
5920
0
    bool need_to_adjust_static_tx_qids = false;
5921
5922
    /* The pmd threads should be started only if there's a pmd port in the
5923
     * datapath.  If the user didn't provide any "pmd-cpu-mask", we start
5924
     * NR_PMD_THREADS per numa node. */
5925
0
    if (!has_pmd_port(dp)) {
5926
0
        pmd_cores = ovs_numa_dump_n_cores_per_numa(0);
5927
0
    } else if (dp->pmd_cmask && dp->pmd_cmask[0]) {
5928
0
        pmd_cores = ovs_numa_dump_cores_with_cmask(dp->pmd_cmask);
5929
0
    } else {
5930
0
        pmd_cores = ovs_numa_dump_n_cores_per_numa(NR_PMD_THREADS);
5931
0
    }
5932
5933
    /* We need to adjust 'static_tx_qid's only if we're reducing number of
5934
     * PMD threads. Otherwise, new threads will allocate all the freed ids. */
5935
0
    if (ovs_numa_dump_count(pmd_cores) < cmap_count(&dp->poll_threads) - 1) {
5936
        /* Adjustment is required to keep 'static_tx_qid's sequential and
5937
         * avoid possible issues, for example, imbalanced tx queue usage
5938
         * and unnecessary locking caused by remapping on netdev level. */
5939
0
        need_to_adjust_static_tx_qids = true;
5940
0
    }
5941
5942
    /* Check for unwanted pmd threads */
5943
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
5944
0
        if (pmd->core_id == NON_PMD_CORE_ID) {
5945
0
            continue;
5946
0
        }
5947
0
        if (!ovs_numa_dump_contains_core(pmd_cores, pmd->numa_id,
5948
0
                                                    pmd->core_id)) {
5949
0
            hmapx_add(&to_delete, pmd);
5950
0
        } else if (need_to_adjust_static_tx_qids) {
5951
0
            atomic_store_relaxed(&pmd->reload_tx_qid, true);
5952
0
            pmd->need_reload = true;
5953
0
        }
5954
0
    }
5955
5956
0
    HMAPX_FOR_EACH (node, &to_delete) {
5957
0
        pmd = (struct dp_netdev_pmd_thread *) node->data;
5958
0
        VLOG_INFO("PMD thread on numa_id: %d, core id: %2d destroyed.",
5959
0
                  pmd->numa_id, pmd->core_id);
5960
0
        dp_netdev_del_pmd(dp, pmd);
5961
0
    }
5962
0
    changed = !hmapx_is_empty(&to_delete);
5963
0
    hmapx_destroy(&to_delete);
5964
5965
0
    if (need_to_adjust_static_tx_qids) {
5966
        /* 'static_tx_qid's are not sequential now.
5967
         * Reload remaining threads to fix this. */
5968
0
        reload_affected_pmds(dp);
5969
0
    }
5970
5971
    /* Check for required new pmd threads */
5972
0
    FOR_EACH_CORE_ON_DUMP(core, pmd_cores) {
5973
0
        pmd = dp_netdev_get_pmd(dp, core->core_id);
5974
0
        if (!pmd) {
5975
0
            struct ds name = DS_EMPTY_INITIALIZER;
5976
5977
0
            pmd = xzalloc(sizeof *pmd);
5978
0
            dp_netdev_configure_pmd(pmd, dp, core->core_id, core->numa_id);
5979
5980
0
            ds_put_format(&name, "pmd-c%02d/id:", core->core_id);
5981
0
            pmd->thread = ovs_thread_create(ds_cstr(&name),
5982
0
                                            pmd_thread_main, pmd);
5983
0
            ds_destroy(&name);
5984
5985
0
            VLOG_INFO("PMD thread on numa_id: %d, core id: %2d created.",
5986
0
                      pmd->numa_id, pmd->core_id);
5987
0
            changed = true;
5988
0
        } else {
5989
0
            dp_netdev_pmd_unref(pmd);
5990
0
        }
5991
0
    }
5992
5993
0
    if (changed) {
5994
0
        struct ovs_numa_info_numa *numa;
5995
5996
        /* Log the number of pmd threads per numa node. */
5997
0
        FOR_EACH_NUMA_ON_DUMP (numa, pmd_cores) {
5998
0
            VLOG_INFO("There are %"PRIuSIZE" pmd threads on numa node %d",
5999
0
                      numa->n_cores, numa->numa_id);
6000
0
        }
6001
0
    }
6002
6003
0
    ovs_numa_dump_destroy(pmd_cores);
6004
0
}
6005
6006
static void
6007
pmd_remove_stale_ports(struct dp_netdev *dp,
6008
                       struct dp_netdev_pmd_thread *pmd)
6009
    OVS_EXCLUDED(pmd->port_mutex)
6010
    OVS_REQ_RDLOCK(dp->port_rwlock)
6011
0
{
6012
0
    struct rxq_poll *poll;
6013
0
    struct tx_port *tx;
6014
6015
0
    ovs_mutex_lock(&pmd->port_mutex);
6016
0
    HMAP_FOR_EACH_SAFE (poll, node, &pmd->poll_list) {
6017
0
        struct dp_netdev_port *port = poll->rxq->port;
6018
6019
0
        if (port->need_reconfigure
6020
0
            || !hmap_contains(&dp->ports, &port->node)) {
6021
0
            dp_netdev_del_rxq_from_pmd(pmd, poll);
6022
0
        }
6023
0
    }
6024
0
    HMAP_FOR_EACH_SAFE (tx, node, &pmd->tx_ports) {
6025
0
        struct dp_netdev_port *port = tx->port;
6026
6027
0
        if (port->need_reconfigure
6028
0
            || !hmap_contains(&dp->ports, &port->node)) {
6029
0
            dp_netdev_del_port_tx_from_pmd(pmd, tx);
6030
0
        }
6031
0
    }
6032
0
    ovs_mutex_unlock(&pmd->port_mutex);
6033
0
}
6034
6035
/* Must be called each time a port is added/removed or the cmask changes.
6036
 * This creates and destroys pmd threads, reconfigures ports, opens their
6037
 * rxqs and assigns all rxqs/txqs to pmd threads. */
6038
static void
6039
reconfigure_datapath(struct dp_netdev *dp)
6040
    OVS_REQ_RDLOCK(dp->port_rwlock)
6041
0
{
6042
0
    struct hmapx busy_threads = HMAPX_INITIALIZER(&busy_threads);
6043
0
    struct dp_netdev_pmd_thread *pmd;
6044
0
    struct dp_netdev_port *port;
6045
0
    int wanted_txqs;
6046
6047
0
    dp->last_reconfigure_seq = seq_read(dp->reconfigure_seq);
6048
6049
    /* Step 1: Adjust the pmd threads based on the datapath ports, the cores
6050
     * on the system and the user configuration. */
6051
0
    reconfigure_pmd_threads(dp);
6052
6053
0
    wanted_txqs = cmap_count(&dp->poll_threads);
6054
6055
    /* The number of pmd threads might have changed, or a port can be new:
6056
     * adjust the txqs. */
6057
0
    HMAP_FOR_EACH (port, node, &dp->ports) {
6058
0
        netdev_set_tx_multiq(port->netdev, wanted_txqs);
6059
0
    }
6060
6061
    /* Step 2: Remove from the pmd threads ports that have been removed or
6062
     * need reconfiguration. */
6063
6064
    /* Check for all the ports that need reconfiguration.  We cache this in
6065
     * 'port->need_reconfigure', because netdev_is_reconf_required() can
6066
     * change at any time.
6067
     * Also mark for reconfiguration all ports which will likely change their
6068
     * 'txq_mode' parameter.  It's required to stop using them before
6069
     * changing this setting and it's simpler to mark ports here and allow
6070
     * 'pmd_remove_stale_ports' to remove them from threads.  There will be
6071
     * no actual reconfiguration in 'port_reconfigure' because it's
6072
     * unnecessary.  */
6073
0
    HMAP_FOR_EACH (port, node, &dp->ports) {
6074
0
        if (netdev_is_reconf_required(port->netdev)
6075
0
            || ((port->txq_mode == TXQ_MODE_XPS)
6076
0
                != (netdev_n_txq(port->netdev) < wanted_txqs))
6077
0
            || ((port->txq_mode == TXQ_MODE_XPS_HASH)
6078
0
                != (port->txq_requested_mode == TXQ_REQ_MODE_HASH
6079
0
                    && netdev_n_txq(port->netdev) > 1))) {
6080
0
            port->need_reconfigure = true;
6081
0
        }
6082
0
    }
6083
6084
    /* Remove from the pmd threads all the ports that have been deleted or
6085
     * need reconfiguration. */
6086
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
6087
0
        pmd_remove_stale_ports(dp, pmd);
6088
0
    }
6089
6090
    /* Reload affected pmd threads.  We must wait for the pmd threads before
6091
     * reconfiguring the ports, because a port cannot be reconfigured while
6092
     * it's being used. */
6093
0
    reload_affected_pmds(dp);
6094
6095
    /* Step 3: Reconfigure ports. */
6096
6097
    /* We only reconfigure the ports that we determined above, because they're
6098
     * not being used by any pmd thread at the moment.  If a port fails to
6099
     * reconfigure we remove it from the datapath. */
6100
0
    HMAP_FOR_EACH_SAFE (port, node, &dp->ports) {
6101
0
        int err;
6102
6103
0
        if (!port->need_reconfigure) {
6104
0
            continue;
6105
0
        }
6106
6107
0
        err = port_reconfigure(port);
6108
0
        if (err) {
6109
0
            hmap_remove(&dp->ports, &port->node);
6110
0
            seq_change(dp->port_seq);
6111
0
            port_destroy(port);
6112
0
        } else {
6113
            /* With a single queue, there is no point in using hash mode. */
6114
0
            if (port->txq_requested_mode == TXQ_REQ_MODE_HASH &&
6115
0
                netdev_n_txq(port->netdev) > 1) {
6116
0
                port->txq_mode = TXQ_MODE_XPS_HASH;
6117
0
            } else if (netdev_n_txq(port->netdev) < wanted_txqs) {
6118
0
                port->txq_mode = TXQ_MODE_XPS;
6119
0
            } else {
6120
0
                port->txq_mode = TXQ_MODE_STATIC;
6121
0
            }
6122
0
        }
6123
0
    }
6124
6125
    /* Step 4: Compute new rxq scheduling.  We don't touch the pmd threads
6126
     * for now, we just update the 'pmd' pointer in each rxq to point to the
6127
     * wanted thread according to the scheduling policy. */
6128
6129
    /* Reset all the pmd threads to non isolated. */
6130
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
6131
0
        pmd->isolated = false;
6132
0
    }
6133
6134
    /* Reset all the queues to unassigned */
6135
0
    HMAP_FOR_EACH (port, node, &dp->ports) {
6136
0
        for (int i = 0; i < port->n_rxq; i++) {
6137
0
            port->rxqs[i].pmd = NULL;
6138
0
        }
6139
0
    }
6140
0
    rxq_scheduling(dp);
6141
6142
    /* Step 5: Remove queues not compliant with new scheduling. */
6143
6144
    /* Count all the threads that will have at least one queue to poll. */
6145
0
    HMAP_FOR_EACH (port, node, &dp->ports) {
6146
0
        for (int qid = 0; qid < port->n_rxq; qid++) {
6147
0
            struct dp_netdev_rxq *q = &port->rxqs[qid];
6148
6149
0
            if (q->pmd) {
6150
0
                hmapx_add(&busy_threads, q->pmd);
6151
0
            }
6152
0
        }
6153
0
    }
6154
6155
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
6156
0
        struct rxq_poll *poll;
6157
6158
0
        ovs_mutex_lock(&pmd->port_mutex);
6159
0
        HMAP_FOR_EACH_SAFE (poll, node, &pmd->poll_list) {
6160
0
            if (poll->rxq->pmd != pmd) {
6161
0
                dp_netdev_del_rxq_from_pmd(pmd, poll);
6162
6163
                /* This pmd might sleep after this step if it has no rxq
6164
                 * remaining. Tell it to busy wait for new assignment if it
6165
                 * has at least one scheduled queue. */
6166
0
                if (hmap_count(&pmd->poll_list) == 0 &&
6167
0
                    hmapx_contains(&busy_threads, pmd)) {
6168
0
                    atomic_store_relaxed(&pmd->wait_for_reload, true);
6169
0
                }
6170
0
            }
6171
0
        }
6172
0
        ovs_mutex_unlock(&pmd->port_mutex);
6173
0
    }
6174
6175
0
    hmapx_destroy(&busy_threads);
6176
6177
    /* Reload affected pmd threads.  We must wait for the pmd threads to remove
6178
     * the old queues before readding them, otherwise a queue can be polled by
6179
     * two threads at the same time. */
6180
0
    reload_affected_pmds(dp);
6181
6182
    /* Step 6: Add queues from scheduling, if they're not there already. */
6183
0
    HMAP_FOR_EACH (port, node, &dp->ports) {
6184
0
        if (!netdev_is_pmd(port->netdev)) {
6185
0
            continue;
6186
0
        }
6187
6188
0
        for (int qid = 0; qid < port->n_rxq; qid++) {
6189
0
            struct dp_netdev_rxq *q = &port->rxqs[qid];
6190
6191
0
            if (q->pmd) {
6192
0
                ovs_mutex_lock(&q->pmd->port_mutex);
6193
0
                dp_netdev_add_rxq_to_pmd(q->pmd, q);
6194
0
                ovs_mutex_unlock(&q->pmd->port_mutex);
6195
0
            }
6196
0
        }
6197
0
    }
6198
6199
    /* Add every port and bond to the tx port and bond caches of
6200
     * every pmd thread, if it's not there already and if this pmd
6201
     * has at least one rxq to poll.
6202
     */
6203
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
6204
0
        ovs_mutex_lock(&pmd->port_mutex);
6205
0
        if (hmap_count(&pmd->poll_list) || pmd->core_id == NON_PMD_CORE_ID) {
6206
0
            struct tx_bond *bond;
6207
6208
0
            HMAP_FOR_EACH (port, node, &dp->ports) {
6209
0
                dp_netdev_add_port_tx_to_pmd(pmd, port);
6210
0
            }
6211
6212
0
            CMAP_FOR_EACH (bond, node, &dp->tx_bonds) {
6213
0
                dp_netdev_add_bond_tx_to_pmd(pmd, bond, false);
6214
0
            }
6215
0
        }
6216
0
        ovs_mutex_unlock(&pmd->port_mutex);
6217
0
    }
6218
6219
    /* Reload affected pmd threads. */
6220
0
    reload_affected_pmds(dp);
6221
6222
    /* PMD ALB will need to recheck if dry run needed. */
6223
0
    dp->pmd_alb.recheck_config = true;
6224
0
}
6225
6226
/* Returns true if one of the netdevs in 'dp' requires a reconfiguration */
6227
static bool
6228
ports_require_restart(const struct dp_netdev *dp)
6229
    OVS_REQ_RDLOCK(dp->port_rwlock)
6230
0
{
6231
0
    struct dp_netdev_port *port;
6232
6233
0
    HMAP_FOR_EACH (port, node, &dp->ports) {
6234
0
        if (netdev_is_reconf_required(port->netdev)) {
6235
0
            return true;
6236
0
        }
6237
0
    }
6238
6239
0
    return false;
6240
0
}
6241
6242
/* Calculates variance in the values stored in array 'a'. 'n' is the number
6243
 * of elements in array to be considered for calculating vairance.
6244
 * Usage example: data array 'a' contains the processing load of each pmd and
6245
 * 'n' is the number of PMDs. It returns the variance in processing load of
6246
 * PMDs*/
6247
static uint64_t
6248
variance(uint64_t a[], int n)
6249
0
{
6250
    /* Compute mean (average of elements). */
6251
0
    uint64_t sum = 0;
6252
0
    uint64_t mean = 0;
6253
0
    uint64_t sqDiff = 0;
6254
6255
0
    if (!n) {
6256
0
        return 0;
6257
0
    }
6258
6259
0
    for (int i = 0; i < n; i++) {
6260
0
        sum += a[i];
6261
0
    }
6262
6263
0
    if (sum) {
6264
0
        mean = sum / n;
6265
6266
        /* Compute sum squared differences with mean. */
6267
0
        for (int i = 0; i < n; i++) {
6268
0
            sqDiff += (a[i] - mean)*(a[i] - mean);
6269
0
        }
6270
0
    }
6271
0
    return (sqDiff ? (sqDiff / n) : 0);
6272
0
}
6273
6274
/* Return true if needs to revalidate datapath flows. */
6275
static bool
6276
dpif_netdev_run(struct dpif *dpif)
6277
0
{
6278
0
    struct dp_netdev_port *port;
6279
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
6280
0
    struct dp_netdev_pmd_thread *non_pmd;
6281
0
    uint64_t new_tnl_seq;
6282
0
    bool need_to_flush = true;
6283
0
    bool pmd_rebalance = false;
6284
0
    long long int now = time_msec();
6285
0
    struct dp_netdev_pmd_thread *pmd;
6286
6287
0
    ovs_rwlock_rdlock(&dp->port_rwlock);
6288
0
    non_pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
6289
0
    if (non_pmd) {
6290
0
        ovs_mutex_lock(&dp->non_pmd_mutex);
6291
6292
0
        atomic_read_relaxed(&dp->smc_enable_db, &non_pmd->ctx.smc_enable_db);
6293
6294
0
        HMAP_FOR_EACH (port, node, &dp->ports) {
6295
0
            if (!netdev_is_pmd(port->netdev)) {
6296
0
                int i;
6297
6298
0
                if (port->emc_enabled) {
6299
0
                    atomic_read_relaxed(&dp->emc_insert_min,
6300
0
                                        &non_pmd->ctx.emc_insert_min);
6301
0
                } else {
6302
0
                    non_pmd->ctx.emc_insert_min = 0;
6303
0
                }
6304
6305
0
                for (i = 0; i < port->n_rxq; i++) {
6306
6307
0
                    if (!netdev_rxq_enabled(port->rxqs[i].rx)) {
6308
0
                        continue;
6309
0
                    }
6310
6311
0
                    if (dp_netdev_process_rxq_port(non_pmd,
6312
0
                                                   &port->rxqs[i],
6313
0
                                                   port->port_no)) {
6314
0
                        need_to_flush = false;
6315
0
                    }
6316
0
                }
6317
0
            }
6318
0
        }
6319
0
        if (need_to_flush) {
6320
            /* We didn't receive anything in the process loop.
6321
             * Check if we need to send something.
6322
             * There was no time updates on current iteration. */
6323
0
            pmd_thread_ctx_time_update(non_pmd);
6324
0
            dp_netdev_pmd_flush_output_packets(non_pmd, false);
6325
0
        }
6326
6327
0
        dpif_netdev_xps_revalidate_pmd(non_pmd, false);
6328
0
        ovs_mutex_unlock(&dp->non_pmd_mutex);
6329
6330
0
        dp_netdev_pmd_unref(non_pmd);
6331
0
    }
6332
6333
0
    struct pmd_auto_lb *pmd_alb = &dp->pmd_alb;
6334
0
    if (pmd_alb->is_enabled) {
6335
0
        if (!pmd_alb->rebalance_poll_timer) {
6336
0
            pmd_alb->rebalance_poll_timer = now;
6337
0
        } else if ((pmd_alb->rebalance_poll_timer +
6338
0
                   pmd_alb->rebalance_intvl) < now) {
6339
0
            pmd_alb->rebalance_poll_timer = now;
6340
0
            CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
6341
0
                if (atomic_count_get(&pmd->pmd_overloaded) >=
6342
0
                                    PMD_INTERVAL_MAX) {
6343
0
                    pmd_rebalance = true;
6344
0
                    break;
6345
0
                }
6346
0
            }
6347
6348
0
            if (pmd_rebalance &&
6349
0
                !dp_netdev_is_reconf_required(dp) &&
6350
0
                !ports_require_restart(dp) &&
6351
0
                pmd_rebalance_dry_run_needed(dp) &&
6352
0
                pmd_rebalance_dry_run(dp)) {
6353
0
                VLOG_INFO("PMD auto load balance dry run. "
6354
0
                          "Requesting datapath reconfigure.");
6355
0
                dp_netdev_request_reconfigure(dp);
6356
0
            }
6357
0
        }
6358
0
    }
6359
6360
0
    if (dp_netdev_is_reconf_required(dp) || ports_require_restart(dp)) {
6361
0
        reconfigure_datapath(dp);
6362
0
    }
6363
0
    ovs_rwlock_unlock(&dp->port_rwlock);
6364
6365
0
    tnl_neigh_cache_run();
6366
0
    tnl_port_map_run();
6367
0
    new_tnl_seq = seq_read(tnl_conf_seq);
6368
6369
0
    if (dp->last_tnl_conf_seq != new_tnl_seq) {
6370
0
        dp->last_tnl_conf_seq = new_tnl_seq;
6371
0
        return true;
6372
0
    }
6373
0
    return false;
6374
0
}
6375
6376
static void
6377
dpif_netdev_wait(struct dpif *dpif)
6378
0
{
6379
0
    struct dp_netdev_port *port;
6380
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
6381
6382
0
    ovs_mutex_lock(&dp_netdev_mutex);
6383
0
    ovs_rwlock_rdlock(&dp->port_rwlock);
6384
0
    HMAP_FOR_EACH (port, node, &dp->ports) {
6385
0
        netdev_wait_reconf_required(port->netdev);
6386
0
        if (!netdev_is_pmd(port->netdev)) {
6387
0
            int i;
6388
6389
0
            for (i = 0; i < port->n_rxq; i++) {
6390
0
                netdev_rxq_wait(port->rxqs[i].rx);
6391
0
            }
6392
0
        }
6393
0
    }
6394
0
    ovs_rwlock_unlock(&dp->port_rwlock);
6395
0
    ovs_mutex_unlock(&dp_netdev_mutex);
6396
0
    seq_wait(tnl_conf_seq, dp->last_tnl_conf_seq);
6397
0
}
6398
6399
static void
6400
pmd_free_cached_ports(struct dp_netdev_pmd_thread *pmd)
6401
0
{
6402
0
    struct tx_port *tx_port_cached;
6403
6404
    /* Flush all the queued packets. */
6405
0
    dp_netdev_pmd_flush_output_packets(pmd, true);
6406
    /* Free all used tx queue ids. */
6407
0
    dpif_netdev_xps_revalidate_pmd(pmd, true);
6408
6409
0
    HMAP_FOR_EACH_POP (tx_port_cached, node, &pmd->tnl_port_cache) {
6410
0
        free(tx_port_cached->txq_pkts);
6411
0
        free(tx_port_cached);
6412
0
    }
6413
0
    HMAP_FOR_EACH_POP (tx_port_cached, node, &pmd->send_port_cache) {
6414
0
        free(tx_port_cached->txq_pkts);
6415
0
        free(tx_port_cached);
6416
0
    }
6417
0
}
6418
6419
/* Copies ports from 'pmd->tx_ports' (shared with the main thread) to
6420
 * thread-local copies. Copy to 'pmd->tnl_port_cache' if it is a tunnel
6421
 * device, otherwise to 'pmd->send_port_cache' if the port has at least
6422
 * one txq. */
6423
static void
6424
pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
6425
    OVS_REQUIRES(pmd->port_mutex)
6426
0
{
6427
0
    struct tx_port *tx_port, *tx_port_cached;
6428
6429
0
    pmd_free_cached_ports(pmd);
6430
0
    hmap_shrink(&pmd->send_port_cache);
6431
0
    hmap_shrink(&pmd->tnl_port_cache);
6432
6433
0
    HMAP_FOR_EACH (tx_port, node, &pmd->tx_ports) {
6434
0
        int n_txq = netdev_n_txq(tx_port->port->netdev);
6435
0
        struct dp_packet_batch *txq_pkts_cached;
6436
6437
0
        if (netdev_has_tunnel_push_pop(tx_port->port->netdev)) {
6438
0
            tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached);
6439
0
            if (tx_port->txq_pkts) {
6440
0
                txq_pkts_cached = xmemdup(tx_port->txq_pkts,
6441
0
                                          n_txq * sizeof *tx_port->txq_pkts);
6442
0
                tx_port_cached->txq_pkts = txq_pkts_cached;
6443
0
            }
6444
0
            hmap_insert(&pmd->tnl_port_cache, &tx_port_cached->node,
6445
0
                        hash_port_no(tx_port_cached->port->port_no));
6446
0
        }
6447
6448
0
        if (n_txq) {
6449
0
            tx_port_cached = xmemdup(tx_port, sizeof *tx_port_cached);
6450
0
            if (tx_port->txq_pkts) {
6451
0
                txq_pkts_cached = xmemdup(tx_port->txq_pkts,
6452
0
                                          n_txq * sizeof *tx_port->txq_pkts);
6453
0
                tx_port_cached->txq_pkts = txq_pkts_cached;
6454
0
            }
6455
0
            hmap_insert(&pmd->send_port_cache, &tx_port_cached->node,
6456
0
                        hash_port_no(tx_port_cached->port->port_no));
6457
0
        }
6458
0
    }
6459
0
}
6460
6461
static void
6462
pmd_alloc_static_tx_qid(struct dp_netdev_pmd_thread *pmd)
6463
0
{
6464
0
    ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex);
6465
0
    if (!id_pool_alloc_id(pmd->dp->tx_qid_pool, &pmd->static_tx_qid)) {
6466
0
        VLOG_ABORT("static_tx_qid allocation failed for PMD on core %2d"
6467
0
                   ", numa_id %d.", pmd->core_id, pmd->numa_id);
6468
0
    }
6469
0
    ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex);
6470
6471
0
    VLOG_DBG("static_tx_qid = %d allocated for PMD thread on core %2d"
6472
0
             ", numa_id %d.", pmd->static_tx_qid, pmd->core_id, pmd->numa_id);
6473
0
}
6474
6475
static void
6476
pmd_free_static_tx_qid(struct dp_netdev_pmd_thread *pmd)
6477
0
{
6478
0
    ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex);
6479
0
    id_pool_free_id(pmd->dp->tx_qid_pool, pmd->static_tx_qid);
6480
0
    ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex);
6481
0
}
6482
6483
static int
6484
pmd_load_queues_and_ports(struct dp_netdev_pmd_thread *pmd,
6485
                          struct polled_queue **ppoll_list)
6486
0
{
6487
0
    struct polled_queue *poll_list = *ppoll_list;
6488
0
    struct rxq_poll *poll;
6489
0
    int i;
6490
6491
0
    ovs_mutex_lock(&pmd->port_mutex);
6492
0
    poll_list = xrealloc(poll_list, hmap_count(&pmd->poll_list)
6493
0
                                    * sizeof *poll_list);
6494
6495
0
    i = 0;
6496
0
    HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
6497
0
        poll_list[i].rxq = poll->rxq;
6498
0
        poll_list[i].port_no = poll->rxq->port->port_no;
6499
0
        poll_list[i].emc_enabled = poll->rxq->port->emc_enabled;
6500
0
        poll_list[i].rxq_enabled = netdev_rxq_enabled(poll->rxq->rx);
6501
0
        poll_list[i].change_seq =
6502
0
                     netdev_get_change_seq(poll->rxq->port->netdev);
6503
0
        i++;
6504
0
    }
6505
6506
0
    pmd_load_cached_ports(pmd);
6507
6508
0
    ovs_mutex_unlock(&pmd->port_mutex);
6509
6510
0
    *ppoll_list = poll_list;
6511
0
    return i;
6512
0
}
6513
6514
static void *
6515
pmd_thread_main(void *f_)
6516
0
{
6517
0
    struct dp_netdev_pmd_thread *pmd = f_;
6518
0
    struct pmd_perf_stats *s = &pmd->perf_stats;
6519
0
    unsigned int lc = 0;
6520
0
    struct polled_queue *poll_list;
6521
0
    bool wait_for_reload = false;
6522
0
    bool dpdk_attached;
6523
0
    bool reload_tx_qid;
6524
0
    bool exiting;
6525
0
    bool reload;
6526
0
    int poll_cnt;
6527
0
    int i;
6528
0
    int process_packets = 0;
6529
0
    uint64_t sleep_time = 0;
6530
6531
0
    poll_list = NULL;
6532
6533
    /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */
6534
0
    ovsthread_setspecific(pmd->dp->per_pmd_key, pmd);
6535
0
    ovs_numa_thread_setaffinity_core(pmd->core_id);
6536
0
    dpdk_attached = dpdk_attach_thread(pmd->core_id);
6537
0
    poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
6538
0
    dfc_cache_init(&pmd->flow_cache);
6539
0
    pmd_alloc_static_tx_qid(pmd);
6540
0
    set_timer_resolution(PMD_TIMER_RES_NS);
6541
6542
0
reload:
6543
0
    atomic_count_init(&pmd->pmd_overloaded, 0);
6544
6545
0
    pmd->intrvl_tsc_prev = 0;
6546
0
    atomic_store_relaxed(&pmd->intrvl_cycles, 0);
6547
6548
0
    if (!dpdk_attached) {
6549
0
        dpdk_attached = dpdk_attach_thread(pmd->core_id);
6550
0
    }
6551
6552
    /* List port/core affinity */
6553
0
    for (i = 0; i < poll_cnt; i++) {
6554
0
       VLOG_DBG("Core %d processing port \'%s\' with queue-id %d\n",
6555
0
                pmd->core_id, netdev_rxq_get_name(poll_list[i].rxq->rx),
6556
0
                netdev_rxq_get_queue_id(poll_list[i].rxq->rx));
6557
       /* Reset the rxq current cycles counter. */
6558
0
       dp_netdev_rxq_set_cycles(poll_list[i].rxq, RXQ_CYCLES_PROC_CURR, 0);
6559
0
       for (int j = 0; j < PMD_INTERVAL_MAX; j++) {
6560
0
           dp_netdev_rxq_set_intrvl_cycles(poll_list[i].rxq, 0);
6561
0
       }
6562
0
    }
6563
6564
0
    if (!poll_cnt) {
6565
0
        if (wait_for_reload) {
6566
            /* Don't sleep, control thread will ask for a reload shortly. */
6567
0
            do {
6568
0
                atomic_read_explicit(&pmd->reload, &reload,
6569
0
                                     memory_order_acquire);
6570
0
            } while (!reload);
6571
0
        } else {
6572
0
            while (seq_read(pmd->reload_seq) == pmd->last_reload_seq) {
6573
0
                seq_wait(pmd->reload_seq, pmd->last_reload_seq);
6574
0
                poll_block();
6575
0
            }
6576
0
        }
6577
0
    }
6578
6579
0
    for (i = 0; i < PMD_INTERVAL_MAX; i++) {
6580
0
        atomic_store_relaxed(&pmd->busy_cycles_intrvl[i], 0);
6581
0
    }
6582
0
    atomic_count_set(&pmd->intrvl_idx, 0);
6583
0
    cycles_counter_update(s);
6584
6585
0
    pmd->next_rcu_quiesce = pmd->ctx.now + PMD_RCU_QUIESCE_INTERVAL;
6586
6587
    /* Protect pmd stats from external clearing while polling. */
6588
0
    ovs_mutex_lock(&pmd->perf_stats.stats_mutex);
6589
0
    for (;;) {
6590
0
        uint64_t rx_packets = 0, tx_packets = 0;
6591
0
        uint64_t time_slept = 0;
6592
0
        uint64_t max_sleep;
6593
6594
0
        pmd_perf_start_iteration(s);
6595
6596
0
        atomic_read_relaxed(&pmd->dp->smc_enable_db, &pmd->ctx.smc_enable_db);
6597
0
        atomic_read_relaxed(&pmd->max_sleep, &max_sleep);
6598
6599
0
        for (i = 0; i < poll_cnt; i++) {
6600
6601
0
            if (!poll_list[i].rxq_enabled) {
6602
0
                continue;
6603
0
            }
6604
6605
0
            if (poll_list[i].emc_enabled) {
6606
0
                atomic_read_relaxed(&pmd->dp->emc_insert_min,
6607
0
                                    &pmd->ctx.emc_insert_min);
6608
0
            } else {
6609
0
                pmd->ctx.emc_insert_min = 0;
6610
0
            }
6611
6612
0
            process_packets =
6613
0
                dp_netdev_process_rxq_port(pmd, poll_list[i].rxq,
6614
0
                                           poll_list[i].port_no);
6615
0
            rx_packets += process_packets;
6616
0
            if (process_packets >= PMD_SLEEP_THRESH) {
6617
0
                sleep_time = 0;
6618
0
            }
6619
0
        }
6620
6621
0
        if (!rx_packets) {
6622
            /* We didn't receive anything in the process loop.
6623
             * Check if we need to send something.
6624
             * There was no time updates on current iteration. */
6625
0
            pmd_thread_ctx_time_update(pmd);
6626
0
            tx_packets = dp_netdev_pmd_flush_output_packets(pmd,
6627
0
                                                   max_sleep && sleep_time
6628
0
                                                   ? true : false);
6629
0
        }
6630
6631
0
        if (max_sleep) {
6632
            /* Check if a sleep should happen on this iteration. */
6633
0
            if (sleep_time) {
6634
0
                struct cycle_timer sleep_timer;
6635
6636
0
                cycle_timer_start(&pmd->perf_stats, &sleep_timer);
6637
0
                xnanosleep_no_quiesce(sleep_time * 1000);
6638
0
                time_slept = cycle_timer_stop(&pmd->perf_stats, &sleep_timer);
6639
0
                pmd_thread_ctx_time_update(pmd);
6640
0
            }
6641
0
            if (sleep_time < max_sleep) {
6642
                /* Increase sleep time for next iteration. */
6643
0
                sleep_time += PMD_SLEEP_INC_US;
6644
0
            } else {
6645
0
                sleep_time = max_sleep;
6646
0
            }
6647
0
        } else {
6648
            /* Reset sleep time as max sleep policy may have been changed. */
6649
0
            sleep_time = 0;
6650
0
        }
6651
6652
        /* Do RCU synchronization at fixed interval.  This ensures that
6653
         * synchronization would not be delayed long even at high load of
6654
         * packet processing. */
6655
0
        if (pmd->ctx.now > pmd->next_rcu_quiesce) {
6656
0
            if (!ovsrcu_try_quiesce()) {
6657
0
                pmd->next_rcu_quiesce =
6658
0
                    pmd->ctx.now + PMD_RCU_QUIESCE_INTERVAL;
6659
0
            }
6660
0
        }
6661
6662
0
        if (lc++ > 1024) {
6663
0
            lc = 0;
6664
6665
0
            coverage_try_clear();
6666
0
            dp_netdev_pmd_try_optimize(pmd, poll_list, poll_cnt);
6667
0
            if (!ovsrcu_try_quiesce()) {
6668
0
                emc_cache_slow_sweep(&((pmd->flow_cache).emc_cache));
6669
0
                pmd->next_rcu_quiesce =
6670
0
                    pmd->ctx.now + PMD_RCU_QUIESCE_INTERVAL;
6671
0
            }
6672
6673
0
            for (i = 0; i < poll_cnt; i++) {
6674
0
                uint64_t current_seq =
6675
0
                         netdev_get_change_seq(poll_list[i].rxq->port->netdev);
6676
0
                if (poll_list[i].change_seq != current_seq) {
6677
0
                    poll_list[i].change_seq = current_seq;
6678
0
                    poll_list[i].rxq_enabled =
6679
0
                                 netdev_rxq_enabled(poll_list[i].rxq->rx);
6680
0
                }
6681
0
            }
6682
0
        }
6683
6684
0
        atomic_read_explicit(&pmd->reload, &reload, memory_order_acquire);
6685
0
        if (OVS_UNLIKELY(reload)) {
6686
0
            break;
6687
0
        }
6688
6689
0
        pmd_perf_end_iteration(s, rx_packets, tx_packets, time_slept,
6690
0
                               pmd_perf_metrics_enabled(pmd));
6691
0
    }
6692
0
    ovs_mutex_unlock(&pmd->perf_stats.stats_mutex);
6693
6694
0
    poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
6695
0
    atomic_read_relaxed(&pmd->wait_for_reload, &wait_for_reload);
6696
0
    atomic_read_relaxed(&pmd->reload_tx_qid, &reload_tx_qid);
6697
0
    atomic_read_relaxed(&pmd->exit, &exiting);
6698
    /* Signal here to make sure the pmd finishes
6699
     * reloading the updated configuration. */
6700
0
    dp_netdev_pmd_reload_done(pmd);
6701
6702
0
    if (reload_tx_qid) {
6703
0
        pmd_free_static_tx_qid(pmd);
6704
0
        pmd_alloc_static_tx_qid(pmd);
6705
0
    }
6706
6707
0
    if (!exiting) {
6708
0
        goto reload;
6709
0
    }
6710
6711
0
    pmd_free_static_tx_qid(pmd);
6712
0
    dfc_cache_uninit(&pmd->flow_cache);
6713
0
    free(poll_list);
6714
0
    pmd_free_cached_ports(pmd);
6715
0
    if (dpdk_attached) {
6716
0
        dpdk_detach_thread();
6717
0
    }
6718
0
    return NULL;
6719
0
}
6720
6721
static void
6722
dp_netdev_disable_upcall(struct dp_netdev *dp)
6723
    OVS_ACQUIRES(dp->upcall_rwlock)
6724
0
{
6725
0
    fat_rwlock_wrlock(&dp->upcall_rwlock);
6726
0
}
6727
6728

6729
/* Meters */
6730
static void
6731
dpif_netdev_meter_get_features(const struct dpif * dpif OVS_UNUSED,
6732
                               struct ofputil_meter_features *features)
6733
0
{
6734
0
    features->max_meters = MAX_METERS;
6735
0
    features->band_types = DP_SUPPORTED_METER_BAND_TYPES;
6736
0
    features->capabilities = DP_SUPPORTED_METER_FLAGS_MASK;
6737
0
    features->max_bands = MAX_BANDS;
6738
0
    features->max_color = 0;
6739
0
}
6740
6741
/* Tries to atomically add 'n' to 'value' in terms of saturation arithmetic,
6742
 * i.e., if the result will be larger than 'max_value', will store 'max_value'
6743
 * instead. */
6744
static void
6745
atomic_sat_add(atomic_uint64_t *value, uint64_t n, uint64_t max_value)
6746
0
{
6747
0
    uint64_t current, new_value;
6748
6749
0
    atomic_read_relaxed(value, &current);
6750
0
    do {
6751
0
        new_value = current + n;
6752
0
        new_value = MIN(new_value, max_value);
6753
0
    } while (!atomic_compare_exchange_weak_relaxed(value, &current,
6754
0
                                                   new_value));
6755
0
}
6756
6757
/* Tries to atomically subtract 'n' from 'value'.  Does not perform the
6758
 * operation and returns 'false' if the result will be less than 'min_value'.
6759
 * Otherwise, stores the result and returns 'true'. */
6760
static bool
6761
atomic_bound_sub(atomic_uint64_t *value, uint64_t n, uint64_t min_value)
6762
0
{
6763
0
    uint64_t current;
6764
6765
0
    atomic_read_relaxed(value, &current);
6766
0
    do {
6767
0
        if (current < min_value + n) {
6768
0
            return false;
6769
0
        }
6770
0
    } while (!atomic_compare_exchange_weak_relaxed(value, &current,
6771
0
                                                   current - n));
6772
0
    return true;
6773
0
}
6774
6775
/* Applies the meter identified by 'meter_id' to 'packets_'.  Packets
6776
 * that exceed a band are dropped in-place. */
6777
static void
6778
dp_netdev_run_meter(struct dp_netdev *dp, struct dp_packet_batch *packets_,
6779
                    uint32_t meter_id, long long int now_ms)
6780
0
{
6781
0
    const size_t cnt = dp_packet_batch_size(packets_);
6782
0
    uint32_t exceeded_rate[NETDEV_MAX_BURST];
6783
0
    uint32_t exceeded_band[NETDEV_MAX_BURST];
6784
0
    uint64_t bytes, volume, meter_used, old;
6785
0
    uint64_t band_packets[MAX_BANDS];
6786
0
    uint64_t band_bytes[MAX_BANDS];
6787
0
    struct dp_meter_band *band;
6788
0
    struct dp_packet *packet;
6789
0
    struct dp_meter *meter;
6790
0
    bool exceeded = false;
6791
6792
0
    if (meter_id >= MAX_METERS) {
6793
0
        return;
6794
0
    }
6795
6796
0
    meter = dp_meter_lookup(&dp->meters, meter_id);
6797
0
    if (!meter) {
6798
0
        return;
6799
0
    }
6800
6801
    /* Initialize as negative values. */
6802
0
    memset(exceeded_band, 0xff, cnt * sizeof *exceeded_band);
6803
    /* Initialize as zeroes. */
6804
0
    memset(exceeded_rate, 0, cnt * sizeof *exceeded_rate);
6805
6806
0
    atomic_read_relaxed(&meter->used, &meter_used);
6807
0
    do {
6808
0
        if (meter_used >= now_ms) {
6809
            /* The '>' condition means that we have several threads hitting the
6810
             * same meter, and the other one already advanced the time. */
6811
0
            meter_used = now_ms;
6812
0
            break;
6813
0
        }
6814
0
    } while (!atomic_compare_exchange_weak_relaxed(&meter->used,
6815
0
                                                   &meter_used, now_ms));
6816
6817
    /* Refill all buckets right away, since other threads may use them. */
6818
0
    if (meter_used < now_ms) {
6819
        /* All packets will hit the meter at the same time. */
6820
0
        uint64_t delta_t = now_ms - meter_used;
6821
6822
        /* Make sure delta_t will not be too large, so that bucket will not
6823
         * wrap around below. */
6824
0
        delta_t = MIN(delta_t, meter->max_delta_t);
6825
6826
0
        for (int m = 0; m < meter->n_bands; m++) {
6827
0
            band = &meter->bands[m];
6828
            /* Update band's bucket.  We can't just use atomic add here,
6829
             * because we should never add above the max capacity. */
6830
0
            atomic_sat_add(&band->bucket, delta_t * band->rate,
6831
0
                           band->burst_size * 1000ULL);
6832
0
        }
6833
0
    }
6834
6835
    /* Update meter stats. */
6836
0
    atomic_add_relaxed(&meter->packet_count, cnt, &old);
6837
0
    bytes = 0;
6838
0
    DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
6839
0
        bytes += dp_packet_size(packet);
6840
0
    }
6841
0
    atomic_add_relaxed(&meter->byte_count, bytes, &old);
6842
6843
    /* Meters can operate in terms of packets per second or kilobits per
6844
     * second. */
6845
0
    if (meter->flags & OFPMF13_PKTPS) {
6846
        /* Rate in packets/second, bucket 1/1000 packets.
6847
         * msec * packets/sec = 1/1000 packets. */
6848
0
        volume = cnt * 1000; /* Take 'cnt' packets from the bucket. */
6849
0
    } else {
6850
        /* Rate in kbps, bucket in bits.
6851
         * msec * kbps = bits */
6852
0
        volume = bytes * 8;
6853
0
    }
6854
6855
    /* Find the band hit with the highest rate for each packet (if any). */
6856
0
    for (int m = 0; m < meter->n_bands; m++) {
6857
0
        band = &meter->bands[m];
6858
6859
        /* Drain the bucket for all the packets, if possible. */
6860
0
        if (atomic_bound_sub(&band->bucket, volume, 0)) {
6861
0
            continue;
6862
0
        }
6863
6864
        /* Band limit hit, must process packet-by-packet. */
6865
0
        DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
6866
0
            uint64_t packet_volume = (meter->flags & OFPMF13_PKTPS)
6867
0
                                     ? 1000 : (dp_packet_size(packet) * 8);
6868
6869
0
            if (!atomic_bound_sub(&band->bucket, packet_volume, 0)) {
6870
                /* Update the exceeding band for the exceeding packet.
6871
                 * Only one band will be fired by a packet, and that can
6872
                 * be different for each packet. */
6873
0
                if (band->rate > exceeded_rate[i]) {
6874
0
                    exceeded_rate[i] = band->rate;
6875
0
                    exceeded_band[i] = m;
6876
0
                    exceeded = true;
6877
0
                }
6878
0
            }
6879
0
        }
6880
0
    }
6881
6882
    /* No need to iterate over packets if there are no drops. */
6883
0
    if (!exceeded) {
6884
0
        return;
6885
0
    }
6886
6887
    /* Fire the highest rate band exceeded by each packet, and drop
6888
     * packets if needed. */
6889
6890
0
    memset(band_packets, 0, sizeof band_packets);
6891
0
    memset(band_bytes,   0, sizeof band_bytes);
6892
6893
0
    size_t j;
6894
0
    DP_PACKET_BATCH_REFILL_FOR_EACH (j, cnt, packet, packets_) {
6895
0
        uint32_t m = exceeded_band[j];
6896
6897
0
        if (m != UINT32_MAX) {
6898
            /* Meter drop packet. */
6899
0
            band_packets[m]++;
6900
0
            band_bytes[m] += dp_packet_size(packet);
6901
0
            dp_packet_delete(packet);
6902
0
        } else {
6903
            /* Meter accepts packet. */
6904
0
            dp_packet_batch_refill(packets_, packet, j);
6905
0
        }
6906
0
    }
6907
6908
0
    for (int m = 0; m < meter->n_bands; m++) {
6909
0
        if (!band_packets[m]) {
6910
0
            continue;
6911
0
        }
6912
0
        band = &meter->bands[m];
6913
0
        atomic_add_relaxed(&band->packet_count, band_packets[m], &old);
6914
0
        atomic_add_relaxed(&band->byte_count,   band_bytes[m],   &old);
6915
0
        COVERAGE_ADD(datapath_drop_meter, band_packets[m]);
6916
0
    }
6917
0
}
6918
6919
/* Meter set/get/del processing is still single-threaded. */
6920
static int
6921
dpif_netdev_meter_set(struct dpif *dpif, ofproto_meter_id meter_id,
6922
                      struct ofputil_meter_config *config)
6923
0
{
6924
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
6925
0
    uint32_t mid = meter_id.uint32;
6926
0
    struct dp_meter *meter;
6927
0
    int i;
6928
6929
0
    if (mid >= MAX_METERS) {
6930
0
        return EFBIG; /* Meter_id out of range. */
6931
0
    }
6932
6933
0
    if (config->flags & ~DP_SUPPORTED_METER_FLAGS_MASK) {
6934
0
        return EBADF; /* Unsupported flags set */
6935
0
    }
6936
6937
0
    if (config->n_bands > MAX_BANDS) {
6938
0
        return EINVAL;
6939
0
    }
6940
6941
0
    for (i = 0; i < config->n_bands; ++i) {
6942
0
        switch (config->bands[i].type) {
6943
0
        case OFPMBT13_DROP:
6944
0
            break;
6945
0
        default:
6946
0
            return ENODEV; /* Unsupported band type */
6947
0
        }
6948
0
    }
6949
6950
    /* Allocate meter */
6951
0
    meter = xzalloc(sizeof *meter
6952
0
                    + config->n_bands * sizeof(struct dp_meter_band));
6953
6954
0
    meter->flags = config->flags;
6955
0
    meter->n_bands = config->n_bands;
6956
0
    meter->max_delta_t = 0;
6957
0
    meter->id = mid;
6958
0
    atomic_init(&meter->used, time_msec());
6959
6960
    /* set up bands */
6961
0
    for (i = 0; i < config->n_bands; ++i) {
6962
0
        uint32_t band_max_delta_t;
6963
0
        uint64_t bucket_size;
6964
6965
        /* Set burst size to a workable value if none specified. */
6966
0
        if (config->bands[i].burst_size == 0) {
6967
0
            config->bands[i].burst_size = config->bands[i].rate;
6968
0
        }
6969
6970
0
        meter->bands[i].rate = config->bands[i].rate;
6971
0
        meter->bands[i].burst_size = config->bands[i].burst_size;
6972
        /* Start with a full bucket. */
6973
0
        bucket_size = meter->bands[i].burst_size * 1000ULL;
6974
0
        atomic_init(&meter->bands[i].bucket, bucket_size);
6975
6976
        /* Figure out max delta_t that is enough to fill any bucket. */
6977
0
        band_max_delta_t = bucket_size / meter->bands[i].rate;
6978
0
        if (band_max_delta_t > meter->max_delta_t) {
6979
0
            meter->max_delta_t = band_max_delta_t;
6980
0
        }
6981
0
    }
6982
6983
0
    ovs_mutex_lock(&dp->meters_lock);
6984
6985
0
    dp_meter_detach_free(&dp->meters, mid); /* Free existing meter, if any. */
6986
0
    dp_meter_attach(&dp->meters, meter);
6987
6988
0
    ovs_mutex_unlock(&dp->meters_lock);
6989
6990
0
    return 0;
6991
0
}
6992
6993
static int
6994
dpif_netdev_meter_get(const struct dpif *dpif,
6995
                      ofproto_meter_id meter_id_,
6996
                      struct ofputil_meter_stats *stats, uint16_t n_bands)
6997
0
{
6998
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
6999
0
    uint32_t meter_id = meter_id_.uint32;
7000
0
    struct dp_meter *meter;
7001
7002
0
    if (meter_id >= MAX_METERS) {
7003
0
        return EFBIG;
7004
0
    }
7005
7006
0
    meter = dp_meter_lookup(&dp->meters, meter_id);
7007
0
    if (!meter) {
7008
0
        return ENOENT;
7009
0
    }
7010
7011
0
    if (stats) {
7012
0
        int i = 0;
7013
7014
0
        atomic_read_relaxed(&meter->packet_count, &stats->packet_in_count);
7015
0
        atomic_read_relaxed(&meter->byte_count, &stats->byte_in_count);
7016
7017
0
        for (i = 0; i < n_bands && i < meter->n_bands; ++i) {
7018
0
            atomic_read_relaxed(&meter->bands[i].packet_count,
7019
0
                                &stats->bands[i].packet_count);
7020
0
            atomic_read_relaxed(&meter->bands[i].byte_count,
7021
0
                                &stats->bands[i].byte_count);
7022
0
        }
7023
0
        stats->n_bands = i;
7024
0
    }
7025
7026
0
    return 0;
7027
0
}
7028
7029
static int
7030
dpif_netdev_meter_del(struct dpif *dpif,
7031
                      ofproto_meter_id meter_id_,
7032
                      struct ofputil_meter_stats *stats, uint16_t n_bands)
7033
0
{
7034
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
7035
0
    int error;
7036
7037
0
    error = dpif_netdev_meter_get(dpif, meter_id_, stats, n_bands);
7038
0
    if (!error) {
7039
0
        uint32_t meter_id = meter_id_.uint32;
7040
7041
0
        ovs_mutex_lock(&dp->meters_lock);
7042
0
        dp_meter_detach_free(&dp->meters, meter_id);
7043
0
        ovs_mutex_unlock(&dp->meters_lock);
7044
0
    }
7045
0
    return error;
7046
0
}
7047
7048

7049
static void
7050
dpif_netdev_disable_upcall(struct dpif *dpif)
7051
    OVS_NO_THREAD_SAFETY_ANALYSIS
7052
0
{
7053
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
7054
0
    dp_netdev_disable_upcall(dp);
7055
0
}
7056
7057
static void
7058
dp_netdev_enable_upcall(struct dp_netdev *dp)
7059
    OVS_RELEASES(dp->upcall_rwlock)
7060
0
{
7061
0
    fat_rwlock_unlock(&dp->upcall_rwlock);
7062
0
}
7063
7064
static void
7065
dpif_netdev_enable_upcall(struct dpif *dpif)
7066
    OVS_NO_THREAD_SAFETY_ANALYSIS
7067
0
{
7068
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
7069
0
    dp_netdev_enable_upcall(dp);
7070
0
}
7071
7072
static void
7073
dp_netdev_pmd_reload_done(struct dp_netdev_pmd_thread *pmd)
7074
0
{
7075
0
    atomic_store_relaxed(&pmd->wait_for_reload, false);
7076
0
    atomic_store_relaxed(&pmd->reload_tx_qid, false);
7077
0
    pmd->last_reload_seq = seq_read(pmd->reload_seq);
7078
0
    atomic_store_explicit(&pmd->reload, false, memory_order_release);
7079
0
}
7080
7081
/* Finds and refs the dp_netdev_pmd_thread on core 'core_id'.  Returns
7082
 * the pointer if succeeds, otherwise, NULL (it can return NULL even if
7083
 * 'core_id' is NON_PMD_CORE_ID).
7084
 *
7085
 * Caller must unrefs the returned reference.  */
7086
static struct dp_netdev_pmd_thread *
7087
dp_netdev_get_pmd(struct dp_netdev *dp, unsigned core_id)
7088
0
{
7089
0
    struct dp_netdev_pmd_thread *pmd;
7090
7091
0
    CMAP_FOR_EACH_WITH_HASH (pmd, node, hash_int(core_id, 0),
7092
0
                             &dp->poll_threads) {
7093
0
        if (pmd->core_id == core_id) {
7094
0
            return dp_netdev_pmd_try_ref(pmd) ? pmd : NULL;
7095
0
        }
7096
0
    }
7097
7098
0
    return NULL;
7099
0
}
7100
7101
/* Sets the 'struct dp_netdev_pmd_thread' for non-pmd threads. */
7102
static void
7103
dp_netdev_set_nonpmd(struct dp_netdev *dp)
7104
    OVS_REQ_WRLOCK(dp->port_rwlock)
7105
0
{
7106
0
    struct dp_netdev_pmd_thread *non_pmd;
7107
7108
0
    non_pmd = xzalloc(sizeof *non_pmd);
7109
0
    dp_netdev_configure_pmd(non_pmd, dp, NON_PMD_CORE_ID, OVS_NUMA_UNSPEC);
7110
0
}
7111
7112
/* Caller must have valid pointer to 'pmd'. */
7113
static bool
7114
dp_netdev_pmd_try_ref(struct dp_netdev_pmd_thread *pmd)
7115
0
{
7116
0
    return ovs_refcount_try_ref_rcu(&pmd->ref_cnt);
7117
0
}
7118
7119
static void
7120
dp_netdev_pmd_unref(struct dp_netdev_pmd_thread *pmd)
7121
0
{
7122
0
    if (pmd && ovs_refcount_unref(&pmd->ref_cnt) == 1) {
7123
0
        ovsrcu_postpone(dp_netdev_destroy_pmd, pmd);
7124
0
    }
7125
0
}
7126
7127
/* Given cmap position 'pos', tries to ref the next node.  If try_ref()
7128
 * fails, keeps checking for next node until reaching the end of cmap.
7129
 *
7130
 * Caller must unrefs the returned reference. */
7131
static struct dp_netdev_pmd_thread *
7132
dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos)
7133
0
{
7134
0
    struct dp_netdev_pmd_thread *next;
7135
7136
0
    do {
7137
0
        struct cmap_node *node;
7138
7139
0
        node = cmap_next_position(&dp->poll_threads, pos);
7140
0
        next = node ? CONTAINER_OF(node, struct dp_netdev_pmd_thread, node)
7141
0
            : NULL;
7142
0
    } while (next && !dp_netdev_pmd_try_ref(next));
7143
7144
0
    return next;
7145
0
}
7146
7147
/* Configures the 'pmd' based on the input argument. */
7148
static void
7149
dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp,
7150
                        unsigned core_id, int numa_id)
7151
    OVS_NO_THREAD_SAFETY_ANALYSIS
7152
0
{
7153
0
    pmd->dp = dp;
7154
0
    pmd->core_id = core_id;
7155
0
    pmd->numa_id = numa_id;
7156
0
    pmd->need_reload = false;
7157
0
    pmd->n_output_batches = 0;
7158
7159
0
    ovs_refcount_init(&pmd->ref_cnt);
7160
0
    atomic_init(&pmd->exit, false);
7161
0
    pmd->reload_seq = seq_create();
7162
0
    pmd->last_reload_seq = seq_read(pmd->reload_seq);
7163
0
    atomic_init(&pmd->reload, false);
7164
0
    ovs_mutex_init(&pmd->flow_mutex);
7165
0
    ovs_mutex_init(&pmd->port_mutex);
7166
0
    ovs_mutex_init(&pmd->bond_mutex);
7167
0
    cmap_init(&pmd->flow_table);
7168
0
    cmap_init(&pmd->classifiers);
7169
0
    cmap_init(&pmd->simple_match_table);
7170
0
    ccmap_init(&pmd->n_flows);
7171
0
    ccmap_init(&pmd->n_simple_flows);
7172
0
    pmd->ctx.last_rxq = NULL;
7173
0
    pmd_thread_ctx_time_update(pmd);
7174
0
    pmd->next_optimization = pmd->ctx.now + DPCLS_OPTIMIZATION_INTERVAL;
7175
0
    pmd->next_rcu_quiesce = pmd->ctx.now + PMD_RCU_QUIESCE_INTERVAL;
7176
0
    pmd->next_cycle_store = pmd->ctx.now + PMD_INTERVAL_LEN;
7177
0
    pmd->busy_cycles_intrvl = xzalloc(PMD_INTERVAL_MAX *
7178
0
                                      sizeof *pmd->busy_cycles_intrvl);
7179
0
    hmap_init(&pmd->poll_list);
7180
0
    hmap_init(&pmd->tx_ports);
7181
0
    hmap_init(&pmd->tnl_port_cache);
7182
0
    hmap_init(&pmd->send_port_cache);
7183
0
    cmap_init(&pmd->tx_bonds);
7184
7185
0
    pmd_init_max_sleep(dp, pmd);
7186
7187
    /* Initialize DPIF function pointer to the default configured version. */
7188
0
    atomic_init(&pmd->netdev_input_func, dp_netdev_impl_get_default());
7189
7190
    /* Init default miniflow_extract function */
7191
0
    atomic_init(&pmd->miniflow_extract_opt, dp_mfex_impl_get_default());
7192
7193
    /* init the 'flow_cache' since there is no
7194
     * actual thread created for NON_PMD_CORE_ID. */
7195
0
    if (core_id == NON_PMD_CORE_ID) {
7196
0
        dfc_cache_init(&pmd->flow_cache);
7197
0
        pmd_alloc_static_tx_qid(pmd);
7198
0
    }
7199
0
    pmd_perf_stats_init(&pmd->perf_stats);
7200
0
    cmap_insert(&dp->poll_threads, CONST_CAST(struct cmap_node *, &pmd->node),
7201
0
                hash_int(core_id, 0));
7202
0
}
7203
7204
static void
7205
dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd)
7206
    OVS_NO_THREAD_SAFETY_ANALYSIS
7207
0
{
7208
0
    struct dpcls *cls;
7209
7210
0
    dp_netdev_pmd_flow_flush(pmd);
7211
0
    hmap_destroy(&pmd->send_port_cache);
7212
0
    hmap_destroy(&pmd->tnl_port_cache);
7213
0
    hmap_destroy(&pmd->tx_ports);
7214
0
    cmap_destroy(&pmd->tx_bonds);
7215
0
    hmap_destroy(&pmd->poll_list);
7216
0
    free(pmd->busy_cycles_intrvl);
7217
    /* All flows (including their dpcls_rules) have been deleted already */
7218
0
    CMAP_FOR_EACH (cls, node, &pmd->classifiers) {
7219
0
        dpcls_destroy(cls);
7220
0
        ovsrcu_postpone(free, cls);
7221
0
    }
7222
0
    cmap_destroy(&pmd->classifiers);
7223
0
    cmap_destroy(&pmd->flow_table);
7224
0
    cmap_destroy(&pmd->simple_match_table);
7225
0
    ccmap_destroy(&pmd->n_flows);
7226
0
    ccmap_destroy(&pmd->n_simple_flows);
7227
0
    ovs_mutex_destroy(&pmd->flow_mutex);
7228
0
    seq_destroy(pmd->reload_seq);
7229
0
    ovs_mutex_destroy(&pmd->port_mutex);
7230
0
    ovs_mutex_destroy(&pmd->bond_mutex);
7231
0
    free(pmd->netdev_input_func_userdata);
7232
0
    free(pmd);
7233
0
}
7234
7235
/* Stops the pmd thread, removes it from the 'dp->poll_threads',
7236
 * and unrefs the struct. */
7237
static void
7238
dp_netdev_del_pmd(struct dp_netdev *dp, struct dp_netdev_pmd_thread *pmd)
7239
0
{
7240
    /* NON_PMD_CORE_ID doesn't have a thread, so we don't have to synchronize,
7241
     * but extra cleanup is necessary */
7242
0
    if (pmd->core_id == NON_PMD_CORE_ID) {
7243
0
        ovs_mutex_lock(&dp->non_pmd_mutex);
7244
0
        dfc_cache_uninit(&pmd->flow_cache);
7245
0
        pmd_free_cached_ports(pmd);
7246
0
        pmd_free_static_tx_qid(pmd);
7247
0
        ovs_mutex_unlock(&dp->non_pmd_mutex);
7248
0
    } else {
7249
0
        atomic_store_relaxed(&pmd->exit, true);
7250
0
        dp_netdev_reload_pmd__(pmd);
7251
0
        xpthread_join(pmd->thread, NULL);
7252
0
    }
7253
7254
0
    dp_netdev_pmd_clear_ports(pmd);
7255
7256
    /* Purges the 'pmd''s flows after stopping the thread, but before
7257
     * destroying the flows, so that the flow stats can be collected. */
7258
0
    if (dp->dp_purge_cb) {
7259
0
        dp->dp_purge_cb(dp->dp_purge_aux, pmd->core_id);
7260
0
    }
7261
0
    cmap_remove(&pmd->dp->poll_threads, &pmd->node, hash_int(pmd->core_id, 0));
7262
0
    dp_netdev_pmd_unref(pmd);
7263
0
}
7264
7265
/* Destroys all pmd threads. If 'non_pmd' is true it also destroys the non pmd
7266
 * thread. */
7267
static void
7268
dp_netdev_destroy_all_pmds(struct dp_netdev *dp, bool non_pmd)
7269
0
{
7270
0
    struct dp_netdev_pmd_thread *pmd;
7271
0
    struct dp_netdev_pmd_thread **pmd_list;
7272
0
    size_t k = 0, n_pmds;
7273
7274
0
    n_pmds = cmap_count(&dp->poll_threads);
7275
0
    pmd_list = xcalloc(n_pmds, sizeof *pmd_list);
7276
7277
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
7278
0
        if (!non_pmd && pmd->core_id == NON_PMD_CORE_ID) {
7279
0
            continue;
7280
0
        }
7281
        /* We cannot call dp_netdev_del_pmd(), since it alters
7282
         * 'dp->poll_threads' (while we're iterating it) and it
7283
         * might quiesce. */
7284
0
        ovs_assert(k < n_pmds);
7285
0
        pmd_list[k++] = pmd;
7286
0
    }
7287
7288
0
    for (size_t i = 0; i < k; i++) {
7289
0
        dp_netdev_del_pmd(dp, pmd_list[i]);
7290
0
    }
7291
0
    free(pmd_list);
7292
0
}
7293
7294
/* Deletes all rx queues from pmd->poll_list and all the ports from
7295
 * pmd->tx_ports. */
7296
static void
7297
dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd)
7298
0
{
7299
0
    struct rxq_poll *poll;
7300
0
    struct tx_port *port;
7301
0
    struct tx_bond *tx;
7302
7303
0
    ovs_mutex_lock(&pmd->port_mutex);
7304
0
    HMAP_FOR_EACH_POP (poll, node, &pmd->poll_list) {
7305
0
        free(poll);
7306
0
    }
7307
0
    HMAP_FOR_EACH_POP (port, node, &pmd->tx_ports) {
7308
0
        free(port->txq_pkts);
7309
0
        free(port);
7310
0
    }
7311
0
    ovs_mutex_unlock(&pmd->port_mutex);
7312
7313
0
    ovs_mutex_lock(&pmd->bond_mutex);
7314
0
    CMAP_FOR_EACH (tx, node, &pmd->tx_bonds) {
7315
0
        cmap_remove(&pmd->tx_bonds, &tx->node, hash_bond_id(tx->bond_id));
7316
0
        ovsrcu_postpone(free, tx);
7317
0
    }
7318
0
    ovs_mutex_unlock(&pmd->bond_mutex);
7319
0
}
7320
7321
/* Adds rx queue to poll_list of PMD thread, if it's not there already. */
7322
static void
7323
dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd,
7324
                         struct dp_netdev_rxq *rxq)
7325
    OVS_REQUIRES(pmd->port_mutex)
7326
0
{
7327
0
    int qid = netdev_rxq_get_queue_id(rxq->rx);
7328
0
    uint32_t hash = hash_2words(odp_to_u32(rxq->port->port_no), qid);
7329
0
    struct rxq_poll *poll;
7330
7331
0
    HMAP_FOR_EACH_WITH_HASH (poll, node, hash, &pmd->poll_list) {
7332
0
        if (poll->rxq == rxq) {
7333
            /* 'rxq' is already polled by this thread. Do nothing. */
7334
0
            return;
7335
0
        }
7336
0
    }
7337
7338
0
    poll = xmalloc(sizeof *poll);
7339
0
    poll->rxq = rxq;
7340
0
    hmap_insert(&pmd->poll_list, &poll->node, hash);
7341
7342
0
    pmd->need_reload = true;
7343
0
}
7344
7345
/* Delete 'poll' from poll_list of PMD thread. */
7346
static void
7347
dp_netdev_del_rxq_from_pmd(struct dp_netdev_pmd_thread *pmd,
7348
                           struct rxq_poll *poll)
7349
    OVS_REQUIRES(pmd->port_mutex)
7350
0
{
7351
0
    hmap_remove(&pmd->poll_list, &poll->node);
7352
0
    free(poll);
7353
7354
0
    pmd->need_reload = true;
7355
0
}
7356
7357
/* Add 'port' to the tx port cache of 'pmd', which must be reloaded for the
7358
 * changes to take effect. */
7359
static void
7360
dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
7361
                             struct dp_netdev_port *port)
7362
    OVS_REQUIRES(pmd->port_mutex)
7363
0
{
7364
0
    struct tx_port *tx;
7365
7366
0
    tx = tx_port_lookup(&pmd->tx_ports, port->port_no);
7367
0
    if (tx) {
7368
        /* 'port' is already on this thread tx cache. Do nothing. */
7369
0
        return;
7370
0
    }
7371
7372
0
    tx = xzalloc(sizeof *tx);
7373
7374
0
    tx->port = port;
7375
0
    tx->qid = -1;
7376
0
    tx->flush_time = 0LL;
7377
0
    dp_packet_batch_init(&tx->output_pkts);
7378
7379
0
    if (tx->port->txq_mode == TXQ_MODE_XPS_HASH) {
7380
0
        int i, n_txq = netdev_n_txq(tx->port->netdev);
7381
7382
0
        tx->txq_pkts = xzalloc(n_txq * sizeof *tx->txq_pkts);
7383
0
        for (i = 0; i < n_txq; i++) {
7384
0
            dp_packet_batch_init(&tx->txq_pkts[i]);
7385
0
        }
7386
0
    }
7387
7388
0
    hmap_insert(&pmd->tx_ports, &tx->node, hash_port_no(tx->port->port_no));
7389
0
    pmd->need_reload = true;
7390
0
}
7391
7392
/* Del 'tx' from the tx port cache of 'pmd', which must be reloaded for the
7393
 * changes to take effect. */
7394
static void
7395
dp_netdev_del_port_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
7396
                               struct tx_port *tx)
7397
    OVS_REQUIRES(pmd->port_mutex)
7398
0
{
7399
0
    hmap_remove(&pmd->tx_ports, &tx->node);
7400
0
    free(tx->txq_pkts);
7401
0
    free(tx);
7402
0
    pmd->need_reload = true;
7403
0
}
7404
7405
/* Add bond to the tx bond cmap of 'pmd'. */
7406
static void
7407
dp_netdev_add_bond_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
7408
                             struct tx_bond *bond, bool update)
7409
    OVS_EXCLUDED(pmd->bond_mutex)
7410
0
{
7411
0
    struct tx_bond *tx;
7412
7413
0
    ovs_mutex_lock(&pmd->bond_mutex);
7414
0
    tx = tx_bond_lookup(&pmd->tx_bonds, bond->bond_id);
7415
7416
0
    if (tx && !update) {
7417
        /* It's not an update and the entry already exists.  Do nothing. */
7418
0
        goto unlock;
7419
0
    }
7420
7421
0
    if (tx) {
7422
0
        struct tx_bond *new_tx = xmemdup(bond, sizeof *bond);
7423
7424
        /* Copy the stats for each bucket. */
7425
0
        for (int i = 0; i < BOND_BUCKETS; i++) {
7426
0
            uint64_t n_packets, n_bytes;
7427
7428
0
            atomic_read_relaxed(&tx->member_buckets[i].n_packets, &n_packets);
7429
0
            atomic_read_relaxed(&tx->member_buckets[i].n_bytes, &n_bytes);
7430
0
            atomic_init(&new_tx->member_buckets[i].n_packets, n_packets);
7431
0
            atomic_init(&new_tx->member_buckets[i].n_bytes, n_bytes);
7432
0
        }
7433
0
        cmap_replace(&pmd->tx_bonds, &tx->node, &new_tx->node,
7434
0
                     hash_bond_id(bond->bond_id));
7435
0
        ovsrcu_postpone(free, tx);
7436
0
    } else {
7437
0
        tx = xmemdup(bond, sizeof *bond);
7438
0
        cmap_insert(&pmd->tx_bonds, &tx->node, hash_bond_id(bond->bond_id));
7439
0
    }
7440
0
unlock:
7441
0
    ovs_mutex_unlock(&pmd->bond_mutex);
7442
0
}
7443
7444
/* Delete bond from the tx bond cmap of 'pmd'. */
7445
static void
7446
dp_netdev_del_bond_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
7447
                               uint32_t bond_id)
7448
    OVS_EXCLUDED(pmd->bond_mutex)
7449
0
{
7450
0
    struct tx_bond *tx;
7451
7452
0
    ovs_mutex_lock(&pmd->bond_mutex);
7453
0
    tx = tx_bond_lookup(&pmd->tx_bonds, bond_id);
7454
0
    if (tx) {
7455
0
        cmap_remove(&pmd->tx_bonds, &tx->node, hash_bond_id(tx->bond_id));
7456
0
        ovsrcu_postpone(free, tx);
7457
0
    }
7458
0
    ovs_mutex_unlock(&pmd->bond_mutex);
7459
0
}
7460

7461
static char *
7462
dpif_netdev_get_datapath_version(void)
7463
0
{
7464
0
     return xstrdup("<built-in>");
7465
0
}
7466
7467
static void
7468
dp_netdev_flow_used(struct dp_netdev_flow *netdev_flow, int cnt, int size,
7469
                    uint16_t tcp_flags, long long now)
7470
0
{
7471
0
    uint16_t flags;
7472
7473
0
    atomic_store_relaxed(&netdev_flow->stats.used, now);
7474
0
    non_atomic_ullong_add(&netdev_flow->stats.packet_count, cnt);
7475
0
    non_atomic_ullong_add(&netdev_flow->stats.byte_count, size);
7476
0
    atomic_read_relaxed(&netdev_flow->stats.tcp_flags, &flags);
7477
0
    flags |= tcp_flags;
7478
0
    atomic_store_relaxed(&netdev_flow->stats.tcp_flags, flags);
7479
0
}
7480
7481
static int
7482
dp_netdev_upcall(struct dp_netdev_pmd_thread *pmd, struct dp_packet *packet_,
7483
                 struct flow *flow, struct flow_wildcards *wc, ovs_u128 *ufid,
7484
                 enum dpif_upcall_type type, const struct nlattr *userdata,
7485
                 struct ofpbuf *actions, struct ofpbuf *put_actions)
7486
0
{
7487
0
    struct dp_netdev *dp = pmd->dp;
7488
7489
0
    if (OVS_UNLIKELY(!dp->upcall_cb)) {
7490
0
        return ENODEV;
7491
0
    }
7492
7493
0
    if (OVS_UNLIKELY(!VLOG_DROP_DBG(&upcall_rl))) {
7494
0
        struct ds ds = DS_EMPTY_INITIALIZER;
7495
0
        char *packet_str;
7496
0
        struct ofpbuf key;
7497
0
        struct odp_flow_key_parms odp_parms = {
7498
0
            .flow = flow,
7499
0
            .mask = wc ? &wc->masks : NULL,
7500
0
            .support = dp_netdev_support,
7501
0
        };
7502
7503
0
        ofpbuf_init(&key, 0);
7504
0
        odp_flow_key_from_flow(&odp_parms, &key);
7505
0
        packet_str = ofp_dp_packet_to_string(packet_);
7506
7507
0
        odp_flow_key_format(key.data, key.size, &ds);
7508
7509
0
        VLOG_DBG("%s: %s upcall:\n%s\n%s", dp->name,
7510
0
                 dpif_upcall_type_to_string(type), ds_cstr(&ds), packet_str);
7511
7512
0
        ofpbuf_uninit(&key);
7513
0
        free(packet_str);
7514
7515
0
        ds_destroy(&ds);
7516
0
    }
7517
7518
0
    if (type != DPIF_UC_MISS) {
7519
0
        dp_packet_ol_send_prepare(packet_, 0);
7520
0
    }
7521
7522
0
    return dp->upcall_cb(packet_, flow, ufid, pmd->core_id, type, userdata,
7523
0
                         actions, wc, put_actions, dp->upcall_aux);
7524
0
}
7525
7526
static inline uint32_t
7527
dpif_netdev_packet_get_rss_hash(struct dp_packet *packet,
7528
                                const struct miniflow *mf)
7529
0
{
7530
0
    uint32_t hash, recirc_depth;
7531
7532
0
    if (OVS_LIKELY(dp_packet_rss_valid(packet))) {
7533
0
        hash = dp_packet_get_rss_hash(packet);
7534
0
    } else {
7535
0
        hash = miniflow_hash_5tuple(mf, 0);
7536
0
        dp_packet_set_rss_hash(packet, hash);
7537
0
    }
7538
7539
    /* The RSS hash must account for the recirculation depth to avoid
7540
     * collisions in the exact match cache */
7541
0
    recirc_depth = *recirc_depth_get_unsafe();
7542
0
    if (OVS_UNLIKELY(recirc_depth)) {
7543
0
        hash = hash_finish(hash, recirc_depth);
7544
0
    }
7545
0
    return hash;
7546
0
}
7547
7548
struct packet_batch_per_flow {
7549
    unsigned int byte_count;
7550
    uint16_t tcp_flags;
7551
    struct dp_netdev_flow *flow;
7552
7553
    struct dp_packet_batch array;
7554
};
7555
7556
static inline void
7557
packet_batch_per_flow_update(struct packet_batch_per_flow *batch,
7558
                             struct dp_packet *packet,
7559
                             uint16_t tcp_flags)
7560
0
{
7561
0
    batch->byte_count += dp_packet_size(packet);
7562
0
    batch->tcp_flags |= tcp_flags;
7563
0
    dp_packet_batch_add(&batch->array, packet);
7564
0
}
7565
7566
static inline void
7567
packet_batch_per_flow_init(struct packet_batch_per_flow *batch,
7568
                           struct dp_netdev_flow *flow)
7569
0
{
7570
0
    flow->batch = batch;
7571
7572
0
    batch->flow = flow;
7573
0
    dp_packet_batch_init(&batch->array);
7574
0
    batch->byte_count = 0;
7575
0
    batch->tcp_flags = 0;
7576
0
}
7577
7578
static inline void
7579
packet_batch_per_flow_execute(struct packet_batch_per_flow *batch,
7580
                              struct dp_netdev_pmd_thread *pmd)
7581
0
{
7582
0
    struct dp_netdev_actions *actions;
7583
0
    struct dp_netdev_flow *flow = batch->flow;
7584
7585
0
    dp_netdev_flow_used(flow, dp_packet_batch_size(&batch->array),
7586
0
                        batch->byte_count,
7587
0
                        batch->tcp_flags, pmd->ctx.now / 1000);
7588
7589
0
    actions = dp_netdev_flow_get_actions(flow);
7590
7591
0
    dp_netdev_execute_actions(pmd, &batch->array, true, &flow->flow,
7592
0
                              actions->actions, actions->size);
7593
0
}
7594
7595
void
7596
dp_netdev_batch_execute(struct dp_netdev_pmd_thread *pmd,
7597
                        struct dp_packet_batch *packets,
7598
                        struct dpcls_rule *rule,
7599
                        uint32_t bytes,
7600
                        uint16_t tcp_flags)
7601
0
{
7602
    /* Gets action* from the rule. */
7603
0
    struct dp_netdev_flow *flow = dp_netdev_flow_cast(rule);
7604
0
    struct dp_netdev_actions *actions = dp_netdev_flow_get_actions(flow);
7605
7606
0
    dp_netdev_flow_used(flow, dp_packet_batch_size(packets), bytes,
7607
0
                        tcp_flags, pmd->ctx.now / 1000);
7608
0
    const uint32_t steal = 1;
7609
0
    dp_netdev_execute_actions(pmd, packets, steal, &flow->flow,
7610
0
                              actions->actions, actions->size);
7611
0
}
7612
7613
static inline void
7614
dp_netdev_queue_batches(struct dp_packet *pkt,
7615
                        struct dp_netdev_flow *flow, uint16_t tcp_flags,
7616
                        struct packet_batch_per_flow *batches,
7617
                        size_t *n_batches)
7618
0
{
7619
0
    struct packet_batch_per_flow *batch = flow->batch;
7620
7621
0
    if (OVS_UNLIKELY(!batch)) {
7622
0
        batch = &batches[(*n_batches)++];
7623
0
        packet_batch_per_flow_init(batch, flow);
7624
0
    }
7625
7626
0
    packet_batch_per_flow_update(batch, pkt, tcp_flags);
7627
0
}
7628
7629
static inline void
7630
packet_enqueue_to_flow_map(struct dp_packet *packet,
7631
                           struct dp_netdev_flow *flow,
7632
                           uint16_t tcp_flags,
7633
                           struct dp_packet_flow_map *flow_map,
7634
                           size_t index)
7635
0
{
7636
0
    struct dp_packet_flow_map *map = &flow_map[index];
7637
0
    map->flow = flow;
7638
0
    map->packet = packet;
7639
0
    map->tcp_flags = tcp_flags;
7640
0
}
7641
7642
/* SMC lookup function for a batch of packets.
7643
 * By doing batching SMC lookup, we can use prefetch
7644
 * to hide memory access latency.
7645
 */
7646
static inline void
7647
smc_lookup_batch(struct dp_netdev_pmd_thread *pmd,
7648
            struct netdev_flow_key *keys,
7649
            struct netdev_flow_key **missed_keys,
7650
            struct dp_packet_batch *packets_,
7651
            const int cnt,
7652
            struct dp_packet_flow_map *flow_map,
7653
            uint8_t *index_map)
7654
0
{
7655
0
    int i;
7656
0
    struct dp_packet *packet;
7657
0
    size_t n_smc_hit = 0, n_missed = 0;
7658
0
    struct dfc_cache *cache = &pmd->flow_cache;
7659
0
    struct smc_cache *smc_cache = &cache->smc_cache;
7660
0
    const struct cmap_node *flow_node;
7661
0
    int recv_idx;
7662
0
    uint16_t tcp_flags;
7663
7664
    /* Prefetch buckets for all packets */
7665
0
    for (i = 0; i < cnt; i++) {
7666
0
        OVS_PREFETCH(&smc_cache->buckets[keys[i].hash & SMC_MASK]);
7667
0
    }
7668
7669
0
    DP_PACKET_BATCH_REFILL_FOR_EACH (i, cnt, packet, packets_) {
7670
0
        struct dp_netdev_flow *flow = NULL;
7671
0
        flow_node = smc_entry_get(pmd, keys[i].hash);
7672
0
        bool hit = false;
7673
        /* Get the original order of this packet in received batch. */
7674
0
        recv_idx = index_map[i];
7675
7676
0
        if (OVS_LIKELY(flow_node != NULL)) {
7677
0
            CMAP_NODE_FOR_EACH (flow, node, flow_node) {
7678
                /* Since we dont have per-port megaflow to check the port
7679
                 * number, we need to  verify that the input ports match. */
7680
0
                if (OVS_LIKELY(dpcls_rule_matches_key(&flow->cr, &keys[i]) &&
7681
0
                flow->flow.in_port.odp_port == packet->md.in_port.odp_port)) {
7682
0
                    tcp_flags = miniflow_get_tcp_flags(&keys[i].mf);
7683
7684
                    /* SMC hit and emc miss, we insert into EMC */
7685
0
                    keys[i].len =
7686
0
                        netdev_flow_key_size(miniflow_n_values(&keys[i].mf));
7687
0
                    emc_probabilistic_insert(pmd, &keys[i], flow);
7688
                    /* Add these packets into the flow map in the same order
7689
                     * as received.
7690
                     */
7691
0
                    packet_enqueue_to_flow_map(packet, flow, tcp_flags,
7692
0
                                               flow_map, recv_idx);
7693
0
                    n_smc_hit++;
7694
0
                    hit = true;
7695
0
                    break;
7696
0
                }
7697
0
            }
7698
0
            if (hit) {
7699
0
                continue;
7700
0
            }
7701
0
        }
7702
7703
        /* SMC missed. Group missed packets together at
7704
         * the beginning of the 'packets' array. */
7705
0
        dp_packet_batch_refill(packets_, packet, i);
7706
7707
        /* Preserve the order of packet for flow batching. */
7708
0
        index_map[n_missed] = recv_idx;
7709
7710
        /* Put missed keys to the pointer arrays return to the caller */
7711
0
        missed_keys[n_missed++] = &keys[i];
7712
0
    }
7713
7714
0
    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SMC_HIT, n_smc_hit);
7715
0
}
7716
7717
struct dp_netdev_flow *
7718
smc_lookup_single(struct dp_netdev_pmd_thread *pmd,
7719
                  struct dp_packet *packet,
7720
                  struct netdev_flow_key *key)
7721
0
{
7722
0
    const struct cmap_node *flow_node = smc_entry_get(pmd, key->hash);
7723
7724
0
    if (OVS_LIKELY(flow_node != NULL)) {
7725
0
        struct dp_netdev_flow *flow = NULL;
7726
7727
0
        CMAP_NODE_FOR_EACH (flow, node, flow_node) {
7728
            /* Since we dont have per-port megaflow to check the port
7729
             * number, we need to verify that the input ports match. */
7730
0
            if (OVS_LIKELY(dpcls_rule_matches_key(&flow->cr, key) &&
7731
0
                flow->flow.in_port.odp_port == packet->md.in_port.odp_port)) {
7732
7733
0
                return (void *) flow;
7734
0
            }
7735
0
        }
7736
0
    }
7737
7738
0
    return NULL;
7739
0
}
7740
7741
inline int
7742
dp_netdev_hw_flow(const struct dp_netdev_pmd_thread *pmd,
7743
                  struct dp_packet *packet,
7744
                  struct dp_netdev_flow **flow)
7745
0
{
7746
0
    struct dp_netdev_rxq *rxq = pmd->ctx.last_rxq;
7747
0
    bool post_process_api_supported;
7748
0
    void *flow_reference = NULL;
7749
0
    int err;
7750
7751
0
    atomic_read_relaxed(&rxq->port->netdev->hw_info.post_process_api_supported,
7752
0
                        &post_process_api_supported);
7753
7754
0
    if (!post_process_api_supported) {
7755
0
        *flow = NULL;
7756
0
        return 0;
7757
0
    }
7758
7759
0
    err = dpif_offload_netdev_hw_post_process(rxq->port->netdev, pmd->core_id,
7760
0
                                              packet, &flow_reference);
7761
0
    if (err && err != EOPNOTSUPP) {
7762
0
        if (err != ECANCELED) {
7763
0
            COVERAGE_INC(datapath_drop_hw_post_process);
7764
0
        } else {
7765
0
            COVERAGE_INC(datapath_drop_hw_post_process_consumed);
7766
0
        }
7767
0
        return -1;
7768
0
    }
7769
7770
0
    *flow = flow_reference;
7771
0
    return 0;
7772
0
}
7773
7774
/* Enqueues already classified packet into per-flow batches or the flow map,
7775
 * depending on the fact if batching enabled. */
7776
static inline void
7777
dfc_processing_enqueue_classified_packet(struct dp_packet *packet,
7778
                                         struct dp_netdev_flow *flow,
7779
                                         uint16_t tcp_flags,
7780
                                         bool batch_enable,
7781
                                         struct packet_batch_per_flow *batches,
7782
                                         size_t *n_batches,
7783
                                         struct dp_packet_flow_map *flow_map,
7784
                                         size_t *map_cnt)
7785
7786
0
{
7787
0
    if (OVS_LIKELY(batch_enable)) {
7788
0
        dp_netdev_queue_batches(packet, flow, tcp_flags, batches,
7789
0
                                n_batches);
7790
0
    } else {
7791
        /* Flow batching should be performed only after fast-path
7792
         * processing is also completed for packets with emc miss
7793
         * or else it will result in reordering of packets with
7794
         * same datapath flows. */
7795
0
        packet_enqueue_to_flow_map(packet, flow, tcp_flags,
7796
0
                                   flow_map, (*map_cnt)++);
7797
0
    }
7798
7799
0
}
7800
7801
/* Try to process all ('cnt') the 'packets' using only the datapath flow cache
7802
 * 'pmd->flow_cache'. If a flow is not found for a packet 'packets[i]', the
7803
 * miniflow is copied into 'keys' and the packet pointer is moved at the
7804
 * beginning of the 'packets' array. The pointers of missed keys are put in the
7805
 * missed_keys pointer array for future processing.
7806
 *
7807
 * The function returns the number of packets that needs to be processed in the
7808
 * 'packets' array (they have been moved to the beginning of the vector).
7809
 *
7810
 * For performance reasons a caller may choose not to initialize the metadata
7811
 * in 'packets_'.  If 'md_is_valid' is false, the metadata in 'packets'
7812
 * is not valid and must be initialized by this function using 'port_no'.
7813
 * If 'md_is_valid' is true, the metadata is already valid and 'port_no'
7814
 * will be ignored.
7815
 */
7816
static inline size_t
7817
dfc_processing(struct dp_netdev_pmd_thread *pmd,
7818
               struct dp_packet_batch *packets_,
7819
               struct netdev_flow_key *keys,
7820
               struct netdev_flow_key **missed_keys,
7821
               struct packet_batch_per_flow batches[], size_t *n_batches,
7822
               struct dp_packet_flow_map *flow_map,
7823
               size_t *n_flows, uint8_t *index_map,
7824
               bool md_is_valid, odp_port_t port_no)
7825
0
{
7826
0
    const bool offload_enabled = dpif_offload_enabled();
7827
0
    const uint32_t recirc_depth = *recirc_depth_get();
7828
0
    const size_t cnt = dp_packet_batch_size(packets_);
7829
0
    size_t n_missed = 0, n_emc_hit = 0, n_phwol_hit = 0;
7830
0
    size_t n_mfex_opt_hit = 0, n_simple_hit = 0;
7831
0
    struct dfc_cache *cache = &pmd->flow_cache;
7832
0
    struct netdev_flow_key *key = &keys[0];
7833
0
    struct dp_packet *packet;
7834
0
    size_t map_cnt = 0;
7835
0
    bool batch_enable = true;
7836
7837
0
    const bool simple_match_enabled =
7838
0
        !md_is_valid && dp_netdev_simple_match_enabled(pmd, port_no);
7839
    /* 'simple_match_table' is a full flow table.  If the flow is not there,
7840
     * upcall is required, and there is no chance to find a match in caches. */
7841
0
    const bool smc_enable_db = !simple_match_enabled && pmd->ctx.smc_enable_db;
7842
0
    const uint32_t cur_min = simple_match_enabled
7843
0
                             ? 0 : pmd->ctx.emc_insert_min;
7844
7845
0
    pmd_perf_update_counter(&pmd->perf_stats,
7846
0
                            md_is_valid ? PMD_STAT_RECIRC : PMD_STAT_RECV,
7847
0
                            cnt);
7848
0
    int i;
7849
0
    DP_PACKET_BATCH_REFILL_FOR_EACH (i, cnt, packet, packets_) {
7850
0
        struct dp_netdev_flow *flow = NULL;
7851
0
        uint16_t tcp_flags;
7852
7853
0
        if (OVS_UNLIKELY(dp_packet_size(packet) < ETH_HEADER_LEN)) {
7854
0
            dp_packet_delete(packet);
7855
0
            COVERAGE_INC(datapath_drop_rx_invalid_packet);
7856
0
            continue;
7857
0
        }
7858
7859
0
        if (i != cnt - 1) {
7860
0
            struct dp_packet **packets = packets_->packets;
7861
            /* Prefetch next packet data and metadata. */
7862
0
            OVS_PREFETCH(dp_packet_data(packets[i+1]));
7863
0
            pkt_metadata_prefetch_init(&packets[i+1]->md);
7864
0
        }
7865
7866
0
        if (!md_is_valid) {
7867
0
            pkt_metadata_init(&packet->md, port_no);
7868
0
        }
7869
7870
0
        if (offload_enabled && recirc_depth == 0) {
7871
0
            if (OVS_UNLIKELY(dp_netdev_hw_flow(pmd, packet, &flow))) {
7872
                /* Packet restoration failed and it was dropped, do not
7873
                 * continue processing.
7874
                 */
7875
0
                continue;
7876
0
            }
7877
0
            if (OVS_LIKELY(flow)) {
7878
0
                tcp_flags = parse_tcp_flags(packet, NULL, NULL, NULL);
7879
0
                n_phwol_hit++;
7880
0
                dfc_processing_enqueue_classified_packet(
7881
0
                        packet, flow, tcp_flags, batch_enable,
7882
0
                        batches, n_batches, flow_map, &map_cnt);
7883
0
                continue;
7884
0
            }
7885
0
        }
7886
7887
0
        if (!flow && simple_match_enabled) {
7888
0
            ovs_be16 dl_type = 0, vlan_tci = 0;
7889
0
            uint8_t nw_frag = 0;
7890
7891
0
            tcp_flags = parse_tcp_flags(packet, &dl_type, &nw_frag, &vlan_tci);
7892
0
            flow = dp_netdev_simple_match_lookup(pmd, port_no, dl_type,
7893
0
                                                 nw_frag, vlan_tci);
7894
0
            if (OVS_LIKELY(flow)) {
7895
0
                n_simple_hit++;
7896
0
                dfc_processing_enqueue_classified_packet(
7897
0
                        packet, flow, tcp_flags, batch_enable,
7898
0
                        batches, n_batches, flow_map, &map_cnt);
7899
0
                continue;
7900
0
            }
7901
0
        }
7902
7903
0
        miniflow_extract(packet, &key->mf);
7904
0
        key->len = 0; /* Not computed yet. */
7905
0
        key->hash =
7906
0
                (md_is_valid == false)
7907
0
                ? dpif_netdev_packet_get_rss_hash_orig_pkt(packet, &key->mf)
7908
0
                : dpif_netdev_packet_get_rss_hash(packet, &key->mf);
7909
7910
        /* If EMC is disabled skip emc_lookup */
7911
0
        flow = (cur_min != 0) ? emc_lookup(&cache->emc_cache, key) : NULL;
7912
0
        if (OVS_LIKELY(flow)) {
7913
0
            tcp_flags = miniflow_get_tcp_flags(&key->mf);
7914
0
            n_emc_hit++;
7915
0
            dfc_processing_enqueue_classified_packet(
7916
0
                    packet, flow, tcp_flags, batch_enable,
7917
0
                    batches, n_batches, flow_map, &map_cnt);
7918
0
        } else {
7919
            /* Exact match cache missed. Group missed packets together at
7920
             * the beginning of the 'packets' array. */
7921
0
            dp_packet_batch_refill(packets_, packet, i);
7922
7923
            /* Preserve the order of packet for flow batching. */
7924
0
            index_map[n_missed] = map_cnt;
7925
0
            flow_map[map_cnt++].flow = NULL;
7926
7927
            /* 'key[n_missed]' contains the key of the current packet and it
7928
             * will be passed to SMC lookup. The next key should be extracted
7929
             * to 'keys[n_missed + 1]'.
7930
             * We also maintain a pointer array to keys missed both SMC and EMC
7931
             * which will be returned to the caller for future processing. */
7932
0
            missed_keys[n_missed] = key;
7933
0
            key = &keys[++n_missed];
7934
7935
            /* Skip batching for subsequent packets to avoid reordering. */
7936
0
            batch_enable = false;
7937
0
        }
7938
0
    }
7939
    /* Count of packets which are not flow batched. */
7940
0
    *n_flows = map_cnt;
7941
7942
0
    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_PHWOL_HIT, n_phwol_hit);
7943
0
    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MFEX_OPT_HIT,
7944
0
                            n_mfex_opt_hit);
7945
0
    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SIMPLE_HIT,
7946
0
                            n_simple_hit);
7947
0
    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_EXACT_HIT, n_emc_hit);
7948
7949
0
    if (!smc_enable_db) {
7950
0
        return dp_packet_batch_size(packets_);
7951
0
    }
7952
7953
    /* Packets miss EMC will do a batch lookup in SMC if enabled */
7954
0
    smc_lookup_batch(pmd, keys, missed_keys, packets_,
7955
0
                     n_missed, flow_map, index_map);
7956
7957
0
    return dp_packet_batch_size(packets_);
7958
0
}
7959
7960
static inline int
7961
handle_packet_upcall(struct dp_netdev_pmd_thread *pmd,
7962
                     struct dp_packet *packet,
7963
                     const struct netdev_flow_key *key,
7964
                     struct ofpbuf *actions, struct ofpbuf *put_actions)
7965
0
{
7966
0
    struct ofpbuf *add_actions;
7967
0
    struct dp_packet_batch b;
7968
0
    struct match match;
7969
0
    ovs_u128 ufid;
7970
0
    int error;
7971
0
    uint64_t cycles = cycles_counter_update(&pmd->perf_stats);
7972
0
    odp_port_t orig_in_port = packet->md.orig_in_port;
7973
7974
0
    match.tun_md.valid = false;
7975
0
    miniflow_expand(&key->mf, &match.flow);
7976
0
    memset(&match.wc, 0, sizeof match.wc);
7977
7978
0
    ofpbuf_clear(actions);
7979
0
    ofpbuf_clear(put_actions);
7980
7981
0
    odp_flow_key_hash(&match.flow, sizeof match.flow, &ufid);
7982
0
    error = dp_netdev_upcall(pmd, packet, &match.flow, &match.wc,
7983
0
                             &ufid, DPIF_UC_MISS, NULL, actions,
7984
0
                             put_actions);
7985
0
    if (OVS_UNLIKELY(error && error != ENOSPC)) {
7986
0
        dp_packet_delete(packet);
7987
0
        COVERAGE_INC(datapath_drop_upcall_error);
7988
0
        return error;
7989
0
    }
7990
7991
    /* The Netlink encoding of datapath flow keys cannot express
7992
     * wildcarding the presence of a VLAN tag. Instead, a missing VLAN
7993
     * tag is interpreted as exact match on the fact that there is no
7994
     * VLAN.  Unless we refactor a lot of code that translates between
7995
     * Netlink and struct flow representations, we have to do the same
7996
     * here.  This must be in sync with 'match' in dpif_netdev_flow_put(). */
7997
0
    if (!match.wc.masks.vlans[0].tci) {
7998
0
        match.wc.masks.vlans[0].tci = htons(VLAN_VID_MASK | VLAN_CFI);
7999
0
    }
8000
8001
    /* We can't allow the packet batching in the next loop to execute
8002
     * the actions.  Otherwise, if there are any slow path actions,
8003
     * we'll send the packet up twice. */
8004
0
    dp_packet_batch_init_packet(&b, packet);
8005
0
    dp_netdev_execute_actions(pmd, &b, true, &match.flow,
8006
0
                              actions->data, actions->size);
8007
8008
0
    add_actions = put_actions->size ? put_actions : actions;
8009
0
    if (OVS_LIKELY(error != ENOSPC)) {
8010
0
        struct dp_netdev_flow *netdev_flow;
8011
8012
        /* XXX: There's a race window where a flow covering this packet
8013
         * could have already been installed since we last did the flow
8014
         * lookup before upcall.  This could be solved by moving the
8015
         * mutex lock outside the loop, but that's an awful long time
8016
         * to be locking revalidators out of making flow modifications. */
8017
0
        ovs_mutex_lock(&pmd->flow_mutex);
8018
0
        netdev_flow = dp_netdev_pmd_lookup_flow(pmd, key, NULL);
8019
0
        if (OVS_LIKELY(!netdev_flow)) {
8020
0
            netdev_flow = dp_netdev_flow_add(pmd, &match, &ufid,
8021
0
                                             add_actions->data,
8022
0
                                             add_actions->size, orig_in_port);
8023
0
        }
8024
0
        ovs_mutex_unlock(&pmd->flow_mutex);
8025
0
        uint32_t hash = dp_netdev_flow_hash(&netdev_flow->ufid);
8026
0
        smc_insert(pmd, key, hash);
8027
0
        emc_probabilistic_insert(pmd, key, netdev_flow);
8028
0
    }
8029
0
    if (pmd_perf_metrics_enabled(pmd)) {
8030
        /* Update upcall stats. */
8031
0
        cycles = cycles_counter_update(&pmd->perf_stats) - cycles;
8032
0
        struct pmd_perf_stats *s = &pmd->perf_stats;
8033
0
        s->current.upcalls++;
8034
0
        s->current.upcall_cycles += cycles;
8035
0
        histogram_add_sample(&s->cycles_per_upcall, cycles);
8036
0
    }
8037
0
    return error;
8038
0
}
8039
8040
static inline void
8041
fast_path_processing(struct dp_netdev_pmd_thread *pmd,
8042
                     struct dp_packet_batch *packets_,
8043
                     struct netdev_flow_key **keys,
8044
                     struct dp_packet_flow_map *flow_map,
8045
                     uint8_t *index_map,
8046
                     odp_port_t in_port)
8047
0
{
8048
0
    const size_t cnt = dp_packet_batch_size(packets_);
8049
0
#if !defined(__CHECKER__) && !defined(_WIN32)
8050
0
    const size_t PKT_ARRAY_SIZE = cnt;
8051
#else
8052
    /* Sparse or MSVC doesn't like variable length array. */
8053
    enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
8054
#endif
8055
0
    struct dp_packet *packet;
8056
0
    struct dpcls *cls;
8057
0
    struct dpcls_rule *rules[PKT_ARRAY_SIZE];
8058
0
    struct dp_netdev *dp = pmd->dp;
8059
0
    int upcall_ok_cnt = 0, upcall_fail_cnt = 0;
8060
0
    int lookup_cnt = 0, add_lookup_cnt;
8061
0
    bool any_miss;
8062
8063
0
    for (size_t i = 0; i < cnt; i++) {
8064
        /* Key length is needed in all the cases, hash computed on demand. */
8065
0
        keys[i]->len = netdev_flow_key_size(miniflow_n_values(&keys[i]->mf));
8066
0
    }
8067
    /* Get the classifier for the in_port */
8068
0
    cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
8069
0
    if (OVS_LIKELY(cls)) {
8070
0
        any_miss = !dpcls_lookup(cls, (const struct netdev_flow_key **)keys,
8071
0
                                rules, cnt, &lookup_cnt);
8072
0
    } else {
8073
0
        any_miss = true;
8074
0
        memset(rules, 0, sizeof(rules));
8075
0
    }
8076
0
    if (OVS_UNLIKELY(any_miss) && !fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {
8077
0
        uint64_t actions_stub[512 / 8], slow_stub[512 / 8];
8078
0
        struct ofpbuf actions, put_actions;
8079
8080
0
        ofpbuf_use_stub(&actions, actions_stub, sizeof actions_stub);
8081
0
        ofpbuf_use_stub(&put_actions, slow_stub, sizeof slow_stub);
8082
8083
0
        DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
8084
0
            struct dp_netdev_flow *netdev_flow;
8085
8086
0
            if (OVS_LIKELY(rules[i])) {
8087
0
                continue;
8088
0
            }
8089
8090
            /* It's possible that an earlier slow path execution installed
8091
             * a rule covering this flow.  In this case, it's a lot cheaper
8092
             * to catch it here than execute a miss. */
8093
0
            netdev_flow = dp_netdev_pmd_lookup_flow(pmd, keys[i],
8094
0
                                                    &add_lookup_cnt);
8095
0
            if (netdev_flow) {
8096
0
                lookup_cnt += add_lookup_cnt;
8097
0
                rules[i] = &netdev_flow->cr;
8098
0
                continue;
8099
0
            }
8100
8101
0
            int error = handle_packet_upcall(pmd, packet, keys[i],
8102
0
                                             &actions, &put_actions);
8103
8104
0
            if (OVS_UNLIKELY(error)) {
8105
0
                upcall_fail_cnt++;
8106
0
            } else {
8107
0
                upcall_ok_cnt++;
8108
0
            }
8109
0
        }
8110
8111
0
        ofpbuf_uninit(&actions);
8112
0
        ofpbuf_uninit(&put_actions);
8113
0
        fat_rwlock_unlock(&dp->upcall_rwlock);
8114
0
    } else if (OVS_UNLIKELY(any_miss)) {
8115
0
        DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
8116
0
            if (OVS_UNLIKELY(!rules[i])) {
8117
0
                dp_packet_delete(packet);
8118
0
                COVERAGE_INC(datapath_drop_lock_error);
8119
0
                upcall_fail_cnt++;
8120
0
            }
8121
0
        }
8122
0
    }
8123
8124
0
    DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
8125
0
        struct dp_netdev_flow *flow;
8126
        /* Get the original order of this packet in received batch. */
8127
0
        int recv_idx = index_map[i];
8128
0
        uint16_t tcp_flags;
8129
8130
0
        if (OVS_UNLIKELY(!rules[i])) {
8131
0
            continue;
8132
0
        }
8133
8134
0
        flow = dp_netdev_flow_cast(rules[i]);
8135
0
        uint32_t hash =  dp_netdev_flow_hash(&flow->ufid);
8136
0
        smc_insert(pmd, keys[i], hash);
8137
8138
0
        emc_probabilistic_insert(pmd, keys[i], flow);
8139
        /* Add these packets into the flow map in the same order
8140
         * as received.
8141
         */
8142
0
        tcp_flags = miniflow_get_tcp_flags(&keys[i]->mf);
8143
0
        packet_enqueue_to_flow_map(packet, flow, tcp_flags,
8144
0
                                   flow_map, recv_idx);
8145
0
    }
8146
8147
0
    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_HIT,
8148
0
                            cnt - upcall_ok_cnt - upcall_fail_cnt);
8149
0
    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_LOOKUP,
8150
0
                            lookup_cnt);
8151
0
    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MISS,
8152
0
                            upcall_ok_cnt);
8153
0
    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_LOST,
8154
0
                            upcall_fail_cnt);
8155
0
}
8156
8157
/* Packets enter the datapath from a port (or from recirculation) here.
8158
 *
8159
 * When 'md_is_valid' is true the metadata in 'packets' are already valid.
8160
 * When false the metadata in 'packets' need to be initialized. */
8161
static void
8162
dp_netdev_input__(struct dp_netdev_pmd_thread *pmd,
8163
                  struct dp_packet_batch *packets,
8164
                  bool md_is_valid, odp_port_t port_no)
8165
0
{
8166
0
#if !defined(__CHECKER__) && !defined(_WIN32)
8167
0
    const size_t PKT_ARRAY_SIZE = dp_packet_batch_size(packets);
8168
#else
8169
    /* Sparse or MSVC doesn't like variable length array. */
8170
    enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
8171
#endif
8172
0
    OVS_ALIGNED_VAR(CACHE_LINE_SIZE)
8173
0
        struct netdev_flow_key keys[PKT_ARRAY_SIZE];
8174
0
    struct netdev_flow_key *missed_keys[PKT_ARRAY_SIZE];
8175
0
    struct packet_batch_per_flow batches[PKT_ARRAY_SIZE];
8176
0
    size_t n_batches;
8177
0
    struct dp_packet_flow_map flow_map[PKT_ARRAY_SIZE];
8178
0
    uint8_t index_map[PKT_ARRAY_SIZE];
8179
0
    size_t n_flows, i;
8180
8181
0
    odp_port_t in_port;
8182
8183
0
    n_batches = 0;
8184
0
    dfc_processing(pmd, packets, keys, missed_keys, batches, &n_batches,
8185
0
                   flow_map, &n_flows, index_map, md_is_valid, port_no);
8186
8187
0
    if (!dp_packet_batch_is_empty(packets)) {
8188
        /* Get ingress port from first packet's metadata. */
8189
0
        in_port = packets->packets[0]->md.in_port.odp_port;
8190
0
        fast_path_processing(pmd, packets, missed_keys,
8191
0
                             flow_map, index_map, in_port);
8192
0
    }
8193
8194
    /* Batch rest of packets which are in flow map. */
8195
0
    for (i = 0; i < n_flows; i++) {
8196
0
        struct dp_packet_flow_map *map = &flow_map[i];
8197
8198
0
        if (OVS_UNLIKELY(!map->flow)) {
8199
0
            continue;
8200
0
        }
8201
0
        dp_netdev_queue_batches(map->packet, map->flow, map->tcp_flags,
8202
0
                                batches, &n_batches);
8203
0
     }
8204
8205
    /* All the flow batches need to be reset before any call to
8206
     * packet_batch_per_flow_execute() as it could potentially trigger
8207
     * recirculation. When a packet matching flow 'j' happens to be
8208
     * recirculated, the nested call to dp_netdev_input__() could potentially
8209
     * classify the packet as matching another flow - say 'k'. It could happen
8210
     * that in the previous call to dp_netdev_input__() that same flow 'k' had
8211
     * already its own batches[k] still waiting to be served.  So if its
8212
     * 'batch' member is not reset, the recirculated packet would be wrongly
8213
     * appended to batches[k] of the 1st call to dp_netdev_input__(). */
8214
0
    for (i = 0; i < n_batches; i++) {
8215
0
        batches[i].flow->batch = NULL;
8216
0
    }
8217
8218
0
    for (i = 0; i < n_batches; i++) {
8219
0
        packet_batch_per_flow_execute(&batches[i], pmd);
8220
0
    }
8221
0
}
8222
8223
int32_t
8224
dp_netdev_input(struct dp_netdev_pmd_thread *pmd,
8225
                struct dp_packet_batch *packets,
8226
                odp_port_t port_no)
8227
0
{
8228
0
    dp_netdev_input__(pmd, packets, false, port_no);
8229
0
    return 0;
8230
0
}
8231
8232
static void
8233
dp_netdev_recirculate(struct dp_netdev_pmd_thread *pmd,
8234
                      struct dp_packet_batch *packets)
8235
0
{
8236
0
    dp_netdev_input__(pmd, packets, true, 0);
8237
0
}
8238
8239
struct dp_netdev_execute_aux {
8240
    struct dp_netdev_pmd_thread *pmd;
8241
    const struct flow *flow;
8242
};
8243
8244
static void
8245
dpif_netdev_register_dp_purge_cb(struct dpif *dpif, dp_purge_callback *cb,
8246
                                 void *aux)
8247
0
{
8248
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
8249
0
    dp->dp_purge_aux = aux;
8250
0
    dp->dp_purge_cb = cb;
8251
0
}
8252
8253
static void
8254
dpif_netdev_register_upcall_cb(struct dpif *dpif, upcall_callback *cb,
8255
                               void *aux)
8256
0
{
8257
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
8258
0
    dp->upcall_aux = aux;
8259
0
    dp->upcall_cb = cb;
8260
0
}
8261
8262
static void
8263
dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
8264
                               bool purge)
8265
0
{
8266
0
    struct tx_port *tx;
8267
0
    struct dp_netdev_port *port;
8268
0
    long long interval;
8269
8270
0
    HMAP_FOR_EACH (tx, node, &pmd->send_port_cache) {
8271
0
        if (tx->port->txq_mode != TXQ_MODE_XPS) {
8272
0
            continue;
8273
0
        }
8274
0
        interval = pmd->ctx.now - tx->last_used;
8275
0
        if (tx->qid >= 0 && (purge || interval >= XPS_TIMEOUT)) {
8276
0
            port = tx->port;
8277
0
            ovs_mutex_lock(&port->txq_used_mutex);
8278
0
            port->txq_used[tx->qid]--;
8279
0
            ovs_mutex_unlock(&port->txq_used_mutex);
8280
0
            tx->qid = -1;
8281
0
        }
8282
0
    }
8283
0
}
8284
8285
static int
8286
dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd,
8287
                           struct tx_port *tx)
8288
0
{
8289
0
    struct dp_netdev_port *port;
8290
0
    long long interval;
8291
0
    int i, min_cnt, min_qid;
8292
8293
0
    interval = pmd->ctx.now - tx->last_used;
8294
0
    tx->last_used = pmd->ctx.now;
8295
8296
0
    if (OVS_LIKELY(tx->qid >= 0 && interval < XPS_TIMEOUT)) {
8297
0
        return tx->qid;
8298
0
    }
8299
8300
0
    port = tx->port;
8301
8302
0
    ovs_mutex_lock(&port->txq_used_mutex);
8303
0
    if (tx->qid >= 0) {
8304
0
        port->txq_used[tx->qid]--;
8305
0
        tx->qid = -1;
8306
0
    }
8307
8308
0
    min_cnt = -1;
8309
0
    min_qid = 0;
8310
0
    for (i = 0; i < netdev_n_txq(port->netdev); i++) {
8311
0
        if (port->txq_used[i] < min_cnt || min_cnt == -1) {
8312
0
            min_cnt = port->txq_used[i];
8313
0
            min_qid = i;
8314
0
        }
8315
0
    }
8316
8317
0
    port->txq_used[min_qid]++;
8318
0
    tx->qid = min_qid;
8319
8320
0
    ovs_mutex_unlock(&port->txq_used_mutex);
8321
8322
0
    dpif_netdev_xps_revalidate_pmd(pmd, false);
8323
8324
0
    VLOG_DBG("Core %d: New TX queue ID %d for port \'%s\'.",
8325
0
             pmd->core_id, tx->qid, netdev_get_name(tx->port->netdev));
8326
0
    return min_qid;
8327
0
}
8328
8329
static struct tx_port *
8330
pmd_tnl_port_cache_lookup(const struct dp_netdev_pmd_thread *pmd,
8331
                          odp_port_t port_no)
8332
0
{
8333
0
    return tx_port_lookup(&pmd->tnl_port_cache, port_no);
8334
0
}
8335
8336
static struct tx_port *
8337
pmd_send_port_cache_lookup(const struct dp_netdev_pmd_thread *pmd,
8338
                           odp_port_t port_no)
8339
0
{
8340
0
    return tx_port_lookup(&pmd->send_port_cache, port_no);
8341
0
}
8342
8343
static int
8344
push_tnl_action(const struct dp_netdev_pmd_thread *pmd,
8345
                const struct nlattr *attr,
8346
                struct dp_packet_batch *batch)
8347
0
{
8348
0
    struct tx_port *tun_port;
8349
0
    const struct ovs_action_push_tnl *data;
8350
0
    int err;
8351
8352
0
    data = nl_attr_get(attr);
8353
8354
0
    tun_port = pmd_tnl_port_cache_lookup(pmd, data->tnl_port);
8355
0
    if (!tun_port) {
8356
0
        err = -EINVAL;
8357
0
        goto error;
8358
0
    }
8359
0
    err = netdev_push_header(tun_port->port->netdev, batch, data);
8360
0
    if (!err) {
8361
0
        return 0;
8362
0
    }
8363
0
error:
8364
0
    dp_packet_delete_batch(batch, true);
8365
0
    return err;
8366
0
}
8367
8368
static void
8369
dp_execute_userspace_action(struct dp_netdev_pmd_thread *pmd,
8370
                            struct dp_packet *packet, bool should_steal,
8371
                            struct flow *flow, ovs_u128 *ufid,
8372
                            struct ofpbuf *actions,
8373
                            const struct nlattr *userdata)
8374
0
{
8375
0
    struct dp_packet_batch b;
8376
0
    int error;
8377
8378
0
    ofpbuf_clear(actions);
8379
8380
0
    error = dp_netdev_upcall(pmd, packet, flow, NULL, ufid,
8381
0
                             DPIF_UC_ACTION, userdata, actions,
8382
0
                             NULL);
8383
0
    if (!error || error == ENOSPC) {
8384
0
        dp_packet_batch_init_packet(&b, packet);
8385
0
        dp_netdev_execute_actions(pmd, &b, should_steal, flow,
8386
0
                                  actions->data, actions->size);
8387
0
    } else if (should_steal) {
8388
0
        dp_packet_delete(packet);
8389
0
        COVERAGE_INC(datapath_drop_userspace_action_error);
8390
0
    }
8391
0
}
8392
8393
static bool
8394
dp_execute_output_action(struct dp_netdev_pmd_thread *pmd,
8395
                         struct dp_packet_batch *packets_,
8396
                         bool should_steal, odp_port_t port_no)
8397
0
{
8398
0
    struct tx_port *p = pmd_send_port_cache_lookup(pmd, port_no);
8399
0
    struct dp_packet_batch out;
8400
8401
0
    if (!OVS_LIKELY(p)) {
8402
0
        COVERAGE_ADD(datapath_drop_invalid_port,
8403
0
                     dp_packet_batch_size(packets_));
8404
0
        dp_packet_delete_batch(packets_, should_steal);
8405
0
        return false;
8406
0
    }
8407
0
    if (!should_steal) {
8408
0
        dp_packet_batch_clone(&out, packets_);
8409
0
        dp_packet_batch_reset_cutlen(packets_);
8410
0
        packets_ = &out;
8411
0
    }
8412
0
    dp_packet_batch_apply_cutlen(packets_);
8413
0
    if (dp_packet_batch_size(&p->output_pkts)
8414
0
        + dp_packet_batch_size(packets_) > NETDEV_MAX_BURST) {
8415
        /* Flush here to avoid overflow. */
8416
0
        dp_netdev_pmd_flush_output_on_port(pmd, p);
8417
0
    }
8418
0
    if (dp_packet_batch_is_empty(&p->output_pkts)) {
8419
0
        pmd->n_output_batches++;
8420
0
    }
8421
8422
0
    struct dp_packet *packet;
8423
0
    DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
8424
0
        p->output_pkts_rxqs[dp_packet_batch_size(&p->output_pkts)] =
8425
0
            pmd->ctx.last_rxq;
8426
0
        dp_packet_batch_add(&p->output_pkts, packet);
8427
0
    }
8428
0
    return true;
8429
0
}
8430
8431
static void
8432
dp_execute_lb_output_action(struct dp_netdev_pmd_thread *pmd,
8433
                            struct dp_packet_batch *packets_,
8434
                            bool should_steal, uint32_t bond)
8435
0
{
8436
0
    struct tx_bond *p_bond = tx_bond_lookup(&pmd->tx_bonds, bond);
8437
0
    struct dp_packet_batch out;
8438
0
    struct dp_packet *packet;
8439
8440
0
    if (!p_bond) {
8441
0
        COVERAGE_ADD(datapath_drop_invalid_bond,
8442
0
                     dp_packet_batch_size(packets_));
8443
0
        dp_packet_delete_batch(packets_, should_steal);
8444
0
        return;
8445
0
    }
8446
0
    if (!should_steal) {
8447
0
        dp_packet_batch_clone(&out, packets_);
8448
0
        dp_packet_batch_reset_cutlen(packets_);
8449
0
        packets_ = &out;
8450
0
    }
8451
0
    dp_packet_batch_apply_cutlen(packets_);
8452
8453
0
    DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
8454
        /*
8455
         * Lookup the bond-hash table using hash to get the member.
8456
         */
8457
0
        uint32_t hash = dp_packet_get_rss_hash(packet);
8458
0
        struct member_entry *s_entry
8459
0
            = &p_bond->member_buckets[hash & BOND_MASK];
8460
0
        odp_port_t bond_member = s_entry->member_id;
8461
0
        uint32_t size = dp_packet_size(packet);
8462
0
        struct dp_packet_batch output_pkt;
8463
8464
0
        dp_packet_batch_init_packet(&output_pkt, packet);
8465
0
        if (OVS_LIKELY(dp_execute_output_action(pmd, &output_pkt, true,
8466
0
                                                bond_member))) {
8467
            /* Update member stats. */
8468
0
            non_atomic_ullong_add(&s_entry->n_packets, 1);
8469
0
            non_atomic_ullong_add(&s_entry->n_bytes, size);
8470
0
        }
8471
0
    }
8472
0
}
8473
8474
static void
8475
dp_execute_cb(void *aux_, struct dp_packet_batch *packets_,
8476
              const struct nlattr *a, bool should_steal)
8477
    OVS_NO_THREAD_SAFETY_ANALYSIS
8478
0
{
8479
0
    struct dp_netdev_execute_aux *aux = aux_;
8480
0
    uint32_t *depth = recirc_depth_get();
8481
0
    struct dp_netdev_pmd_thread *pmd = aux->pmd;
8482
0
    struct dp_netdev *dp = pmd->dp;
8483
0
    int type = nl_attr_type(a);
8484
0
    struct tx_port *p;
8485
0
    uint32_t packet_count, packets_dropped;
8486
8487
0
    switch ((enum ovs_action_attr)type) {
8488
0
    case OVS_ACTION_ATTR_OUTPUT:
8489
0
        dp_execute_output_action(pmd, packets_, should_steal,
8490
0
                                 nl_attr_get_odp_port(a));
8491
0
        return;
8492
8493
0
    case OVS_ACTION_ATTR_LB_OUTPUT:
8494
0
        dp_execute_lb_output_action(pmd, packets_, should_steal,
8495
0
                                    nl_attr_get_u32(a));
8496
0
        return;
8497
8498
0
    case OVS_ACTION_ATTR_TUNNEL_PUSH:
8499
0
        if (should_steal) {
8500
            /* We're requested to push tunnel header, but also we need to take
8501
             * the ownership of these packets. Thus, we can avoid performing
8502
             * the action, because the caller will not use the result anyway.
8503
             * Just break to free the batch. */
8504
0
            break;
8505
0
        }
8506
0
        dp_packet_batch_apply_cutlen(packets_);
8507
0
        packet_count = dp_packet_batch_size(packets_);
8508
0
        if (push_tnl_action(pmd, a, packets_)) {
8509
0
            COVERAGE_ADD(datapath_drop_tunnel_push_error,
8510
0
                         packet_count);
8511
0
        }
8512
0
        return;
8513
8514
0
    case OVS_ACTION_ATTR_TUNNEL_POP:
8515
0
        if (*depth < MAX_RECIRC_DEPTH) {
8516
0
            struct dp_packet_batch *orig_packets_ = packets_;
8517
0
            odp_port_t portno = nl_attr_get_odp_port(a);
8518
8519
0
            p = pmd_tnl_port_cache_lookup(pmd, portno);
8520
0
            if (p) {
8521
0
                struct dp_packet_batch tnl_pkt;
8522
8523
0
                if (!should_steal) {
8524
0
                    dp_packet_batch_clone(&tnl_pkt, packets_);
8525
0
                    packets_ = &tnl_pkt;
8526
0
                    dp_packet_batch_reset_cutlen(orig_packets_);
8527
0
                }
8528
8529
0
                dp_packet_batch_apply_cutlen(packets_);
8530
8531
0
                packet_count = dp_packet_batch_size(packets_);
8532
0
                netdev_pop_header(p->port->netdev, packets_);
8533
0
                packets_dropped =
8534
0
                   packet_count - dp_packet_batch_size(packets_);
8535
0
                if (packets_dropped) {
8536
0
                    COVERAGE_ADD(datapath_drop_tunnel_pop_error,
8537
0
                                 packets_dropped);
8538
0
                }
8539
0
                if (dp_packet_batch_is_empty(packets_)) {
8540
0
                    return;
8541
0
                }
8542
8543
0
                struct dp_packet *packet;
8544
0
                DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
8545
0
                    packet->md.in_port.odp_port = portno;
8546
0
                }
8547
8548
0
                (*depth)++;
8549
0
                dp_netdev_recirculate(pmd, packets_);
8550
0
                (*depth)--;
8551
0
                return;
8552
0
            }
8553
0
            COVERAGE_ADD(datapath_drop_invalid_tnl_port,
8554
0
                         dp_packet_batch_size(packets_));
8555
0
        } else {
8556
0
            COVERAGE_ADD(datapath_drop_recirc_error,
8557
0
                         dp_packet_batch_size(packets_));
8558
0
        }
8559
0
        break;
8560
8561
0
    case OVS_ACTION_ATTR_USERSPACE:
8562
0
        if (!fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {
8563
0
            struct dp_packet_batch *orig_packets_ = packets_;
8564
0
            const struct nlattr *userdata;
8565
0
            struct dp_packet_batch usr_pkt;
8566
0
            struct ofpbuf actions;
8567
0
            struct flow flow;
8568
0
            ovs_u128 ufid;
8569
0
            bool clone = false;
8570
8571
0
            userdata = nl_attr_find_nested(a, OVS_USERSPACE_ATTR_USERDATA);
8572
0
            ofpbuf_init(&actions, 0);
8573
8574
0
            if (packets_->trunc) {
8575
0
                if (!should_steal) {
8576
0
                    dp_packet_batch_clone(&usr_pkt, packets_);
8577
0
                    packets_ = &usr_pkt;
8578
0
                    clone = true;
8579
0
                    dp_packet_batch_reset_cutlen(orig_packets_);
8580
0
                }
8581
8582
0
                dp_packet_batch_apply_cutlen(packets_);
8583
0
            }
8584
8585
0
            struct dp_packet *packet;
8586
0
            DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
8587
0
                flow_extract(packet, &flow);
8588
0
                odp_flow_key_hash(&flow, sizeof flow, &ufid);
8589
0
                dp_execute_userspace_action(pmd, packet, should_steal, &flow,
8590
0
                                            &ufid, &actions, userdata);
8591
0
            }
8592
8593
0
            if (clone) {
8594
0
                dp_packet_delete_batch(packets_, true);
8595
0
            }
8596
8597
0
            ofpbuf_uninit(&actions);
8598
0
            fat_rwlock_unlock(&dp->upcall_rwlock);
8599
8600
0
            return;
8601
0
        }
8602
0
        COVERAGE_ADD(datapath_drop_lock_error,
8603
0
                     dp_packet_batch_size(packets_));
8604
0
        break;
8605
8606
0
    case OVS_ACTION_ATTR_RECIRC:
8607
0
        if (*depth < MAX_RECIRC_DEPTH) {
8608
0
            struct dp_packet_batch recirc_pkts;
8609
8610
0
            if (!should_steal) {
8611
0
               dp_packet_batch_clone(&recirc_pkts, packets_);
8612
0
               packets_ = &recirc_pkts;
8613
0
            }
8614
8615
0
            struct dp_packet *packet;
8616
0
            DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
8617
0
                packet->md.recirc_id = nl_attr_get_u32(a);
8618
0
            }
8619
8620
0
            (*depth)++;
8621
0
            dp_netdev_recirculate(pmd, packets_);
8622
0
            (*depth)--;
8623
8624
0
            return;
8625
0
        }
8626
8627
0
        COVERAGE_ADD(datapath_drop_recirc_error,
8628
0
                     dp_packet_batch_size(packets_));
8629
0
        VLOG_WARN("Packet dropped. Max recirculation depth exceeded.");
8630
0
        break;
8631
8632
0
    case OVS_ACTION_ATTR_CT: {
8633
0
        const struct nlattr *b;
8634
0
        bool force = false;
8635
0
        bool commit = false;
8636
0
        unsigned int left;
8637
0
        uint16_t zone = 0;
8638
0
        uint32_t tp_id = 0;
8639
0
        const char *helper = NULL;
8640
0
        const uint32_t *setmark = NULL;
8641
0
        const struct ovs_key_ct_labels *setlabel = NULL;
8642
0
        struct nat_action_info_t nat_action_info;
8643
0
        struct nat_action_info_t *nat_action_info_ref = NULL;
8644
0
        bool nat_config = false;
8645
8646
0
        NL_ATTR_FOR_EACH_UNSAFE (b, left, nl_attr_get(a),
8647
0
                                 nl_attr_get_size(a)) {
8648
0
            enum ovs_ct_attr sub_type = nl_attr_type(b);
8649
8650
0
            switch(sub_type) {
8651
0
            case OVS_CT_ATTR_FORCE_COMMIT:
8652
0
                force = true;
8653
                /* fall through. */
8654
0
            case OVS_CT_ATTR_COMMIT:
8655
0
                commit = true;
8656
0
                break;
8657
0
            case OVS_CT_ATTR_ZONE:
8658
0
                zone = nl_attr_get_u16(b);
8659
0
                break;
8660
0
            case OVS_CT_ATTR_HELPER:
8661
0
                helper = nl_attr_get_string(b);
8662
0
                break;
8663
0
            case OVS_CT_ATTR_MARK:
8664
0
                setmark = nl_attr_get(b);
8665
0
                break;
8666
0
            case OVS_CT_ATTR_LABELS:
8667
0
                setlabel = nl_attr_get(b);
8668
0
                break;
8669
0
            case OVS_CT_ATTR_EVENTMASK:
8670
                /* Silently ignored, as userspace datapath does not generate
8671
                 * netlink events. */
8672
0
                break;
8673
0
            case OVS_CT_ATTR_TIMEOUT:
8674
0
                if (!str_to_uint(nl_attr_get_string(b), 10, &tp_id)) {
8675
0
                    VLOG_WARN("Invalid Timeout Policy ID: %s.",
8676
0
                              nl_attr_get_string(b));
8677
0
                    tp_id = DEFAULT_TP_ID;
8678
0
                }
8679
0
                break;
8680
0
            case OVS_CT_ATTR_NAT: {
8681
0
                const struct nlattr *b_nest;
8682
0
                unsigned int left_nest;
8683
0
                bool ip_min_specified = false;
8684
0
                bool proto_num_min_specified = false;
8685
0
                bool ip_max_specified = false;
8686
0
                bool proto_num_max_specified = false;
8687
0
                memset(&nat_action_info, 0, sizeof nat_action_info);
8688
0
                nat_action_info_ref = &nat_action_info;
8689
8690
0
                NL_NESTED_FOR_EACH_UNSAFE (b_nest, left_nest, b) {
8691
0
                    enum ovs_nat_attr sub_type_nest = nl_attr_type(b_nest);
8692
8693
0
                    switch (sub_type_nest) {
8694
0
                    case OVS_NAT_ATTR_SRC:
8695
0
                    case OVS_NAT_ATTR_DST:
8696
0
                        nat_config = true;
8697
0
                        nat_action_info.nat_action |=
8698
0
                            ((sub_type_nest == OVS_NAT_ATTR_SRC)
8699
0
                                ? NAT_ACTION_SRC : NAT_ACTION_DST);
8700
0
                        break;
8701
0
                    case OVS_NAT_ATTR_IP_MIN:
8702
0
                        memcpy(&nat_action_info.min_addr,
8703
0
                               nl_attr_get(b_nest),
8704
0
                               nl_attr_get_size(b_nest));
8705
0
                        ip_min_specified = true;
8706
0
                        break;
8707
0
                    case OVS_NAT_ATTR_IP_MAX:
8708
0
                        memcpy(&nat_action_info.max_addr,
8709
0
                               nl_attr_get(b_nest),
8710
0
                               nl_attr_get_size(b_nest));
8711
0
                        ip_max_specified = true;
8712
0
                        break;
8713
0
                    case OVS_NAT_ATTR_PROTO_MIN:
8714
0
                        nat_action_info.min_port =
8715
0
                            nl_attr_get_u16(b_nest);
8716
0
                        proto_num_min_specified = true;
8717
0
                        break;
8718
0
                    case OVS_NAT_ATTR_PROTO_MAX:
8719
0
                        nat_action_info.max_port =
8720
0
                            nl_attr_get_u16(b_nest);
8721
0
                        proto_num_max_specified = true;
8722
0
                        break;
8723
0
                    case OVS_NAT_ATTR_PROTO_RANDOM:
8724
0
                        nat_action_info.nat_flags |= NAT_RANGE_RANDOM;
8725
0
                        break;
8726
0
                    case OVS_NAT_ATTR_PERSISTENT:
8727
0
                        nat_action_info.nat_flags |= NAT_PERSISTENT;
8728
0
                        break;
8729
0
                    case OVS_NAT_ATTR_PROTO_HASH:
8730
0
                        break;
8731
0
                    case OVS_NAT_ATTR_UNSPEC:
8732
0
                    case __OVS_NAT_ATTR_MAX:
8733
0
                        OVS_NOT_REACHED();
8734
0
                    }
8735
0
                }
8736
8737
0
                if (ip_min_specified && !ip_max_specified) {
8738
0
                    nat_action_info.max_addr = nat_action_info.min_addr;
8739
0
                }
8740
0
                if (proto_num_min_specified && !proto_num_max_specified) {
8741
0
                    nat_action_info.max_port = nat_action_info.min_port;
8742
0
                }
8743
0
                if (proto_num_min_specified || proto_num_max_specified) {
8744
0
                    if (nat_action_info.nat_action & NAT_ACTION_SRC) {
8745
0
                        nat_action_info.nat_action |= NAT_ACTION_SRC_PORT;
8746
0
                    } else if (nat_action_info.nat_action & NAT_ACTION_DST) {
8747
0
                        nat_action_info.nat_action |= NAT_ACTION_DST_PORT;
8748
0
                    }
8749
0
                }
8750
0
                break;
8751
0
            }
8752
0
            case OVS_CT_ATTR_UNSPEC:
8753
0
            case __OVS_CT_ATTR_MAX:
8754
0
                OVS_NOT_REACHED();
8755
0
            }
8756
0
        }
8757
8758
        /* We won't be able to function properly in this case, hence
8759
         * complain loudly. */
8760
0
        if (nat_config && !commit) {
8761
0
            static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
8762
0
            VLOG_WARN_RL(&rl, "NAT specified without commit.");
8763
0
        }
8764
8765
0
        conntrack_execute(dp->conntrack, packets_, aux->flow->dl_type, force,
8766
0
                          commit, zone, setmark, setlabel, helper,
8767
0
                          nat_action_info_ref, pmd->ctx.now / 1000, tp_id);
8768
0
        break;
8769
0
    }
8770
8771
0
    case OVS_ACTION_ATTR_METER:
8772
0
        dp_netdev_run_meter(pmd->dp, packets_, nl_attr_get_u32(a),
8773
0
                            pmd->ctx.now / 1000);
8774
0
        break;
8775
8776
0
    case OVS_ACTION_ATTR_PUSH_VLAN:
8777
0
    case OVS_ACTION_ATTR_POP_VLAN:
8778
0
    case OVS_ACTION_ATTR_PUSH_MPLS:
8779
0
    case OVS_ACTION_ATTR_POP_MPLS:
8780
0
    case OVS_ACTION_ATTR_SET:
8781
0
    case OVS_ACTION_ATTR_SET_MASKED:
8782
0
    case OVS_ACTION_ATTR_SAMPLE:
8783
0
    case OVS_ACTION_ATTR_HASH:
8784
0
    case OVS_ACTION_ATTR_UNSPEC:
8785
0
    case OVS_ACTION_ATTR_TRUNC:
8786
0
    case OVS_ACTION_ATTR_PUSH_ETH:
8787
0
    case OVS_ACTION_ATTR_POP_ETH:
8788
0
    case OVS_ACTION_ATTR_CLONE:
8789
0
    case OVS_ACTION_ATTR_PUSH_NSH:
8790
0
    case OVS_ACTION_ATTR_POP_NSH:
8791
0
    case OVS_ACTION_ATTR_CT_CLEAR:
8792
0
    case OVS_ACTION_ATTR_CHECK_PKT_LEN:
8793
0
    case OVS_ACTION_ATTR_DROP:
8794
0
    case OVS_ACTION_ATTR_ADD_MPLS:
8795
0
    case OVS_ACTION_ATTR_DEC_TTL:
8796
0
    case OVS_ACTION_ATTR_PSAMPLE:
8797
0
    case __OVS_ACTION_ATTR_MAX:
8798
0
        OVS_NOT_REACHED();
8799
0
    }
8800
8801
0
    dp_packet_delete_batch(packets_, should_steal);
8802
0
}
8803
8804
static void
8805
dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
8806
                          struct dp_packet_batch *packets,
8807
                          bool should_steal, const struct flow *flow,
8808
                          const struct nlattr *actions, size_t actions_len)
8809
0
{
8810
0
    struct dp_netdev_execute_aux aux = { pmd, flow };
8811
8812
0
    odp_execute_actions(&aux, packets, should_steal, actions,
8813
0
                        actions_len, dp_execute_cb);
8814
0
}
8815
8816
struct dp_netdev_ct_dump {
8817
    struct ct_dpif_dump_state up;
8818
    struct conntrack_dump dump;
8819
    struct conntrack *ct;
8820
    struct dp_netdev *dp;
8821
};
8822
8823
static int
8824
dpif_netdev_ct_dump_start(struct dpif *dpif, struct ct_dpif_dump_state **dump_,
8825
                          const uint16_t *pzone, int *ptot_bkts)
8826
0
{
8827
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
8828
0
    struct dp_netdev_ct_dump *dump;
8829
8830
0
    dump = xzalloc(sizeof *dump);
8831
0
    dump->dp = dp;
8832
0
    dump->ct = dp->conntrack;
8833
8834
0
    conntrack_dump_start(dp->conntrack, &dump->dump, pzone, ptot_bkts);
8835
8836
0
    *dump_ = &dump->up;
8837
8838
0
    return 0;
8839
0
}
8840
8841
static int
8842
dpif_netdev_ct_dump_next(struct dpif *dpif OVS_UNUSED,
8843
                         struct ct_dpif_dump_state *dump_,
8844
                         struct ct_dpif_entry *entry)
8845
0
{
8846
0
    struct dp_netdev_ct_dump *dump;
8847
8848
0
    INIT_CONTAINER(dump, dump_, up);
8849
8850
0
    return conntrack_dump_next(&dump->dump, entry);
8851
0
}
8852
8853
static int
8854
dpif_netdev_ct_dump_done(struct dpif *dpif OVS_UNUSED,
8855
                         struct ct_dpif_dump_state *dump_)
8856
0
{
8857
0
    struct dp_netdev_ct_dump *dump;
8858
0
    int err;
8859
8860
0
    INIT_CONTAINER(dump, dump_, up);
8861
8862
0
    err = conntrack_dump_done(&dump->dump);
8863
8864
0
    free(dump);
8865
8866
0
    return err;
8867
0
}
8868
8869
static int
8870
dpif_netdev_ct_exp_dump_start(struct dpif *dpif,
8871
                              struct ct_dpif_dump_state **dump_,
8872
                              const uint16_t *pzone)
8873
0
{
8874
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
8875
0
    struct dp_netdev_ct_dump *dump;
8876
8877
0
    dump = xzalloc(sizeof *dump);
8878
0
    dump->dp = dp;
8879
0
    dump->ct = dp->conntrack;
8880
8881
0
    conntrack_exp_dump_start(dp->conntrack, &dump->dump, pzone);
8882
8883
0
    *dump_ = &dump->up;
8884
8885
0
    return 0;
8886
0
}
8887
8888
static int
8889
dpif_netdev_ct_exp_dump_next(struct dpif *dpif OVS_UNUSED,
8890
                             struct ct_dpif_dump_state *dump_,
8891
                             struct ct_dpif_exp *entry)
8892
0
{
8893
0
    struct dp_netdev_ct_dump *dump;
8894
8895
0
    INIT_CONTAINER(dump, dump_, up);
8896
8897
0
    return conntrack_exp_dump_next(&dump->dump, entry);
8898
0
}
8899
8900
static int
8901
dpif_netdev_ct_exp_dump_done(struct dpif *dpif OVS_UNUSED,
8902
                             struct ct_dpif_dump_state *dump_)
8903
0
{
8904
0
    struct dp_netdev_ct_dump *dump;
8905
0
    int err;
8906
8907
0
    INIT_CONTAINER(dump, dump_, up);
8908
8909
0
    err = conntrack_exp_dump_done(&dump->dump);
8910
8911
0
    free(dump);
8912
8913
0
    return err;
8914
0
}
8915
8916
static int
8917
dpif_netdev_ct_flush(struct dpif *dpif, const uint16_t *zone,
8918
                     const struct ct_dpif_tuple *tuple)
8919
0
{
8920
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
8921
8922
0
    if (tuple) {
8923
0
        return conntrack_flush_tuple(dp->conntrack, tuple, zone ? *zone : 0);
8924
0
    }
8925
0
    return conntrack_flush(dp->conntrack, zone);
8926
0
}
8927
8928
static int
8929
dpif_netdev_ct_set_maxconns(struct dpif *dpif, uint32_t maxconns)
8930
0
{
8931
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
8932
8933
0
    return conntrack_set_maxconns(dp->conntrack, maxconns);
8934
0
}
8935
8936
static int
8937
dpif_netdev_ct_get_maxconns(struct dpif *dpif, uint32_t *maxconns)
8938
0
{
8939
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
8940
8941
0
    return conntrack_get_maxconns(dp->conntrack, maxconns);
8942
0
}
8943
8944
static int
8945
dpif_netdev_ct_get_nconns(struct dpif *dpif, uint32_t *nconns)
8946
0
{
8947
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
8948
8949
0
    return conntrack_get_nconns(dp->conntrack, nconns);
8950
0
}
8951
8952
static int
8953
dpif_netdev_ct_set_tcp_seq_chk(struct dpif *dpif, bool enabled)
8954
0
{
8955
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
8956
8957
0
    return conntrack_set_tcp_seq_chk(dp->conntrack, enabled);
8958
0
}
8959
8960
static int
8961
dpif_netdev_ct_get_tcp_seq_chk(struct dpif *dpif, bool *enabled)
8962
0
{
8963
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
8964
0
    *enabled = conntrack_get_tcp_seq_chk(dp->conntrack);
8965
0
    return 0;
8966
0
}
8967
8968
static int
8969
dpif_netdev_ct_set_sweep_interval(struct dpif *dpif, uint32_t ms)
8970
0
{
8971
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
8972
0
    return conntrack_set_sweep_interval(dp->conntrack, ms);
8973
0
}
8974
8975
static int
8976
dpif_netdev_ct_get_sweep_interval(struct dpif *dpif, uint32_t *ms)
8977
0
{
8978
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
8979
0
    *ms = conntrack_get_sweep_interval(dp->conntrack);
8980
0
    return 0;
8981
0
}
8982
8983
static int
8984
dpif_netdev_ct_set_limits(struct dpif *dpif,
8985
                           const struct ovs_list *zone_limits)
8986
0
{
8987
0
    int err = 0;
8988
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
8989
8990
0
    struct ct_dpif_zone_limit *zone_limit;
8991
0
    LIST_FOR_EACH (zone_limit, node, zone_limits) {
8992
0
        err = zone_limit_update(dp->conntrack, zone_limit->zone,
8993
0
                                zone_limit->limit);
8994
0
        if (err != 0) {
8995
0
            break;
8996
0
        }
8997
0
    }
8998
0
    return err;
8999
0
}
9000
9001
static int
9002
dpif_netdev_ct_get_limits(struct dpif *dpif,
9003
                           const struct ovs_list *zone_limits_request,
9004
                           struct ovs_list *zone_limits_reply)
9005
0
{
9006
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
9007
0
    struct conntrack_zone_info czl;
9008
9009
0
    if (!ovs_list_is_empty(zone_limits_request)) {
9010
0
        struct ct_dpif_zone_limit *zone_limit;
9011
0
        LIST_FOR_EACH (zone_limit, node, zone_limits_request) {
9012
0
            czl = zone_limit_get(dp->conntrack, zone_limit->zone);
9013
0
            if (czl.zone == zone_limit->zone || czl.zone == DEFAULT_ZONE) {
9014
0
                ct_dpif_push_zone_limit(zone_limits_reply, zone_limit->zone,
9015
0
                                        czl.limit,
9016
0
                                        czl.count);
9017
0
            } else {
9018
0
                return EINVAL;
9019
0
            }
9020
0
        }
9021
0
    } else {
9022
0
        czl = zone_limit_get(dp->conntrack, DEFAULT_ZONE);
9023
0
        if (czl.zone == DEFAULT_ZONE) {
9024
0
            ct_dpif_push_zone_limit(zone_limits_reply, DEFAULT_ZONE,
9025
0
                                    czl.limit, 0);
9026
0
        }
9027
9028
0
        for (int z = MIN_ZONE; z <= MAX_ZONE; z++) {
9029
0
            czl = zone_limit_get(dp->conntrack, z);
9030
0
            if (czl.zone == z) {
9031
0
                ct_dpif_push_zone_limit(zone_limits_reply, z, czl.limit,
9032
0
                                        czl.count);
9033
0
            }
9034
0
        }
9035
0
    }
9036
9037
0
    return 0;
9038
0
}
9039
9040
static int
9041
dpif_netdev_ct_del_limits(struct dpif *dpif,
9042
                           const struct ovs_list *zone_limits)
9043
0
{
9044
0
    int err = 0;
9045
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
9046
0
    struct ct_dpif_zone_limit *zone_limit;
9047
0
    LIST_FOR_EACH (zone_limit, node, zone_limits) {
9048
0
        err = zone_limit_delete(dp->conntrack, zone_limit->zone);
9049
0
        if (err != 0) {
9050
0
            break;
9051
0
        }
9052
0
    }
9053
9054
0
    return err;
9055
0
}
9056
9057
static int
9058
dpif_netdev_ct_get_features(struct dpif *dpif OVS_UNUSED,
9059
                            enum ct_features *features)
9060
0
{
9061
0
    if (features != NULL) {
9062
0
        *features = CONNTRACK_F_ZERO_SNAT;
9063
0
    }
9064
0
    return 0;
9065
0
}
9066
9067
static int
9068
dpif_netdev_ct_set_timeout_policy(struct dpif *dpif,
9069
                                  const struct ct_dpif_timeout_policy *dpif_tp)
9070
0
{
9071
0
    struct timeout_policy tp;
9072
0
    struct dp_netdev *dp;
9073
9074
0
    dp = get_dp_netdev(dpif);
9075
0
    memcpy(&tp.policy, dpif_tp, sizeof tp.policy);
9076
0
    return timeout_policy_update(dp->conntrack, &tp);
9077
0
}
9078
9079
static int
9080
dpif_netdev_ct_get_timeout_policy(struct dpif *dpif, uint32_t tp_id,
9081
                                  struct ct_dpif_timeout_policy *dpif_tp)
9082
0
{
9083
0
    struct timeout_policy *tp;
9084
0
    struct dp_netdev *dp;
9085
0
    int err = 0;
9086
9087
0
    dp = get_dp_netdev(dpif);
9088
0
    tp = timeout_policy_get(dp->conntrack, tp_id);
9089
0
    if (!tp) {
9090
0
        return ENOENT;
9091
0
    }
9092
0
    memcpy(dpif_tp, &tp->policy, sizeof tp->policy);
9093
0
    return err;
9094
0
}
9095
9096
static int
9097
dpif_netdev_ct_del_timeout_policy(struct dpif *dpif,
9098
                                  uint32_t tp_id)
9099
0
{
9100
0
    struct dp_netdev *dp;
9101
0
    int err = 0;
9102
9103
0
    dp = get_dp_netdev(dpif);
9104
0
    err = timeout_policy_delete(dp->conntrack, tp_id);
9105
0
    return err;
9106
0
}
9107
9108
static int
9109
dpif_netdev_ct_get_timeout_policy_name(struct dpif *dpif OVS_UNUSED,
9110
                                       uint32_t tp_id,
9111
                                       uint16_t dl_type OVS_UNUSED,
9112
                                       uint8_t nw_proto OVS_UNUSED,
9113
                                       char **tp_name, bool *is_generic)
9114
0
{
9115
0
    struct ds ds = DS_EMPTY_INITIALIZER;
9116
9117
0
    ds_put_format(&ds, "%"PRIu32, tp_id);
9118
0
    *tp_name = ds_steal_cstr(&ds);
9119
0
    *is_generic = true;
9120
0
    return 0;
9121
0
}
9122
9123
static int
9124
dpif_netdev_ipf_set_enabled(struct dpif *dpif, bool v6, bool enable)
9125
0
{
9126
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
9127
0
    return ipf_set_enabled(conntrack_ipf_ctx(dp->conntrack), v6, enable);
9128
0
}
9129
9130
static int
9131
dpif_netdev_ipf_set_min_frag(struct dpif *dpif, bool v6, uint32_t min_frag)
9132
0
{
9133
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
9134
0
    return ipf_set_min_frag(conntrack_ipf_ctx(dp->conntrack), v6, min_frag);
9135
0
}
9136
9137
static int
9138
dpif_netdev_ipf_set_max_nfrags(struct dpif *dpif, uint32_t max_frags)
9139
0
{
9140
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
9141
0
    return ipf_set_max_nfrags(conntrack_ipf_ctx(dp->conntrack), max_frags);
9142
0
}
9143
9144
/* Adjust this function if 'dpif_ipf_status' and 'ipf_status' were to
9145
 * diverge. */
9146
static int
9147
dpif_netdev_ipf_get_status(struct dpif *dpif,
9148
                           struct dpif_ipf_status *dpif_ipf_status)
9149
0
{
9150
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
9151
0
    ipf_get_status(conntrack_ipf_ctx(dp->conntrack),
9152
0
                   (struct ipf_status *) dpif_ipf_status);
9153
0
    return 0;
9154
0
}
9155
9156
static int
9157
dpif_netdev_ipf_dump_start(struct dpif *dpif OVS_UNUSED,
9158
                           struct ipf_dump_ctx **ipf_dump_ctx)
9159
0
{
9160
0
    return ipf_dump_start(ipf_dump_ctx);
9161
0
}
9162
9163
static int
9164
dpif_netdev_ipf_dump_next(struct dpif *dpif, void *ipf_dump_ctx, char **dump)
9165
0
{
9166
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
9167
0
    return ipf_dump_next(conntrack_ipf_ctx(dp->conntrack), ipf_dump_ctx,
9168
0
                         dump);
9169
0
}
9170
9171
static int
9172
dpif_netdev_ipf_dump_done(struct dpif *dpif OVS_UNUSED, void *ipf_dump_ctx)
9173
0
{
9174
0
    return ipf_dump_done(ipf_dump_ctx);
9175
9176
0
}
9177
9178
static int
9179
dpif_netdev_bond_add(struct dpif *dpif, uint32_t bond_id,
9180
                     odp_port_t *member_map)
9181
0
{
9182
0
    struct tx_bond *new_tx = xzalloc(sizeof *new_tx);
9183
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
9184
0
    struct dp_netdev_pmd_thread *pmd;
9185
9186
    /* Prepare new bond mapping. */
9187
0
    new_tx->bond_id = bond_id;
9188
0
    for (int bucket = 0; bucket < BOND_BUCKETS; bucket++) {
9189
0
        new_tx->member_buckets[bucket].member_id = member_map[bucket];
9190
0
    }
9191
9192
0
    ovs_mutex_lock(&dp->bond_mutex);
9193
    /* Check if bond already existed. */
9194
0
    struct tx_bond *old_tx = tx_bond_lookup(&dp->tx_bonds, bond_id);
9195
0
    if (old_tx) {
9196
0
        cmap_replace(&dp->tx_bonds, &old_tx->node, &new_tx->node,
9197
0
                     hash_bond_id(bond_id));
9198
0
        ovsrcu_postpone(free, old_tx);
9199
0
    } else {
9200
0
        cmap_insert(&dp->tx_bonds, &new_tx->node, hash_bond_id(bond_id));
9201
0
    }
9202
0
    ovs_mutex_unlock(&dp->bond_mutex);
9203
9204
    /* Update all PMDs with new bond mapping. */
9205
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
9206
0
        dp_netdev_add_bond_tx_to_pmd(pmd, new_tx, true);
9207
0
    }
9208
0
    return 0;
9209
0
}
9210
9211
static int
9212
dpif_netdev_bond_del(struct dpif *dpif, uint32_t bond_id)
9213
0
{
9214
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
9215
0
    struct dp_netdev_pmd_thread *pmd;
9216
0
    struct tx_bond *tx;
9217
9218
0
    ovs_mutex_lock(&dp->bond_mutex);
9219
    /* Check if bond existed. */
9220
0
    tx = tx_bond_lookup(&dp->tx_bonds, bond_id);
9221
0
    if (tx) {
9222
0
        cmap_remove(&dp->tx_bonds, &tx->node, hash_bond_id(bond_id));
9223
0
        ovsrcu_postpone(free, tx);
9224
0
    } else {
9225
        /* Bond is not present. */
9226
0
        ovs_mutex_unlock(&dp->bond_mutex);
9227
0
        return ENOENT;
9228
0
    }
9229
0
    ovs_mutex_unlock(&dp->bond_mutex);
9230
9231
    /* Remove the bond map in all pmds. */
9232
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
9233
0
        dp_netdev_del_bond_tx_from_pmd(pmd, bond_id);
9234
0
    }
9235
0
    return 0;
9236
0
}
9237
9238
static int
9239
dpif_netdev_bond_stats_get(struct dpif *dpif, uint32_t bond_id,
9240
                           uint64_t *n_bytes)
9241
0
{
9242
0
    struct dp_netdev *dp = get_dp_netdev(dpif);
9243
0
    struct dp_netdev_pmd_thread *pmd;
9244
9245
0
    if (!tx_bond_lookup(&dp->tx_bonds, bond_id)) {
9246
0
        return ENOENT;
9247
0
    }
9248
9249
    /* Search the bond in all PMDs. */
9250
0
    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
9251
0
        struct tx_bond *pmd_bond_entry
9252
0
            = tx_bond_lookup(&pmd->tx_bonds, bond_id);
9253
9254
0
        if (!pmd_bond_entry) {
9255
0
            continue;
9256
0
        }
9257
9258
        /* Read bond stats. */
9259
0
        for (int i = 0; i < BOND_BUCKETS; i++) {
9260
0
            uint64_t pmd_n_bytes;
9261
9262
0
            atomic_read_relaxed(&pmd_bond_entry->member_buckets[i].n_bytes,
9263
0
                                &pmd_n_bytes);
9264
0
            n_bytes[i] += pmd_n_bytes;
9265
0
        }
9266
0
    }
9267
0
    return 0;
9268
0
}
9269
9270
const struct dpif_class dpif_netdev_class = {
9271
    "netdev",
9272
    true,                       /* cleanup_required */
9273
    dpif_netdev_init,
9274
    dpif_netdev_enumerate,
9275
    dpif_netdev_port_open_type,
9276
    dpif_netdev_open,
9277
    dpif_netdev_close,
9278
    dpif_netdev_destroy,
9279
    dpif_netdev_run,
9280
    dpif_netdev_wait,
9281
    dpif_netdev_get_stats,
9282
    NULL,                      /* set_features */
9283
    NULL,                      /* get_features */
9284
    dpif_netdev_port_add,
9285
    dpif_netdev_port_del,
9286
    dpif_netdev_port_set_config,
9287
    dpif_netdev_port_query_by_number,
9288
    dpif_netdev_port_query_by_name,
9289
    NULL,                       /* port_get_pid */
9290
    dpif_netdev_port_dump_start,
9291
    dpif_netdev_port_dump_next,
9292
    dpif_netdev_port_dump_done,
9293
    dpif_netdev_port_poll,
9294
    dpif_netdev_port_poll_wait,
9295
    dpif_netdev_flow_flush,
9296
    dpif_netdev_flow_dump_create,
9297
    dpif_netdev_flow_dump_destroy,
9298
    dpif_netdev_flow_dump_thread_create,
9299
    dpif_netdev_flow_dump_thread_destroy,
9300
    dpif_netdev_flow_dump_next,
9301
    dpif_netdev_operate,
9302
    NULL,                       /* recv_set */
9303
    NULL,                       /* handlers_set */
9304
    dpif_netdev_number_handlers_required,
9305
    dpif_netdev_set_config,
9306
    dpif_netdev_queue_to_priority,
9307
    NULL,                       /* recv */
9308
    NULL,                       /* recv_wait */
9309
    NULL,                       /* recv_purge */
9310
    dpif_netdev_register_dp_purge_cb,
9311
    dpif_netdev_register_upcall_cb,
9312
    dpif_netdev_enable_upcall,
9313
    dpif_netdev_disable_upcall,
9314
    dpif_netdev_get_datapath_version,
9315
    dpif_netdev_ct_dump_start,
9316
    dpif_netdev_ct_dump_next,
9317
    dpif_netdev_ct_dump_done,
9318
    dpif_netdev_ct_exp_dump_start,
9319
    dpif_netdev_ct_exp_dump_next,
9320
    dpif_netdev_ct_exp_dump_done,
9321
    dpif_netdev_ct_flush,
9322
    dpif_netdev_ct_set_maxconns,
9323
    dpif_netdev_ct_get_maxconns,
9324
    dpif_netdev_ct_get_nconns,
9325
    dpif_netdev_ct_set_tcp_seq_chk,
9326
    dpif_netdev_ct_get_tcp_seq_chk,
9327
    dpif_netdev_ct_set_sweep_interval,
9328
    dpif_netdev_ct_get_sweep_interval,
9329
    dpif_netdev_ct_set_limits,
9330
    dpif_netdev_ct_get_limits,
9331
    dpif_netdev_ct_del_limits,
9332
    dpif_netdev_ct_set_timeout_policy,
9333
    dpif_netdev_ct_get_timeout_policy,
9334
    dpif_netdev_ct_del_timeout_policy,
9335
    NULL,                       /* ct_timeout_policy_dump_start */
9336
    NULL,                       /* ct_timeout_policy_dump_next */
9337
    NULL,                       /* ct_timeout_policy_dump_done */
9338
    dpif_netdev_ct_get_timeout_policy_name,
9339
    dpif_netdev_ct_get_features,
9340
    dpif_netdev_ipf_set_enabled,
9341
    dpif_netdev_ipf_set_min_frag,
9342
    dpif_netdev_ipf_set_max_nfrags,
9343
    dpif_netdev_ipf_get_status,
9344
    dpif_netdev_ipf_dump_start,
9345
    dpif_netdev_ipf_dump_next,
9346
    dpif_netdev_ipf_dump_done,
9347
    dpif_netdev_meter_get_features,
9348
    dpif_netdev_meter_set,
9349
    dpif_netdev_meter_get,
9350
    dpif_netdev_meter_del,
9351
    dpif_netdev_bond_add,
9352
    dpif_netdev_bond_del,
9353
    dpif_netdev_bond_stats_get,
9354
    NULL,                       /* cache_get_supported_levels */
9355
    NULL,                       /* cache_get_name */
9356
    NULL,                       /* cache_get_size */
9357
    NULL,                       /* cache_set_size */
9358
};
9359
9360
static void
9361
dpif_dummy_change_port_number(struct unixctl_conn *conn, int argc OVS_UNUSED,
9362
                              const char *argv[], void *aux OVS_UNUSED)
9363
0
{
9364
0
    struct dp_netdev_port *port;
9365
0
    struct dp_netdev *dp;
9366
0
    odp_port_t port_no;
9367
9368
0
    ovs_mutex_lock(&dp_netdev_mutex);
9369
0
    dp = shash_find_data(&dp_netdevs, argv[1]);
9370
0
    if (!dp || !dpif_netdev_class_is_dummy(dp->class)) {
9371
0
        ovs_mutex_unlock(&dp_netdev_mutex);
9372
0
        unixctl_command_reply_error(conn, "unknown datapath or not a dummy");
9373
0
        return;
9374
0
    }
9375
0
    ovs_refcount_ref(&dp->ref_cnt);
9376
0
    ovs_mutex_unlock(&dp_netdev_mutex);
9377
9378
0
    ovs_rwlock_wrlock(&dp->port_rwlock);
9379
0
    if (get_port_by_name(dp, argv[2], &port)) {
9380
0
        unixctl_command_reply_error(conn, "unknown port");
9381
0
        goto exit;
9382
0
    }
9383
9384
0
    port_no = u32_to_odp(atoi(argv[3]));
9385
0
    if (!port_no || port_no == ODPP_NONE) {
9386
0
        unixctl_command_reply_error(conn, "bad port number");
9387
0
        goto exit;
9388
0
    }
9389
0
    if (dp_netdev_lookup_port(dp, port_no)) {
9390
0
        unixctl_command_reply_error(conn, "port number already in use");
9391
0
        goto exit;
9392
0
    }
9393
9394
    /* Remove port. */
9395
0
    hmap_remove(&dp->ports, &port->node);
9396
0
    reconfigure_datapath(dp);
9397
9398
    /* Reinsert with new port number. */
9399
0
    port->port_no = port_no;
9400
0
    hmap_insert(&dp->ports, &port->node, hash_port_no(port_no));
9401
0
    reconfigure_datapath(dp);
9402
9403
0
    seq_change(dp->port_seq);
9404
0
    unixctl_command_reply(conn, NULL);
9405
9406
0
exit:
9407
0
    ovs_rwlock_unlock(&dp->port_rwlock);
9408
0
    dp_netdev_unref(dp);
9409
0
}
9410
9411
static void
9412
dpif_dummy_register__(const char *type)
9413
0
{
9414
0
    struct dpif_class *class;
9415
9416
0
    class = xmalloc(sizeof *class);
9417
0
    *class = dpif_netdev_class;
9418
0
    class->type = xstrdup(type);
9419
0
    dp_register_provider(class);
9420
0
}
9421
9422
static void
9423
dpif_dummy_override(const char *type)
9424
0
{
9425
0
    int error;
9426
9427
    /*
9428
     * Ignore EAFNOSUPPORT to allow --enable-dummy=system with
9429
     * a userland-only build.  It's useful for testsuite.
9430
     */
9431
0
    error = dp_unregister_provider(type);
9432
0
    if (error == 0 || error == EAFNOSUPPORT) {
9433
0
        dpif_dummy_register__(type);
9434
0
    }
9435
0
}
9436
9437
void
9438
dpif_dummy_register(enum dummy_level level)
9439
0
{
9440
0
    if (level == DUMMY_OVERRIDE_ALL) {
9441
0
        struct sset types;
9442
0
        const char *type;
9443
9444
0
        sset_init(&types);
9445
0
        dp_enumerate_types(&types);
9446
0
        SSET_FOR_EACH (type, &types) {
9447
0
            dpif_dummy_override(type);
9448
0
        }
9449
0
        sset_destroy(&types);
9450
0
    } else if (level == DUMMY_OVERRIDE_SYSTEM) {
9451
0
        dpif_dummy_override("system");
9452
0
    }
9453
9454
0
    dpif_dummy_register__("dummy");
9455
9456
0
    unixctl_command_register("dpif-dummy/change-port-number",
9457
0
                             "dp port new-number",
9458
0
                             3, 3, dpif_dummy_change_port_number, NULL);
9459
0
}
9460

9461
/* Datapath Classifier. */
9462
9463
static void
9464
dpcls_subtable_destroy_cb(struct dpcls_subtable *subtable)
9465
0
{
9466
0
    cmap_destroy(&subtable->rules);
9467
0
    ovsrcu_postpone(free, subtable->mf_masks);
9468
0
    ovsrcu_postpone(free, subtable);
9469
0
}
9470
9471
/* Initializes 'cls' as a classifier that initially contains no classification
9472
 * rules. */
9473
static void
9474
dpcls_init(struct dpcls *cls)
9475
0
{
9476
0
    cmap_init(&cls->subtables_map);
9477
0
    pvector_init(&cls->subtables);
9478
0
}
9479
9480
static void
9481
dpcls_destroy_subtable(struct dpcls *cls, struct dpcls_subtable *subtable)
9482
0
{
9483
0
    VLOG_DBG("Destroying subtable %p for in_port %d", subtable, cls->in_port);
9484
0
    pvector_remove(&cls->subtables, subtable);
9485
0
    cmap_remove(&cls->subtables_map, &subtable->cmap_node,
9486
0
                subtable->mask.hash);
9487
0
    dpcls_info_dec_usage(subtable->lookup_func_info);
9488
0
    ovsrcu_postpone(dpcls_subtable_destroy_cb, subtable);
9489
0
}
9490
9491
/* Destroys 'cls'.  Rules within 'cls', if any, are not freed; this is the
9492
 * caller's responsibility.
9493
 * May only be called after all the readers have been terminated. */
9494
static void
9495
dpcls_destroy(struct dpcls *cls)
9496
0
{
9497
0
    if (cls) {
9498
0
        struct dpcls_subtable *subtable;
9499
9500
0
        CMAP_FOR_EACH (subtable, cmap_node, &cls->subtables_map) {
9501
0
            ovs_assert(cmap_count(&subtable->rules) == 0);
9502
0
            dpcls_destroy_subtable(cls, subtable);
9503
0
        }
9504
0
        cmap_destroy(&cls->subtables_map);
9505
0
        pvector_destroy(&cls->subtables);
9506
0
    }
9507
0
}
9508
9509
static struct dpcls_subtable *
9510
dpcls_create_subtable(struct dpcls *cls, const struct netdev_flow_key *mask)
9511
0
{
9512
0
    struct dpcls_subtable *subtable;
9513
9514
    /* Need to add one. */
9515
0
    subtable = xmalloc(sizeof *subtable
9516
0
                       - sizeof subtable->mask.mf + mask->len);
9517
0
    cmap_init(&subtable->rules);
9518
0
    subtable->hit_cnt = 0;
9519
0
    netdev_flow_key_clone(&subtable->mask, mask);
9520
9521
    /* The count of bits in the mask defines the space required for masks.
9522
     * Then call gen_masks() to create the appropriate masks, avoiding the cost
9523
     * of doing runtime calculations. */
9524
0
    uint32_t unit0 = count_1bits(mask->mf.map.bits[0]);
9525
0
    uint32_t unit1 = count_1bits(mask->mf.map.bits[1]);
9526
0
    subtable->mf_bits_set_unit0 = unit0;
9527
0
    subtable->mf_bits_set_unit1 = unit1;
9528
0
    subtable->mf_masks = xmalloc(sizeof(uint64_t) * (unit0 + unit1));
9529
0
    dpcls_flow_key_gen_masks(mask, subtable->mf_masks, unit0, unit1);
9530
9531
    /* Get the preferred subtable search function for this (u0,u1) subtable.
9532
     * The function is guaranteed to always return a valid implementation, and
9533
     * possibly an ISA optimized, and/or specialized implementation. Initialize
9534
     * the subtable search function atomically to avoid garbage data being read
9535
     * by the PMD thread.
9536
     */
9537
0
    atomic_init(&subtable->lookup_func,
9538
0
                dpcls_subtable_get_best_impl(unit0, unit1,
9539
0
                                             &subtable->lookup_func_info));
9540
0
    dpcls_info_inc_usage(subtable->lookup_func_info);
9541
9542
0
    cmap_insert(&cls->subtables_map, &subtable->cmap_node, mask->hash);
9543
    /* Add the new subtable at the end of the pvector (with no hits yet) */
9544
0
    pvector_insert(&cls->subtables, subtable, 0);
9545
0
    VLOG_DBG("Creating %"PRIuSIZE". subtable %p for in_port %d",
9546
0
             cmap_count(&cls->subtables_map), subtable, cls->in_port);
9547
0
    pvector_publish(&cls->subtables);
9548
9549
0
    return subtable;
9550
0
}
9551
9552
static inline struct dpcls_subtable *
9553
dpcls_find_subtable(struct dpcls *cls, const struct netdev_flow_key *mask)
9554
0
{
9555
0
    struct dpcls_subtable *subtable;
9556
9557
0
    CMAP_FOR_EACH_WITH_HASH (subtable, cmap_node, mask->hash,
9558
0
                             &cls->subtables_map) {
9559
0
        if (netdev_flow_key_equal(&subtable->mask, mask)) {
9560
0
            return subtable;
9561
0
        }
9562
0
    }
9563
0
    return dpcls_create_subtable(cls, mask);
9564
0
}
9565
9566
/* Checks for the best available implementation for each subtable lookup
9567
 * function, and assigns it as the lookup function pointer for each subtable.
9568
 * Returns the number of subtables that have changed lookup implementation.
9569
 * This function requires holding a flow_mutex when called. This is to make
9570
 * sure modifications done by this function are not overwritten. This could
9571
 * happen if dpcls_sort_subtable_vector() is called at the same time as this
9572
 * function.
9573
 */
9574
static uint32_t
9575
dpcls_subtable_lookup_reprobe(struct dpcls *cls)
9576
0
{
9577
0
    struct pvector *pvec = &cls->subtables;
9578
0
    uint32_t subtables_changed = 0;
9579
0
    struct dpcls_subtable *subtable = NULL;
9580
9581
0
    PVECTOR_FOR_EACH (subtable, pvec) {
9582
0
        uint32_t u0_bits = subtable->mf_bits_set_unit0;
9583
0
        uint32_t u1_bits = subtable->mf_bits_set_unit1;
9584
0
        void *old_func = subtable->lookup_func;
9585
0
        struct dpcls_subtable_lookup_info_t *old_info;
9586
0
        old_info = subtable->lookup_func_info;
9587
        /* Set the subtable lookup function atomically to avoid garbage data
9588
         * being read by the PMD thread. */
9589
0
        atomic_store_relaxed(&subtable->lookup_func,
9590
0
                dpcls_subtable_get_best_impl(u0_bits, u1_bits,
9591
0
                                             &subtable->lookup_func_info));
9592
0
        if (old_func != subtable->lookup_func) {
9593
0
            subtables_changed += 1;
9594
0
        }
9595
9596
0
        if (old_info != subtable->lookup_func_info) {
9597
            /* In theory, functions can be shared between implementations, so
9598
             * do an explicit check on the function info structures. */
9599
0
            dpcls_info_dec_usage(old_info);
9600
0
            dpcls_info_inc_usage(subtable->lookup_func_info);
9601
0
        }
9602
0
    }
9603
9604
0
    return subtables_changed;
9605
0
}
9606
9607
/* Periodically sort the dpcls subtable vectors according to hit counts */
9608
static void
9609
dpcls_sort_subtable_vector(struct dpcls *cls)
9610
0
{
9611
0
    struct pvector *pvec = &cls->subtables;
9612
0
    struct dpcls_subtable *subtable;
9613
9614
0
    PVECTOR_FOR_EACH (subtable, pvec) {
9615
0
        pvector_change_priority(pvec, subtable, subtable->hit_cnt);
9616
0
        subtable->hit_cnt = 0;
9617
0
    }
9618
0
    pvector_publish(pvec);
9619
0
}
9620
9621
static inline void
9622
dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd,
9623
                           struct polled_queue *poll_list, int poll_cnt)
9624
0
{
9625
0
    struct dpcls *cls;
9626
0
    uint64_t tot_idle = 0, tot_proc = 0, tot_sleep = 0;
9627
0
    unsigned int pmd_load = 0;
9628
9629
0
    if (pmd->ctx.now > pmd->next_cycle_store) {
9630
0
        uint64_t curr_tsc;
9631
0
        uint8_t rebalance_load_trigger;
9632
0
        struct pmd_auto_lb *pmd_alb = &pmd->dp->pmd_alb;
9633
0
        unsigned int idx;
9634
9635
0
        if (pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE] >=
9636
0
                pmd->prev_stats[PMD_CYCLES_ITER_IDLE] &&
9637
0
            pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY] >=
9638
0
                pmd->prev_stats[PMD_CYCLES_ITER_BUSY]) {
9639
0
            tot_idle = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE] -
9640
0
                       pmd->prev_stats[PMD_CYCLES_ITER_IDLE];
9641
0
            tot_proc = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY] -
9642
0
                       pmd->prev_stats[PMD_CYCLES_ITER_BUSY];
9643
0
            tot_sleep = pmd->perf_stats.counters.n[PMD_CYCLES_SLEEP] -
9644
0
                        pmd->prev_stats[PMD_CYCLES_SLEEP];
9645
9646
0
            if (pmd_alb->is_enabled && !pmd->isolated) {
9647
0
                if (tot_proc) {
9648
0
                    pmd_load = ((tot_proc * 100) /
9649
0
                                    (tot_idle + tot_proc + tot_sleep));
9650
0
                }
9651
9652
0
                atomic_read_relaxed(&pmd_alb->rebalance_load_thresh,
9653
0
                                    &rebalance_load_trigger);
9654
0
                if (pmd_load >= rebalance_load_trigger) {
9655
0
                    atomic_count_inc(&pmd->pmd_overloaded);
9656
0
                } else {
9657
0
                    atomic_count_set(&pmd->pmd_overloaded, 0);
9658
0
                }
9659
0
            }
9660
0
        }
9661
9662
0
        pmd->prev_stats[PMD_CYCLES_ITER_IDLE] =
9663
0
                        pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE];
9664
0
        pmd->prev_stats[PMD_CYCLES_ITER_BUSY] =
9665
0
                        pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY];
9666
0
        pmd->prev_stats[PMD_CYCLES_SLEEP] =
9667
0
                        pmd->perf_stats.counters.n[PMD_CYCLES_SLEEP];
9668
9669
        /* Get the cycles that were used to process each queue and store. */
9670
0
        for (unsigned i = 0; i < poll_cnt; i++) {
9671
0
            uint64_t rxq_cyc_curr = dp_netdev_rxq_get_cycles(poll_list[i].rxq,
9672
0
                                                        RXQ_CYCLES_PROC_CURR);
9673
0
            dp_netdev_rxq_set_intrvl_cycles(poll_list[i].rxq, rxq_cyc_curr);
9674
0
            dp_netdev_rxq_set_cycles(poll_list[i].rxq, RXQ_CYCLES_PROC_CURR,
9675
0
                                     0);
9676
0
        }
9677
0
        curr_tsc = cycles_counter_update(&pmd->perf_stats);
9678
0
        if (pmd->intrvl_tsc_prev) {
9679
            /* There is a prev timestamp, store a new intrvl cycle count. */
9680
0
            atomic_store_relaxed(&pmd->intrvl_cycles,
9681
0
                                 curr_tsc - pmd->intrvl_tsc_prev);
9682
0
        }
9683
0
        idx = atomic_count_inc(&pmd->intrvl_idx) % PMD_INTERVAL_MAX;
9684
0
        atomic_store_relaxed(&pmd->busy_cycles_intrvl[idx], tot_proc);
9685
0
        pmd->intrvl_tsc_prev = curr_tsc;
9686
        /* Start new measuring interval */
9687
0
        pmd->next_cycle_store = pmd->ctx.now + PMD_INTERVAL_LEN;
9688
0
    }
9689
9690
0
    if (pmd->ctx.now > pmd->next_optimization) {
9691
        /* Try to obtain the flow lock to block out revalidator threads.
9692
         * If not possible, just try next time. */
9693
0
        if (!ovs_mutex_trylock(&pmd->flow_mutex)) {
9694
            /* Optimize each classifier */
9695
0
            CMAP_FOR_EACH (cls, node, &pmd->classifiers) {
9696
0
                dpcls_sort_subtable_vector(cls);
9697
0
            }
9698
0
            ovs_mutex_unlock(&pmd->flow_mutex);
9699
            /* Start new measuring interval */
9700
0
            pmd->next_optimization = pmd->ctx.now
9701
0
                                     + DPCLS_OPTIMIZATION_INTERVAL;
9702
0
        }
9703
0
    }
9704
0
}
9705
9706
/* Returns the sum of a specified number of newest to
9707
 * oldest interval values. 'cur_idx' is where the next
9708
 * write will be and wrap around needs to be handled.
9709
 */
9710
static uint64_t
9711
get_interval_values(atomic_ullong *source, atomic_count *cur_idx,
9712
0
                    int num_to_read) {
9713
0
    unsigned int i;
9714
0
    uint64_t total = 0;
9715
9716
0
    i = atomic_count_get(cur_idx) % PMD_INTERVAL_MAX;
9717
0
    for (int read = 0; read < num_to_read; read++) {
9718
0
        uint64_t interval_value;
9719
9720
0
        i = i ? i - 1 : PMD_INTERVAL_MAX - 1;
9721
0
        atomic_read_relaxed(&source[i], &interval_value);
9722
0
        total += interval_value;
9723
0
    }
9724
0
    return total;
9725
0
}
9726
9727
/* Insert 'rule' into 'cls'. */
9728
static void
9729
dpcls_insert(struct dpcls *cls, struct dpcls_rule *rule,
9730
             const struct netdev_flow_key *mask)
9731
0
{
9732
0
    struct dpcls_subtable *subtable = dpcls_find_subtable(cls, mask);
9733
9734
    /* Refer to subtable's mask, also for later removal. */
9735
0
    rule->mask = &subtable->mask;
9736
0
    cmap_insert(&subtable->rules, &rule->cmap_node, rule->flow.hash);
9737
0
}
9738
9739
/* Removes 'rule' from 'cls', also destructing the 'rule'. */
9740
static void
9741
dpcls_remove(struct dpcls *cls, struct dpcls_rule *rule)
9742
0
{
9743
0
    struct dpcls_subtable *subtable;
9744
9745
0
    ovs_assert(rule->mask);
9746
9747
    /* Get subtable from reference in rule->mask. */
9748
0
    INIT_CONTAINER(subtable, rule->mask, mask);
9749
0
    if (cmap_remove(&subtable->rules, &rule->cmap_node, rule->flow.hash)
9750
0
        == 0) {
9751
        /* Delete empty subtable. */
9752
0
        dpcls_destroy_subtable(cls, subtable);
9753
0
        pvector_publish(&cls->subtables);
9754
0
    }
9755
0
}
9756
9757
/* Inner loop for mask generation of a unit, see dpcls_flow_key_gen_masks. */
9758
static inline void
9759
dpcls_flow_key_gen_mask_unit(uint64_t iter, const uint64_t count,
9760
                             uint64_t *mf_masks)
9761
0
{
9762
0
    int i;
9763
0
    for (i = 0; i < count; i++) {
9764
0
        uint64_t lowest_bit = (iter & -iter);
9765
0
        iter &= ~lowest_bit;
9766
0
        mf_masks[i] = (lowest_bit - 1);
9767
0
    }
9768
    /* Checks that count has covered all bits in the iter bitmap. */
9769
0
    ovs_assert(iter == 0);
9770
0
}
9771
9772
/* Generate a mask for each block in the miniflow, based on the bits set. This
9773
 * allows easily masking packets with the generated array here, without
9774
 * calculations. This replaces runtime-calculating the masks.
9775
 * @param key The table to generate the mf_masks for
9776
 * @param mf_masks Pointer to a u64 array of at least *mf_bits* in size
9777
 * @param mf_bits_total Number of bits set in the whole miniflow (both units)
9778
 * @param mf_bits_unit0 Number of bits set in unit0 of the miniflow
9779
 */
9780
void
9781
dpcls_flow_key_gen_masks(const struct netdev_flow_key *tbl,
9782
                         uint64_t *mf_masks,
9783
                         const uint32_t mf_bits_u0,
9784
                         const uint32_t mf_bits_u1)
9785
0
{
9786
0
    uint64_t iter_u0 = tbl->mf.map.bits[0];
9787
0
    uint64_t iter_u1 = tbl->mf.map.bits[1];
9788
9789
0
    dpcls_flow_key_gen_mask_unit(iter_u0, mf_bits_u0, &mf_masks[0]);
9790
0
    dpcls_flow_key_gen_mask_unit(iter_u1, mf_bits_u1, &mf_masks[mf_bits_u0]);
9791
0
}
9792
9793
/* Returns true if 'target' satisfies 'key' in 'mask', that is, if each 1-bit
9794
 * in 'mask' the values in 'key' and 'target' are the same. */
9795
inline bool
9796
dpcls_rule_matches_key(const struct dpcls_rule *rule,
9797
                       const struct netdev_flow_key *target)
9798
0
{
9799
0
    const uint64_t *keyp = miniflow_get_values(&rule->flow.mf);
9800
0
    const uint64_t *maskp = miniflow_get_values(&rule->mask->mf);
9801
0
    uint64_t value;
9802
9803
0
    NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(value, target, rule->flow.mf.map) {
9804
0
        if (OVS_UNLIKELY((value & *maskp++) != *keyp++)) {
9805
0
            return false;
9806
0
        }
9807
0
    }
9808
0
    return true;
9809
0
}
9810
9811
/* For each miniflow in 'keys' performs a classifier lookup writing the result
9812
 * into the corresponding slot in 'rules'.  If a particular entry in 'keys' is
9813
 * NULL it is skipped.
9814
 *
9815
 * This function is optimized for use in the userspace datapath and therefore
9816
 * does not implement a lot of features available in the standard
9817
 * classifier_lookup() function.  Specifically, it does not implement
9818
 * priorities, instead returning any rule which matches the flow.
9819
 *
9820
 * Returns true if all miniflows found a corresponding rule. */
9821
bool
9822
dpcls_lookup(struct dpcls *cls, const struct netdev_flow_key *keys[],
9823
             struct dpcls_rule **rules, const size_t cnt,
9824
             int *num_lookups_p)
9825
0
{
9826
    /* The received 'cnt' miniflows are the search-keys that will be processed
9827
     * to find a matching entry into the available subtables.
9828
     * The number of bits in map_type is equal to NETDEV_MAX_BURST. */
9829
0
#define MAP_BITS (sizeof(uint32_t) * CHAR_BIT)
9830
0
    BUILD_ASSERT_DECL(MAP_BITS >= NETDEV_MAX_BURST);
9831
9832
0
    struct dpcls_subtable *subtable;
9833
0
    uint32_t keys_map = TYPE_MAXIMUM(uint32_t); /* Set all bits. */
9834
9835
0
    if (cnt != MAP_BITS) {
9836
0
        keys_map >>= MAP_BITS - cnt; /* Clear extra bits. */
9837
0
    }
9838
0
    memset(rules, 0, cnt * sizeof *rules);
9839
9840
0
    int lookups_match = 0, subtable_pos = 1;
9841
0
    uint32_t found_map;
9842
9843
    /* The Datapath classifier - aka dpcls - is composed of subtables.
9844
     * Subtables are dynamically created as needed when new rules are inserted.
9845
     * Each subtable collects rules with matches on a specific subset of packet
9846
     * fields as defined by the subtable's mask.  We proceed to process every
9847
     * search-key against each subtable, but when a match is found for a
9848
     * search-key, the search for that key can stop because the rules are
9849
     * non-overlapping. */
9850
0
    PVECTOR_FOR_EACH (subtable, &cls->subtables) {
9851
        /* Call the subtable specific lookup function. */
9852
0
        found_map = subtable->lookup_func(subtable, keys_map, keys, rules);
9853
9854
        /* Count the number of subtables searched for this packet match. This
9855
         * estimates the "spread" of subtables looked at per matched packet. */
9856
0
        uint32_t pkts_matched = count_1bits(found_map);
9857
0
        lookups_match += pkts_matched * subtable_pos;
9858
9859
        /* Clear the found rules, and return early if all packets are found. */
9860
0
        keys_map &= ~found_map;
9861
0
        if (!keys_map) {
9862
0
            if (num_lookups_p) {
9863
0
                *num_lookups_p = lookups_match;
9864
0
            }
9865
0
            return true;
9866
0
        }
9867
0
        subtable_pos++;
9868
0
    }
9869
9870
0
    if (num_lookups_p) {
9871
0
        *num_lookups_p = lookups_match;
9872
0
    }
9873
    return false;
9874
0
}