Coverage Report

Created: 2026-02-09 06:06

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/openvswitch/lib/netdev-linux.c
Line
Count
Source
1
/*
2
 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017 Nicira, Inc.
3
 *
4
 * Licensed under the Apache License, Version 2.0 (the "License");
5
 * you may not use this file except in compliance with the License.
6
 * You may obtain a copy of the License at:
7
 *
8
 *     http://www.apache.org/licenses/LICENSE-2.0
9
 *
10
 * Unless required by applicable law or agreed to in writing, software
11
 * distributed under the License is distributed on an "AS IS" BASIS,
12
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
 * See the License for the specific language governing permissions and
14
 * limitations under the License.
15
 */
16
17
#include <config.h>
18
19
#include "netdev-linux.h"
20
#include "netdev-linux-private.h"
21
22
#include <errno.h>
23
#include <fcntl.h>
24
#include <sys/types.h>
25
#include <netinet/in.h>
26
#include <arpa/inet.h>
27
#include <inttypes.h>
28
#include <math.h>
29
#include <linux/filter.h>
30
#include <linux/gen_stats.h>
31
#include <linux/if_ether.h>
32
#include <linux/if_packet.h>
33
#include <linux/if_tun.h>
34
#include <linux/types.h>
35
#include <linux/ethtool.h>
36
#include <linux/mii.h>
37
#include <linux/rtnetlink.h>
38
#include <linux/sockios.h>
39
#include <linux/virtio_net.h>
40
#include <sys/ioctl.h>
41
#include <sys/socket.h>
42
#include <sys/uio.h>
43
#include <net/if.h>
44
#include <net/if_arp.h>
45
#include <net/route.h>
46
#include <poll.h>
47
#include <stdlib.h>
48
#include <string.h>
49
#include <unistd.h>
50
51
#include "coverage.h"
52
#include "dp-packet.h"
53
#include "dpif-netlink.h"
54
#include "dpif-netdev.h"
55
#include "dpif-offload.h"
56
#include "openvswitch/dynamic-string.h"
57
#include "fatal-signal.h"
58
#include "hash.h"
59
#include "openvswitch/hmap.h"
60
#include "netdev-afxdp.h"
61
#include "netdev-provider.h"
62
#include "netdev-vport.h"
63
#include "netlink-notifier.h"
64
#include "netlink-socket.h"
65
#include "netlink.h"
66
#include "netnsid.h"
67
#include "openvswitch/ofpbuf.h"
68
#include "openflow/openflow.h"
69
#include "ovs-atomic.h"
70
#include "ovs-numa.h"
71
#include "packets.h"
72
#include "openvswitch/poll-loop.h"
73
#include "rtnetlink.h"
74
#include "openvswitch/shash.h"
75
#include "socket-util.h"
76
#include "sset.h"
77
#include "tc.h"
78
#include "timer.h"
79
#include "unaligned.h"
80
#include "openvswitch/vlog.h"
81
#include "userspace-tso.h"
82
#include "util.h"
83
84
VLOG_DEFINE_THIS_MODULE(netdev_linux);
85
86
COVERAGE_DEFINE(netdev_set_policing);
87
COVERAGE_DEFINE(netdev_arp_lookup);
88
COVERAGE_DEFINE(netdev_get_ifindex);
89
COVERAGE_DEFINE(netdev_get_hwaddr);
90
COVERAGE_DEFINE(netdev_set_hwaddr);
91
COVERAGE_DEFINE(netdev_get_ethtool);
92
COVERAGE_DEFINE(netdev_set_ethtool);
93
COVERAGE_DEFINE(netdev_linux_invalid_l4_csum);
94
COVERAGE_DEFINE(netdev_linux_unknown_l4_csum);
95
96

97
#ifndef IFLA_IF_NETNSID
98
0
#define IFLA_IF_NETNSID 0x45
99
#endif
100
/* These were introduced in Linux 2.6.14, so they might be missing if we have
101
 * old headers. */
102
#ifndef ADVERTISED_Pause
103
#define ADVERTISED_Pause                (1 << 13)
104
#endif
105
#ifndef ADVERTISED_Asym_Pause
106
#define ADVERTISED_Asym_Pause           (1 << 14)
107
#endif
108
109
/* These were introduced in Linux 2.6.24, so they might be missing if we
110
 * have old headers. */
111
#ifndef ETHTOOL_GFLAGS
112
#define ETHTOOL_GFLAGS       0x00000025 /* Get flags bitmap(ethtool_value) */
113
#endif
114
#ifndef ETHTOOL_SFLAGS
115
#define ETHTOOL_SFLAGS       0x00000026 /* Set flags bitmap(ethtool_value) */
116
#endif
117
118
/* This was introduced in Linux 2.6.25, so it might be missing if we have old
119
 * headers. */
120
#ifndef TC_RTAB_SIZE
121
#define TC_RTAB_SIZE 1024
122
#endif
123
124
/* Linux 2.6.21 introduced struct tpacket_auxdata.
125
 * Linux 2.6.27 added the tp_vlan_tci member.
126
 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
127
 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
128
 * TP_STATUS_VLAN_TPID_VALID.
129
 *
130
 * With all this churn it's easiest to unconditionally define a replacement
131
 * structure that has everything we want.
132
 */
133
#ifndef PACKET_AUXDATA
134
#define PACKET_AUXDATA                  8
135
#endif
136
#ifndef TP_STATUS_VLAN_VALID
137
#define TP_STATUS_VLAN_VALID            (1 << 4)
138
#endif
139
#ifndef TP_STATUS_VLAN_TPID_VALID
140
#define TP_STATUS_VLAN_TPID_VALID       (1 << 6)
141
#endif
142
#undef tpacket_auxdata
143
#define tpacket_auxdata rpl_tpacket_auxdata
144
struct tpacket_auxdata {
145
    uint32_t tp_status;
146
    uint32_t tp_len;
147
    uint32_t tp_snaplen;
148
    uint16_t tp_mac;
149
    uint16_t tp_net;
150
    uint16_t tp_vlan_tci;
151
    uint16_t tp_vlan_tpid;
152
};
153
154
/* Linux 2.6.27 introduced ethtool_cmd_speed
155
 *
156
 * To avoid revisiting problems reported with using configure to detect
157
 * compatibility (see report at
158
 * https://mail.openvswitch.org/pipermail/ovs-dev/2014-October/291521.html)
159
 * unconditionally replace ethtool_cmd_speed. */
160
0
#define ethtool_cmd_speed rpl_ethtool_cmd_speed
161
static inline uint32_t rpl_ethtool_cmd_speed(const struct ethtool_cmd *ep)
162
0
{
163
0
        return ep->speed | (ep->speed_hi << 16);
164
0
}
165
166
/* Linux 2.6.30 introduced supported and advertised flags for
167
 * 1G base KX, and 10G base KX4, KR and R. */
168
#ifndef SUPPORTED_1000baseKX_Full
169
#define SUPPORTED_1000baseKX_Full      (1 << 17)
170
#define SUPPORTED_10000baseKX4_Full    (1 << 18)
171
#define SUPPORTED_10000baseKR_Full     (1 << 19)
172
#define SUPPORTED_10000baseR_FEC       (1 << 20)
173
#define ADVERTISED_1000baseKX_Full     (1 << 17)
174
#define ADVERTISED_10000baseKX4_Full   (1 << 18)
175
#define ADVERTISED_10000baseKR_Full    (1 << 19)
176
#define ADVERTISED_10000baseR_FEC      (1 << 20)
177
#endif
178
179
/* Linux 3.2 introduced "unknown" speed and duplex. */
180
#ifndef SPEED_UNKNOWN
181
#define SPEED_UNKNOWN -1
182
#endif
183
#ifndef DUPLEX_UNKNOWN
184
#define DUPLEX_UNKNOWN 0xff
185
#endif
186
187
/* Linux 3.5 introduced supported and advertised flags for
188
 * 40G base KR4, CR4, SR4 and LR4. */
189
#ifndef SUPPORTED_40000baseKR4_Full
190
#define SUPPORTED_40000baseKR4_Full    (1 << 23)
191
#define SUPPORTED_40000baseCR4_Full    (1 << 24)
192
#define SUPPORTED_40000baseSR4_Full    (1 << 25)
193
#define SUPPORTED_40000baseLR4_Full    (1 << 26)
194
#define ADVERTISED_40000baseKR4_Full   (1 << 23)
195
#define ADVERTISED_40000baseCR4_Full   (1 << 24)
196
#define ADVERTISED_40000baseSR4_Full   (1 << 25)
197
#define ADVERTISED_40000baseLR4_Full   (1 << 26)
198
#endif
199
200
/* Linux 3.19 introduced speed for 40G. */
201
#ifndef SPEED_40000
202
#define SPEED_40000 40000
203
#endif
204
205
/* Linux 4.2 introduced speed for 100G. */
206
#ifndef SPEED_100000
207
#define SPEED_100000 100000
208
#endif
209
210
/* Linux 2.6.35 introduced IFLA_STATS64 and rtnl_link_stats64.
211
 *
212
 * Tests for rtnl_link_stats64 don't seem to consistently work, e.g. on
213
 * 2.6.32-431.29.2.el6.x86_64 (see report at
214
 * https://mail.openvswitch.org/pipermail/ovs-dev/2014-October/291521.html).
215
 * Maybe if_link.h is not self-contained on those kernels.  It is easiest to
216
 * unconditionally define a replacement. */
217
#ifndef IFLA_STATS64
218
0
#define IFLA_STATS64 23
219
#endif
220
#define rtnl_link_stats64 rpl_rtnl_link_stats64
221
struct rtnl_link_stats64 {
222
    uint64_t rx_packets;
223
    uint64_t tx_packets;
224
    uint64_t rx_bytes;
225
    uint64_t tx_bytes;
226
    uint64_t rx_errors;
227
    uint64_t tx_errors;
228
    uint64_t rx_dropped;
229
    uint64_t tx_dropped;
230
    uint64_t multicast;
231
    uint64_t collisions;
232
233
    uint64_t rx_length_errors;
234
    uint64_t rx_over_errors;
235
    uint64_t rx_crc_errors;
236
    uint64_t rx_frame_errors;
237
    uint64_t rx_fifo_errors;
238
    uint64_t rx_missed_errors;
239
240
    uint64_t tx_aborted_errors;
241
    uint64_t tx_carrier_errors;
242
    uint64_t tx_fifo_errors;
243
    uint64_t tx_heartbeat_errors;
244
    uint64_t tx_window_errors;
245
246
    uint64_t rx_compressed;
247
    uint64_t tx_compressed;
248
};
249
250
/* Linux 3.19 introduced virtio_types.h.  It might be missing
251
 * if we are using old kernel. */
252
#ifndef HAVE_VIRTIO_TYPES
253
typedef __u16 __bitwise__ __virtio16;
254
typedef __u32 __bitwise__ __virtio32;
255
typedef __u64 __bitwise__ __virtio64;
256
#endif
257
258
enum {
259
    VALID_IFINDEX           = 1 << 0,
260
    VALID_ETHERADDR         = 1 << 1,
261
    VALID_IN                = 1 << 2,
262
    VALID_MTU               = 1 << 3,
263
    VALID_POLICING          = 1 << 4,
264
    VALID_VPORT_STAT_ERROR  = 1 << 5,
265
    VALID_DRVINFO           = 1 << 6,
266
    VALID_FEATURES          = 1 << 7,
267
    VALID_NUMA_ID           = 1 << 8,
268
};
269
270
/* Linux 4.4 introduced the ability to skip the internal stats gathering
271
 * that netlink does via an external filter mask that can be passed into
272
 * a netlink request.
273
 */
274
#ifndef RTEXT_FILTER_SKIP_STATS
275
#define RTEXT_FILTER_SKIP_STATS (1 << 3)
276
#endif
277
278
/* Use one for the packet buffer and another for the aux buffer to receive
279
 * TSO packets. */
280
0
#define IOV_STD_SIZE 1
281
0
#define IOV_TSO_SIZE 2
282
283
enum {
284
    IOV_PACKET = 0,
285
    IOV_AUXBUF = 1,
286
};
287

288
struct linux_lag_member {
289
   uint32_t block_id;
290
   struct shash_node *node;
291
};
292
293
/* Protects 'lag_shash' and the mutable members of struct linux_lag_member. */
294
static struct ovs_mutex lag_mutex = OVS_MUTEX_INITIALIZER;
295
296
/* All members whose LAG primary interfaces are OVS network devices. */
297
static struct shash lag_shash OVS_GUARDED_BY(lag_mutex)
298
    = SHASH_INITIALIZER(&lag_shash);
299
300
/* Traffic control. */
301
302
/* An instance of a traffic control class.  Always associated with a particular
303
 * network device.
304
 *
305
 * Each TC implementation subclasses this with whatever additional data it
306
 * needs. */
307
struct tc {
308
    const struct tc_ops *ops;
309
    struct hmap queues;         /* Contains "struct tc_queue"s.
310
                                 * Read by generic TC layer.
311
                                 * Written only by TC implementation. */
312
};
313
314
0
#define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
315
316
/* One traffic control queue.
317
 *
318
 * Each TC implementation subclasses this with whatever additional data it
319
 * needs. */
320
struct tc_queue {
321
    struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
322
    unsigned int queue_id;      /* OpenFlow queue ID. */
323
    long long int created;      /* Time queue was created, in msecs. */
324
};
325
326
/* A particular kind of traffic control.  Each implementation generally maps to
327
 * one particular Linux qdisc class.
328
 *
329
 * The functions below return 0 if successful or a positive errno value on
330
 * failure, except where otherwise noted.  All of them must be provided, except
331
 * where otherwise noted. */
332
struct tc_ops {
333
    /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
334
     * This is null for tc_ops_default and tc_ops_other, for which there are no
335
     * appropriate values. */
336
    const char *linux_name;
337
338
    /* Name used in OVS database, e.g. "linux-htb".  Must be nonnull. */
339
    const char *ovs_name;
340
341
    /* Number of supported OpenFlow queues, 0 for qdiscs that have no
342
     * queues.  The queues are numbered 0 through n_queues - 1. */
343
    unsigned int n_queues;
344
345
    /* Called to install this TC class on 'netdev'.  The implementation should
346
     * make the Netlink calls required to set up 'netdev' with the right qdisc
347
     * and configure it according to 'details'.  The implementation may assume
348
     * that the current qdisc is the default; that is, there is no need for it
349
     * to delete the current qdisc before installing itself.
350
     *
351
     * The contents of 'details' should be documented as valid for 'ovs_name'
352
     * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
353
     * (which is built as ovs-vswitchd.conf.db(8)).
354
     *
355
     * This function must return 0 if and only if it sets 'netdev->tc' to an
356
     * initialized 'struct tc'.
357
     *
358
     * (This function is null for tc_ops_other, which cannot be installed.  For
359
     * other TC classes it should always be nonnull.) */
360
    int (*tc_install)(struct netdev *netdev, const struct smap *details);
361
362
    /* Called when the netdev code determines (through a Netlink query) that
363
     * this TC class's qdisc is installed on 'netdev', but we didn't install
364
     * it ourselves and so don't know any of the details.
365
     *
366
     * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
367
     * 'netdev'.  The TCA_KIND attribute of 'nlmsg' is 'linux_name'.  The
368
     * implementation should parse the other attributes of 'nlmsg' as
369
     * necessary to determine its configuration.  If necessary it should also
370
     * use Netlink queries to determine the configuration of queues on
371
     * 'netdev'.
372
     *
373
     * This function must return 0 if and only if it sets 'netdev->tc' to an
374
     * initialized 'struct tc'. */
375
    int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
376
377
    /* Destroys the data structures allocated by the implementation as part of
378
     * 'tc'.  (This includes destroying 'tc->queues' by calling
379
     * tc_destroy(tc).
380
     *
381
     * The implementation should not need to perform any Netlink calls.  If
382
     * desirable, the caller is responsible for deconfiguring the kernel qdisc.
383
     * (But it may not be desirable.)
384
     *
385
     * This function may be null if 'tc' is trivial. */
386
    void (*tc_destroy)(struct tc *tc);
387
388
    /* Retrieves details of 'netdev->tc' configuration into 'details'.
389
     *
390
     * The implementation should not need to perform any Netlink calls, because
391
     * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
392
     * cached the configuration.
393
     *
394
     * The contents of 'details' should be documented as valid for 'ovs_name'
395
     * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
396
     * (which is built as ovs-vswitchd.conf.db(8)).
397
     *
398
     * This function may be null if 'tc' is not configurable.
399
     */
400
    int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
401
402
    /* Reconfigures 'netdev->tc' according to 'details', performing any
403
     * required Netlink calls to complete the reconfiguration.
404
     *
405
     * The contents of 'details' should be documented as valid for 'ovs_name'
406
     * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
407
     * (which is built as ovs-vswitchd.conf.db(8)).
408
     *
409
     * This function may be null if 'tc' is not configurable.
410
     */
411
    int (*qdisc_set)(struct netdev *, const struct smap *details);
412
413
    /* Retrieves details of 'queue' on 'netdev->tc' into 'details'.  'queue' is
414
     * one of the 'struct tc_queue's within 'netdev->tc->queues'.
415
     *
416
     * The contents of 'details' should be documented as valid for 'ovs_name'
417
     * in the "other_config" column in the "Queue" table in
418
     * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
419
     *
420
     * The implementation should not need to perform any Netlink calls, because
421
     * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
422
     * cached the queue configuration.
423
     *
424
     * This function may be null if 'tc' does not have queues ('n_queues' is
425
     * 0). */
426
    int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
427
                     struct smap *details);
428
429
    /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
430
     * 'details', perfoming any required Netlink calls to complete the
431
     * reconfiguration.  The caller ensures that 'queue_id' is less than
432
     * 'n_queues'.
433
     *
434
     * The contents of 'details' should be documented as valid for 'ovs_name'
435
     * in the "other_config" column in the "Queue" table in
436
     * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
437
     *
438
     * This function may be null if 'tc' does not have queues or its queues are
439
     * not configurable. */
440
    int (*class_set)(struct netdev *, unsigned int queue_id,
441
                     const struct smap *details);
442
443
    /* Deletes 'queue' from 'netdev->tc'.  'queue' is one of the 'struct
444
     * tc_queue's within 'netdev->tc->queues'.
445
     *
446
     * This function may be null if 'tc' does not have queues or its queues
447
     * cannot be deleted. */
448
    int (*class_delete)(struct netdev *, struct tc_queue *queue);
449
450
    /* Obtains stats for 'queue' from 'netdev->tc'.  'queue' is one of the
451
     * 'struct tc_queue's within 'netdev->tc->queues'.
452
     *
453
     * On success, initializes '*stats'.
454
     *
455
     * This function may be null if 'tc' does not have queues or if it cannot
456
     * report queue statistics. */
457
    int (*class_get_stats)(const struct netdev *netdev,
458
                           const struct tc_queue *queue,
459
                           struct netdev_queue_stats *stats);
460
461
    /* Extracts queue stats from 'nlmsg', which is a response to a
462
     * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
463
     *
464
     * This function may be null if 'tc' does not have queues or if it cannot
465
     * report queue statistics. */
466
    int (*class_dump_stats)(const struct netdev *netdev,
467
                            const struct ofpbuf *nlmsg,
468
                            netdev_dump_queue_stats_cb *cb, void *aux);
469
};
470
471
static void
472
tc_init(struct tc *tc, const struct tc_ops *ops)
473
0
{
474
0
    tc->ops = ops;
475
0
    hmap_init(&tc->queues);
476
0
}
477
478
static void
479
tc_destroy(struct tc *tc)
480
0
{
481
0
    hmap_destroy(&tc->queues);
482
0
}
483
484
static const struct tc_ops tc_ops_htb;
485
static const struct tc_ops tc_ops_hfsc;
486
static const struct tc_ops tc_ops_codel;
487
static const struct tc_ops tc_ops_fqcodel;
488
static const struct tc_ops tc_ops_sfq;
489
static const struct tc_ops tc_ops_netem;
490
static const struct tc_ops tc_ops_default;
491
static const struct tc_ops tc_ops_noop;
492
static const struct tc_ops tc_ops_other;
493
494
static const struct tc_ops *const tcs[] = {
495
    &tc_ops_htb,                /* Hierarchy token bucket (see tc-htb(8)). */
496
    &tc_ops_hfsc,               /* Hierarchical fair service curve. */
497
    &tc_ops_codel,              /* Controlled delay */
498
    &tc_ops_fqcodel,            /* Fair queue controlled delay */
499
    &tc_ops_sfq,                /* Stochastic fair queueing */
500
    &tc_ops_netem,              /* Network Emulator */
501
    &tc_ops_noop,               /* Non operating qos type. */
502
    &tc_ops_default,            /* Default qdisc (see tc-pfifo_fast(8)). */
503
    &tc_ops_other,              /* Some other qdisc. */
504
    NULL
505
};
506
507
static unsigned int tc_ticks_to_bytes(uint64_t rate, unsigned int ticks);
508
static unsigned int tc_bytes_to_ticks(uint64_t rate, unsigned int size);
509
static unsigned int tc_buffer_per_jiffy(uint64_t rate);
510
static uint32_t tc_time_to_ticks(uint32_t time);
511
512
static struct tcmsg *netdev_linux_tc_make_request(const struct netdev *,
513
                                                  int type,
514
                                                  unsigned int flags,
515
                                                  struct ofpbuf *);
516
517
static int tc_add_policer(struct netdev *, uint64_t kbits_rate,
518
                          uint32_t kbits_burst, uint32_t kpkts_rate,
519
                          uint32_t kpkts_burst);
520
521
static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
522
                          struct nlattr **options);
523
static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
524
                          struct nlattr **options,
525
                          struct netdev_queue_stats *);
526
static int tc_query_class(const struct netdev *,
527
                          unsigned int handle, unsigned int parent,
528
                          struct ofpbuf **replyp);
529
static int tc_delete_class(const struct netdev *, unsigned int handle);
530
531
static int tc_del_qdisc(struct netdev *netdev);
532
static int tc_query_qdisc(const struct netdev *netdev);
533
static void tc_policer_init(struct tc_police *tc_police, uint64_t kbits_rate,
534
                            uint64_t kbits_burst);
535
536
void
537
tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate,
538
            uint64_t rate64);
539
static int tc_calc_cell_log(unsigned int mtu);
540
static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
541
static int tc_calc_buffer(uint64_t Bps, int mtu, uint64_t burst_bytes);
542

543
544
/* This is set pretty low because we probably won't learn anything from the
545
 * additional log messages. */
546
static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
547
548
/* Polling miimon status for all ports causes performance degradation when
549
 * handling a large number of ports. If there are no devices using miimon, then
550
 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait().
551
 *
552
 * Readers do not depend on this variable synchronizing with the related
553
 * changes in the device miimon status, so we can use atomic_count. */
554
static atomic_count miimon_cnt = ATOMIC_COUNT_INIT(0);
555
556
/* Very old kernels from the 2.6 era don't support vnet headers with the tun
557
 * device. We can detect this while constructing a netdev, but need this for
558
 * packet rx/tx. */
559
static bool tap_supports_vnet_hdr = true;
560
561
static int netdev_linux_parse_vnet_hdr(struct dp_packet *b);
562
static int netdev_linux_prepend_vnet_hdr(struct dp_packet *b, int mtu);
563
static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
564
                                   int cmd, const char *cmd_name);
565
static int get_flags(const struct netdev *, unsigned int *flags);
566
static int set_flags(const char *, unsigned int flags);
567
static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
568
                        enum netdev_flags on, enum netdev_flags *old_flagsp)
569
    OVS_REQUIRES(netdev->mutex);
570
static int get_ifindex(const struct netdev *, int *ifindexp);
571
static int do_set_addr(struct netdev *netdev,
572
                       int ioctl_nr, const char *ioctl_name,
573
                       struct in_addr addr);
574
static int get_etheraddr(const char *netdev_name, struct eth_addr *ea);
575
static int set_etheraddr(const char *netdev_name, const struct eth_addr);
576
static int af_packet_sock(void);
577
static bool netdev_linux_miimon_enabled(void);
578
static void netdev_linux_miimon_run(void);
579
static void netdev_linux_miimon_wait(void);
580
static int netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup);
581
static void netdev_linux_set_ol(struct netdev *netdev);
582
583
static bool
584
is_tap_netdev(const struct netdev *netdev)
585
0
{
586
0
    return netdev_get_class(netdev) == &netdev_tap_class;
587
0
}
588

589
static int
590
netdev_linux_netnsid_update__(struct netdev_linux *netdev)
591
0
{
592
0
    struct dpif_netlink_vport reply;
593
0
    struct ofpbuf *buf;
594
0
    int error;
595
596
0
    error = dpif_netlink_vport_get(netdev_get_name(&netdev->up), &reply, &buf);
597
0
    if (error) {
598
0
        if (error == ENOENT) {
599
            /* Assume it is local if there is no API (e.g. if the openvswitch
600
             * kernel module is not loaded). */
601
0
            netnsid_set_local(&netdev->netnsid);
602
0
        } else {
603
0
            netnsid_unset(&netdev->netnsid);
604
0
        }
605
0
        return error;
606
0
    }
607
608
0
    netnsid_set(&netdev->netnsid, reply.netnsid);
609
0
    ofpbuf_delete(buf);
610
0
    return 0;
611
0
}
612
613
static int
614
netdev_linux_netnsid_update(struct netdev_linux *netdev)
615
0
{
616
0
    if (netnsid_is_unset(netdev->netnsid)) {
617
0
        if (netdev_get_class(&netdev->up) == &netdev_tap_class) {
618
0
            netnsid_set_local(&netdev->netnsid);
619
0
        } else {
620
0
            return netdev_linux_netnsid_update__(netdev);
621
0
        }
622
0
    }
623
624
0
    return 0;
625
0
}
626
627
static bool
628
netdev_linux_netnsid_is_eq(struct netdev_linux *netdev, int nsid)
629
0
{
630
0
    netdev_linux_netnsid_update(netdev);
631
0
    return netnsid_eq(netdev->netnsid, nsid);
632
0
}
633
634
static bool
635
netdev_linux_netnsid_is_remote(struct netdev_linux *netdev)
636
0
{
637
0
    netdev_linux_netnsid_update(netdev);
638
0
    return netnsid_is_remote(netdev->netnsid);
639
0
}
640
641
static int netdev_linux_update_via_netlink(struct netdev_linux *);
642
static void netdev_linux_update(struct netdev_linux *netdev, int,
643
                                const struct rtnetlink_change *)
644
    OVS_REQUIRES(netdev->mutex);
645
static void netdev_linux_changed(struct netdev_linux *netdev,
646
                                 unsigned int ifi_flags, unsigned int mask)
647
    OVS_REQUIRES(netdev->mutex);
648
649
/* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK,
650
 * RTNLGRP_IPV4_IFADDR and RTNLGRP_IPV6_IFADDR changes, or NULL
651
 * if no such socket could be created. */
652
static struct nl_sock *
653
netdev_linux_notify_sock(void)
654
0
{
655
0
    static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
656
0
    static struct nl_sock *sock;
657
0
    unsigned int mcgroups[] = {RTNLGRP_LINK, RTNLGRP_IPV4_IFADDR,
658
0
                                RTNLGRP_IPV6_IFADDR, RTNLGRP_IPV6_IFINFO};
659
660
0
    if (ovsthread_once_start(&once)) {
661
0
        int error;
662
663
0
        error = nl_sock_create(NETLINK_ROUTE, &sock);
664
0
        if (!error) {
665
0
            size_t i;
666
667
0
            nl_sock_listen_all_nsid(sock, true);
668
0
            for (i = 0; i < ARRAY_SIZE(mcgroups); i++) {
669
0
                error = nl_sock_join_mcgroup(sock, mcgroups[i]);
670
0
                if (error) {
671
0
                    nl_sock_destroy(sock);
672
0
                    sock = NULL;
673
0
                    break;
674
0
                }
675
0
            }
676
0
        }
677
0
        ovsthread_once_done(&once);
678
0
    }
679
680
0
    return sock;
681
0
}
682
683
static bool
684
netdev_linux_miimon_enabled(void)
685
0
{
686
0
    return atomic_count_get(&miimon_cnt) > 0;
687
0
}
688
689
static bool
690
netdev_linux_kind_is_lag(const char *kind)
691
0
{
692
0
    if (!strcmp(kind, "bond") || !strcmp(kind, "team")) {
693
0
        return true;
694
0
    }
695
696
0
    return false;
697
0
}
698
699
static void
700
netdev_linux_update_lag(struct rtnetlink_change *change)
701
    OVS_REQUIRES(lag_mutex)
702
0
{
703
0
    struct linux_lag_member *lag;
704
705
0
    if (change->sub && netdev_linux_kind_is_lag(change->sub)) {
706
0
        lag = shash_find_data(&lag_shash, change->ifname);
707
708
0
        if (!lag) {
709
0
            struct netdev *primary_netdev;
710
0
            char primary_name[IFNAMSIZ];
711
0
            uint32_t block_id;
712
0
            int error = 0;
713
714
0
            if (!if_indextoname(change->master_ifindex, primary_name)) {
715
0
                return;
716
0
            }
717
0
            primary_netdev = netdev_from_name(primary_name);
718
0
            if (!primary_netdev) {
719
0
                return;
720
0
            }
721
722
            /* If LAG primary member is not attached to ovs,
723
             * ingress block on LAG members should not be updated. */
724
0
            if (!primary_netdev->auto_classified &&
725
0
                is_netdev_linux_class(primary_netdev->netdev_class)) {
726
0
                block_id = netdev_get_block_id(primary_netdev);
727
0
                if (!block_id) {
728
0
                    netdev_close(primary_netdev);
729
0
                    return;
730
0
                }
731
732
0
                lag = xmalloc(sizeof *lag);
733
0
                lag->block_id = block_id;
734
0
                lag->node = shash_add(&lag_shash, change->ifname, lag);
735
736
                /* delete ingress block in case it exists */
737
0
                tc_add_del_qdisc(change->if_index, false, 0, TC_INGRESS);
738
                /* LAG primary is linux netdev so add member to same block. */
739
0
                error = tc_add_del_qdisc(change->if_index, true, block_id,
740
0
                                         TC_INGRESS);
741
0
                if (error) {
742
0
                    VLOG_WARN("failed to bind LAG member %s to "
743
0
                              "primary's block", change->ifname);
744
0
                    shash_delete(&lag_shash, lag->node);
745
0
                    free(lag);
746
0
                }
747
0
            }
748
749
0
            netdev_close(primary_netdev);
750
0
        }
751
0
    } else if (change->master_ifindex == 0) {
752
        /* Check if this was a lag member that has been removed. */
753
0
        lag = shash_find_data(&lag_shash, change->ifname);
754
755
0
        if (lag) {
756
0
            tc_add_del_qdisc(change->if_index, false, lag->block_id,
757
0
                             TC_INGRESS);
758
0
            shash_delete(&lag_shash, lag->node);
759
0
            free(lag);
760
0
        }
761
0
    }
762
0
}
763
764
void
765
netdev_linux_run(const struct netdev_class *netdev_class OVS_UNUSED)
766
0
{
767
0
    struct nl_sock *sock;
768
0
    int error;
769
770
0
    if (netdev_linux_miimon_enabled()) {
771
0
        netdev_linux_miimon_run();
772
0
    }
773
774
0
    sock = netdev_linux_notify_sock();
775
0
    if (!sock) {
776
0
        return;
777
0
    }
778
779
0
    do {
780
0
        uint64_t buf_stub[4096 / 8];
781
0
        int nsid;
782
0
        struct ofpbuf buf;
783
784
0
        ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
785
0
        error = nl_sock_recv(sock, &buf, &nsid, false);
786
0
        if (!error) {
787
0
            struct rtnetlink_change change;
788
789
0
            if (rtnetlink_parse(&buf, &change) && !change.irrelevant) {
790
0
                struct netdev *netdev_ = NULL;
791
0
                char dev_name[IFNAMSIZ];
792
793
0
                if (!change.ifname) {
794
0
                     change.ifname = if_indextoname(change.if_index, dev_name);
795
0
                }
796
797
0
                if (change.ifname) {
798
0
                    netdev_ = netdev_from_name(change.ifname);
799
0
                }
800
0
                if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
801
0
                    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
802
803
0
                    ovs_mutex_lock(&netdev->mutex);
804
0
                    netdev_linux_update(netdev, nsid, &change);
805
0
                    ovs_mutex_unlock(&netdev->mutex);
806
0
                }
807
808
0
                if (change.ifname &&
809
0
                    rtnetlink_type_is_rtnlgrp_link(change.nlmsg_type)) {
810
811
                    /* Need to try updating the LAG information. */
812
0
                    ovs_mutex_lock(&lag_mutex);
813
0
                    netdev_linux_update_lag(&change);
814
0
                    ovs_mutex_unlock(&lag_mutex);
815
0
                }
816
0
                netdev_close(netdev_);
817
0
            }
818
0
        } else if (error == ENOBUFS) {
819
0
            struct shash device_shash;
820
0
            struct shash_node *node;
821
822
0
            nl_sock_drain(sock);
823
824
0
            shash_init(&device_shash);
825
0
            netdev_get_devices(&netdev_linux_class, &device_shash);
826
0
            SHASH_FOR_EACH (node, &device_shash) {
827
0
                struct netdev *netdev_ = node->data;
828
0
                struct netdev_linux *netdev = netdev_linux_cast(netdev_);
829
0
                unsigned int flags;
830
831
0
                ovs_mutex_lock(&netdev->mutex);
832
0
                get_flags(netdev_, &flags);
833
0
                netdev_linux_changed(netdev, flags, 0);
834
0
                ovs_mutex_unlock(&netdev->mutex);
835
836
0
                netdev_close(netdev_);
837
0
            }
838
0
            shash_destroy(&device_shash);
839
0
        } else if (error != EAGAIN) {
840
0
            static struct vlog_rate_limit rll = VLOG_RATE_LIMIT_INIT(1, 5);
841
0
            VLOG_WARN_RL(&rll, "error reading or parsing netlink (%s)",
842
0
                         ovs_strerror(error));
843
0
        }
844
0
        ofpbuf_uninit(&buf);
845
0
    } while (!error);
846
0
}
847
848
static void
849
netdev_linux_wait(const struct netdev_class *netdev_class OVS_UNUSED)
850
0
{
851
0
    struct nl_sock *sock;
852
853
0
    if (netdev_linux_miimon_enabled()) {
854
0
        netdev_linux_miimon_wait();
855
0
    }
856
0
    sock = netdev_linux_notify_sock();
857
0
    if (sock) {
858
0
        nl_sock_wait(sock, POLLIN);
859
0
    }
860
0
}
861
862
static void
863
netdev_linux_changed(struct netdev_linux *dev,
864
                     unsigned int ifi_flags, unsigned int mask)
865
    OVS_REQUIRES(dev->mutex)
866
0
{
867
0
    netdev_change_seq_changed(&dev->up);
868
869
0
    if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
870
0
        dev->carrier_resets++;
871
0
    }
872
0
    dev->ifi_flags = ifi_flags;
873
874
0
    dev->cache_valid &= mask;
875
0
    if (!(mask & VALID_IN)) {
876
0
        netdev_get_addrs_list_flush();
877
0
    }
878
0
}
879
880
static void
881
netdev_linux_update__(struct netdev_linux *dev,
882
                      const struct rtnetlink_change *change)
883
    OVS_REQUIRES(dev->mutex)
884
0
{
885
0
    if (rtnetlink_type_is_rtnlgrp_link(change->nlmsg_type)) {
886
0
        if (change->nlmsg_type == RTM_NEWLINK) {
887
            /* Keep drv-info, ip addresses, and NUMA id. */
888
0
            netdev_linux_changed(dev, change->ifi_flags,
889
0
                                 VALID_DRVINFO | VALID_IN | VALID_NUMA_ID);
890
891
            /* Update netdev from rtnl-change msg. */
892
0
            if (change->mtu) {
893
0
                dev->mtu = change->mtu;
894
0
                dev->cache_valid |= VALID_MTU;
895
0
                dev->netdev_mtu_error = 0;
896
0
            }
897
898
0
            if (!eth_addr_is_zero(change->mac)) {
899
0
                dev->etheraddr = change->mac;
900
0
                dev->cache_valid |= VALID_ETHERADDR;
901
0
                dev->ether_addr_error = 0;
902
903
                /* The mac addr has been changed, report it now. */
904
0
                rtnetlink_report_link();
905
0
            }
906
907
0
            if (change->primary && netdev_linux_kind_is_lag(change->primary)) {
908
0
                dev->is_lag_primary = true;
909
0
            }
910
911
0
            dev->ifindex = change->if_index;
912
0
            dev->cache_valid |= VALID_IFINDEX;
913
0
            dev->get_ifindex_error = 0;
914
0
            dev->present = true;
915
0
        } else {
916
            /* FIXME */
917
0
            netdev_linux_changed(dev, change->ifi_flags, 0);
918
0
            dev->present = false;
919
0
            netnsid_unset(&dev->netnsid);
920
0
        }
921
0
    } else if (rtnetlink_type_is_rtnlgrp_addr(change->nlmsg_type)) {
922
        /* Invalidates in4, in6. */
923
0
        netdev_linux_changed(dev, dev->ifi_flags, ~VALID_IN);
924
0
    } else {
925
0
        OVS_NOT_REACHED();
926
0
    }
927
0
}
928
929
static void
930
netdev_linux_update(struct netdev_linux *dev, int nsid,
931
                    const struct rtnetlink_change *change)
932
    OVS_REQUIRES(dev->mutex)
933
0
{
934
0
    if (netdev_linux_netnsid_is_eq(dev, nsid)) {
935
0
        netdev_linux_update__(dev, change);
936
0
    }
937
0
}
938
939
static struct netdev *
940
netdev_linux_alloc(void)
941
0
{
942
0
    struct netdev_linux *netdev = xzalloc(sizeof *netdev);
943
0
    return &netdev->up;
944
0
}
945
946
static int
947
netdev_linux_common_construct(struct netdev *netdev_)
948
0
{
949
    /* Prevent any attempt to create (or open) a network device named "default"
950
     * or "all".  These device names are effectively reserved on Linux because
951
     * /proc/sys/net/ipv4/conf/ always contains directories by these names.  By
952
     * itself this wouldn't call for any special treatment, but in practice if
953
     * a program tries to create devices with these names, it causes the kernel
954
     * to fire a "new device" notification event even though creation failed,
955
     * and in turn that causes OVS to wake up and try to create them again,
956
     * which ends up as a 100% CPU loop. */
957
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
958
0
    const char *name = netdev_->name;
959
0
    if (!strcmp(name, "default") || !strcmp(name, "all")) {
960
0
        static struct vlog_rate_limit rll = VLOG_RATE_LIMIT_INIT(1, 1);
961
0
        VLOG_WARN_RL(&rll, "%s: Linux forbids network device with this name",
962
0
                     name);
963
0
        return EINVAL;
964
0
    }
965
966
    /* The device could be in the same network namespace or in another one. */
967
0
    netnsid_unset(&netdev->netnsid);
968
0
    ovs_mutex_init(&netdev->mutex);
969
970
0
    return 0;
971
0
}
972
973
/* Creates system and internal devices. */
974
int
975
netdev_linux_construct(struct netdev *netdev_)
976
0
{
977
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
978
0
    int error = netdev_linux_common_construct(netdev_);
979
0
    if (error) {
980
0
        return error;
981
0
    }
982
983
0
    if (userspace_tso_enabled()) {
984
        /* The AF_PACKET socket interface uses the same option to facilitate
985
         * both csum and segmentation offloading. However, these features can
986
         * be toggled off or on individually at the interface level. The netdev
987
         * flags are set based on the features indicated by ethtool. */
988
0
        netdev_linux_set_ol(netdev_);
989
0
    }
990
991
0
    error = get_flags(&netdev->up, &netdev->ifi_flags);
992
0
    if (error == ENODEV) {
993
0
        if (netdev->up.netdev_class != &netdev_internal_class) {
994
            /* The device does not exist, so don't allow it to be opened. */
995
0
            return ENODEV;
996
0
        } else {
997
            /* "Internal" netdevs have to be created as netdev objects before
998
             * they exist in the kernel, because creating them in the kernel
999
             * happens by passing a netdev object to dpif_port_add().
1000
             * Therefore, ignore the error. */
1001
0
        }
1002
0
    }
1003
1004
0
    return 0;
1005
0
}
1006
1007
/* For most types of netdevs we open the device for each call of
1008
 * netdev_open().  However, this is not the case with tap devices,
1009
 * since it is only possible to open the device once.  In this
1010
 * situation we share a single file descriptor, and consequently
1011
 * buffers, across all readers.  Therefore once data is read it will
1012
 * be unavailable to other reads for tap devices. */
1013
static int
1014
netdev_linux_construct_tap(struct netdev *netdev_)
1015
0
{
1016
0
    static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
1017
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1018
0
    static const char tap_dev[] = "/dev/net/tun";
1019
0
    const char *name = netdev_->name;
1020
0
    unsigned long oflags;
1021
0
    unsigned int up;
1022
0
    struct ifreq ifr;
1023
1024
0
    int error = netdev_linux_common_construct(netdev_);
1025
0
    if (error) {
1026
0
        return error;
1027
0
    }
1028
1029
    /* Open tap device. */
1030
0
    netdev->tap_fd = open(tap_dev, O_RDWR);
1031
0
    if (netdev->tap_fd < 0) {
1032
0
        error = errno;
1033
0
        VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
1034
0
        return error;
1035
0
    }
1036
1037
    /* Create tap device. */
1038
0
    get_flags(&netdev->up, &netdev->ifi_flags);
1039
1040
0
    if (ovsthread_once_start(&once)) {
1041
0
        if (ioctl(netdev->tap_fd, TUNGETFEATURES, &up) == -1) {
1042
0
            VLOG_WARN("%s: querying tap features failed: %s", name,
1043
0
                      ovs_strerror(errno));
1044
0
            tap_supports_vnet_hdr = false;
1045
0
        } else if (!(up & IFF_VNET_HDR)) {
1046
0
            VLOG_WARN("TAP interfaces do not support virtio-net headers");
1047
0
            tap_supports_vnet_hdr = false;
1048
0
        }
1049
0
        ovsthread_once_done(&once);
1050
0
    }
1051
1052
0
    memset(&ifr, 0, sizeof ifr);
1053
1054
0
    ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
1055
0
    if (tap_supports_vnet_hdr) {
1056
0
        ifr.ifr_flags |= IFF_VNET_HDR;
1057
0
    }
1058
1059
0
    ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
1060
0
    if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
1061
0
        VLOG_WARN("%s: creating tap device failed: %s", name,
1062
0
                  ovs_strerror(errno));
1063
0
        error = errno;
1064
0
        goto error_close;
1065
0
    }
1066
1067
    /* Make non-blocking. */
1068
0
    error = set_nonblocking(netdev->tap_fd);
1069
0
    if (error) {
1070
0
        goto error_close;
1071
0
    }
1072
1073
0
    if (ioctl(netdev->tap_fd, TUNSETPERSIST, 1)) {
1074
0
        VLOG_WARN("%s: creating tap device failed (persist): %s", name,
1075
0
                  ovs_strerror(errno));
1076
0
        error = errno;
1077
0
        goto error_close;
1078
0
    }
1079
1080
0
    oflags = TUN_F_CSUM;
1081
0
    if (userspace_tso_enabled()) {
1082
0
        oflags |= (TUN_F_TSO4 | TUN_F_TSO6);
1083
0
    }
1084
1085
0
    if (tap_supports_vnet_hdr
1086
0
        && ioctl(netdev->tap_fd, TUNSETOFFLOAD, oflags) == 0) {
1087
0
        netdev_->ol_flags |= (NETDEV_TX_OFFLOAD_TCP_CKSUM
1088
0
                              | NETDEV_TX_OFFLOAD_UDP_CKSUM);
1089
1090
0
        if (userspace_tso_enabled()) {
1091
0
            netdev_->ol_flags |= NETDEV_TX_OFFLOAD_TCP_TSO;
1092
0
        }
1093
0
    } else {
1094
0
       VLOG_INFO("%s: Disabling checksum and segment offloading due to "
1095
0
                 "missing kernel support", name);
1096
0
    }
1097
1098
0
    netdev->present = true;
1099
0
    return 0;
1100
1101
0
error_close:
1102
0
    close(netdev->tap_fd);
1103
0
    return error;
1104
0
}
1105
1106
static void
1107
netdev_linux_destruct(struct netdev *netdev_)
1108
0
{
1109
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1110
1111
0
    if (netdev->tc && netdev->tc->ops->tc_destroy) {
1112
0
        netdev->tc->ops->tc_destroy(netdev->tc);
1113
0
    }
1114
1115
0
    if (netdev_get_class(netdev_) == &netdev_tap_class
1116
0
        && netdev->tap_fd >= 0)
1117
0
    {
1118
0
        ioctl(netdev->tap_fd, TUNSETPERSIST, 0);
1119
0
        close(netdev->tap_fd);
1120
0
    }
1121
1122
0
    if (netdev->miimon_interval > 0) {
1123
0
        atomic_count_dec(&miimon_cnt);
1124
0
    }
1125
1126
0
    ovs_mutex_destroy(&netdev->mutex);
1127
0
}
1128
1129
static void
1130
netdev_linux_dealloc(struct netdev *netdev_)
1131
0
{
1132
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1133
0
    free(netdev);
1134
0
}
1135
1136
static struct netdev_rxq *
1137
netdev_linux_rxq_alloc(void)
1138
0
{
1139
0
    struct netdev_rxq_linux *rx = xzalloc(sizeof *rx);
1140
0
    return &rx->up;
1141
0
}
1142
1143
static int
1144
netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
1145
0
{
1146
0
    struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1147
0
    struct netdev *netdev_ = rx->up.netdev;
1148
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1149
0
    int error;
1150
1151
0
    ovs_mutex_lock(&netdev->mutex);
1152
0
    rx->is_tap = is_tap_netdev(netdev_);
1153
0
    if (rx->is_tap) {
1154
0
        rx->fd = netdev->tap_fd;
1155
0
    } else {
1156
0
        struct sockaddr_ll sll;
1157
0
        int ifindex, val;
1158
        /* Result of tcpdump -dd inbound */
1159
0
        static const struct sock_filter filt[] = {
1160
0
            { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
1161
0
            { 0x15, 0, 1, 0x00000004 }, /* jeq #4     jt 2  jf 3 */
1162
0
            { 0x6, 0, 0, 0x00000000 },  /* ret #0 */
1163
0
            { 0x6, 0, 0, 0x0000ffff }   /* ret #65535 */
1164
0
        };
1165
0
        static const struct sock_fprog fprog = {
1166
0
            ARRAY_SIZE(filt), (struct sock_filter *) filt
1167
0
        };
1168
1169
        /* Create file descriptor. */
1170
0
        rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
1171
0
        if (rx->fd < 0) {
1172
0
            error = errno;
1173
0
            VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
1174
0
            goto error;
1175
0
        }
1176
1177
0
        val = 1;
1178
0
        if (setsockopt(rx->fd, SOL_PACKET, PACKET_AUXDATA, &val, sizeof val)) {
1179
0
            error = errno;
1180
0
            VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
1181
0
                     netdev_get_name(netdev_), ovs_strerror(error));
1182
0
            goto error;
1183
0
        }
1184
1185
0
        if (userspace_tso_enabled()
1186
0
            && setsockopt(rx->fd, SOL_PACKET, PACKET_VNET_HDR, &val,
1187
0
                          sizeof val)) {
1188
0
            error = errno;
1189
0
            VLOG_ERR("%s: failed to enable vnet hdr in txq raw socket: %s",
1190
0
                     netdev_get_name(netdev_), ovs_strerror(errno));
1191
0
            goto error;
1192
0
        }
1193
1194
        /* Set non-blocking mode. */
1195
0
        error = set_nonblocking(rx->fd);
1196
0
        if (error) {
1197
0
            goto error;
1198
0
        }
1199
1200
        /* Get ethernet device index. */
1201
0
        error = get_ifindex(&netdev->up, &ifindex);
1202
0
        if (error) {
1203
0
            goto error;
1204
0
        }
1205
1206
        /* Bind to specific ethernet device. */
1207
0
        memset(&sll, 0, sizeof sll);
1208
0
        sll.sll_family = AF_PACKET;
1209
0
        sll.sll_ifindex = ifindex;
1210
0
        sll.sll_protocol = htons(ETH_P_ALL);
1211
0
        if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
1212
0
            error = errno;
1213
0
            VLOG_ERR("%s: failed to bind raw socket (%s)",
1214
0
                     netdev_get_name(netdev_), ovs_strerror(error));
1215
0
            goto error;
1216
0
        }
1217
1218
        /* Filter for only inbound packets. */
1219
0
        error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
1220
0
                           sizeof fprog);
1221
0
        if (error) {
1222
0
            error = errno;
1223
0
            VLOG_ERR("%s: failed to attach filter (%s)",
1224
0
                     netdev_get_name(netdev_), ovs_strerror(error));
1225
0
            goto error;
1226
0
        }
1227
0
    }
1228
0
    ovs_mutex_unlock(&netdev->mutex);
1229
1230
0
    return 0;
1231
1232
0
error:
1233
0
    if (rx->fd >= 0) {
1234
0
        close(rx->fd);
1235
0
    }
1236
0
    ovs_mutex_unlock(&netdev->mutex);
1237
0
    return error;
1238
0
}
1239
1240
static void
1241
netdev_linux_rxq_destruct(struct netdev_rxq *rxq_)
1242
0
{
1243
0
    struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1244
0
    int i;
1245
1246
0
    if (!rx->is_tap) {
1247
0
        close(rx->fd);
1248
0
    }
1249
1250
0
    for (i = 0; i < NETDEV_MAX_BURST; i++) {
1251
0
        dp_packet_delete(rx->aux_bufs[i]);
1252
0
    }
1253
0
}
1254
1255
static void
1256
netdev_linux_rxq_dealloc(struct netdev_rxq *rxq_)
1257
0
{
1258
0
    struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1259
1260
0
    free(rx);
1261
0
}
1262
1263
static ovs_be16
1264
auxdata_to_vlan_tpid(const struct tpacket_auxdata *aux, bool double_tagged)
1265
0
{
1266
0
    if (aux->tp_status & TP_STATUS_VLAN_TPID_VALID) {
1267
0
        return htons(aux->tp_vlan_tpid);
1268
0
    } else if (double_tagged) {
1269
0
        return htons(ETH_TYPE_VLAN_8021AD);
1270
0
    } else {
1271
0
        return htons(ETH_TYPE_VLAN_8021Q);
1272
0
    }
1273
0
}
1274
1275
static bool
1276
auxdata_has_vlan_tci(const struct tpacket_auxdata *aux)
1277
0
{
1278
0
    return aux->tp_vlan_tci || aux->tp_status & TP_STATUS_VLAN_VALID;
1279
0
}
1280
1281
/*
1282
 * Receive packets from raw socket in batch process for better performance,
1283
 * it can receive NETDEV_MAX_BURST packets at most once, the received
1284
 * packets are added into *batch. The return value is 0 or errno.
1285
 *
1286
 * It also used recvmmsg to reduce multiple syscalls overhead;
1287
 */
1288
static int
1289
netdev_linux_batch_rxq_recv_sock(struct netdev_rxq_linux *rx, int mtu,
1290
                                 struct dp_packet_batch *batch)
1291
0
{
1292
0
    int iovlen;
1293
0
    size_t std_len;
1294
0
    ssize_t retval;
1295
0
    int virtio_net_hdr_size;
1296
0
    struct iovec iovs[NETDEV_MAX_BURST][IOV_TSO_SIZE];
1297
0
    struct cmsghdr *cmsg;
1298
0
    union {
1299
0
        struct cmsghdr cmsg;
1300
0
        char buffer[CMSG_SPACE(sizeof(struct tpacket_auxdata))];
1301
0
    } cmsg_buffers[NETDEV_MAX_BURST];
1302
0
    struct mmsghdr mmsgs[NETDEV_MAX_BURST];
1303
0
    struct dp_packet *buffers[NETDEV_MAX_BURST];
1304
0
    int i;
1305
1306
0
    if (userspace_tso_enabled()) {
1307
        /* Use the buffer from the allocated packet below to receive MTU
1308
         * sized packets and an aux_buf for extra TSO data. */
1309
0
        iovlen = IOV_TSO_SIZE;
1310
0
        virtio_net_hdr_size = sizeof(struct virtio_net_hdr);
1311
0
    } else {
1312
        /* Use only the buffer from the allocated packet. */
1313
0
        iovlen = IOV_STD_SIZE;
1314
0
        virtio_net_hdr_size = 0;
1315
0
    }
1316
1317
    /* The length here needs to be accounted in the same way when the
1318
     * aux_buf is allocated so that it can be prepended to TSO buffer. */
1319
0
    std_len = virtio_net_hdr_size + VLAN_ETH_HEADER_LEN + mtu;
1320
0
    for (i = 0; i < NETDEV_MAX_BURST; i++) {
1321
0
        buffers[i] = dp_packet_new_with_headroom(std_len, DP_NETDEV_HEADROOM);
1322
0
        iovs[i][IOV_PACKET].iov_base = dp_packet_data(buffers[i]);
1323
0
        iovs[i][IOV_PACKET].iov_len = std_len;
1324
0
        if (iovlen == IOV_TSO_SIZE) {
1325
0
            iovs[i][IOV_AUXBUF].iov_base = dp_packet_data(rx->aux_bufs[i]);
1326
0
            iovs[i][IOV_AUXBUF].iov_len = dp_packet_tailroom(rx->aux_bufs[i]);
1327
0
        }
1328
1329
0
        mmsgs[i].msg_hdr.msg_name = NULL;
1330
0
        mmsgs[i].msg_hdr.msg_namelen = 0;
1331
0
        mmsgs[i].msg_hdr.msg_iov = iovs[i];
1332
0
        mmsgs[i].msg_hdr.msg_iovlen = iovlen;
1333
0
        mmsgs[i].msg_hdr.msg_control = &cmsg_buffers[i];
1334
0
        mmsgs[i].msg_hdr.msg_controllen = sizeof cmsg_buffers[i];
1335
0
        mmsgs[i].msg_hdr.msg_flags = 0;
1336
0
    }
1337
1338
0
    do {
1339
0
        retval = recvmmsg(rx->fd, mmsgs, NETDEV_MAX_BURST, MSG_TRUNC, NULL);
1340
0
    } while (retval < 0 && errno == EINTR);
1341
1342
0
    if (retval < 0) {
1343
0
        retval = errno;
1344
0
        for (i = 0; i < NETDEV_MAX_BURST; i++) {
1345
0
            dp_packet_delete(buffers[i]);
1346
0
        }
1347
1348
0
        return retval;
1349
0
    }
1350
1351
0
    for (i = 0; i < retval; i++) {
1352
0
        struct dp_packet *pkt;
1353
1354
0
        if (mmsgs[i].msg_hdr.msg_flags & MSG_TRUNC
1355
0
            || mmsgs[i].msg_len < ETH_HEADER_LEN) {
1356
0
            struct netdev *netdev_ = netdev_rxq_get_netdev(&rx->up);
1357
0
            struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1358
1359
            /* The rx->aux_bufs[i] will be re-used next time. */
1360
0
            dp_packet_delete(buffers[i]);
1361
0
            netdev->rx_dropped += 1;
1362
0
            if (mmsgs[i].msg_hdr.msg_flags & MSG_TRUNC) {
1363
                /* Data is truncated, so the packet is corrupted, and needs
1364
                 * to be dropped. This can happen if TSO/GRO is enabled in
1365
                 * the kernel, but not in userspace, i.e. there is no dp
1366
                 * buffer to store the full packet. */
1367
0
                VLOG_WARN_RL(&rl,
1368
0
                             "%s: Dropped packet: Too big. GRO/TSO enabled?",
1369
0
                             netdev_get_name(netdev_));
1370
0
            } else {
1371
0
                VLOG_WARN_RL(&rl,
1372
0
                             "%s: Dropped packet: less than ether hdr size",
1373
0
                             netdev_get_name(netdev_));
1374
0
            }
1375
1376
0
            continue;
1377
0
        }
1378
1379
0
        if (mmsgs[i].msg_len > std_len) {
1380
            /* Build a single linear TSO packet by prepending the data from
1381
             * std_len buffer to the aux_buf. */
1382
0
            pkt = rx->aux_bufs[i];
1383
0
            dp_packet_set_size(pkt, mmsgs[i].msg_len - std_len);
1384
0
            dp_packet_push(pkt, dp_packet_data(buffers[i]), std_len);
1385
            /* The headroom should be the same in buffers[i], pkt and
1386
             * DP_NETDEV_HEADROOM. */
1387
0
            dp_packet_resize(pkt, DP_NETDEV_HEADROOM, 0);
1388
0
            dp_packet_delete(buffers[i]);
1389
0
            rx->aux_bufs[i] = NULL;
1390
0
         } else {
1391
0
            dp_packet_set_size(buffers[i], mmsgs[i].msg_len);
1392
0
            pkt = buffers[i];
1393
0
         }
1394
1395
0
        if (virtio_net_hdr_size) {
1396
0
            int ret = netdev_linux_parse_vnet_hdr(pkt);
1397
0
            if (OVS_UNLIKELY(ret)) {
1398
0
                struct netdev *netdev_ = netdev_rxq_get_netdev(&rx->up);
1399
0
                struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1400
1401
                /* Unexpected error situation: the virtio header is not
1402
                 * present or corrupted or contains unsupported features.
1403
                 * Drop the packet but continue in case next ones are
1404
                 * correct. */
1405
0
                dp_packet_delete(pkt);
1406
0
                netdev->rx_dropped += 1;
1407
0
                VLOG_WARN_RL(&rl, "%s: Dropped packet: vnet header is missing "
1408
0
                             "or corrupt: %s", netdev_get_name(netdev_),
1409
0
                             ovs_strerror(ret));
1410
0
                continue;
1411
0
            }
1412
0
        }
1413
1414
0
        for (cmsg = CMSG_FIRSTHDR(&mmsgs[i].msg_hdr); cmsg;
1415
0
                 cmsg = CMSG_NXTHDR(&mmsgs[i].msg_hdr, cmsg)) {
1416
0
            const struct tpacket_auxdata *aux;
1417
1418
0
            if (cmsg->cmsg_level != SOL_PACKET
1419
0
                || cmsg->cmsg_type != PACKET_AUXDATA
1420
0
                || cmsg->cmsg_len <
1421
0
                       CMSG_LEN(sizeof(struct tpacket_auxdata))) {
1422
0
                continue;
1423
0
            }
1424
1425
0
            aux = ALIGNED_CAST(struct tpacket_auxdata *, CMSG_DATA(cmsg));
1426
0
            if (auxdata_has_vlan_tci(aux)) {
1427
0
                struct eth_header *eth;
1428
0
                bool double_tagged;
1429
1430
0
                eth = dp_packet_data(pkt);
1431
0
                double_tagged = eth->eth_type == htons(ETH_TYPE_VLAN_8021Q);
1432
1433
0
                eth_push_vlan(pkt,
1434
0
                              auxdata_to_vlan_tpid(aux, double_tagged),
1435
0
                              htons(aux->tp_vlan_tci));
1436
0
                break;
1437
0
            }
1438
0
        }
1439
0
        dp_packet_batch_add(batch, pkt);
1440
0
    }
1441
1442
    /* Delete unused buffers. */
1443
0
    for (; i < NETDEV_MAX_BURST; i++) {
1444
0
        dp_packet_delete(buffers[i]);
1445
0
    }
1446
1447
0
    return 0;
1448
0
}
1449
1450
/*
1451
 * Receive packets from tap by batch process for better performance,
1452
 * it can receive NETDEV_MAX_BURST packets at most once, the received
1453
 * packets are added into *batch. The return value is 0 or errno.
1454
 */
1455
static int
1456
netdev_linux_batch_rxq_recv_tap(struct netdev_rxq_linux *rx, int mtu,
1457
                                struct dp_packet_batch *batch)
1458
0
{
1459
0
    int virtio_net_hdr_size;
1460
0
    ssize_t retval;
1461
0
    size_t std_len;
1462
0
    int iovlen;
1463
0
    int i;
1464
1465
0
    if (userspace_tso_enabled()) {
1466
        /* Use the buffer from the allocated packet below to receive MTU
1467
         * sized packets and an aux_buf for extra TSO data. */
1468
0
        iovlen = IOV_TSO_SIZE;
1469
0
    } else {
1470
        /* Use only the buffer from the allocated packet. */
1471
0
        iovlen = IOV_STD_SIZE;
1472
0
    }
1473
0
    if (OVS_LIKELY(tap_supports_vnet_hdr)) {
1474
0
        virtio_net_hdr_size = sizeof(struct virtio_net_hdr);
1475
0
    } else {
1476
0
        virtio_net_hdr_size = 0;
1477
0
    }
1478
1479
    /* The length here needs to be accounted in the same way when the
1480
     * aux_buf is allocated so that it can be prepended to TSO buffer. */
1481
0
    std_len = virtio_net_hdr_size + VLAN_ETH_HEADER_LEN + mtu;
1482
0
    for (i = 0; i < NETDEV_MAX_BURST; i++) {
1483
0
        struct dp_packet *buffer;
1484
0
        struct dp_packet *pkt;
1485
0
        struct iovec iov[IOV_TSO_SIZE];
1486
1487
        /* Assume Ethernet port. No need to set packet_type. */
1488
0
        buffer = dp_packet_new_with_headroom(std_len, DP_NETDEV_HEADROOM);
1489
0
        iov[IOV_PACKET].iov_base = dp_packet_data(buffer);
1490
0
        iov[IOV_PACKET].iov_len = std_len;
1491
0
        if (iovlen == IOV_TSO_SIZE) {
1492
0
            iov[IOV_AUXBUF].iov_base = dp_packet_data(rx->aux_bufs[i]);
1493
0
            iov[IOV_AUXBUF].iov_len = dp_packet_tailroom(rx->aux_bufs[i]);
1494
0
        }
1495
1496
0
        do {
1497
0
            retval = readv(rx->fd, iov, iovlen);
1498
0
        } while (retval < 0 && errno == EINTR);
1499
1500
0
        if (retval < 0) {
1501
0
            dp_packet_delete(buffer);
1502
0
            break;
1503
0
        }
1504
1505
0
        if (retval > std_len) {
1506
            /* Build a single linear TSO packet by prepending the data from
1507
             * std_len buffer to the aux_buf. */
1508
0
            pkt = rx->aux_bufs[i];
1509
0
            dp_packet_set_size(pkt, retval - std_len);
1510
0
            dp_packet_push(pkt, dp_packet_data(buffer), std_len);
1511
            /* The headroom should be the same in buffers[i], pkt and
1512
             * DP_NETDEV_HEADROOM. */
1513
0
            dp_packet_resize(pkt, DP_NETDEV_HEADROOM, 0);
1514
0
            dp_packet_delete(buffer);
1515
0
            rx->aux_bufs[i] = NULL;
1516
0
        } else {
1517
0
            dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
1518
0
            pkt = buffer;
1519
0
        }
1520
1521
0
        if (OVS_LIKELY(virtio_net_hdr_size) &&
1522
0
            netdev_linux_parse_vnet_hdr(pkt)) {
1523
0
            struct netdev *netdev_ = netdev_rxq_get_netdev(&rx->up);
1524
0
            struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1525
1526
            /* Unexpected error situation: the virtio header is not present
1527
             * or corrupted. Drop the packet but continue in case next ones
1528
             * are correct. */
1529
0
            dp_packet_delete(pkt);
1530
0
            netdev->rx_dropped += 1;
1531
0
            VLOG_WARN_RL(&rl, "%s: Dropped packet: Invalid virtio net header",
1532
0
                         netdev_get_name(netdev_));
1533
0
            continue;
1534
0
        }
1535
1536
0
        dp_packet_batch_add(batch, pkt);
1537
0
    }
1538
1539
0
    if ((i == 0) && (retval < 0)) {
1540
0
        return errno;
1541
0
    }
1542
1543
0
    return 0;
1544
0
}
1545
1546
static int
1547
netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet_batch *batch,
1548
                      int *qfill)
1549
0
{
1550
0
    struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1551
0
    struct netdev *netdev = rx->up.netdev;
1552
0
    ssize_t retval;
1553
0
    int mtu;
1554
1555
0
    if (netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
1556
0
        mtu = ETH_PAYLOAD_MAX;
1557
0
    }
1558
1559
0
    if (userspace_tso_enabled()) {
1560
        /* Allocate TSO packets. The packet has enough headroom to store
1561
         * a full non-TSO packet. When a TSO packet is received, the data
1562
         * from non-TSO buffer (std_len) is prepended to the TSO packet
1563
         * (aux_buf). */
1564
0
        size_t std_len = sizeof(struct virtio_net_hdr) + VLAN_ETH_HEADER_LEN
1565
0
                         + DP_NETDEV_HEADROOM + mtu;
1566
0
        size_t data_len = LINUX_RXQ_TSO_MAX_LEN - std_len;
1567
0
        for (int i = 0; i < NETDEV_MAX_BURST; i++) {
1568
0
            if (rx->aux_bufs[i]) {
1569
0
                continue;
1570
0
            }
1571
1572
0
            rx->aux_bufs[i] = dp_packet_new_with_headroom(data_len, std_len);
1573
0
        }
1574
0
    }
1575
1576
0
    dp_packet_batch_init(batch);
1577
0
    retval = (rx->is_tap
1578
0
              ? netdev_linux_batch_rxq_recv_tap(rx, mtu, batch)
1579
0
              : netdev_linux_batch_rxq_recv_sock(rx, mtu, batch));
1580
1581
0
    if (retval) {
1582
0
        if (retval != EAGAIN && retval != EMSGSIZE) {
1583
0
            VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
1584
0
                         netdev_rxq_get_name(rxq_), ovs_strerror(errno));
1585
0
        }
1586
0
    }
1587
1588
0
    if (qfill) {
1589
0
        *qfill = -ENOTSUP;
1590
0
    }
1591
1592
0
    return retval;
1593
0
}
1594
1595
static void
1596
netdev_linux_rxq_wait(struct netdev_rxq *rxq_)
1597
0
{
1598
0
    struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1599
0
    poll_fd_wait(rx->fd, POLLIN);
1600
0
}
1601
1602
static int
1603
netdev_linux_rxq_drain(struct netdev_rxq *rxq_)
1604
0
{
1605
0
    struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1606
0
    if (rx->is_tap) {
1607
0
        struct ifreq ifr;
1608
0
        int error;
1609
1610
0
        memset(&ifr, 0, sizeof ifr);
1611
0
        error = af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_), &ifr,
1612
0
                                    SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
1613
0
        if (error) {
1614
0
            return error;
1615
0
        }
1616
0
        drain_fd(rx->fd, ifr.ifr_qlen);
1617
0
        return 0;
1618
0
    } else {
1619
0
        return drain_rcvbuf(rx->fd);
1620
0
    }
1621
0
}
1622
1623
static int
1624
netdev_linux_sock_batch_send(struct netdev *netdev_, int sock, int ifindex,
1625
                             bool tso, int mtu, struct dp_packet_batch *batch)
1626
0
{
1627
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1628
0
    const size_t size = dp_packet_batch_size(batch);
1629
    /* We don't bother setting most fields in sockaddr_ll because the
1630
     * kernel ignores them for SOCK_RAW. */
1631
0
    struct sockaddr_ll sll = { .sll_family = AF_PACKET,
1632
0
                               .sll_ifindex = ifindex };
1633
1634
0
    struct mmsghdr *mmsg = xmalloc(sizeof(*mmsg) * size);
1635
0
    struct iovec *iov = xmalloc(sizeof(*iov) * size);
1636
0
    struct dp_packet *packet;
1637
0
    int cnt = 0;
1638
1639
0
    DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
1640
0
        if (tso) {
1641
0
            int ret = netdev_linux_prepend_vnet_hdr(packet, mtu);
1642
1643
0
            if (OVS_UNLIKELY(ret)) {
1644
0
                netdev->tx_dropped += 1;
1645
0
                VLOG_WARN_RL(&rl, "%s: Prepend vnet hdr failed, packet "
1646
0
                                  "dropped. %s", netdev_get_name(netdev_),
1647
0
                             ovs_strerror(ret));
1648
0
                continue;
1649
0
            }
1650
0
         }
1651
1652
0
        iov[cnt].iov_base = dp_packet_data(packet);
1653
0
        iov[cnt].iov_len = dp_packet_size(packet);
1654
0
        mmsg[cnt].msg_hdr = (struct msghdr) { .msg_name = &sll,
1655
0
                                              .msg_namelen = sizeof sll,
1656
0
                                              .msg_iov = &iov[cnt],
1657
0
                                              .msg_iovlen = 1 };
1658
0
        cnt++;
1659
0
    }
1660
1661
0
    int error = 0;
1662
0
    for (uint32_t ofs = 0; ofs < cnt;) {
1663
0
        ssize_t retval;
1664
0
        do {
1665
0
            retval = sendmmsg(sock, mmsg + ofs, cnt - ofs, 0);
1666
0
            error = retval < 0 ? errno : 0;
1667
0
        } while (error == EINTR);
1668
0
        if (error) {
1669
0
            break;
1670
0
        }
1671
0
        ofs += retval;
1672
0
    }
1673
1674
0
    free(mmsg);
1675
0
    free(iov);
1676
0
    return error;
1677
0
}
1678
1679
/* Use the tap fd to send 'batch' to tap device 'netdev'.  Using the tap fd is
1680
 * essential, because packets sent to a tap device with an AF_PACKET socket
1681
 * will loop back to be *received* again on the tap device.  This doesn't occur
1682
 * on other interface types because we attach a socket filter to the rx
1683
 * socket. */
1684
static int
1685
netdev_linux_tap_batch_send(struct netdev *netdev_, int mtu,
1686
                            struct dp_packet_batch *batch)
1687
0
{
1688
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1689
0
    struct dp_packet *packet;
1690
1691
    /* The Linux tap driver returns EIO if the device is not up,
1692
     * so if the device is not up, don't waste time sending it.
1693
     * However, if the device is in another network namespace
1694
     * then OVS can't retrieve the state. In that case, send the
1695
     * packets anyway. */
1696
0
    if (netdev->present && !(netdev->ifi_flags & IFF_UP)) {
1697
0
        netdev->tx_dropped += dp_packet_batch_size(batch);
1698
0
        return 0;
1699
0
    }
1700
1701
0
    DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
1702
0
        size_t size;
1703
0
        ssize_t retval;
1704
0
        int error;
1705
1706
0
        if (OVS_LIKELY(tap_supports_vnet_hdr)) {
1707
0
            error = netdev_linux_prepend_vnet_hdr(packet, mtu);
1708
0
            if (OVS_UNLIKELY(error)) {
1709
0
                netdev->tx_dropped++;
1710
0
                VLOG_WARN_RL(&rl, "%s: Prepend vnet hdr failed, packet "
1711
0
                             "dropped. %s", netdev_get_name(netdev_),
1712
0
                             ovs_strerror(error));
1713
0
                continue;
1714
0
            }
1715
0
        }
1716
1717
0
        size = dp_packet_size(packet);
1718
0
        do {
1719
0
            retval = write(netdev->tap_fd, dp_packet_data(packet), size);
1720
0
            error = retval < 0 ? errno : 0;
1721
0
        } while (error == EINTR);
1722
1723
0
        if (error) {
1724
            /* The Linux tap driver returns EIO if the device is not up.  From
1725
             * the OVS side this is not an error, so we ignore it; otherwise,
1726
             * return the erro. */
1727
0
            if (error != EIO) {
1728
0
                return error;
1729
0
            }
1730
0
        } else if (retval != size) {
1731
0
            VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE" "
1732
0
                         "bytes of %"PRIuSIZE") on %s",
1733
0
                         retval, size, netdev_get_name(netdev_));
1734
0
            return EMSGSIZE;
1735
0
        }
1736
0
    }
1737
0
    return 0;
1738
0
}
1739
1740
static int
1741
netdev_linux_get_numa_id__(struct netdev_linux *netdev)
1742
    OVS_REQUIRES(netdev->mutex)
1743
0
{
1744
0
    char *numa_node_path;
1745
0
    const char *name;
1746
0
    int node_id;
1747
0
    FILE *stream;
1748
0
1749
0
    if (netdev->cache_valid & VALID_NUMA_ID) {
1750
0
        return netdev->numa_id;
1751
0
    }
1752
0
1753
0
    netdev->numa_id = 0;
1754
0
    netdev->cache_valid |= VALID_NUMA_ID;
1755
0
1756
0
    if (ovs_numa_get_n_numas() < 2) {
1757
0
        /* No need to check on system with a single NUMA node. */
1758
0
        return 0;
1759
0
    }
1760
0
1761
0
    name = netdev_get_name(&netdev->up);
1762
0
    if (strpbrk(name, "/\\")) {
1763
0
        VLOG_ERR_RL(&rl, "\"%s\" is not a valid name for a port. "
1764
0
                    "A valid name must not include '/' or '\\'."
1765
0
                    "Using numa_id 0", name);
1766
0
        return 0;
1767
0
    }
1768
0
1769
0
    numa_node_path = xasprintf("/sys/class/net/%s/device/numa_node", name);
1770
0
1771
0
    stream = fopen(numa_node_path, "r");
1772
0
    if (!stream) {
1773
0
        /* Virtual device does not have this info. */
1774
0
        VLOG_INFO_RL(&rl, "%s: Can't open '%s': %s, using numa_id 0",
1775
0
                     name, numa_node_path, ovs_strerror(errno));
1776
0
        free(numa_node_path);
1777
0
        return 0;
1778
0
    }
1779
0
1780
0
    if (fscanf(stream, "%d", &node_id) != 1
1781
0
        || !ovs_numa_numa_id_is_valid(node_id))  {
1782
0
        VLOG_WARN_RL(&rl, "%s: Can't detect NUMA node, using numa_id 0", name);
1783
0
        node_id = 0;
1784
0
    }
1785
0
1786
0
    netdev->numa_id = node_id;
1787
0
    fclose(stream);
1788
0
    free(numa_node_path);
1789
0
    return node_id;
1790
0
}
1791
1792
static int OVS_UNUSED
1793
netdev_linux_get_numa_id(const struct netdev *netdev_)
1794
0
{
1795
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1796
0
    int numa_id;
1797
0
1798
0
    ovs_mutex_lock(&netdev->mutex);
1799
0
    numa_id = netdev_linux_get_numa_id__(netdev);
1800
0
    ovs_mutex_unlock(&netdev->mutex);
1801
0
1802
0
    return numa_id;
1803
0
}
1804
1805
/* Sends 'batch' on 'netdev'.  Returns 0 if successful, otherwise a positive
1806
 * errno value.  Returns EAGAIN without blocking if the packet cannot be queued
1807
 * immediately.  Returns EMSGSIZE if a partial packet was transmitted or if
1808
 * the packet is too big or too small to transmit on the device.
1809
 *
1810
 * The kernel maintains a packet transmission queue, so the caller is not
1811
 * expected to do additional queuing of packets. */
1812
static int
1813
netdev_linux_send(struct netdev *netdev_, int qid OVS_UNUSED,
1814
                  struct dp_packet_batch *batch,
1815
                  bool concurrent_txq OVS_UNUSED)
1816
0
{
1817
0
    bool tso = userspace_tso_enabled();
1818
0
    int mtu = ETH_PAYLOAD_MAX;
1819
0
    int error = 0;
1820
0
    int sock = 0;
1821
1822
0
    if (tso) {
1823
0
        netdev_linux_get_mtu__(netdev_linux_cast(netdev_), &mtu);
1824
0
    }
1825
1826
0
    if (!is_tap_netdev(netdev_)) {
1827
0
        if (netdev_linux_netnsid_is_remote(netdev_linux_cast(netdev_))) {
1828
0
            error = EOPNOTSUPP;
1829
0
            goto free_batch;
1830
0
        }
1831
1832
0
        sock = af_packet_sock();
1833
0
        if (sock < 0) {
1834
0
            error = -sock;
1835
0
            goto free_batch;
1836
0
        }
1837
1838
0
        int ifindex = netdev_get_ifindex(netdev_);
1839
0
        if (ifindex < 0) {
1840
0
            error = -ifindex;
1841
0
            goto free_batch;
1842
0
        }
1843
1844
0
        error = netdev_linux_sock_batch_send(netdev_, sock, ifindex, tso, mtu,
1845
0
                                             batch);
1846
0
    } else {
1847
0
        error = netdev_linux_tap_batch_send(netdev_, mtu, batch);
1848
0
    }
1849
0
    if (error) {
1850
0
        if (error == ENOBUFS) {
1851
            /* The Linux AF_PACKET implementation never blocks waiting
1852
             * for room for packets, instead returning ENOBUFS.
1853
             * Translate this into EAGAIN for the caller. */
1854
0
            error = EAGAIN;
1855
0
        } else {
1856
0
            VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
1857
0
                         netdev_get_name(netdev_), ovs_strerror(error));
1858
0
        }
1859
0
    }
1860
1861
0
free_batch:
1862
0
    dp_packet_delete_batch(batch, true);
1863
0
    return error;
1864
0
}
1865
1866
/* Registers with the poll loop to wake up from the next call to poll_block()
1867
 * when the packet transmission queue has sufficient room to transmit a packet
1868
 * with netdev_send().
1869
 *
1870
 * The kernel maintains a packet transmission queue, so the client is not
1871
 * expected to do additional queuing of packets.  Thus, this function is
1872
 * unlikely to ever be used.  It is included for completeness. */
1873
static void
1874
netdev_linux_send_wait(struct netdev *netdev, int qid OVS_UNUSED)
1875
0
{
1876
0
    if (is_tap_netdev(netdev)) {
1877
        /* TAP device always accepts packets.*/
1878
0
        poll_immediate_wake();
1879
0
    }
1880
0
}
1881
1882
/* Attempts to set 'netdev''s MAC address to 'mac'.  Returns 0 if successful,
1883
 * otherwise a positive errno value. */
1884
static int
1885
netdev_linux_set_etheraddr(struct netdev *netdev_, const struct eth_addr mac)
1886
0
{
1887
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1888
0
    enum netdev_flags old_flags = 0;
1889
0
    int error;
1890
1891
0
    ovs_mutex_lock(&netdev->mutex);
1892
0
    if (netdev_linux_netnsid_is_remote(netdev)) {
1893
0
        error = EOPNOTSUPP;
1894
0
        goto exit;
1895
0
    }
1896
1897
0
    if (netdev->cache_valid & VALID_ETHERADDR) {
1898
0
        error = netdev->ether_addr_error;
1899
0
        if (error || eth_addr_equals(netdev->etheraddr, mac)) {
1900
0
            goto exit;
1901
0
        }
1902
0
        netdev->cache_valid &= ~VALID_ETHERADDR;
1903
0
    }
1904
1905
    /* Tap devices must be brought down before setting the address. */
1906
0
    if (is_tap_netdev(netdev_)) {
1907
0
        update_flags(netdev, NETDEV_UP, 0, &old_flags);
1908
0
    }
1909
0
    error = set_etheraddr(netdev_get_name(netdev_), mac);
1910
0
    if (!error || error == ENODEV) {
1911
0
        netdev->ether_addr_error = error;
1912
0
        netdev->cache_valid |= VALID_ETHERADDR;
1913
0
        if (!error) {
1914
0
            netdev->etheraddr = mac;
1915
0
        }
1916
0
    }
1917
1918
0
    if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1919
0
        update_flags(netdev, 0, NETDEV_UP, &old_flags);
1920
0
    }
1921
1922
0
exit:
1923
0
    ovs_mutex_unlock(&netdev->mutex);
1924
0
    return error;
1925
0
}
1926
1927
/* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1928
static int
1929
netdev_linux_get_etheraddr(const struct netdev *netdev_, struct eth_addr *mac)
1930
0
{
1931
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1932
0
    int error;
1933
1934
0
    ovs_mutex_lock(&netdev->mutex);
1935
0
    if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1936
0
        netdev_linux_update_via_netlink(netdev);
1937
0
    }
1938
1939
0
    if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1940
        /* Fall back to ioctl if netlink fails */
1941
0
        netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
1942
0
                                                 &netdev->etheraddr);
1943
0
        netdev->cache_valid |= VALID_ETHERADDR;
1944
0
    }
1945
1946
0
    error = netdev->ether_addr_error;
1947
0
    if (!error) {
1948
0
        *mac = netdev->etheraddr;
1949
0
    }
1950
0
    ovs_mutex_unlock(&netdev->mutex);
1951
1952
0
    return error;
1953
0
}
1954
1955
static int
1956
netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup)
1957
0
{
1958
0
    int error;
1959
1960
0
    if (!(netdev->cache_valid & VALID_MTU)) {
1961
0
        netdev_linux_update_via_netlink(netdev);
1962
0
    }
1963
1964
0
    if (!(netdev->cache_valid & VALID_MTU)) {
1965
        /* Fall back to ioctl if netlink fails */
1966
0
        struct ifreq ifr;
1967
1968
0
        memset(&ifr, 0, sizeof ifr);
1969
0
        netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
1970
0
            netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
1971
0
        netdev->mtu = ifr.ifr_mtu;
1972
0
        netdev->cache_valid |= VALID_MTU;
1973
0
    }
1974
1975
0
    error = netdev->netdev_mtu_error;
1976
0
    if (!error) {
1977
0
        *mtup = netdev->mtu;
1978
0
    }
1979
1980
0
    return error;
1981
0
}
1982
1983
/* Returns the maximum size of transmitted (and received) packets on 'netdev',
1984
 * in bytes, not including the hardware header; thus, this is typically 1500
1985
 * bytes for Ethernet devices. */
1986
static int
1987
netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1988
0
{
1989
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1990
0
    int error;
1991
1992
0
    ovs_mutex_lock(&netdev->mutex);
1993
0
    error = netdev_linux_get_mtu__(netdev, mtup);
1994
0
    ovs_mutex_unlock(&netdev->mutex);
1995
1996
0
    return error;
1997
0
}
1998
1999
/* Sets the maximum size of transmitted (MTU) for given device using linux
2000
 * networking ioctl interface.
2001
 */
2002
static int
2003
netdev_linux_set_mtu(struct netdev *netdev_, int mtu)
2004
0
{
2005
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2006
0
    struct ifreq ifr;
2007
0
    int error;
2008
2009
0
    ovs_mutex_lock(&netdev->mutex);
2010
0
    if (netdev_linux_netnsid_is_remote(netdev)) {
2011
0
        error = EOPNOTSUPP;
2012
0
        goto exit;
2013
0
    }
2014
2015
#ifdef HAVE_AF_XDP
2016
    if (netdev_get_class(netdev_) == &netdev_afxdp_class) {
2017
        error = netdev_afxdp_verify_mtu_size(netdev_, mtu);
2018
        if (error) {
2019
            goto exit;
2020
        }
2021
    }
2022
#endif
2023
2024
0
    if (netdev->cache_valid & VALID_MTU) {
2025
0
        error = netdev->netdev_mtu_error;
2026
0
        if (error || netdev->mtu == mtu) {
2027
0
            goto exit;
2028
0
        }
2029
0
        netdev->cache_valid &= ~VALID_MTU;
2030
0
    }
2031
2032
0
    memset(&ifr, 0, sizeof ifr);
2033
0
    ifr.ifr_mtu = mtu;
2034
2035
0
    error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
2036
0
                                SIOCSIFMTU, "SIOCSIFMTU");
2037
0
    if (!error || error == ENODEV) {
2038
0
        netdev->netdev_mtu_error = error;
2039
0
        netdev->mtu = ifr.ifr_mtu;
2040
0
        netdev->cache_valid |= VALID_MTU;
2041
0
    }
2042
0
exit:
2043
0
    ovs_mutex_unlock(&netdev->mutex);
2044
0
    return error;
2045
0
}
2046
2047
/* Returns the ifindex of 'netdev', if successful, as a positive number.
2048
 * On failure, returns a negative errno value. */
2049
static int
2050
netdev_linux_get_ifindex(const struct netdev *netdev_)
2051
0
{
2052
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2053
0
    int ifindex, error;
2054
2055
0
    ovs_mutex_lock(&netdev->mutex);
2056
0
    if (netdev_linux_netnsid_is_remote(netdev)) {
2057
0
        error = EOPNOTSUPP;
2058
0
        goto exit;
2059
0
    }
2060
0
    error = get_ifindex(netdev_, &ifindex);
2061
2062
0
exit:
2063
0
    ovs_mutex_unlock(&netdev->mutex);
2064
0
    return error ? -error : ifindex;
2065
0
}
2066
2067
static int
2068
netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
2069
0
{
2070
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2071
2072
0
    ovs_mutex_lock(&netdev->mutex);
2073
0
    if (netdev->miimon_interval > 0) {
2074
0
        *carrier = netdev->miimon;
2075
0
    } else {
2076
0
        *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
2077
0
    }
2078
0
    ovs_mutex_unlock(&netdev->mutex);
2079
2080
0
    return 0;
2081
0
}
2082
2083
static long long int
2084
netdev_linux_get_carrier_resets(const struct netdev *netdev_)
2085
0
{
2086
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2087
0
    long long int carrier_resets;
2088
2089
0
    ovs_mutex_lock(&netdev->mutex);
2090
0
    carrier_resets = netdev->carrier_resets;
2091
0
    ovs_mutex_unlock(&netdev->mutex);
2092
2093
0
    return carrier_resets;
2094
0
}
2095
2096
static int
2097
netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
2098
                       struct mii_ioctl_data *data)
2099
0
{
2100
0
    struct ifreq ifr;
2101
0
    int error;
2102
2103
0
    memset(&ifr, 0, sizeof ifr);
2104
0
    memcpy(&ifr.ifr_data, data, sizeof *data);
2105
0
    error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
2106
0
    memcpy(data, &ifr.ifr_data, sizeof *data);
2107
2108
0
    return error;
2109
0
}
2110
2111
static int
2112
netdev_linux_get_miimon(const char *name, bool *miimon)
2113
0
{
2114
0
    struct mii_ioctl_data data;
2115
0
    int error;
2116
2117
0
    *miimon = false;
2118
2119
0
    memset(&data, 0, sizeof data);
2120
0
    error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
2121
0
    if (!error) {
2122
        /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
2123
0
        data.reg_num = MII_BMSR;
2124
0
        error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
2125
0
                                       &data);
2126
2127
0
        if (!error) {
2128
0
            *miimon = !!(data.val_out & BMSR_LSTATUS);
2129
0
        }
2130
0
    }
2131
0
    if (error) {
2132
0
        struct ethtool_cmd ecmd;
2133
2134
0
        VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
2135
0
                    name);
2136
2137
0
        COVERAGE_INC(netdev_get_ethtool);
2138
0
        memset(&ecmd, 0, sizeof ecmd);
2139
0
        error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
2140
0
                                        "ETHTOOL_GLINK");
2141
0
        if (!error) {
2142
0
            struct ethtool_value eval;
2143
2144
0
            memcpy(&eval, &ecmd, sizeof eval);
2145
0
            *miimon = !!eval.data;
2146
0
        } else {
2147
0
            VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
2148
0
        }
2149
0
    }
2150
2151
0
    return error;
2152
0
}
2153
2154
static int
2155
netdev_linux_set_miimon_interval(struct netdev *netdev_,
2156
                                 long long int interval)
2157
0
{
2158
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2159
2160
0
    ovs_mutex_lock(&netdev->mutex);
2161
0
    interval = interval > 0 ? MAX(interval, 100) : 0;
2162
0
    if (netdev->miimon_interval != interval) {
2163
0
        if (interval && !netdev->miimon_interval) {
2164
0
            atomic_count_inc(&miimon_cnt);
2165
0
        } else if (!interval && netdev->miimon_interval) {
2166
0
            atomic_count_dec(&miimon_cnt);
2167
0
        }
2168
2169
0
        netdev->miimon_interval = interval;
2170
0
        timer_set_expired(&netdev->miimon_timer);
2171
0
    }
2172
0
    ovs_mutex_unlock(&netdev->mutex);
2173
2174
0
    return 0;
2175
0
}
2176
2177
static void
2178
netdev_linux_miimon_run(void)
2179
0
{
2180
0
    struct shash device_shash;
2181
0
    struct shash_node *node;
2182
2183
0
    shash_init(&device_shash);
2184
0
    netdev_get_devices(&netdev_linux_class, &device_shash);
2185
0
    SHASH_FOR_EACH (node, &device_shash) {
2186
0
        struct netdev *netdev = node->data;
2187
0
        struct netdev_linux *dev = netdev_linux_cast(netdev);
2188
0
        bool miimon;
2189
2190
0
        ovs_mutex_lock(&dev->mutex);
2191
0
        if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
2192
0
            netdev_linux_get_miimon(dev->up.name, &miimon);
2193
0
            if (miimon != dev->miimon) {
2194
0
                dev->miimon = miimon;
2195
0
                netdev_linux_changed(dev, dev->ifi_flags, 0);
2196
0
            }
2197
2198
0
            timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
2199
0
        }
2200
0
        ovs_mutex_unlock(&dev->mutex);
2201
0
        netdev_close(netdev);
2202
0
    }
2203
2204
0
    shash_destroy(&device_shash);
2205
0
}
2206
2207
static void
2208
netdev_linux_miimon_wait(void)
2209
0
{
2210
0
    struct shash device_shash;
2211
0
    struct shash_node *node;
2212
2213
0
    shash_init(&device_shash);
2214
0
    netdev_get_devices(&netdev_linux_class, &device_shash);
2215
0
    SHASH_FOR_EACH (node, &device_shash) {
2216
0
        struct netdev *netdev = node->data;
2217
0
        struct netdev_linux *dev = netdev_linux_cast(netdev);
2218
2219
0
        ovs_mutex_lock(&dev->mutex);
2220
0
        if (dev->miimon_interval > 0) {
2221
0
            timer_wait(&dev->miimon_timer);
2222
0
        }
2223
0
        ovs_mutex_unlock(&dev->mutex);
2224
0
        netdev_close(netdev);
2225
0
    }
2226
0
    shash_destroy(&device_shash);
2227
0
}
2228
2229
static void
2230
swap_uint64(uint64_t *a, uint64_t *b)
2231
0
{
2232
0
    uint64_t tmp = *a;
2233
0
    *a = *b;
2234
0
    *b = tmp;
2235
0
}
2236
2237
/* Copies 'src' into 'dst', performing format conversion in the process.
2238
 *
2239
 * 'src' is allowed to be misaligned. */
2240
static void
2241
netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
2242
                                  const struct dpif_netlink_vport *vport)
2243
0
{
2244
0
    dst->rx_packets = get_32aligned_u64(&vport->stats->rx_packets);
2245
0
    dst->tx_packets = get_32aligned_u64(&vport->stats->tx_packets);
2246
0
    dst->rx_bytes = get_32aligned_u64(&vport->stats->rx_bytes);
2247
0
    dst->tx_bytes = get_32aligned_u64(&vport->stats->tx_bytes);
2248
0
    dst->rx_errors = get_32aligned_u64(&vport->stats->rx_errors);
2249
0
    dst->tx_errors = get_32aligned_u64(&vport->stats->tx_errors);
2250
0
    dst->rx_dropped = get_32aligned_u64(&vport->stats->rx_dropped);
2251
0
    dst->tx_dropped = get_32aligned_u64(&vport->stats->tx_dropped);
2252
0
    dst->multicast = 0;
2253
0
    dst->collisions = 0;
2254
0
    dst->rx_length_errors = 0;
2255
0
    dst->rx_over_errors = 0;
2256
0
    dst->rx_crc_errors = 0;
2257
0
    dst->rx_frame_errors = 0;
2258
0
    dst->rx_fifo_errors = 0;
2259
0
    dst->rx_missed_errors = 0;
2260
0
    dst->tx_aborted_errors = 0;
2261
0
    dst->tx_carrier_errors = 0;
2262
0
    dst->tx_fifo_errors = 0;
2263
0
    dst->tx_heartbeat_errors = 0;
2264
0
    dst->tx_window_errors = 0;
2265
0
    dst->upcall_packets = vport->upcall_success;
2266
0
    dst->upcall_errors = vport->upcall_fail;
2267
0
}
2268
2269
static int
2270
get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
2271
0
{
2272
0
    struct dpif_netlink_vport reply;
2273
0
    struct ofpbuf *buf;
2274
0
    int error;
2275
2276
0
    error = dpif_netlink_vport_get(netdev_get_name(netdev), &reply, &buf);
2277
0
    if (error) {
2278
0
        return error;
2279
0
    } else if (!reply.stats) {
2280
0
        ofpbuf_delete(buf);
2281
0
        return EOPNOTSUPP;
2282
0
    }
2283
2284
0
    netdev_stats_from_ovs_vport_stats(stats, &reply);
2285
2286
0
    ofpbuf_delete(buf);
2287
2288
0
    return 0;
2289
0
}
2290
2291
static void
2292
get_stats_via_vport(const struct netdev *netdev_,
2293
                    struct netdev_stats *stats)
2294
0
{
2295
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2296
2297
0
    if (!netdev->vport_stats_error ||
2298
0
        !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
2299
0
        int error;
2300
2301
0
        error = get_stats_via_vport__(netdev_, stats);
2302
0
        if (error && error != ENOENT && error != ENODEV) {
2303
0
            VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
2304
0
                         "(%s)",
2305
0
                         netdev_get_name(netdev_), ovs_strerror(error));
2306
0
        }
2307
0
        netdev->vport_stats_error = error;
2308
0
        netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
2309
0
    }
2310
0
}
2311
2312
/* Retrieves current device stats for 'netdev-linux'. */
2313
static int
2314
netdev_linux_get_stats(const struct netdev *netdev_,
2315
                       struct netdev_stats *stats)
2316
0
{
2317
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2318
0
    struct netdev_stats dev_stats;
2319
0
    int error;
2320
2321
0
    ovs_mutex_lock(&netdev->mutex);
2322
0
    get_stats_via_vport(netdev_, stats);
2323
0
    error = get_stats_via_netlink(netdev_, &dev_stats);
2324
0
    if (error) {
2325
0
        if (!netdev->vport_stats_error) {
2326
0
            error = 0;
2327
0
        }
2328
0
    } else if (netdev->vport_stats_error) {
2329
        /* stats not available from OVS then use netdev stats. */
2330
0
        *stats = dev_stats;
2331
0
    } else {
2332
0
        stats->multicast           += dev_stats.multicast;
2333
0
        stats->collisions          += dev_stats.collisions;
2334
0
        stats->rx_length_errors    += dev_stats.rx_length_errors;
2335
0
        stats->rx_over_errors      += dev_stats.rx_over_errors;
2336
0
        stats->rx_crc_errors       += dev_stats.rx_crc_errors;
2337
0
        stats->rx_frame_errors     += dev_stats.rx_frame_errors;
2338
0
        stats->rx_fifo_errors      += dev_stats.rx_fifo_errors;
2339
0
        stats->rx_missed_errors    += dev_stats.rx_missed_errors;
2340
0
        stats->tx_aborted_errors   += dev_stats.tx_aborted_errors;
2341
0
        stats->tx_carrier_errors   += dev_stats.tx_carrier_errors;
2342
0
        stats->tx_fifo_errors      += dev_stats.tx_fifo_errors;
2343
0
        stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
2344
0
        stats->tx_window_errors    += dev_stats.tx_window_errors;
2345
0
    }
2346
0
    ovs_mutex_unlock(&netdev->mutex);
2347
2348
0
    return error;
2349
0
}
2350
2351
/* Retrieves current device stats for 'netdev-tap' netdev or
2352
 * netdev-internal. */
2353
static int
2354
netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
2355
0
{
2356
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2357
0
    struct netdev_stats dev_stats;
2358
0
    int error;
2359
2360
0
    ovs_mutex_lock(&netdev->mutex);
2361
0
    get_stats_via_vport(netdev_, stats);
2362
0
    error = get_stats_via_netlink(netdev_, &dev_stats);
2363
0
    if (error) {
2364
0
        if (!netdev->vport_stats_error) {
2365
0
            error = 0;
2366
0
        }
2367
0
    } else if (netdev->vport_stats_error) {
2368
        /* Transmit and receive stats will appear to be swapped relative to the
2369
         * other ports since we are the one sending the data, not a remote
2370
         * computer.  For consistency, we swap them back here. This does not
2371
         * apply if we are getting stats from the vport layer because it always
2372
         * tracks stats from the perspective of the switch. */
2373
2374
0
        *stats = dev_stats;
2375
0
        swap_uint64(&stats->rx_packets, &stats->tx_packets);
2376
0
        swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
2377
0
        swap_uint64(&stats->rx_errors, &stats->tx_errors);
2378
0
        swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
2379
0
        stats->rx_length_errors = 0;
2380
0
        stats->rx_over_errors = 0;
2381
0
        stats->rx_crc_errors = 0;
2382
0
        stats->rx_frame_errors = 0;
2383
0
        stats->rx_fifo_errors = 0;
2384
0
        stats->rx_missed_errors = 0;
2385
0
        stats->tx_aborted_errors = 0;
2386
0
        stats->tx_carrier_errors = 0;
2387
0
        stats->tx_fifo_errors = 0;
2388
0
        stats->tx_heartbeat_errors = 0;
2389
0
        stats->tx_window_errors = 0;
2390
0
    } else {
2391
        /* Use kernel netdev's packet and byte counts since vport counters
2392
         * do not reflect packet counts on the wire when GSO, TSO or GRO
2393
         * are enabled. */
2394
0
        stats->rx_packets = dev_stats.tx_packets;
2395
0
        stats->rx_bytes = dev_stats.tx_bytes;
2396
0
        stats->tx_packets = dev_stats.rx_packets;
2397
0
        stats->tx_bytes = dev_stats.rx_bytes;
2398
2399
0
        stats->rx_dropped          += dev_stats.tx_dropped;
2400
0
        stats->tx_dropped          += dev_stats.rx_dropped;
2401
2402
0
        stats->rx_errors           += dev_stats.tx_errors;
2403
0
        stats->tx_errors           += dev_stats.rx_errors;
2404
2405
0
        stats->multicast           += dev_stats.multicast;
2406
0
        stats->collisions          += dev_stats.collisions;
2407
0
    }
2408
0
    stats->tx_dropped += netdev->tx_dropped;
2409
0
    stats->rx_dropped += netdev->rx_dropped;
2410
0
    ovs_mutex_unlock(&netdev->mutex);
2411
2412
0
    return error;
2413
0
}
2414
2415
static int
2416
netdev_internal_get_stats(const struct netdev *netdev_,
2417
                          struct netdev_stats *stats)
2418
0
{
2419
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2420
0
    int error;
2421
2422
0
    ovs_mutex_lock(&netdev->mutex);
2423
0
    get_stats_via_vport(netdev_, stats);
2424
0
    error = netdev->vport_stats_error;
2425
0
    ovs_mutex_unlock(&netdev->mutex);
2426
2427
0
    return error;
2428
0
}
2429
2430
static int
2431
netdev_linux_read_stringset_info(struct netdev_linux *netdev, uint32_t *len)
2432
0
{
2433
0
    union {
2434
0
        struct ethtool_cmd ecmd;
2435
0
        struct ethtool_sset_info hdr;
2436
0
        struct {
2437
0
            uint64_t pad[2];
2438
0
            uint32_t sset_len[1];
2439
0
        };
2440
0
    } sset_info;
2441
0
    int error;
2442
2443
0
    sset_info.hdr.cmd = ETHTOOL_GSSET_INFO;
2444
0
    sset_info.hdr.reserved = 0;
2445
0
    sset_info.hdr.sset_mask = 1ULL << ETH_SS_FEATURES;
2446
2447
0
    error = netdev_linux_do_ethtool(netdev_get_name(&netdev->up),
2448
0
                                    (struct ethtool_cmd *) &sset_info,
2449
0
                                    ETHTOOL_GSSET_INFO, "ETHTOOL_GSSET_INFO");
2450
0
    if (error) {
2451
0
        return error;
2452
0
    }
2453
0
    if (sset_info.hdr.sset_mask & (1ULL << ETH_SS_FEATURES)) {
2454
0
        *len = sset_info.sset_len[0];
2455
0
        return 0;
2456
0
    } else {
2457
        /* ETH_SS_FEATURES is not supported. */
2458
0
        return -EOPNOTSUPP;
2459
0
    }
2460
0
}
2461
2462
2463
static int
2464
netdev_linux_read_definitions(struct netdev_linux *netdev,
2465
                              struct ethtool_gstrings **pstrings)
2466
0
{
2467
0
    struct ethtool_gstrings *strings = NULL;
2468
0
    uint32_t len = 0;
2469
0
    int error = 0;
2470
2471
0
    error = netdev_linux_read_stringset_info(netdev, &len);
2472
0
    if (error) {
2473
0
        return error;
2474
0
    } else if (!len) {
2475
0
        return -EOPNOTSUPP;
2476
0
    }
2477
2478
0
    strings = xzalloc(sizeof *strings + len * ETH_GSTRING_LEN);
2479
2480
0
    strings->cmd = ETHTOOL_GSTRINGS;
2481
0
    strings->string_set = ETH_SS_FEATURES;
2482
0
    strings->len = len;
2483
0
    error = netdev_linux_do_ethtool(netdev_get_name(&netdev->up),
2484
0
                                    (struct ethtool_cmd *) strings,
2485
0
                                    ETHTOOL_GSTRINGS, "ETHTOOL_GSTRINGS");
2486
0
    if (error) {
2487
0
        goto out;
2488
0
    }
2489
2490
0
    for (int i = 0; i < len; i++) {
2491
0
        strings->data[(i + 1) * ETH_GSTRING_LEN - 1] = 0;
2492
0
    }
2493
2494
0
    *pstrings = strings;
2495
2496
0
    return 0;
2497
0
out:
2498
0
    *pstrings = NULL;
2499
0
    free(strings);
2500
0
    return error;
2501
0
}
2502
2503
static void
2504
netdev_linux_set_ol(struct netdev *netdev_)
2505
0
{
2506
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2507
0
    struct ethtool_gfeatures *features = NULL;
2508
0
    struct ethtool_gstrings *names = NULL;
2509
0
    int error;
2510
2511
0
    COVERAGE_INC(netdev_get_ethtool);
2512
2513
0
    error = netdev_linux_read_definitions(netdev, &names);
2514
0
    if (error) {
2515
0
        return;
2516
0
    }
2517
2518
0
    features = xzalloc(sizeof *features +
2519
0
                       DIV_ROUND_UP(names->len, 32) *
2520
0
                       sizeof features->features[0]);
2521
2522
0
    features->cmd = ETHTOOL_GFEATURES;
2523
0
    features->size = DIV_ROUND_UP(names->len, 32);
2524
0
    error = netdev_linux_do_ethtool(netdev_get_name(netdev_),
2525
0
                                    (struct ethtool_cmd *) features,
2526
0
                                    ETHTOOL_GFEATURES, "ETHTOOL_GFEATURES");
2527
2528
0
    if (error) {
2529
0
        goto out;
2530
0
    }
2531
2532
0
#define FEATURE_WORD(blocks, index, field)  ((blocks)[(index) / 32U].field)
2533
0
#define FEATURE_FIELD_FLAG(index)       (1U << (index) % 32U)
2534
0
#define FEATURE_BIT_IS_SET(blocks, index, field)        \
2535
0
    (FEATURE_WORD(blocks, index, field) & FEATURE_FIELD_FLAG(index))
2536
2537
0
    netdev->up.ol_flags = 0;
2538
0
    static const struct {
2539
0
        char *string;
2540
0
        uint32_t value;
2541
0
    } t_list[] = {
2542
0
        {"tx-checksum-ipv4", NETDEV_TX_OFFLOAD_TCP_CKSUM |
2543
0
                             NETDEV_TX_OFFLOAD_UDP_CKSUM},
2544
0
        {"tx-checksum-ipv6", NETDEV_TX_OFFLOAD_TCP_CKSUM |
2545
0
                             NETDEV_TX_OFFLOAD_UDP_CKSUM},
2546
0
        {"tx-checksum-ip-generic", NETDEV_TX_OFFLOAD_TCP_CKSUM |
2547
0
                                   NETDEV_TX_OFFLOAD_UDP_CKSUM},
2548
0
        {"tx-checksum-sctp", NETDEV_TX_OFFLOAD_SCTP_CKSUM},
2549
0
        {"tx-tcp-segmentation", NETDEV_TX_OFFLOAD_TCP_TSO},
2550
0
    };
2551
2552
0
    for (int j = 0; j < ARRAY_SIZE(t_list); j++) {
2553
0
        for (int i = 0; i < names->len; i++) {
2554
0
            char *name = (char *) names->data + i * ETH_GSTRING_LEN;
2555
0
            if (strcmp(t_list[j].string, name) == 0) {
2556
0
                if (FEATURE_BIT_IS_SET(features->features, i, active)) {
2557
0
                    netdev_->ol_flags |= t_list[j].value;
2558
0
                }
2559
0
                break;
2560
0
            }
2561
0
        }
2562
0
    }
2563
2564
0
out:
2565
0
    free(names);
2566
0
    free(features);
2567
0
}
2568
2569
static void
2570
netdev_linux_read_features(struct netdev_linux *netdev)
2571
0
{
2572
0
    struct ethtool_cmd ecmd;
2573
0
    int error;
2574
2575
0
    if (netdev->cache_valid & VALID_FEATURES) {
2576
0
        return;
2577
0
    }
2578
2579
0
    COVERAGE_INC(netdev_get_ethtool);
2580
0
    memset(&ecmd, 0, sizeof ecmd);
2581
0
    error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
2582
0
                                    ETHTOOL_GSET, "ETHTOOL_GSET");
2583
0
    if (error) {
2584
0
        goto out;
2585
0
    }
2586
2587
    /* Supported features. */
2588
0
    netdev->supported = 0;
2589
0
    if (ecmd.supported & SUPPORTED_10baseT_Half) {
2590
0
        netdev->supported |= NETDEV_F_10MB_HD;
2591
0
    }
2592
0
    if (ecmd.supported & SUPPORTED_10baseT_Full) {
2593
0
        netdev->supported |= NETDEV_F_10MB_FD;
2594
0
    }
2595
0
    if (ecmd.supported & SUPPORTED_100baseT_Half)  {
2596
0
        netdev->supported |= NETDEV_F_100MB_HD;
2597
0
    }
2598
0
    if (ecmd.supported & SUPPORTED_100baseT_Full) {
2599
0
        netdev->supported |= NETDEV_F_100MB_FD;
2600
0
    }
2601
0
    if (ecmd.supported & SUPPORTED_1000baseT_Half) {
2602
0
        netdev->supported |= NETDEV_F_1GB_HD;
2603
0
    }
2604
0
    if ((ecmd.supported & SUPPORTED_1000baseT_Full) ||
2605
0
        (ecmd.supported & SUPPORTED_1000baseKX_Full)) {
2606
0
        netdev->supported |= NETDEV_F_1GB_FD;
2607
0
    }
2608
0
    if ((ecmd.supported & SUPPORTED_10000baseT_Full) ||
2609
0
        (ecmd.supported & SUPPORTED_10000baseKX4_Full) ||
2610
0
        (ecmd.supported & SUPPORTED_10000baseKR_Full) ||
2611
0
        (ecmd.supported & SUPPORTED_10000baseR_FEC)) {
2612
0
        netdev->supported |= NETDEV_F_10GB_FD;
2613
0
    }
2614
0
    if ((ecmd.supported & SUPPORTED_40000baseKR4_Full) ||
2615
0
        (ecmd.supported & SUPPORTED_40000baseCR4_Full) ||
2616
0
        (ecmd.supported & SUPPORTED_40000baseSR4_Full) ||
2617
0
        (ecmd.supported & SUPPORTED_40000baseLR4_Full)) {
2618
0
        netdev->supported |= NETDEV_F_40GB_FD;
2619
0
    }
2620
0
    if (ecmd.supported & SUPPORTED_TP) {
2621
0
        netdev->supported |= NETDEV_F_COPPER;
2622
0
    }
2623
0
    if (ecmd.supported & SUPPORTED_FIBRE) {
2624
0
        netdev->supported |= NETDEV_F_FIBER;
2625
0
    }
2626
0
    if (ecmd.supported & SUPPORTED_Autoneg) {
2627
0
        netdev->supported |= NETDEV_F_AUTONEG;
2628
0
    }
2629
0
    if (ecmd.supported & SUPPORTED_Pause) {
2630
0
        netdev->supported |= NETDEV_F_PAUSE;
2631
0
    }
2632
0
    if (ecmd.supported & SUPPORTED_Asym_Pause) {
2633
0
        netdev->supported |= NETDEV_F_PAUSE_ASYM;
2634
0
    }
2635
2636
    /* Advertised features. */
2637
0
    netdev->advertised = 0;
2638
0
    if (ecmd.advertising & ADVERTISED_10baseT_Half) {
2639
0
        netdev->advertised |= NETDEV_F_10MB_HD;
2640
0
    }
2641
0
    if (ecmd.advertising & ADVERTISED_10baseT_Full) {
2642
0
        netdev->advertised |= NETDEV_F_10MB_FD;
2643
0
    }
2644
0
    if (ecmd.advertising & ADVERTISED_100baseT_Half) {
2645
0
        netdev->advertised |= NETDEV_F_100MB_HD;
2646
0
    }
2647
0
    if (ecmd.advertising & ADVERTISED_100baseT_Full) {
2648
0
        netdev->advertised |= NETDEV_F_100MB_FD;
2649
0
    }
2650
0
    if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
2651
0
        netdev->advertised |= NETDEV_F_1GB_HD;
2652
0
    }
2653
0
    if ((ecmd.advertising & ADVERTISED_1000baseT_Full) ||
2654
0
        (ecmd.advertising & ADVERTISED_1000baseKX_Full)) {
2655
0
        netdev->advertised |= NETDEV_F_1GB_FD;
2656
0
    }
2657
0
    if ((ecmd.advertising & ADVERTISED_10000baseT_Full) ||
2658
0
        (ecmd.advertising & ADVERTISED_10000baseKX4_Full) ||
2659
0
        (ecmd.advertising & ADVERTISED_10000baseKR_Full) ||
2660
0
        (ecmd.advertising & ADVERTISED_10000baseR_FEC)) {
2661
0
        netdev->advertised |= NETDEV_F_10GB_FD;
2662
0
    }
2663
0
    if ((ecmd.advertising & ADVERTISED_40000baseKR4_Full) ||
2664
0
        (ecmd.advertising & ADVERTISED_40000baseCR4_Full) ||
2665
0
        (ecmd.advertising & ADVERTISED_40000baseSR4_Full) ||
2666
0
        (ecmd.advertising & ADVERTISED_40000baseLR4_Full)) {
2667
0
        netdev->advertised |= NETDEV_F_40GB_FD;
2668
0
    }
2669
0
    if (ecmd.advertising & ADVERTISED_TP) {
2670
0
        netdev->advertised |= NETDEV_F_COPPER;
2671
0
    }
2672
0
    if (ecmd.advertising & ADVERTISED_FIBRE) {
2673
0
        netdev->advertised |= NETDEV_F_FIBER;
2674
0
    }
2675
0
    if (ecmd.advertising & ADVERTISED_Autoneg) {
2676
0
        netdev->advertised |= NETDEV_F_AUTONEG;
2677
0
    }
2678
0
    if (ecmd.advertising & ADVERTISED_Pause) {
2679
0
        netdev->advertised |= NETDEV_F_PAUSE;
2680
0
    }
2681
0
    if (ecmd.advertising & ADVERTISED_Asym_Pause) {
2682
0
        netdev->advertised |= NETDEV_F_PAUSE_ASYM;
2683
0
    }
2684
2685
    /* Current settings. */
2686
0
    netdev->current_speed = ethtool_cmd_speed(&ecmd);
2687
0
    if (netdev->current_speed == SPEED_10) {
2688
0
        netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
2689
0
    } else if (netdev->current_speed == SPEED_100) {
2690
0
        netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
2691
0
    } else if (netdev->current_speed == SPEED_1000) {
2692
0
        netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
2693
0
    } else if (netdev->current_speed == SPEED_10000) {
2694
0
        netdev->current = NETDEV_F_10GB_FD;
2695
0
    } else if (netdev->current_speed == SPEED_40000) {
2696
0
        netdev->current = NETDEV_F_40GB_FD;
2697
0
    } else if (netdev->current_speed == SPEED_100000) {
2698
0
        netdev->current = NETDEV_F_100GB_FD;
2699
0
    } else if (netdev->current_speed == 1000000) {
2700
0
        netdev->current = NETDEV_F_1TB_FD;
2701
0
    } else if (netdev->current_speed
2702
0
               && netdev->current_speed != SPEED_UNKNOWN) {
2703
0
        netdev->current = NETDEV_F_OTHER;
2704
0
    } else {
2705
0
        netdev->current = 0;
2706
0
    }
2707
0
    netdev->current_duplex = ecmd.duplex;
2708
2709
0
    if (ecmd.port == PORT_TP) {
2710
0
        netdev->current |= NETDEV_F_COPPER;
2711
0
    } else if (ecmd.port == PORT_FIBRE) {
2712
0
        netdev->current |= NETDEV_F_FIBER;
2713
0
    }
2714
2715
0
    if (ecmd.autoneg) {
2716
0
        netdev->current |= NETDEV_F_AUTONEG;
2717
0
    }
2718
2719
0
out:
2720
0
    netdev->cache_valid |= VALID_FEATURES;
2721
0
    netdev->get_features_error = error;
2722
0
}
2723
2724
/* Stores the features supported by 'netdev' into of '*current', '*advertised',
2725
 * '*supported', and '*peer'.  Each value is a bitmap of NETDEV_* bits.
2726
 * Returns 0 if successful, otherwise a positive errno value. */
2727
static int
2728
netdev_linux_get_features(const struct netdev *netdev_,
2729
                          enum netdev_features *current,
2730
                          enum netdev_features *advertised,
2731
                          enum netdev_features *supported,
2732
                          enum netdev_features *peer)
2733
0
{
2734
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2735
0
    int error;
2736
2737
0
    ovs_mutex_lock(&netdev->mutex);
2738
0
    if (netdev_linux_netnsid_is_remote(netdev)) {
2739
0
        error = EOPNOTSUPP;
2740
0
        goto exit;
2741
0
    }
2742
2743
0
    netdev_linux_read_features(netdev);
2744
0
    if (!netdev->get_features_error) {
2745
0
        *current = netdev->current;
2746
0
        *advertised = netdev->advertised;
2747
0
        *supported = netdev->supported;
2748
0
        *peer = 0;              /* XXX */
2749
0
    }
2750
0
    error = netdev->get_features_error;
2751
2752
0
exit:
2753
0
    ovs_mutex_unlock(&netdev->mutex);
2754
0
    return error;
2755
0
}
2756
2757
static int
2758
netdev_linux_get_speed_locked(struct netdev_linux *netdev,
2759
                              uint32_t *current, uint32_t *max)
2760
0
{
2761
0
    if (netdev_linux_netnsid_is_remote(netdev)) {
2762
0
        *current = *max = 0;
2763
0
        return EOPNOTSUPP;
2764
0
    }
2765
2766
0
    netdev_linux_read_features(netdev);
2767
0
    if (!netdev->get_features_error) {
2768
0
        *current = netdev->current_speed == SPEED_UNKNOWN
2769
0
                   ? 0 : netdev->current_speed;
2770
0
        *max = MIN(UINT32_MAX,
2771
0
                   netdev_features_to_bps(netdev->supported, 0) / 1000000ULL);
2772
0
    } else {
2773
0
        *current = *max = 0;
2774
0
    }
2775
0
    return netdev->get_features_error;
2776
0
}
2777
2778
static int
2779
netdev_linux_get_speed(const struct netdev *netdev_, uint32_t *current,
2780
                       uint32_t *max)
2781
0
{
2782
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2783
0
    int error;
2784
2785
0
    ovs_mutex_lock(&netdev->mutex);
2786
0
    error = netdev_linux_get_speed_locked(netdev, current, max);
2787
0
    ovs_mutex_unlock(&netdev->mutex);
2788
0
    return error;
2789
0
}
2790
2791
static int
2792
netdev_linux_get_duplex(const struct netdev *netdev_, bool *full_duplex)
2793
0
{
2794
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2795
0
    int err;
2796
2797
0
    ovs_mutex_lock(&netdev->mutex);
2798
2799
0
    if (netdev_linux_netnsid_is_remote(netdev)) {
2800
0
        err = EOPNOTSUPP;
2801
0
        goto exit;
2802
0
    }
2803
2804
0
    netdev_linux_read_features(netdev);
2805
0
    err = netdev->get_features_error;
2806
0
    if (!err && netdev->current_duplex == DUPLEX_UNKNOWN) {
2807
0
        err = EOPNOTSUPP;
2808
0
        goto exit;
2809
0
    }
2810
0
    *full_duplex = netdev->current_duplex == DUPLEX_FULL;
2811
2812
0
exit:
2813
0
    ovs_mutex_unlock(&netdev->mutex);
2814
0
    return err;
2815
0
}
2816
2817
/* Set the features advertised by 'netdev' to 'advertise'. */
2818
static int
2819
netdev_linux_set_advertisements(struct netdev *netdev_,
2820
                                enum netdev_features advertise)
2821
0
{
2822
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2823
0
    struct ethtool_cmd ecmd;
2824
0
    int error;
2825
2826
0
    ovs_mutex_lock(&netdev->mutex);
2827
2828
0
    COVERAGE_INC(netdev_get_ethtool);
2829
2830
0
    if (netdev_linux_netnsid_is_remote(netdev)) {
2831
0
        error = EOPNOTSUPP;
2832
0
        goto exit;
2833
0
    }
2834
2835
0
    memset(&ecmd, 0, sizeof ecmd);
2836
0
    error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
2837
0
                                    ETHTOOL_GSET, "ETHTOOL_GSET");
2838
0
    if (error) {
2839
0
        goto exit;
2840
0
    }
2841
2842
0
    ecmd.advertising = 0;
2843
0
    if (advertise & NETDEV_F_10MB_HD) {
2844
0
        ecmd.advertising |= ADVERTISED_10baseT_Half;
2845
0
    }
2846
0
    if (advertise & NETDEV_F_10MB_FD) {
2847
0
        ecmd.advertising |= ADVERTISED_10baseT_Full;
2848
0
    }
2849
0
    if (advertise & NETDEV_F_100MB_HD) {
2850
0
        ecmd.advertising |= ADVERTISED_100baseT_Half;
2851
0
    }
2852
0
    if (advertise & NETDEV_F_100MB_FD) {
2853
0
        ecmd.advertising |= ADVERTISED_100baseT_Full;
2854
0
    }
2855
0
    if (advertise & NETDEV_F_1GB_HD) {
2856
0
        ecmd.advertising |= ADVERTISED_1000baseT_Half;
2857
0
    }
2858
0
    if (advertise & NETDEV_F_1GB_FD) {
2859
0
        ecmd.advertising |= ADVERTISED_1000baseT_Full;
2860
0
    }
2861
0
    if (advertise & NETDEV_F_10GB_FD) {
2862
0
        ecmd.advertising |= ADVERTISED_10000baseT_Full;
2863
0
    }
2864
0
    if (advertise & NETDEV_F_COPPER) {
2865
0
        ecmd.advertising |= ADVERTISED_TP;
2866
0
    }
2867
0
    if (advertise & NETDEV_F_FIBER) {
2868
0
        ecmd.advertising |= ADVERTISED_FIBRE;
2869
0
    }
2870
0
    if (advertise & NETDEV_F_AUTONEG) {
2871
0
        ecmd.advertising |= ADVERTISED_Autoneg;
2872
0
    }
2873
0
    if (advertise & NETDEV_F_PAUSE) {
2874
0
        ecmd.advertising |= ADVERTISED_Pause;
2875
0
    }
2876
0
    if (advertise & NETDEV_F_PAUSE_ASYM) {
2877
0
        ecmd.advertising |= ADVERTISED_Asym_Pause;
2878
0
    }
2879
0
    COVERAGE_INC(netdev_set_ethtool);
2880
0
    error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
2881
0
                                    ETHTOOL_SSET, "ETHTOOL_SSET");
2882
2883
0
exit:
2884
0
    ovs_mutex_unlock(&netdev->mutex);
2885
0
    return error;
2886
0
}
2887
2888
static void
2889
nl_msg_act_police_start_nest(struct ofpbuf *request, uint32_t prio,
2890
                             size_t *offset, size_t *act_offset,
2891
                             bool single_action)
2892
0
{
2893
0
    *act_offset = nl_msg_start_nested(request, prio);
2894
0
    nl_msg_put_string(request, TCA_ACT_KIND, "police");
2895
2896
    /* If police action is added independently from filter, we need to
2897
     * add action flag according to tc-policy. */
2898
0
    if (single_action) {
2899
0
        nl_msg_put_act_tc_policy_flag(request);
2900
0
    }
2901
0
    *offset = nl_msg_start_nested(request, TCA_ACT_OPTIONS);
2902
0
}
2903
2904
static void
2905
nl_msg_act_police_end_nest(struct ofpbuf *request, size_t offset,
2906
                           size_t act_offset, uint32_t notexceed_act)
2907
0
{
2908
0
    nl_msg_put_u32(request, TCA_POLICE_RESULT, notexceed_act);
2909
0
    nl_msg_end_nested(request, offset);
2910
0
    nl_msg_end_nested(request, act_offset);
2911
0
}
2912
2913
static void
2914
nl_msg_put_act_police(struct ofpbuf *request, uint32_t index,
2915
                      uint64_t kbits_rate, uint64_t kbits_burst,
2916
                      uint64_t pkts_rate, uint64_t pkts_burst,
2917
                      uint32_t notexceed_act, bool single_action)
2918
0
{
2919
0
    uint64_t bytes_rate = kbits_rate / 8 * 1000;
2920
0
    size_t offset, act_offset;
2921
0
    struct tc_police police;
2922
0
    uint32_t prio = 0;
2923
2924
0
    if (!kbits_rate && !pkts_rate) {
2925
0
        return;
2926
0
    }
2927
2928
0
    tc_policer_init(&police, kbits_rate, kbits_burst);
2929
0
    police.index = index;
2930
2931
0
    nl_msg_act_police_start_nest(request, ++prio, &offset, &act_offset,
2932
0
                                 single_action);
2933
0
    if (police.rate.rate) {
2934
0
        tc_put_rtab(request, TCA_POLICE_RATE, &police.rate, bytes_rate);
2935
0
    }
2936
#ifdef HAVE_TCA_POLICE_PKTRATE64
2937
    if (bytes_rate > UINT32_MAX) {
2938
        nl_msg_put_u64(request, TCA_POLICE_RATE64, bytes_rate);
2939
    }
2940
#endif
2941
0
    if (pkts_rate) {
2942
0
        uint64_t pkt_burst_ticks;
2943
        /* Here tc_bytes_to_ticks is used to convert packets rather than bytes
2944
           to ticks. */
2945
0
        pkt_burst_ticks = tc_bytes_to_ticks(pkts_rate, pkts_burst);
2946
0
        nl_msg_put_u64(request, TCA_POLICE_PKTRATE64, pkts_rate);
2947
0
        nl_msg_put_u64(request, TCA_POLICE_PKTBURST64, pkt_burst_ticks);
2948
0
    }
2949
0
    nl_msg_put_unspec(request, TCA_POLICE_TBF, &police, sizeof police);
2950
0
    nl_msg_act_police_end_nest(request, offset, act_offset, notexceed_act);
2951
0
}
2952
2953
static int
2954
tc_add_matchall_policer(struct netdev *netdev, uint64_t kbits_rate,
2955
                        uint32_t kbits_burst, uint32_t kpkts_rate,
2956
                        uint32_t kpkts_burst)
2957
0
{
2958
0
    uint16_t eth_type = (OVS_FORCE uint16_t) htons(ETH_P_ALL);
2959
0
    size_t basic_offset, action_offset;
2960
0
    uint16_t prio = TC_RESERVED_PRIORITY_POLICE;
2961
0
    int ifindex, err = 0;
2962
0
    struct ofpbuf request;
2963
0
    struct ofpbuf *reply;
2964
0
    struct tcmsg *tcmsg;
2965
0
    uint32_t handle = 1;
2966
2967
0
    err = get_ifindex(netdev, &ifindex);
2968
0
    if (err) {
2969
0
        return err;
2970
0
    }
2971
2972
0
    tcmsg = tc_make_request(ifindex, RTM_NEWTFILTER, NLM_F_CREATE | NLM_F_ECHO,
2973
0
                            &request);
2974
0
    tcmsg->tcm_parent = TC_INGRESS_PARENT;
2975
0
    tcmsg->tcm_info = tc_make_handle(prio, eth_type);
2976
0
    tcmsg->tcm_handle = handle;
2977
2978
0
    nl_msg_put_string(&request, TCA_KIND, "matchall");
2979
0
    basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2980
0
    action_offset = nl_msg_start_nested(&request, TCA_MATCHALL_ACT);
2981
0
    nl_msg_put_act_police(&request, 0, kbits_rate, kbits_burst,
2982
0
                          kpkts_rate * 1000ULL, kpkts_burst * 1000ULL,
2983
0
                          TC_ACT_UNSPEC, false);
2984
0
    nl_msg_end_nested(&request, action_offset);
2985
0
    nl_msg_end_nested(&request, basic_offset);
2986
2987
0
    err = tc_transact(&request, &reply);
2988
0
    if (!err) {
2989
0
        struct ofpbuf b = ofpbuf_const_initializer(reply->data, reply->size);
2990
0
        struct nlmsghdr *nlmsg = ofpbuf_try_pull(&b, sizeof *nlmsg);
2991
0
        struct tcmsg *tc = ofpbuf_try_pull(&b, sizeof *tc);
2992
2993
0
        if (!nlmsg || !tc) {
2994
0
            VLOG_ERR_RL(&rl,
2995
0
                        "Failed to add match all policer, malformed reply");
2996
0
            ofpbuf_delete(reply);
2997
0
            return EPROTO;
2998
0
        }
2999
0
        ofpbuf_delete(reply);
3000
0
    }
3001
3002
0
    return err;
3003
0
}
3004
3005
static int
3006
tc_del_matchall_policer(struct netdev *netdev)
3007
0
{
3008
0
    int prio = TC_RESERVED_PRIORITY_POLICE;
3009
0
    uint32_t block_id = 0;
3010
0
    struct tcf_id id;
3011
0
    int ifindex;
3012
0
    int err;
3013
3014
0
    err = get_ifindex(netdev, &ifindex);
3015
0
    if (err) {
3016
0
        return err;
3017
0
    }
3018
3019
0
    id = tc_make_tcf_id(ifindex, block_id, prio, TC_INGRESS);
3020
0
    err = tc_del_filter(&id, "matchall");
3021
0
    if (err) {
3022
0
        return err;
3023
0
    }
3024
3025
0
    return 0;
3026
0
}
3027
3028
/* Attempts to set input rate limiting (policing) policy.  Returns 0 if
3029
 * successful, otherwise a positive errno value. */
3030
static int
3031
netdev_linux_set_policing(struct netdev *netdev_, uint32_t kbits_rate,
3032
                          uint32_t kbits_burst, uint32_t kpkts_rate,
3033
                          uint32_t kpkts_burst)
3034
0
{
3035
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3036
0
    const char *netdev_name = netdev_get_name(netdev_);
3037
0
    int ifindex;
3038
0
    int error;
3039
3040
0
    kbits_burst = (!kbits_rate ? 0       /* Force to 0 if no rate specified. */
3041
0
                   : !kbits_burst ? 8000 /* Default to 8000 kbits if 0. */
3042
0
                   : kbits_burst);       /* Stick with user-specified value. */
3043
3044
0
    kpkts_burst = (!kpkts_rate ? 0       /* Force to 0 if no rate specified. */
3045
0
                   : !kpkts_burst ? 16   /* Default to 16 kpkts if 0. */
3046
0
                   : kpkts_burst);       /* Stick with user-specified value. */
3047
3048
0
    ovs_mutex_lock(&netdev->mutex);
3049
0
    if (netdev_linux_netnsid_is_remote(netdev)) {
3050
0
        error = EOPNOTSUPP;
3051
0
        goto out;
3052
0
    }
3053
3054
0
    if (netdev->cache_valid & VALID_POLICING) {
3055
0
        error = netdev->netdev_policing_error;
3056
0
        if (error || (netdev->kbits_rate == kbits_rate &&
3057
0
                      netdev->kpkts_rate == kpkts_rate &&
3058
0
                      netdev->kbits_burst == kbits_burst &&
3059
0
                      netdev->kpkts_burst == kpkts_burst)) {
3060
            /* Assume that settings haven't changed since we last set them. */
3061
0
            goto out;
3062
0
        }
3063
0
        netdev->cache_valid &= ~VALID_POLICING;
3064
0
    }
3065
3066
0
    COVERAGE_INC(netdev_set_policing);
3067
3068
    /* Use matchall for policing when offloadling ovs with tc-flower. */
3069
0
    if (dpif_offload_enabled()) {
3070
0
        error = tc_del_matchall_policer(netdev_);
3071
0
        if (kbits_rate || kpkts_rate) {
3072
0
            error = tc_add_matchall_policer(netdev_, kbits_rate, kbits_burst,
3073
0
                                            kpkts_rate, kpkts_burst);
3074
0
        }
3075
0
        goto out;
3076
0
    }
3077
3078
0
    error = get_ifindex(netdev_, &ifindex);
3079
0
    if (error) {
3080
0
        goto out;
3081
0
    }
3082
3083
    /* Remove any existing ingress qdisc. */
3084
0
    error = tc_add_del_qdisc(ifindex, false, 0, TC_INGRESS);
3085
0
    if (error) {
3086
0
        VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
3087
0
                     netdev_name, ovs_strerror(error));
3088
0
        goto out;
3089
0
    }
3090
3091
0
    if (kbits_rate || kpkts_rate) {
3092
0
        const char *cls_name = "matchall";
3093
3094
0
        error = tc_add_del_qdisc(ifindex, true, 0, TC_INGRESS);
3095
0
        if (error) {
3096
0
            VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
3097
0
                         netdev_name, ovs_strerror(error));
3098
0
            goto out;
3099
0
        }
3100
3101
0
        error = tc_add_matchall_policer(netdev_, kbits_rate, kbits_burst,
3102
0
                                        kpkts_rate, kpkts_burst);
3103
0
        if (error == ENOENT) {
3104
0
            cls_name = "basic";
3105
            /* This error is returned when the matchall classifier is missing.
3106
             * Fall back to the basic classifier.  */
3107
0
            error = tc_add_policer(netdev_, kbits_rate, kbits_burst,
3108
0
                                   kpkts_rate, kpkts_burst);
3109
0
        }
3110
0
        if (error){
3111
0
            VLOG_WARN_RL(&rl, "%s: adding cls_%s policing action failed: %s",
3112
0
                         netdev_name, cls_name, ovs_strerror(error));
3113
0
            goto out;
3114
0
        }
3115
0
    }
3116
3117
0
out:
3118
0
    if (!error) {
3119
0
        netdev->kbits_rate = kbits_rate;
3120
0
        netdev->kbits_burst = kbits_burst;
3121
0
        netdev->kpkts_rate = kpkts_rate;
3122
0
        netdev->kpkts_burst = kpkts_burst;
3123
0
    }
3124
3125
0
    if (!error || error == ENODEV) {
3126
0
        netdev->netdev_policing_error = error;
3127
0
        netdev->cache_valid |= VALID_POLICING;
3128
0
    }
3129
0
    ovs_mutex_unlock(&netdev->mutex);
3130
0
    return error;
3131
0
}
3132
3133
static int
3134
netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
3135
                           struct sset *types)
3136
0
{
3137
0
    const struct tc_ops *const *opsp;
3138
0
    for (opsp = tcs; *opsp != NULL; opsp++) {
3139
0
        const struct tc_ops *ops = *opsp;
3140
0
        if (ops->tc_install && ops->ovs_name[0] != '\0') {
3141
0
            sset_add(types, ops->ovs_name);
3142
0
        }
3143
0
    }
3144
0
    return 0;
3145
0
}
3146
3147
static const struct tc_ops *
3148
tc_lookup_ovs_name(const char *name)
3149
0
{
3150
0
    const struct tc_ops *const *opsp;
3151
3152
0
    for (opsp = tcs; *opsp != NULL; opsp++) {
3153
0
        const struct tc_ops *ops = *opsp;
3154
0
        if (!strcmp(name, ops->ovs_name)) {
3155
0
            return ops;
3156
0
        }
3157
0
    }
3158
0
    return NULL;
3159
0
}
3160
3161
static const struct tc_ops *
3162
tc_lookup_linux_name(const char *name)
3163
0
{
3164
0
    const struct tc_ops *const *opsp;
3165
3166
0
    for (opsp = tcs; *opsp != NULL; opsp++) {
3167
0
        const struct tc_ops *ops = *opsp;
3168
0
        if (ops->linux_name && !strcmp(name, ops->linux_name)) {
3169
0
            return ops;
3170
0
        }
3171
0
    }
3172
0
    return NULL;
3173
0
}
3174
3175
static struct tc_queue *
3176
tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
3177
                size_t hash)
3178
0
{
3179
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3180
0
    struct tc_queue *queue;
3181
3182
0
    HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
3183
0
        if (queue->queue_id == queue_id) {
3184
0
            return queue;
3185
0
        }
3186
0
    }
3187
0
    return NULL;
3188
0
}
3189
3190
static struct tc_queue *
3191
tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
3192
0
{
3193
0
    return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
3194
0
}
3195
3196
static int
3197
netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
3198
                                  const char *type,
3199
                                  struct netdev_qos_capabilities *caps)
3200
0
{
3201
0
    const struct tc_ops *ops = tc_lookup_ovs_name(type);
3202
0
    if (!ops) {
3203
0
        return EOPNOTSUPP;
3204
0
    }
3205
0
    caps->n_queues = ops->n_queues;
3206
0
    return 0;
3207
0
}
3208
3209
static int
3210
netdev_linux_get_qos(const struct netdev *netdev_,
3211
                     const char **typep, struct smap *details)
3212
0
{
3213
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3214
0
    int error;
3215
3216
0
    ovs_mutex_lock(&netdev->mutex);
3217
0
    if (netdev_linux_netnsid_is_remote(netdev)) {
3218
0
        error = EOPNOTSUPP;
3219
0
        goto exit;
3220
0
    }
3221
3222
0
    error = tc_query_qdisc(netdev_);
3223
0
    if (!error) {
3224
0
        *typep = netdev->tc->ops->ovs_name;
3225
0
        error = (netdev->tc->ops->qdisc_get
3226
0
                 ? netdev->tc->ops->qdisc_get(netdev_, details)
3227
0
                 : 0);
3228
0
    }
3229
3230
0
exit:
3231
0
    ovs_mutex_unlock(&netdev->mutex);
3232
0
    return error;
3233
0
}
3234
3235
static int
3236
netdev_linux_set_qos(struct netdev *netdev_,
3237
                     const char *type, const struct smap *details)
3238
0
{
3239
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3240
0
    const struct tc_ops *new_ops;
3241
0
    int error;
3242
3243
0
    new_ops = tc_lookup_ovs_name(type);
3244
0
    if (!new_ops || !new_ops->tc_install) {
3245
0
        return EOPNOTSUPP;
3246
0
    }
3247
3248
0
    if (new_ops == &tc_ops_noop) {
3249
0
        return new_ops->tc_install(netdev_, details);
3250
0
    }
3251
3252
0
    ovs_mutex_lock(&netdev->mutex);
3253
0
    if (netdev_linux_netnsid_is_remote(netdev)) {
3254
0
        error = EOPNOTSUPP;
3255
0
        goto exit;
3256
0
    }
3257
3258
0
    error = tc_query_qdisc(netdev_);
3259
0
    if (error) {
3260
0
        goto exit;
3261
0
    }
3262
3263
0
    if (new_ops == netdev->tc->ops) {
3264
0
        error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
3265
0
    } else {
3266
        /* Delete existing qdisc. */
3267
0
        error = tc_del_qdisc(netdev_);
3268
0
        if (error) {
3269
0
            VLOG_WARN_RL(&rl, "%s: Failed to delete existing qdisc: %s",
3270
0
                         netdev_get_name(netdev_), ovs_strerror(error));
3271
0
            goto exit;
3272
0
        }
3273
0
        ovs_assert(netdev->tc == NULL);
3274
3275
        /* Install new qdisc. */
3276
0
        error = new_ops->tc_install(netdev_, details);
3277
0
        if (error) {
3278
0
            VLOG_WARN_RL(&rl, "%s: Failed to install new qdisc: %s",
3279
0
                         netdev_get_name(netdev_), ovs_strerror(error));
3280
0
        }
3281
0
        ovs_assert((error == 0) == (netdev->tc != NULL));
3282
0
    }
3283
3284
0
exit:
3285
0
    ovs_mutex_unlock(&netdev->mutex);
3286
0
    return error;
3287
0
}
3288
3289
static int
3290
netdev_linux_get_queue(const struct netdev *netdev_,
3291
                       unsigned int queue_id, struct smap *details)
3292
0
{
3293
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3294
0
    int error;
3295
3296
0
    ovs_mutex_lock(&netdev->mutex);
3297
0
    if (netdev_linux_netnsid_is_remote(netdev)) {
3298
0
        error = EOPNOTSUPP;
3299
0
        goto exit;
3300
0
    }
3301
3302
0
    error = tc_query_qdisc(netdev_);
3303
0
    if (!error) {
3304
0
        struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
3305
0
        error = (queue
3306
0
                ? netdev->tc->ops->class_get(netdev_, queue, details)
3307
0
                : ENOENT);
3308
0
    }
3309
3310
0
exit:
3311
0
    ovs_mutex_unlock(&netdev->mutex);
3312
0
    return error;
3313
0
}
3314
3315
static int
3316
netdev_linux_set_queue(struct netdev *netdev_,
3317
                       unsigned int queue_id, const struct smap *details)
3318
0
{
3319
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3320
0
    int error;
3321
3322
0
    ovs_mutex_lock(&netdev->mutex);
3323
0
    if (netdev_linux_netnsid_is_remote(netdev)) {
3324
0
        error = EOPNOTSUPP;
3325
0
        goto exit;
3326
0
    }
3327
3328
0
    error = tc_query_qdisc(netdev_);
3329
0
    if (!error) {
3330
0
        error = (queue_id < netdev->tc->ops->n_queues
3331
0
                 && netdev->tc->ops->class_set
3332
0
                 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
3333
0
                 : EINVAL);
3334
0
    }
3335
3336
0
exit:
3337
0
    ovs_mutex_unlock(&netdev->mutex);
3338
0
    return error;
3339
0
}
3340
3341
static int
3342
netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
3343
0
{
3344
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3345
0
    int error;
3346
3347
0
    ovs_mutex_lock(&netdev->mutex);
3348
0
    if (netdev_linux_netnsid_is_remote(netdev)) {
3349
0
        error = EOPNOTSUPP;
3350
0
        goto exit;
3351
0
    }
3352
3353
0
    error = tc_query_qdisc(netdev_);
3354
0
    if (!error) {
3355
0
        if (netdev->tc->ops->class_delete) {
3356
0
            struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
3357
0
            error = (queue
3358
0
                     ? netdev->tc->ops->class_delete(netdev_, queue)
3359
0
                     : ENOENT);
3360
0
        } else {
3361
0
            error = EINVAL;
3362
0
        }
3363
0
    }
3364
3365
0
exit:
3366
0
    ovs_mutex_unlock(&netdev->mutex);
3367
0
    return error;
3368
0
}
3369
3370
static int
3371
netdev_linux_get_queue_stats(const struct netdev *netdev_,
3372
                             unsigned int queue_id,
3373
                             struct netdev_queue_stats *stats)
3374
0
{
3375
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3376
0
    int error;
3377
3378
0
    ovs_mutex_lock(&netdev->mutex);
3379
0
    if (netdev_linux_netnsid_is_remote(netdev)) {
3380
0
        error = EOPNOTSUPP;
3381
0
        goto exit;
3382
0
    }
3383
3384
0
    error = tc_query_qdisc(netdev_);
3385
0
    if (!error) {
3386
0
        if (netdev->tc->ops->class_get_stats) {
3387
0
            const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
3388
0
            if (queue) {
3389
0
                stats->created = queue->created;
3390
0
                error = netdev->tc->ops->class_get_stats(netdev_, queue,
3391
0
                                                         stats);
3392
0
            } else {
3393
0
                error = ENOENT;
3394
0
            }
3395
0
        } else {
3396
0
            error = EOPNOTSUPP;
3397
0
        }
3398
0
    }
3399
3400
0
exit:
3401
0
    ovs_mutex_unlock(&netdev->mutex);
3402
0
    return error;
3403
0
}
3404
3405
struct queue_dump_state {
3406
    struct nl_dump dump;
3407
    struct ofpbuf buf;
3408
};
3409
3410
static bool
3411
start_queue_dump(const struct netdev *netdev, struct queue_dump_state *state)
3412
0
{
3413
0
    struct ofpbuf request;
3414
0
    struct tcmsg *tcmsg;
3415
3416
0
    tcmsg = netdev_linux_tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
3417
0
    if (!tcmsg) {
3418
0
        return false;
3419
0
    }
3420
0
    tcmsg->tcm_parent = 0;
3421
0
    nl_dump_start(&state->dump, NETLINK_ROUTE, &request);
3422
0
    ofpbuf_uninit(&request);
3423
3424
0
    ofpbuf_init(&state->buf, NL_DUMP_BUFSIZE);
3425
0
    return true;
3426
0
}
3427
3428
static int
3429
finish_queue_dump(struct queue_dump_state *state)
3430
0
{
3431
0
    ofpbuf_uninit(&state->buf);
3432
0
    return nl_dump_done(&state->dump);
3433
0
}
3434
3435
struct netdev_linux_queue_state {
3436
    unsigned int *queues;
3437
    size_t cur_queue;
3438
    size_t n_queues;
3439
};
3440
3441
static int
3442
netdev_linux_queue_dump_start(const struct netdev *netdev_, void **statep)
3443
0
{
3444
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3445
0
    int error;
3446
3447
0
    ovs_mutex_lock(&netdev->mutex);
3448
0
    if (netdev_linux_netnsid_is_remote(netdev)) {
3449
0
        error = EOPNOTSUPP;
3450
0
        goto exit;
3451
0
    }
3452
3453
0
    error = tc_query_qdisc(netdev_);
3454
0
    if (!error) {
3455
0
        if (netdev->tc->ops->class_get) {
3456
0
            struct netdev_linux_queue_state *state;
3457
0
            struct tc_queue *queue;
3458
0
            size_t i;
3459
3460
0
            *statep = state = xmalloc(sizeof *state);
3461
0
            state->n_queues = hmap_count(&netdev->tc->queues);
3462
0
            state->cur_queue = 0;
3463
0
            state->queues = xmalloc(state->n_queues * sizeof *state->queues);
3464
3465
0
            i = 0;
3466
0
            HMAP_FOR_EACH (queue, hmap_node, &netdev->tc->queues) {
3467
0
                state->queues[i++] = queue->queue_id;
3468
0
            }
3469
0
        } else {
3470
0
            error = EOPNOTSUPP;
3471
0
        }
3472
0
    }
3473
3474
0
exit:
3475
0
    ovs_mutex_unlock(&netdev->mutex);
3476
0
    return error;
3477
0
}
3478
3479
static int
3480
netdev_linux_queue_dump_next(const struct netdev *netdev_, void *state_,
3481
                             unsigned int *queue_idp, struct smap *details)
3482
0
{
3483
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3484
0
    struct netdev_linux_queue_state *state = state_;
3485
0
    int error = EOF;
3486
3487
0
    ovs_mutex_lock(&netdev->mutex);
3488
0
    if (netdev_linux_netnsid_is_remote(netdev)) {
3489
0
        error = EOPNOTSUPP;
3490
0
        goto exit;
3491
0
    }
3492
3493
0
    while (state->cur_queue < state->n_queues) {
3494
0
        unsigned int queue_id = state->queues[state->cur_queue++];
3495
0
        struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
3496
3497
0
        if (queue) {
3498
0
            *queue_idp = queue_id;
3499
0
            error = netdev->tc->ops->class_get(netdev_, queue, details);
3500
0
            break;
3501
0
        }
3502
0
    }
3503
3504
0
exit:
3505
0
    ovs_mutex_unlock(&netdev->mutex);
3506
0
    return error;
3507
0
}
3508
3509
static int
3510
netdev_linux_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
3511
                             void *state_)
3512
0
{
3513
0
    struct netdev_linux_queue_state *state = state_;
3514
3515
0
    free(state->queues);
3516
0
    free(state);
3517
0
    return 0;
3518
0
}
3519
3520
static int
3521
netdev_linux_dump_queue_stats(const struct netdev *netdev_,
3522
                              netdev_dump_queue_stats_cb *cb, void *aux)
3523
0
{
3524
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3525
0
    int error;
3526
3527
0
    ovs_mutex_lock(&netdev->mutex);
3528
0
    if (netdev_linux_netnsid_is_remote(netdev)) {
3529
0
        error = EOPNOTSUPP;
3530
0
        goto exit;
3531
0
    }
3532
3533
0
    error = tc_query_qdisc(netdev_);
3534
0
    if (!error) {
3535
0
        struct queue_dump_state state;
3536
3537
0
        if (!netdev->tc->ops->class_dump_stats) {
3538
0
            error = EOPNOTSUPP;
3539
0
        } else if (!start_queue_dump(netdev_, &state)) {
3540
0
            error = ENODEV;
3541
0
        } else {
3542
0
            struct ofpbuf msg;
3543
0
            int retval;
3544
3545
0
            while (nl_dump_next(&state.dump, &msg, &state.buf)) {
3546
0
                retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
3547
0
                                                           cb, aux);
3548
0
                if (retval) {
3549
0
                    error = retval;
3550
0
                }
3551
0
            }
3552
3553
0
            retval = finish_queue_dump(&state);
3554
0
            if (retval) {
3555
0
                error = retval;
3556
0
            }
3557
0
        }
3558
0
    }
3559
3560
0
exit:
3561
0
    ovs_mutex_unlock(&netdev->mutex);
3562
0
    return error;
3563
0
}
3564
3565
static int
3566
netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
3567
                     struct in_addr netmask)
3568
0
{
3569
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3570
0
    int error;
3571
3572
0
    ovs_mutex_lock(&netdev->mutex);
3573
0
    if (netdev_linux_netnsid_is_remote(netdev)) {
3574
0
        error = EOPNOTSUPP;
3575
0
        goto exit;
3576
0
    }
3577
3578
0
    error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
3579
0
    if (!error) {
3580
0
        if (address.s_addr != INADDR_ANY) {
3581
0
            error = do_set_addr(netdev_, SIOCSIFNETMASK,
3582
0
                                "SIOCSIFNETMASK", netmask);
3583
0
        }
3584
0
    }
3585
3586
0
exit:
3587
0
    ovs_mutex_unlock(&netdev->mutex);
3588
0
    return error;
3589
0
}
3590
3591
/* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address.
3592
 * Otherwise, sets '*in6' to 'in6addr_any' and returns the corresponding
3593
 * error. */
3594
static int
3595
netdev_linux_get_addr_list(const struct netdev *netdev_,
3596
                          struct in6_addr **addr, struct in6_addr **mask, int *n_cnt)
3597
0
{
3598
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3599
0
    int error;
3600
3601
0
    ovs_mutex_lock(&netdev->mutex);
3602
0
    if (netdev_linux_netnsid_is_remote(netdev)) {
3603
0
        error = EOPNOTSUPP;
3604
0
        goto exit;
3605
0
    }
3606
3607
0
    error = netdev_get_addrs(netdev_get_name(netdev_), addr, mask, n_cnt);
3608
3609
0
exit:
3610
0
    ovs_mutex_unlock(&netdev->mutex);
3611
0
    return error;
3612
0
}
3613
3614
static void
3615
make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
3616
0
{
3617
0
    struct sockaddr_in sin;
3618
0
    memset(&sin, 0, sizeof sin);
3619
0
    sin.sin_family = AF_INET;
3620
0
    sin.sin_addr = addr;
3621
0
    sin.sin_port = 0;
3622
3623
0
    memset(sa, 0, sizeof *sa);
3624
0
    memcpy(sa, &sin, sizeof sin);
3625
0
}
3626
3627
static int
3628
do_set_addr(struct netdev *netdev,
3629
            int ioctl_nr, const char *ioctl_name, struct in_addr addr)
3630
0
{
3631
0
    struct ifreq ifr;
3632
3633
0
    memset(&ifr, 0, sizeof ifr);
3634
0
    make_in4_sockaddr(&ifr.ifr_addr, addr);
3635
0
    return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
3636
0
                               ioctl_name);
3637
0
}
3638
3639
/* Adds 'router' as a default IP gateway. */
3640
static int
3641
netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
3642
0
{
3643
0
    struct in_addr any = { INADDR_ANY };
3644
0
    struct rtentry rt;
3645
0
    int error;
3646
3647
0
    memset(&rt, 0, sizeof rt);
3648
0
    make_in4_sockaddr(&rt.rt_dst, any);
3649
0
    make_in4_sockaddr(&rt.rt_gateway, router);
3650
0
    make_in4_sockaddr(&rt.rt_genmask, any);
3651
0
    rt.rt_flags = RTF_UP | RTF_GATEWAY;
3652
0
    error = af_inet_ioctl(SIOCADDRT, &rt);
3653
0
    if (error) {
3654
0
        VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
3655
0
    }
3656
0
    return error;
3657
0
}
3658
3659
static int
3660
netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
3661
                          char **netdev_name)
3662
0
{
3663
0
    static const char fn[] = "/proc/net/route";
3664
0
    FILE *stream;
3665
0
    char line[256];
3666
0
    int ln;
3667
3668
0
    *netdev_name = NULL;
3669
0
    stream = fopen(fn, "r");
3670
0
    if (stream == NULL) {
3671
0
        VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
3672
0
        return errno;
3673
0
    }
3674
3675
0
    ln = 0;
3676
0
    while (fgets(line, sizeof line, stream)) {
3677
0
        if (++ln >= 2) {
3678
0
            char iface[17];
3679
0
            ovs_be32 dest, gateway, mask;
3680
0
            int refcnt, metric, mtu;
3681
0
            unsigned int flags, use, window, irtt;
3682
3683
0
            if (!ovs_scan(line,
3684
0
                          "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
3685
0
                          " %d %u %u\n",
3686
0
                          iface, &dest, &gateway, &flags, &refcnt,
3687
0
                          &use, &metric, &mask, &mtu, &window, &irtt)) {
3688
0
                VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
3689
0
                        fn, ln, line);
3690
0
                continue;
3691
0
            }
3692
0
            if (!(flags & RTF_UP)) {
3693
                /* Skip routes that aren't up. */
3694
0
                continue;
3695
0
            }
3696
3697
            /* The output of 'dest', 'mask', and 'gateway' were given in
3698
             * network byte order, so we don't need need any endian
3699
             * conversions here. */
3700
0
            if ((dest & mask) == (host->s_addr & mask)) {
3701
0
                if (!gateway) {
3702
                    /* The host is directly reachable. */
3703
0
                    next_hop->s_addr = 0;
3704
0
                } else {
3705
                    /* To reach the host, we must go through a gateway. */
3706
0
                    next_hop->s_addr = gateway;
3707
0
                }
3708
0
                *netdev_name = xstrdup(iface);
3709
0
                fclose(stream);
3710
0
                return 0;
3711
0
            }
3712
0
        }
3713
0
    }
3714
3715
0
    fclose(stream);
3716
0
    return ENXIO;
3717
0
}
3718
3719
int
3720
netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
3721
0
{
3722
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3723
0
    int error = 0;
3724
3725
0
    ovs_mutex_lock(&netdev->mutex);
3726
0
    if (!(netdev->cache_valid & VALID_DRVINFO)) {
3727
0
        struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
3728
3729
0
        COVERAGE_INC(netdev_get_ethtool);
3730
0
        memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
3731
0
        error = netdev_linux_do_ethtool(netdev->up.name,
3732
0
                                        cmd,
3733
0
                                        ETHTOOL_GDRVINFO,
3734
0
                                        "ETHTOOL_GDRVINFO");
3735
0
        if (!error) {
3736
0
            netdev->cache_valid |= VALID_DRVINFO;
3737
0
        }
3738
0
    }
3739
3740
0
    if (!error) {
3741
0
        smap_add(smap, "driver_name", netdev->drvinfo.driver);
3742
0
        smap_add(smap, "driver_version", netdev->drvinfo.version);
3743
0
        smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
3744
0
    }
3745
0
    ovs_mutex_unlock(&netdev->mutex);
3746
3747
0
    return error;
3748
0
}
3749
3750
static int
3751
netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
3752
                           struct smap *smap)
3753
0
{
3754
0
    smap_add(smap, "driver_name", "openvswitch");
3755
0
    return 0;
3756
0
}
3757
3758
static uint32_t
3759
netdev_linux_get_block_id(struct netdev *netdev_)
3760
0
{
3761
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3762
0
    uint32_t block_id = 0;
3763
3764
0
    ovs_mutex_lock(&netdev->mutex);
3765
    /* Ensure the linux netdev has had its fields populated. */
3766
0
    if (!(netdev->cache_valid & VALID_IFINDEX)) {
3767
0
        netdev_linux_update_via_netlink(netdev);
3768
0
    }
3769
3770
    /* Only assigning block ids to linux netdevs that are
3771
     * LAG primary members. */
3772
0
    if (netdev->is_lag_primary) {
3773
0
        block_id = netdev->ifindex;
3774
0
    }
3775
0
    ovs_mutex_unlock(&netdev->mutex);
3776
3777
0
    return block_id;
3778
0
}
3779
3780
/* Looks up the ARP table entry for 'ip' on 'netdev'.  If one exists and can be
3781
 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
3782
 * returns 0.  Otherwise, it returns a positive errno value; in particular,
3783
 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
3784
static int
3785
netdev_linux_arp_lookup(const struct netdev *netdev,
3786
                        ovs_be32 ip, struct eth_addr *mac)
3787
0
{
3788
0
    struct arpreq r;
3789
0
    struct sockaddr_in sin;
3790
0
    int retval;
3791
3792
0
    memset(&r, 0, sizeof r);
3793
0
    memset(&sin, 0, sizeof sin);
3794
0
    sin.sin_family = AF_INET;
3795
0
    sin.sin_addr.s_addr = ip;
3796
0
    sin.sin_port = 0;
3797
0
    memcpy(&r.arp_pa, &sin, sizeof sin);
3798
0
    r.arp_ha.sa_family = ARPHRD_ETHER;
3799
0
    r.arp_flags = 0;
3800
0
    ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
3801
0
    COVERAGE_INC(netdev_arp_lookup);
3802
0
    retval = af_inet_ioctl(SIOCGARP, &r);
3803
0
    if (!retval) {
3804
0
        memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
3805
0
    } else if (retval != ENXIO) {
3806
0
        VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
3807
0
                     netdev_get_name(netdev), IP_ARGS(ip),
3808
0
                     ovs_strerror(retval));
3809
0
    }
3810
0
    return retval;
3811
0
}
3812
3813
static unsigned int
3814
nd_to_iff_flags(enum netdev_flags nd)
3815
0
{
3816
0
    unsigned int iff = 0;
3817
0
    if (nd & NETDEV_UP) {
3818
0
        iff |= IFF_UP;
3819
0
    }
3820
0
    if (nd & NETDEV_PROMISC) {
3821
0
        iff |= IFF_PROMISC;
3822
0
    }
3823
0
    if (nd & NETDEV_LOOPBACK) {
3824
0
        iff |= IFF_LOOPBACK;
3825
0
    }
3826
0
    return iff;
3827
0
}
3828
3829
static int
3830
iff_to_nd_flags(unsigned int iff)
3831
0
{
3832
0
    enum netdev_flags nd = 0;
3833
0
    if (iff & IFF_UP) {
3834
0
        nd |= NETDEV_UP;
3835
0
    }
3836
0
    if (iff & IFF_PROMISC) {
3837
0
        nd |= NETDEV_PROMISC;
3838
0
    }
3839
0
    if (iff & IFF_LOOPBACK) {
3840
0
        nd |= NETDEV_LOOPBACK;
3841
0
    }
3842
0
    return nd;
3843
0
}
3844
3845
static int
3846
update_flags(struct netdev_linux *netdev, enum netdev_flags off,
3847
             enum netdev_flags on, enum netdev_flags *old_flagsp)
3848
    OVS_REQUIRES(netdev->mutex)
3849
0
{
3850
0
    unsigned int old_flags, new_flags;
3851
0
    int error = 0;
3852
3853
0
    old_flags = netdev->ifi_flags;
3854
0
    *old_flagsp = iff_to_nd_flags(old_flags);
3855
0
    new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
3856
0
    if (new_flags != old_flags) {
3857
0
        error = set_flags(netdev_get_name(&netdev->up), new_flags);
3858
0
        get_flags(&netdev->up, &netdev->ifi_flags);
3859
0
    }
3860
3861
0
    return error;
3862
0
}
3863
3864
static int
3865
netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
3866
                          enum netdev_flags on, enum netdev_flags *old_flagsp)
3867
0
{
3868
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3869
0
    int error = 0;
3870
3871
0
    ovs_mutex_lock(&netdev->mutex);
3872
0
    if (on || off) {
3873
        /* Changing flags over netlink isn't support yet. */
3874
0
        if (netdev_linux_netnsid_is_remote(netdev)) {
3875
0
            error = EOPNOTSUPP;
3876
0
            goto exit;
3877
0
        }
3878
0
        error = update_flags(netdev, off, on, old_flagsp);
3879
0
    } else {
3880
        /* Try reading flags over netlink, or fall back to ioctl. */
3881
0
        if (!netdev_linux_update_via_netlink(netdev)) {
3882
0
            *old_flagsp = iff_to_nd_flags(netdev->ifi_flags);
3883
0
        } else {
3884
0
            error = update_flags(netdev, off, on, old_flagsp);
3885
0
        }
3886
0
    }
3887
3888
0
exit:
3889
0
    ovs_mutex_unlock(&netdev->mutex);
3890
0
    return error;
3891
0
}
3892
3893
#define NETDEV_LINUX_CLASS_COMMON                               \
3894
    .run = netdev_linux_run,                                    \
3895
    .wait = netdev_linux_wait,                                  \
3896
    .alloc = netdev_linux_alloc,                                \
3897
    .dealloc = netdev_linux_dealloc,                            \
3898
    .send_wait = netdev_linux_send_wait,                        \
3899
    .set_etheraddr = netdev_linux_set_etheraddr,                \
3900
    .get_etheraddr = netdev_linux_get_etheraddr,                \
3901
    .get_mtu = netdev_linux_get_mtu,                            \
3902
    .set_mtu = netdev_linux_set_mtu,                            \
3903
    .get_ifindex = netdev_linux_get_ifindex,                    \
3904
    .get_carrier = netdev_linux_get_carrier,                    \
3905
    .get_carrier_resets = netdev_linux_get_carrier_resets,      \
3906
    .set_miimon_interval = netdev_linux_set_miimon_interval,    \
3907
    .set_advertisements = netdev_linux_set_advertisements,      \
3908
    .set_policing = netdev_linux_set_policing,                  \
3909
    .get_qos_types = netdev_linux_get_qos_types,                \
3910
    .get_qos_capabilities = netdev_linux_get_qos_capabilities,  \
3911
    .get_qos = netdev_linux_get_qos,                            \
3912
    .set_qos = netdev_linux_set_qos,                            \
3913
    .get_queue = netdev_linux_get_queue,                        \
3914
    .set_queue = netdev_linux_set_queue,                        \
3915
    .delete_queue = netdev_linux_delete_queue,                  \
3916
    .get_queue_stats = netdev_linux_get_queue_stats,            \
3917
    .queue_dump_start = netdev_linux_queue_dump_start,          \
3918
    .queue_dump_next = netdev_linux_queue_dump_next,            \
3919
    .queue_dump_done = netdev_linux_queue_dump_done,            \
3920
    .dump_queue_stats = netdev_linux_dump_queue_stats,          \
3921
    .set_in4 = netdev_linux_set_in4,                            \
3922
    .get_addr_list = netdev_linux_get_addr_list,                \
3923
    .add_router = netdev_linux_add_router,                      \
3924
    .get_next_hop = netdev_linux_get_next_hop,                  \
3925
    .arp_lookup = netdev_linux_arp_lookup,                      \
3926
    .update_flags = netdev_linux_update_flags,                  \
3927
    .rxq_alloc = netdev_linux_rxq_alloc,                        \
3928
    .rxq_dealloc = netdev_linux_rxq_dealloc,                    \
3929
    .rxq_wait = netdev_linux_rxq_wait,                          \
3930
    .rxq_drain = netdev_linux_rxq_drain
3931
3932
const struct netdev_class netdev_linux_class = {
3933
    NETDEV_LINUX_CLASS_COMMON,
3934
    .type = "system",
3935
    .is_pmd = false,
3936
    .construct = netdev_linux_construct,
3937
    .destruct = netdev_linux_destruct,
3938
    .get_stats = netdev_linux_get_stats,
3939
    .get_features = netdev_linux_get_features,
3940
    .get_speed = netdev_linux_get_speed,
3941
    .get_duplex = netdev_linux_get_duplex,
3942
    .get_status = netdev_linux_get_status,
3943
    .get_block_id = netdev_linux_get_block_id,
3944
    .send = netdev_linux_send,
3945
    .rxq_construct = netdev_linux_rxq_construct,
3946
    .rxq_destruct = netdev_linux_rxq_destruct,
3947
    .rxq_recv = netdev_linux_rxq_recv,
3948
};
3949
3950
const struct netdev_class netdev_tap_class = {
3951
    NETDEV_LINUX_CLASS_COMMON,
3952
    .type = "tap",
3953
    .is_pmd = false,
3954
    .construct = netdev_linux_construct_tap,
3955
    .destruct = netdev_linux_destruct,
3956
    .get_stats = netdev_tap_get_stats,
3957
    .get_features = netdev_linux_get_features,
3958
    .get_speed = netdev_linux_get_speed,
3959
    .get_duplex = netdev_linux_get_duplex,
3960
    .get_status = netdev_linux_get_status,
3961
    .send = netdev_linux_send,
3962
    .rxq_construct = netdev_linux_rxq_construct,
3963
    .rxq_destruct = netdev_linux_rxq_destruct,
3964
    .rxq_recv = netdev_linux_rxq_recv,
3965
};
3966
3967
const struct netdev_class netdev_internal_class = {
3968
    NETDEV_LINUX_CLASS_COMMON,
3969
    .type = "internal",
3970
    .is_pmd = false,
3971
    .construct = netdev_linux_construct,
3972
    .destruct = netdev_linux_destruct,
3973
    .get_stats = netdev_internal_get_stats,
3974
    .get_status = netdev_internal_get_status,
3975
    .send = netdev_linux_send,
3976
    .rxq_construct = netdev_linux_rxq_construct,
3977
    .rxq_destruct = netdev_linux_rxq_destruct,
3978
    .rxq_recv = netdev_linux_rxq_recv,
3979
};
3980
3981
#ifdef HAVE_AF_XDP
3982
#define NETDEV_AFXDP_CLASS_COMMON                               \
3983
    .construct = netdev_afxdp_construct,                        \
3984
    .destruct = netdev_afxdp_destruct,                          \
3985
    .get_stats = netdev_afxdp_get_stats,                        \
3986
    .get_custom_stats = netdev_afxdp_get_custom_stats,          \
3987
    .get_status = netdev_afxdp_get_status,                      \
3988
    .set_config = netdev_afxdp_set_config,                      \
3989
    .get_config = netdev_afxdp_get_config,                      \
3990
    .reconfigure = netdev_afxdp_reconfigure,                    \
3991
    .get_numa_id = netdev_linux_get_numa_id,                    \
3992
    .send = netdev_afxdp_batch_send,                            \
3993
    .rxq_construct = netdev_afxdp_rxq_construct,                \
3994
    .rxq_destruct = netdev_afxdp_rxq_destruct,                  \
3995
    .rxq_recv = netdev_afxdp_rxq_recv
3996
3997
const struct netdev_class netdev_afxdp_class = {
3998
    NETDEV_LINUX_CLASS_COMMON,
3999
    NETDEV_AFXDP_CLASS_COMMON,
4000
    .type = "afxdp",
4001
    .is_pmd = true,
4002
};
4003
4004
const struct netdev_class netdev_afxdp_nonpmd_class = {
4005
    NETDEV_LINUX_CLASS_COMMON,
4006
    NETDEV_AFXDP_CLASS_COMMON,
4007
    .type = "afxdp-nonpmd",
4008
    .is_pmd = false,
4009
};
4010
#endif
4011

4012
4013
#define CODEL_N_QUEUES 0x0000
4014
4015
/* In sufficiently new kernel headers these are defined as enums in
4016
 * <linux/pkt_sched.h>.  Define them here as macros to help out with older
4017
 * kernels.  (This overrides any enum definition in the header file but that's
4018
 * harmless.) */
4019
0
#define TCA_CODEL_TARGET   1
4020
0
#define TCA_CODEL_LIMIT    2
4021
0
#define TCA_CODEL_INTERVAL 3
4022
4023
struct codel {
4024
    struct tc tc;
4025
    uint32_t target;
4026
    uint32_t limit;
4027
    uint32_t interval;
4028
};
4029
4030
static struct codel *
4031
codel_get__(const struct netdev *netdev_)
4032
0
{
4033
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4034
0
    return CONTAINER_OF(netdev->tc, struct codel, tc);
4035
0
}
4036
4037
static void
4038
codel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
4039
                uint32_t interval)
4040
0
{
4041
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4042
0
    struct codel *codel;
4043
4044
0
    codel = xmalloc(sizeof *codel);
4045
0
    tc_init(&codel->tc, &tc_ops_codel);
4046
0
    codel->target = target;
4047
0
    codel->limit = limit;
4048
0
    codel->interval = interval;
4049
4050
0
    netdev->tc = &codel->tc;
4051
0
}
4052
4053
static int
4054
codel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
4055
                    uint32_t interval)
4056
0
{
4057
0
    size_t opt_offset;
4058
0
    struct ofpbuf request;
4059
0
    struct tcmsg *tcmsg;
4060
0
    uint32_t otarget, olimit, ointerval;
4061
0
    int error;
4062
4063
0
    tc_del_qdisc(netdev);
4064
4065
0
    tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
4066
0
                                         NLM_F_EXCL | NLM_F_CREATE, &request);
4067
0
    if (!tcmsg) {
4068
0
        return ENODEV;
4069
0
    }
4070
0
    tcmsg->tcm_handle = tc_make_handle(1, 0);
4071
0
    tcmsg->tcm_parent = TC_H_ROOT;
4072
4073
0
    otarget = target ? target : 5000;
4074
0
    olimit = limit ? limit : 10240;
4075
0
    ointerval = interval ? interval : 100000;
4076
4077
0
    nl_msg_put_string(&request, TCA_KIND, "codel");
4078
0
    opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4079
0
    nl_msg_put_u32(&request, TCA_CODEL_TARGET, otarget);
4080
0
    nl_msg_put_u32(&request, TCA_CODEL_LIMIT, olimit);
4081
0
    nl_msg_put_u32(&request, TCA_CODEL_INTERVAL, ointerval);
4082
0
    nl_msg_end_nested(&request, opt_offset);
4083
4084
0
    error = tc_transact(&request, NULL);
4085
0
    if (error) {
4086
0
        VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
4087
0
        "target %u, limit %u, interval %u error %d(%s)",
4088
0
        netdev_get_name(netdev),
4089
0
        otarget, olimit, ointerval,
4090
0
        error, ovs_strerror(error));
4091
0
    }
4092
0
    return error;
4093
0
}
4094
4095
static void
4096
codel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
4097
                            const struct smap *details, struct codel *codel)
4098
0
{
4099
0
    codel->target = smap_get_ullong(details, "target", 0);
4100
0
    codel->limit = smap_get_ullong(details, "limit", 0);
4101
0
    codel->interval = smap_get_ullong(details, "interval", 0);
4102
4103
0
    if (!codel->target) {
4104
0
        codel->target = 5000;
4105
0
    }
4106
0
    if (!codel->limit) {
4107
0
        codel->limit = 10240;
4108
0
    }
4109
0
    if (!codel->interval) {
4110
0
        codel->interval = 100000;
4111
0
    }
4112
0
}
4113
4114
static int
4115
codel_tc_install(struct netdev *netdev, const struct smap *details)
4116
0
{
4117
0
    int error;
4118
0
    struct codel codel;
4119
4120
0
    codel_parse_qdisc_details__(netdev, details, &codel);
4121
0
    error = codel_setup_qdisc__(netdev, codel.target, codel.limit,
4122
0
                                codel.interval);
4123
0
    if (!error) {
4124
0
        codel_install__(netdev, codel.target, codel.limit, codel.interval);
4125
0
    }
4126
0
    return error;
4127
0
}
4128
4129
static int
4130
codel_parse_tca_options__(struct nlattr *nl_options, struct codel *codel)
4131
0
{
4132
0
    static const struct nl_policy tca_codel_policy[] = {
4133
0
        [TCA_CODEL_TARGET] = { .type = NL_A_U32 },
4134
0
        [TCA_CODEL_LIMIT] = { .type = NL_A_U32 },
4135
0
        [TCA_CODEL_INTERVAL] = { .type = NL_A_U32 }
4136
0
    };
4137
4138
0
    struct nlattr *attrs[ARRAY_SIZE(tca_codel_policy)];
4139
4140
0
    if (!nl_parse_nested(nl_options, tca_codel_policy,
4141
0
                         attrs, ARRAY_SIZE(tca_codel_policy))) {
4142
0
        VLOG_WARN_RL(&rl, "failed to parse CoDel class options");
4143
0
        return EPROTO;
4144
0
    }
4145
4146
0
    codel->target = nl_attr_get_u32(attrs[TCA_CODEL_TARGET]);
4147
0
    codel->limit = nl_attr_get_u32(attrs[TCA_CODEL_LIMIT]);
4148
0
    codel->interval = nl_attr_get_u32(attrs[TCA_CODEL_INTERVAL]);
4149
0
    return 0;
4150
0
}
4151
4152
static int
4153
codel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
4154
0
{
4155
0
    struct nlattr *nlattr;
4156
0
    const char * kind;
4157
0
    int error;
4158
0
    struct codel codel;
4159
4160
0
    error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
4161
0
    if (error != 0) {
4162
0
        return error;
4163
0
    }
4164
4165
0
    error = codel_parse_tca_options__(nlattr, &codel);
4166
0
    if (error != 0) {
4167
0
        return error;
4168
0
    }
4169
4170
0
    codel_install__(netdev, codel.target, codel.limit, codel.interval);
4171
0
    return 0;
4172
0
}
4173
4174
4175
static void
4176
codel_tc_destroy(struct tc *tc)
4177
0
{
4178
0
    struct codel *codel = CONTAINER_OF(tc, struct codel, tc);
4179
0
    tc_destroy(tc);
4180
0
    free(codel);
4181
0
}
4182
4183
static int
4184
codel_qdisc_get(const struct netdev *netdev, struct smap *details)
4185
0
{
4186
0
    const struct codel *codel = codel_get__(netdev);
4187
0
    smap_add_format(details, "target", "%u", codel->target);
4188
0
    smap_add_format(details, "limit", "%u", codel->limit);
4189
0
    smap_add_format(details, "interval", "%u", codel->interval);
4190
0
    return 0;
4191
0
}
4192
4193
static int
4194
codel_qdisc_set(struct netdev *netdev, const struct smap *details)
4195
0
{
4196
0
    struct codel codel;
4197
4198
0
    codel_parse_qdisc_details__(netdev, details, &codel);
4199
0
    codel_install__(netdev, codel.target, codel.limit, codel.interval);
4200
0
    codel_get__(netdev)->target = codel.target;
4201
0
    codel_get__(netdev)->limit = codel.limit;
4202
0
    codel_get__(netdev)->interval = codel.interval;
4203
0
    return 0;
4204
0
}
4205
4206
static const struct tc_ops tc_ops_codel = {
4207
    .linux_name = "codel",
4208
    .ovs_name = "linux-codel",
4209
    .n_queues = CODEL_N_QUEUES,
4210
    .tc_install = codel_tc_install,
4211
    .tc_load = codel_tc_load,
4212
    .tc_destroy = codel_tc_destroy,
4213
    .qdisc_get = codel_qdisc_get,
4214
    .qdisc_set = codel_qdisc_set,
4215
};
4216

4217
/* FQ-CoDel traffic control class. */
4218
4219
#define FQCODEL_N_QUEUES 0x0000
4220
4221
/* In sufficiently new kernel headers these are defined as enums in
4222
 * <linux/pkt_sched.h>.  Define them here as macros to help out with older
4223
 * kernels.  (This overrides any enum definition in the header file but that's
4224
 * harmless.) */
4225
0
#define TCA_FQ_CODEL_TARGET     1
4226
0
#define TCA_FQ_CODEL_LIMIT      2
4227
0
#define TCA_FQ_CODEL_INTERVAL   3
4228
#define TCA_FQ_CODEL_ECN        4
4229
0
#define TCA_FQ_CODEL_FLOWS      5
4230
0
#define TCA_FQ_CODEL_QUANTUM    6
4231
4232
struct fqcodel {
4233
    struct tc tc;
4234
    uint32_t target;
4235
    uint32_t limit;
4236
    uint32_t interval;
4237
    uint32_t flows;
4238
    uint32_t quantum;
4239
};
4240
4241
static struct fqcodel *
4242
fqcodel_get__(const struct netdev *netdev_)
4243
0
{
4244
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4245
0
    return CONTAINER_OF(netdev->tc, struct fqcodel, tc);
4246
0
}
4247
4248
static void
4249
fqcodel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
4250
                  uint32_t interval, uint32_t flows, uint32_t quantum)
4251
0
{
4252
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4253
0
    struct fqcodel *fqcodel;
4254
4255
0
    fqcodel = xmalloc(sizeof *fqcodel);
4256
0
    tc_init(&fqcodel->tc, &tc_ops_fqcodel);
4257
0
    fqcodel->target = target;
4258
0
    fqcodel->limit = limit;
4259
0
    fqcodel->interval = interval;
4260
0
    fqcodel->flows = flows;
4261
0
    fqcodel->quantum = quantum;
4262
4263
0
    netdev->tc = &fqcodel->tc;
4264
0
}
4265
4266
static int
4267
fqcodel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
4268
                      uint32_t interval, uint32_t flows, uint32_t quantum)
4269
0
{
4270
0
    size_t opt_offset;
4271
0
    struct ofpbuf request;
4272
0
    struct tcmsg *tcmsg;
4273
0
    uint32_t otarget, olimit, ointerval, oflows,  oquantum;
4274
0
    int error;
4275
4276
0
    tc_del_qdisc(netdev);
4277
4278
0
    tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
4279
0
                                         NLM_F_EXCL | NLM_F_CREATE, &request);
4280
0
    if (!tcmsg) {
4281
0
        return ENODEV;
4282
0
    }
4283
0
    tcmsg->tcm_handle = tc_make_handle(1, 0);
4284
0
    tcmsg->tcm_parent = TC_H_ROOT;
4285
4286
0
    otarget = target ? target : 5000;
4287
0
    olimit = limit ? limit : 10240;
4288
0
    ointerval = interval ? interval : 100000;
4289
0
    oflows = flows ? flows : 1024;
4290
0
    oquantum = quantum ? quantum : 1514; /* fq_codel default quantum is 1514
4291
                                            not mtu */
4292
4293
0
    nl_msg_put_string(&request, TCA_KIND, "fq_codel");
4294
0
    opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4295
0
    nl_msg_put_u32(&request, TCA_FQ_CODEL_TARGET, otarget);
4296
0
    nl_msg_put_u32(&request, TCA_FQ_CODEL_LIMIT, olimit);
4297
0
    nl_msg_put_u32(&request, TCA_FQ_CODEL_INTERVAL, ointerval);
4298
0
    nl_msg_put_u32(&request, TCA_FQ_CODEL_FLOWS, oflows);
4299
0
    nl_msg_put_u32(&request, TCA_FQ_CODEL_QUANTUM, oquantum);
4300
0
    nl_msg_end_nested(&request, opt_offset);
4301
4302
0
    error = tc_transact(&request, NULL);
4303
0
    if (error) {
4304
0
        VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
4305
0
        "target %u, limit %u, interval %u, flows %u, quantum %u error %d(%s)",
4306
0
        netdev_get_name(netdev),
4307
0
        otarget, olimit, ointerval, oflows, oquantum,
4308
0
        error, ovs_strerror(error));
4309
0
    }
4310
0
    return error;
4311
0
}
4312
4313
static void
4314
fqcodel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
4315
                          const struct smap *details, struct fqcodel *fqcodel)
4316
0
{
4317
0
    fqcodel->target = smap_get_ullong(details, "target", 0);
4318
0
    fqcodel->limit = smap_get_ullong(details, "limit", 0);
4319
0
    fqcodel->interval = smap_get_ullong(details, "interval", 0);
4320
0
    fqcodel->flows = smap_get_ullong(details, "flows", 0);
4321
0
    fqcodel->quantum = smap_get_ullong(details, "quantum", 0);
4322
4323
0
    if (!fqcodel->target) {
4324
0
        fqcodel->target = 5000;
4325
0
    }
4326
0
    if (!fqcodel->limit) {
4327
0
        fqcodel->limit = 10240;
4328
0
    }
4329
0
    if (!fqcodel->interval) {
4330
0
        fqcodel->interval = 1000000;
4331
0
    }
4332
0
    if (!fqcodel->flows) {
4333
0
        fqcodel->flows = 1024;
4334
0
    }
4335
0
    if (!fqcodel->quantum) {
4336
0
        fqcodel->quantum = 1514;
4337
0
    }
4338
0
}
4339
4340
static int
4341
fqcodel_tc_install(struct netdev *netdev, const struct smap *details)
4342
0
{
4343
0
    int error;
4344
0
    struct fqcodel fqcodel;
4345
4346
0
    fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
4347
0
    error = fqcodel_setup_qdisc__(netdev, fqcodel.target, fqcodel.limit,
4348
0
                                  fqcodel.interval, fqcodel.flows,
4349
0
                                  fqcodel.quantum);
4350
0
    if (!error) {
4351
0
        fqcodel_install__(netdev, fqcodel.target, fqcodel.limit,
4352
0
                          fqcodel.interval, fqcodel.flows, fqcodel.quantum);
4353
0
    }
4354
0
    return error;
4355
0
}
4356
4357
static int
4358
fqcodel_parse_tca_options__(struct nlattr *nl_options, struct fqcodel *fqcodel)
4359
0
{
4360
0
    static const struct nl_policy tca_fqcodel_policy[] = {
4361
0
        [TCA_FQ_CODEL_TARGET] = { .type = NL_A_U32 },
4362
0
        [TCA_FQ_CODEL_LIMIT] = { .type = NL_A_U32 },
4363
0
        [TCA_FQ_CODEL_INTERVAL] = { .type = NL_A_U32 },
4364
0
        [TCA_FQ_CODEL_FLOWS] = { .type = NL_A_U32 },
4365
0
        [TCA_FQ_CODEL_QUANTUM] = { .type = NL_A_U32 }
4366
0
    };
4367
4368
0
    struct nlattr *attrs[ARRAY_SIZE(tca_fqcodel_policy)];
4369
4370
0
    if (!nl_parse_nested(nl_options, tca_fqcodel_policy,
4371
0
                         attrs, ARRAY_SIZE(tca_fqcodel_policy))) {
4372
0
        VLOG_WARN_RL(&rl, "failed to parse FQ_CoDel class options");
4373
0
        return EPROTO;
4374
0
    }
4375
4376
0
    fqcodel->target = nl_attr_get_u32(attrs[TCA_FQ_CODEL_TARGET]);
4377
0
    fqcodel->limit = nl_attr_get_u32(attrs[TCA_FQ_CODEL_LIMIT]);
4378
0
    fqcodel->interval =nl_attr_get_u32(attrs[TCA_FQ_CODEL_INTERVAL]);
4379
0
    fqcodel->flows = nl_attr_get_u32(attrs[TCA_FQ_CODEL_FLOWS]);
4380
0
    fqcodel->quantum = nl_attr_get_u32(attrs[TCA_FQ_CODEL_QUANTUM]);
4381
0
    return 0;
4382
0
}
4383
4384
static int
4385
fqcodel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
4386
0
{
4387
0
    struct nlattr *nlattr;
4388
0
    const char * kind;
4389
0
    int error;
4390
0
    struct fqcodel fqcodel;
4391
4392
0
    error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
4393
0
    if (error != 0) {
4394
0
        return error;
4395
0
    }
4396
4397
0
    error = fqcodel_parse_tca_options__(nlattr, &fqcodel);
4398
0
    if (error != 0) {
4399
0
        return error;
4400
0
    }
4401
4402
0
    fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
4403
0
                      fqcodel.flows, fqcodel.quantum);
4404
0
    return 0;
4405
0
}
4406
4407
static void
4408
fqcodel_tc_destroy(struct tc *tc)
4409
0
{
4410
0
    struct fqcodel *fqcodel = CONTAINER_OF(tc, struct fqcodel, tc);
4411
0
    tc_destroy(tc);
4412
0
    free(fqcodel);
4413
0
}
4414
4415
static int
4416
fqcodel_qdisc_get(const struct netdev *netdev, struct smap *details)
4417
0
{
4418
0
    const struct fqcodel *fqcodel = fqcodel_get__(netdev);
4419
0
    smap_add_format(details, "target", "%u", fqcodel->target);
4420
0
    smap_add_format(details, "limit", "%u", fqcodel->limit);
4421
0
    smap_add_format(details, "interval", "%u", fqcodel->interval);
4422
0
    smap_add_format(details, "flows", "%u", fqcodel->flows);
4423
0
    smap_add_format(details, "quantum", "%u", fqcodel->quantum);
4424
0
    return 0;
4425
0
}
4426
4427
static int
4428
fqcodel_qdisc_set(struct netdev *netdev, const struct smap *details)
4429
0
{
4430
0
    struct fqcodel fqcodel;
4431
4432
0
    fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
4433
0
    fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
4434
0
                      fqcodel.flows, fqcodel.quantum);
4435
0
    fqcodel_get__(netdev)->target = fqcodel.target;
4436
0
    fqcodel_get__(netdev)->limit = fqcodel.limit;
4437
0
    fqcodel_get__(netdev)->interval = fqcodel.interval;
4438
0
    fqcodel_get__(netdev)->flows = fqcodel.flows;
4439
0
    fqcodel_get__(netdev)->quantum = fqcodel.quantum;
4440
0
    return 0;
4441
0
}
4442
4443
static const struct tc_ops tc_ops_fqcodel = {
4444
    .linux_name = "fq_codel",
4445
    .ovs_name = "linux-fq_codel",
4446
    .n_queues = FQCODEL_N_QUEUES,
4447
    .tc_install = fqcodel_tc_install,
4448
    .tc_load = fqcodel_tc_load,
4449
    .tc_destroy = fqcodel_tc_destroy,
4450
    .qdisc_get = fqcodel_qdisc_get,
4451
    .qdisc_set = fqcodel_qdisc_set,
4452
};
4453

4454
/* SFQ traffic control class. */
4455
4456
#define SFQ_N_QUEUES 0x0000
4457
4458
struct sfq {
4459
    struct tc tc;
4460
    uint32_t quantum;
4461
    uint32_t perturb;
4462
};
4463
4464
static struct sfq *
4465
sfq_get__(const struct netdev *netdev_)
4466
0
{
4467
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4468
0
    return CONTAINER_OF(netdev->tc, struct sfq, tc);
4469
0
}
4470
4471
static void
4472
sfq_install__(struct netdev *netdev_, uint32_t quantum, uint32_t perturb)
4473
0
{
4474
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4475
0
    struct sfq *sfq;
4476
4477
0
    sfq = xmalloc(sizeof *sfq);
4478
0
    tc_init(&sfq->tc, &tc_ops_sfq);
4479
0
    sfq->perturb = perturb;
4480
0
    sfq->quantum = quantum;
4481
4482
0
    netdev->tc = &sfq->tc;
4483
0
}
4484
4485
static int
4486
sfq_setup_qdisc__(struct netdev *netdev, uint32_t quantum, uint32_t perturb)
4487
0
{
4488
0
    struct tc_sfq_qopt opt;
4489
0
    struct ofpbuf request;
4490
0
    struct tcmsg *tcmsg;
4491
0
    int mtu;
4492
0
    int mtu_error, error;
4493
0
    mtu_error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
4494
4495
0
    tc_del_qdisc(netdev);
4496
4497
0
    tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
4498
0
                                         NLM_F_EXCL | NLM_F_CREATE, &request);
4499
0
    if (!tcmsg) {
4500
0
        return ENODEV;
4501
0
    }
4502
0
    tcmsg->tcm_handle = tc_make_handle(1, 0);
4503
0
    tcmsg->tcm_parent = TC_H_ROOT;
4504
4505
0
    memset(&opt, 0, sizeof opt);
4506
0
    if (!quantum) {
4507
0
        if (!mtu_error) {
4508
0
            opt.quantum = mtu; /* if we cannot find mtu, use default */
4509
0
        }
4510
0
    } else {
4511
0
        opt.quantum = quantum;
4512
0
    }
4513
4514
0
    if (!perturb) {
4515
0
        opt.perturb_period = 10;
4516
0
    } else {
4517
0
        opt.perturb_period = perturb;
4518
0
    }
4519
4520
0
    nl_msg_put_string(&request, TCA_KIND, "sfq");
4521
0
    nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
4522
4523
0
    error = tc_transact(&request, NULL);
4524
0
    if (error) {
4525
0
        VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
4526
0
                     "quantum %u, perturb %u error %d(%s)",
4527
0
                     netdev_get_name(netdev),
4528
0
                     opt.quantum, opt.perturb_period,
4529
0
                     error, ovs_strerror(error));
4530
0
    }
4531
0
    return error;
4532
0
}
4533
4534
static void
4535
sfq_parse_qdisc_details__(struct netdev *netdev,
4536
                          const struct smap *details, struct sfq *sfq)
4537
0
{
4538
0
    sfq->perturb = smap_get_ullong(details, "perturb", 0);
4539
0
    sfq->quantum = smap_get_ullong(details, "quantum", 0);
4540
4541
0
    if (!sfq->perturb) {
4542
0
        sfq->perturb = 10;
4543
0
    }
4544
4545
0
    if (!sfq->quantum) {
4546
0
        int mtu;
4547
0
        if (!netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
4548
0
            sfq->quantum = mtu;
4549
0
        } else {
4550
0
            VLOG_WARN_RL(&rl, "when using SFQ, you must specify quantum on a "
4551
0
                         "device without mtu");
4552
0
        }
4553
0
    }
4554
0
}
4555
4556
static int
4557
sfq_tc_install(struct netdev *netdev, const struct smap *details)
4558
0
{
4559
0
    int error;
4560
0
    struct sfq sfq;
4561
4562
0
    sfq_parse_qdisc_details__(netdev, details, &sfq);
4563
0
    error = sfq_setup_qdisc__(netdev, sfq.quantum, sfq.perturb);
4564
0
    if (!error) {
4565
0
        sfq_install__(netdev, sfq.quantum, sfq.perturb);
4566
0
    }
4567
0
    return error;
4568
0
}
4569
4570
static int
4571
sfq_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
4572
0
{
4573
0
    const struct tc_sfq_qopt *sfq;
4574
0
    struct nlattr *nlattr;
4575
0
    const char * kind;
4576
0
    int error;
4577
4578
0
    error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
4579
0
    if (error == 0) {
4580
0
        sfq = nl_attr_get(nlattr);
4581
0
        sfq_install__(netdev, sfq->quantum, sfq->perturb_period);
4582
0
        return 0;
4583
0
    }
4584
4585
0
    return error;
4586
0
}
4587
4588
static void
4589
sfq_tc_destroy(struct tc *tc)
4590
0
{
4591
0
    struct sfq *sfq = CONTAINER_OF(tc, struct sfq, tc);
4592
0
    tc_destroy(tc);
4593
0
    free(sfq);
4594
0
}
4595
4596
static int
4597
sfq_qdisc_get(const struct netdev *netdev, struct smap *details)
4598
0
{
4599
0
    const struct sfq *sfq = sfq_get__(netdev);
4600
0
    smap_add_format(details, "quantum", "%u", sfq->quantum);
4601
0
    smap_add_format(details, "perturb", "%u", sfq->perturb);
4602
0
    return 0;
4603
0
}
4604
4605
static int
4606
sfq_qdisc_set(struct netdev *netdev, const struct smap *details)
4607
0
{
4608
0
    struct sfq sfq;
4609
4610
0
    sfq_parse_qdisc_details__(netdev, details, &sfq);
4611
0
    sfq_install__(netdev, sfq.quantum, sfq.perturb);
4612
0
    sfq_get__(netdev)->quantum = sfq.quantum;
4613
0
    sfq_get__(netdev)->perturb = sfq.perturb;
4614
0
    return 0;
4615
0
}
4616
4617
static const struct tc_ops tc_ops_sfq = {
4618
    .linux_name = "sfq",
4619
    .ovs_name = "linux-sfq",
4620
    .n_queues = SFQ_N_QUEUES,
4621
    .tc_install = sfq_tc_install,
4622
    .tc_load = sfq_tc_load,
4623
    .tc_destroy = sfq_tc_destroy,
4624
    .qdisc_get = sfq_qdisc_get,
4625
    .qdisc_set = sfq_qdisc_set,
4626
};
4627

4628
/* netem traffic control class. */
4629
4630
struct netem {
4631
    struct tc tc;
4632
    uint32_t latency;
4633
    uint32_t limit;
4634
    uint32_t loss;
4635
    uint32_t jitter;
4636
};
4637
4638
static struct netem *
4639
netem_get__(const struct netdev *netdev_)
4640
0
{
4641
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4642
0
    return CONTAINER_OF(netdev->tc, struct netem, tc);
4643
0
}
4644
4645
static void
4646
netem_install__(struct netdev *netdev_, uint32_t latency,
4647
                uint32_t limit, uint32_t loss, uint32_t jitter)
4648
0
{
4649
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4650
0
    struct netem *netem;
4651
4652
0
    netem = xmalloc(sizeof *netem);
4653
0
    tc_init(&netem->tc, &tc_ops_netem);
4654
0
    netem->latency = latency;
4655
0
    netem->limit = limit;
4656
0
    netem->loss = loss;
4657
0
    netem->jitter = jitter;
4658
4659
0
    netdev->tc = &netem->tc;
4660
0
}
4661
4662
static int
4663
netem_setup_qdisc__(struct netdev *netdev, uint32_t latency,
4664
                    uint32_t limit, uint32_t loss, uint32_t jitter)
4665
0
{
4666
0
    struct tc_netem_qopt opt;
4667
0
    struct ofpbuf request;
4668
0
    struct tcmsg *tcmsg;
4669
0
    int error;
4670
4671
0
    tc_del_qdisc(netdev);
4672
4673
0
    tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
4674
0
                                         NLM_F_EXCL | NLM_F_CREATE, &request);
4675
0
    if (!tcmsg) {
4676
0
        return ENODEV;
4677
0
    }
4678
0
    tcmsg->tcm_handle = tc_make_handle(1, 0);
4679
0
    tcmsg->tcm_parent = TC_H_ROOT;
4680
4681
0
    memset(&opt, 0, sizeof opt);
4682
4683
0
    if (!limit) {
4684
0
        opt.limit = 1000;
4685
0
    } else {
4686
0
        opt.limit = limit;
4687
0
    }
4688
4689
0
    if (loss) {
4690
0
        if (loss > 100) {
4691
0
            VLOG_WARN_RL(&rl,
4692
0
                         "loss should be a percentage value between 0 to 100, "
4693
0
                         "loss was %u", loss);
4694
0
            return EINVAL;
4695
0
        }
4696
0
        opt.loss = floor(UINT32_MAX * (loss / 100.0));
4697
0
    }
4698
4699
0
    opt.latency = tc_time_to_ticks(latency);
4700
0
    opt.jitter = tc_time_to_ticks(jitter);
4701
4702
0
    nl_msg_put_string(&request, TCA_KIND, "netem");
4703
0
    nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
4704
4705
0
    error = tc_transact(&request, NULL);
4706
0
    if (error) {
4707
0
        VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
4708
0
                          "latency %u, limit %u, loss %u, jitter %u "
4709
0
                          "error %d(%s)",
4710
0
                     netdev_get_name(netdev),
4711
0
                     opt.latency, opt.limit, opt.loss, opt.jitter,
4712
0
                     error, ovs_strerror(error));
4713
0
    }
4714
0
    return error;
4715
0
}
4716
4717
static void
4718
netem_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
4719
                          const struct smap *details, struct netem *netem)
4720
0
{
4721
0
    netem->latency = smap_get_ullong(details, "latency", 0);
4722
0
    netem->limit = smap_get_ullong(details, "limit", 0);
4723
0
    netem->loss = smap_get_ullong(details, "loss", 0);
4724
0
    netem->jitter = smap_get_ullong(details, "jitter", 0);
4725
4726
0
    if (!netem->limit) {
4727
0
        netem->limit = 1000;
4728
0
    }
4729
0
}
4730
4731
static int
4732
netem_tc_install(struct netdev *netdev, const struct smap *details)
4733
0
{
4734
0
    int error;
4735
0
    struct netem netem;
4736
4737
0
    netem_parse_qdisc_details__(netdev, details, &netem);
4738
0
    error = netem_setup_qdisc__(netdev, netem.latency,
4739
0
                                netem.limit, netem.loss, netem.jitter);
4740
0
    if (!error) {
4741
0
        netem_install__(netdev, netem.latency,
4742
0
                        netem.limit, netem.loss, netem.jitter);
4743
0
    }
4744
0
    return error;
4745
0
}
4746
4747
static int
4748
netem_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
4749
0
{
4750
0
    const struct tc_netem_qopt *netem;
4751
0
    struct nlattr *nlattr;
4752
0
    const char *kind;
4753
0
    int error;
4754
4755
0
    error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
4756
0
    if (error == 0) {
4757
0
        netem = nl_attr_get(nlattr);
4758
0
        netem_install__(netdev, netem->latency,
4759
0
                        netem->limit, netem->loss, netem->jitter);
4760
0
        return 0;
4761
0
    }
4762
4763
0
    return error;
4764
0
}
4765
4766
static void
4767
netem_tc_destroy(struct tc *tc)
4768
0
{
4769
0
    struct netem *netem = CONTAINER_OF(tc, struct netem, tc);
4770
0
    tc_destroy(tc);
4771
0
    free(netem);
4772
0
}
4773
4774
static int
4775
netem_qdisc_get(const struct netdev *netdev, struct smap *details)
4776
0
{
4777
0
    const struct netem *netem = netem_get__(netdev);
4778
0
    smap_add_format(details, "latency", "%u", netem->latency);
4779
0
    smap_add_format(details, "limit", "%u", netem->limit);
4780
0
    smap_add_format(details, "loss", "%u", netem->loss);
4781
0
    smap_add_format(details, "jitter", "%u", netem->jitter);
4782
0
    return 0;
4783
0
}
4784
4785
static int
4786
netem_qdisc_set(struct netdev *netdev, const struct smap *details)
4787
0
{
4788
0
    struct netem netem;
4789
4790
0
    netem_parse_qdisc_details__(netdev, details, &netem);
4791
0
    netem_install__(netdev, netem.latency,
4792
0
                    netem.limit, netem.loss, netem.jitter);
4793
0
    netem_get__(netdev)->latency = netem.latency;
4794
0
    netem_get__(netdev)->limit = netem.limit;
4795
0
    netem_get__(netdev)->loss = netem.loss;
4796
0
    netem_get__(netdev)->jitter = netem.jitter;
4797
0
    return 0;
4798
0
}
4799
4800
static const struct tc_ops tc_ops_netem = {
4801
    .linux_name = "netem",
4802
    .ovs_name = "linux-netem",
4803
    .n_queues = 0,
4804
    .tc_install = netem_tc_install,
4805
    .tc_load = netem_tc_load,
4806
    .tc_destroy = netem_tc_destroy,
4807
    .qdisc_get = netem_qdisc_get,
4808
    .qdisc_set = netem_qdisc_set,
4809
};
4810

4811
/* HTB traffic control class. */
4812
4813
0
#define HTB_N_QUEUES 0xf000
4814
0
#define HTB_RATE2QUANTUM 10
4815
4816
struct htb {
4817
    struct tc tc;
4818
    uint64_t max_rate;          /* In bytes/s. */
4819
};
4820
4821
struct htb_class {
4822
    struct tc_queue tc_queue;
4823
    uint64_t min_rate;          /* In bytes/s. */
4824
    uint64_t max_rate;          /* In bytes/s. */
4825
    unsigned int burst;         /* In bytes. */
4826
    unsigned int priority;      /* Lower values are higher priorities. */
4827
};
4828
4829
static struct htb *
4830
htb_get__(const struct netdev *netdev_)
4831
0
{
4832
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4833
0
    return CONTAINER_OF(netdev->tc, struct htb, tc);
4834
0
}
4835
4836
static void
4837
htb_install__(struct netdev *netdev_, uint64_t max_rate)
4838
0
{
4839
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4840
0
    struct htb *htb;
4841
4842
0
    htb = xmalloc(sizeof *htb);
4843
0
    tc_init(&htb->tc, &tc_ops_htb);
4844
0
    htb->max_rate = max_rate;
4845
4846
0
    netdev->tc = &htb->tc;
4847
0
}
4848
4849
/* Create an HTB qdisc.
4850
 *
4851
 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
4852
static int
4853
htb_setup_qdisc__(struct netdev *netdev)
4854
0
{
4855
0
    size_t opt_offset;
4856
0
    struct tc_htb_glob opt;
4857
0
    struct ofpbuf request;
4858
0
    struct tcmsg *tcmsg;
4859
4860
0
    tc_del_qdisc(netdev);
4861
4862
0
    tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
4863
0
                                         NLM_F_EXCL | NLM_F_CREATE, &request);
4864
0
    if (!tcmsg) {
4865
0
        return ENODEV;
4866
0
    }
4867
0
    tcmsg->tcm_handle = tc_make_handle(1, 0);
4868
0
    tcmsg->tcm_parent = TC_H_ROOT;
4869
4870
0
    nl_msg_put_string(&request, TCA_KIND, "htb");
4871
4872
0
    memset(&opt, 0, sizeof opt);
4873
0
    opt.rate2quantum = HTB_RATE2QUANTUM;
4874
0
    opt.version = 3;
4875
0
    opt.defcls = 1;
4876
4877
0
    opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4878
0
    nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
4879
0
    nl_msg_end_nested(&request, opt_offset);
4880
4881
0
    return tc_transact(&request, NULL);
4882
0
}
4883
4884
/* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
4885
 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
4886
static int
4887
htb_setup_class__(struct netdev *netdev, unsigned int handle,
4888
                  unsigned int parent, struct htb_class *class)
4889
0
{
4890
0
    size_t opt_offset;
4891
0
    struct tc_htb_opt opt;
4892
0
    struct ofpbuf request;
4893
0
    struct tcmsg *tcmsg;
4894
0
    int error;
4895
0
    int mtu;
4896
4897
0
    error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
4898
0
    if (error) {
4899
0
        VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
4900
0
                     netdev_get_name(netdev));
4901
0
        return error;
4902
0
    }
4903
4904
0
    memset(&opt, 0, sizeof opt);
4905
0
    tc_fill_rate(&opt.rate, class->min_rate, mtu);
4906
0
    tc_fill_rate(&opt.ceil, class->max_rate, mtu);
4907
    /* Makes sure the quantum is at least MTU.  Setting quantum will
4908
     * make htb ignore the r2q for this class. */
4909
0
    if ((class->min_rate / HTB_RATE2QUANTUM) < mtu) {
4910
0
        opt.quantum = mtu;
4911
0
    }
4912
0
    opt.buffer = tc_calc_buffer(class->min_rate, mtu, class->burst);
4913
0
    opt.cbuffer = tc_calc_buffer(class->max_rate, mtu, class->burst);
4914
0
    opt.prio = class->priority;
4915
4916
0
    tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE,
4917
0
                                         &request);
4918
0
    if (!tcmsg) {
4919
0
        return ENODEV;
4920
0
    }
4921
0
    tcmsg->tcm_handle = handle;
4922
0
    tcmsg->tcm_parent = parent;
4923
4924
0
    nl_msg_put_string(&request, TCA_KIND, "htb");
4925
0
    opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4926
4927
0
#ifdef HAVE_TCA_HTB_RATE64
4928
0
    if (class->min_rate > UINT32_MAX) {
4929
0
        nl_msg_put_u64(&request, TCA_HTB_RATE64, class->min_rate);
4930
0
    }
4931
0
    if (class->max_rate > UINT32_MAX) {
4932
0
        nl_msg_put_u64(&request, TCA_HTB_CEIL64, class->max_rate);
4933
0
    }
4934
0
#endif
4935
0
    nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
4936
4937
0
    tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate, class->min_rate);
4938
0
    tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil, class->max_rate);
4939
0
    nl_msg_end_nested(&request, opt_offset);
4940
4941
0
    error = tc_transact(&request, NULL);
4942
0
    if (error) {
4943
0
        VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
4944
0
                     "min_rate=%"PRIu64" max_rate=%"PRIu64" burst=%u prio=%u "
4945
0
                     "(%s)",
4946
0
                     netdev_get_name(netdev),
4947
0
                     tc_get_major(handle), tc_get_minor(handle),
4948
0
                     tc_get_major(parent), tc_get_minor(parent),
4949
0
                     class->min_rate, class->max_rate,
4950
0
                     class->burst, class->priority, ovs_strerror(error));
4951
0
    }
4952
0
    return error;
4953
0
}
4954
4955
/* Parses Netlink attributes in 'options' for HTB parameters and stores a
4956
 * description of them into 'details'.  The description complies with the
4957
 * specification given in the vswitch database documentation for linux-htb
4958
 * queue details. */
4959
static int
4960
htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
4961
0
{
4962
0
    static const struct nl_policy tca_htb_policy[] = {
4963
0
        [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
4964
0
                            .min_len = sizeof(struct tc_htb_opt) },
4965
0
#ifdef HAVE_TCA_HTB_RATE64
4966
0
        [TCA_HTB_RATE64] = { .type = NL_A_U64, .optional = true },
4967
0
        [TCA_HTB_CEIL64] = { .type = NL_A_U64, .optional = true },
4968
0
#endif
4969
0
    };
4970
4971
0
    struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
4972
0
    const struct tc_htb_opt *htb;
4973
4974
0
    if (!nl_parse_nested(nl_options, tca_htb_policy,
4975
0
                         attrs, ARRAY_SIZE(tca_htb_policy))) {
4976
0
        VLOG_WARN_RL(&rl, "failed to parse HTB class options");
4977
0
        return EPROTO;
4978
0
    }
4979
4980
0
    htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
4981
0
    class->min_rate = htb->rate.rate;
4982
0
    class->max_rate = htb->ceil.rate;
4983
0
#ifdef HAVE_TCA_HTB_RATE64
4984
0
    if (attrs[TCA_HTB_RATE64]) {
4985
0
        class->min_rate = nl_attr_get_u64(attrs[TCA_HTB_RATE64]);
4986
0
    }
4987
0
    if (attrs[TCA_HTB_CEIL64]) {
4988
0
        class->max_rate = nl_attr_get_u64(attrs[TCA_HTB_CEIL64]);
4989
0
    }
4990
0
#endif
4991
0
    class->burst = tc_ticks_to_bytes(class->min_rate, htb->buffer);
4992
0
    class->priority = htb->prio;
4993
0
    return 0;
4994
0
}
4995
4996
static int
4997
htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
4998
                  struct htb_class *options,
4999
                  struct netdev_queue_stats *stats)
5000
0
{
5001
0
    struct nlattr *nl_options;
5002
0
    unsigned int handle;
5003
0
    int error;
5004
5005
0
    error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
5006
0
    if (!error && queue_id) {
5007
0
        unsigned int major = tc_get_major(handle);
5008
0
        unsigned int minor = tc_get_minor(handle);
5009
0
        if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
5010
0
            *queue_id = minor - 1;
5011
0
        } else {
5012
0
            error = EPROTO;
5013
0
        }
5014
0
    }
5015
0
    if (!error && options) {
5016
0
        error = htb_parse_tca_options__(nl_options, options);
5017
0
    }
5018
0
    return error;
5019
0
}
5020
5021
static void
5022
htb_parse_qdisc_details__(struct netdev *netdev, const struct smap *details,
5023
                          struct htb_class *hc)
5024
0
{
5025
0
    hc->max_rate = smap_get_ullong(details, "max-rate", 0) / 8;
5026
0
    if (!hc->max_rate) {
5027
0
        uint32_t current_speed;
5028
0
        uint32_t max_speed OVS_UNUSED;
5029
5030
0
        netdev_linux_get_speed_locked(netdev_linux_cast(netdev),
5031
0
                                      &current_speed, &max_speed);
5032
0
        hc->max_rate = current_speed ? current_speed / 8 * 1000000ULL
5033
0
                                     : NETDEV_DEFAULT_BPS / 8;
5034
0
    }
5035
0
    hc->min_rate = hc->max_rate;
5036
0
    hc->burst = 0;
5037
0
    hc->priority = 0;
5038
0
}
5039
5040
static int
5041
htb_parse_class_details__(struct netdev *netdev,
5042
                          const struct smap *details, struct htb_class *hc)
5043
0
{
5044
0
    const struct htb *htb = htb_get__(netdev);
5045
0
    int mtu, error;
5046
0
    unsigned long long int max_rate_bit;
5047
5048
0
    error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
5049
0
    if (error) {
5050
0
        VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
5051
0
                     netdev_get_name(netdev));
5052
0
        return error;
5053
0
    }
5054
5055
    /* HTB requires at least an mtu sized min-rate to send any traffic even
5056
     * on uncongested links. */
5057
0
    hc->min_rate = smap_get_ullong(details, "min-rate", 0) / 8;
5058
0
    hc->min_rate = MAX(hc->min_rate, mtu);
5059
0
    hc->min_rate = MIN(hc->min_rate, htb->max_rate);
5060
5061
    /* max-rate */
5062
0
    max_rate_bit = smap_get_ullong(details, "max-rate", 0);
5063
0
    hc->max_rate = max_rate_bit ? max_rate_bit / 8 : htb->max_rate;
5064
0
    hc->max_rate = MAX(hc->max_rate, hc->min_rate);
5065
0
    hc->max_rate = MIN(hc->max_rate, htb->max_rate);
5066
5067
    /* burst
5068
     *
5069
     * According to hints in the documentation that I've read, it is important
5070
     * that 'burst' be at least as big as the largest frame that might be
5071
     * transmitted.  Also, making 'burst' a bit bigger than necessary is OK,
5072
     * but having it a bit too small is a problem.  Since netdev_get_mtu()
5073
     * doesn't include the Ethernet header, we need to add at least 14 (18?) to
5074
     * the MTU.  We actually add 64, instead of 14, as a guard against
5075
     * additional headers get tacked on somewhere that we're not aware of. */
5076
0
    hc->burst = smap_get_ullong(details, "burst", 0) / 8;
5077
0
    hc->burst = MAX(hc->burst, mtu + 64);
5078
5079
    /* priority */
5080
0
    hc->priority = smap_get_ullong(details, "priority", 0);
5081
5082
0
    return 0;
5083
0
}
5084
5085
static int
5086
htb_query_class__(const struct netdev *netdev, unsigned int handle,
5087
                  unsigned int parent, struct htb_class *options,
5088
                  struct netdev_queue_stats *stats)
5089
0
{
5090
0
    struct ofpbuf *reply;
5091
0
    int error;
5092
5093
0
    error = tc_query_class(netdev, handle, parent, &reply);
5094
0
    if (!error) {
5095
0
        error = htb_parse_tcmsg__(reply, NULL, options, stats);
5096
0
        ofpbuf_delete(reply);
5097
0
    }
5098
0
    return error;
5099
0
}
5100
5101
static int
5102
htb_tc_install(struct netdev *netdev, const struct smap *details)
5103
0
{
5104
0
    int error;
5105
5106
0
    error = htb_setup_qdisc__(netdev);
5107
0
    if (!error) {
5108
0
        struct htb_class hc;
5109
5110
0
        htb_parse_qdisc_details__(netdev, details, &hc);
5111
0
        error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
5112
0
                                  tc_make_handle(1, 0), &hc);
5113
0
        if (!error) {
5114
0
            htb_install__(netdev, hc.max_rate);
5115
0
        }
5116
0
    }
5117
0
    return error;
5118
0
}
5119
5120
static struct htb_class *
5121
htb_class_cast__(const struct tc_queue *queue)
5122
0
{
5123
0
    return CONTAINER_OF(queue, struct htb_class, tc_queue);
5124
0
}
5125
5126
static void
5127
htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
5128
                   const struct htb_class *hc)
5129
0
{
5130
0
    struct htb *htb = htb_get__(netdev);
5131
0
    size_t hash = hash_int(queue_id, 0);
5132
0
    struct tc_queue *queue;
5133
0
    struct htb_class *hcp;
5134
5135
0
    queue = tc_find_queue__(netdev, queue_id, hash);
5136
0
    if (queue) {
5137
0
        hcp = htb_class_cast__(queue);
5138
0
    } else {
5139
0
        hcp = xmalloc(sizeof *hcp);
5140
0
        queue = &hcp->tc_queue;
5141
0
        queue->queue_id = queue_id;
5142
0
        queue->created = time_msec();
5143
0
        hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
5144
0
    }
5145
5146
0
    hcp->min_rate = hc->min_rate;
5147
0
    hcp->max_rate = hc->max_rate;
5148
0
    hcp->burst = hc->burst;
5149
0
    hcp->priority = hc->priority;
5150
0
}
5151
5152
static int
5153
htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
5154
0
{
5155
0
    struct ofpbuf msg;
5156
0
    struct queue_dump_state state;
5157
0
    struct htb_class hc;
5158
5159
    /* Get qdisc options. */
5160
0
    hc.max_rate = 0;
5161
0
    htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
5162
0
    htb_install__(netdev, hc.max_rate);
5163
5164
    /* Get queues. */
5165
0
    if (!start_queue_dump(netdev, &state)) {
5166
0
        return ENODEV;
5167
0
    }
5168
0
    while (nl_dump_next(&state.dump, &msg, &state.buf)) {
5169
0
        unsigned int queue_id;
5170
5171
0
        if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
5172
0
            htb_update_queue__(netdev, queue_id, &hc);
5173
0
        }
5174
0
    }
5175
0
    finish_queue_dump(&state);
5176
5177
0
    return 0;
5178
0
}
5179
5180
static void
5181
htb_tc_destroy(struct tc *tc)
5182
0
{
5183
0
    struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
5184
0
    struct htb_class *hc;
5185
5186
0
    HMAP_FOR_EACH_POP (hc, tc_queue.hmap_node, &htb->tc.queues) {
5187
0
        free(hc);
5188
0
    }
5189
0
    tc_destroy(tc);
5190
0
    free(htb);
5191
0
}
5192
5193
static int
5194
htb_qdisc_get(const struct netdev *netdev, struct smap *details)
5195
0
{
5196
0
    const struct htb *htb = htb_get__(netdev);
5197
0
    smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
5198
0
    return 0;
5199
0
}
5200
5201
static int
5202
htb_qdisc_set(struct netdev *netdev, const struct smap *details)
5203
0
{
5204
0
    struct htb_class hc;
5205
0
    int error;
5206
5207
0
    htb_parse_qdisc_details__(netdev, details, &hc);
5208
0
    error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
5209
0
                              tc_make_handle(1, 0), &hc);
5210
0
    if (!error) {
5211
0
        htb_get__(netdev)->max_rate = hc.max_rate;
5212
0
    }
5213
0
    return error;
5214
0
}
5215
5216
static int
5217
htb_class_get(const struct netdev *netdev OVS_UNUSED,
5218
              const struct tc_queue *queue, struct smap *details)
5219
0
{
5220
0
    const struct htb_class *hc = htb_class_cast__(queue);
5221
5222
0
    smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
5223
0
    if (hc->min_rate != hc->max_rate) {
5224
0
        smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
5225
0
    }
5226
0
    smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
5227
0
    if (hc->priority) {
5228
0
        smap_add_format(details, "priority", "%u", hc->priority);
5229
0
    }
5230
0
    return 0;
5231
0
}
5232
5233
static int
5234
htb_class_set(struct netdev *netdev, unsigned int queue_id,
5235
              const struct smap *details)
5236
0
{
5237
0
    struct htb_class hc;
5238
0
    int error;
5239
5240
0
    error = htb_parse_class_details__(netdev, details, &hc);
5241
0
    if (error) {
5242
0
        return error;
5243
0
    }
5244
5245
0
    error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
5246
0
                              tc_make_handle(1, 0xfffe), &hc);
5247
0
    if (error) {
5248
0
        return error;
5249
0
    }
5250
5251
0
    htb_update_queue__(netdev, queue_id, &hc);
5252
0
    return 0;
5253
0
}
5254
5255
static int
5256
htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
5257
0
{
5258
0
    struct htb_class *hc = htb_class_cast__(queue);
5259
0
    struct htb *htb = htb_get__(netdev);
5260
0
    int error;
5261
5262
0
    error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
5263
0
    if (!error) {
5264
0
        hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
5265
0
        free(hc);
5266
0
    }
5267
0
    return error;
5268
0
}
5269
5270
static int
5271
htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
5272
                    struct netdev_queue_stats *stats)
5273
0
{
5274
0
    return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
5275
0
                             tc_make_handle(1, 0xfffe), NULL, stats);
5276
0
}
5277
5278
static int
5279
htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
5280
                     const struct ofpbuf *nlmsg,
5281
                     netdev_dump_queue_stats_cb *cb, void *aux)
5282
0
{
5283
0
    struct netdev_queue_stats stats;
5284
0
    unsigned int handle, major, minor;
5285
0
    int error;
5286
5287
0
    error = tc_parse_class(nlmsg, &handle, NULL, &stats);
5288
0
    if (error) {
5289
0
        return error;
5290
0
    }
5291
5292
0
    major = tc_get_major(handle);
5293
0
    minor = tc_get_minor(handle);
5294
0
    if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
5295
0
        (*cb)(minor - 1, &stats, aux);
5296
0
    }
5297
0
    return 0;
5298
0
}
5299
5300
static const struct tc_ops tc_ops_htb = {
5301
    .linux_name = "htb",
5302
    .ovs_name = "linux-htb",
5303
    .n_queues = HTB_N_QUEUES,
5304
    .tc_install = htb_tc_install,
5305
    .tc_load = htb_tc_load,
5306
    .tc_destroy = htb_tc_destroy,
5307
    .qdisc_get = htb_qdisc_get,
5308
    .qdisc_set = htb_qdisc_set,
5309
    .class_get = htb_class_get,
5310
    .class_set = htb_class_set,
5311
    .class_delete = htb_class_delete,
5312
    .class_get_stats = htb_class_get_stats,
5313
    .class_dump_stats = htb_class_dump_stats
5314
};
5315

5316
/* "linux-hfsc" traffic control class. */
5317
5318
0
#define HFSC_N_QUEUES 0xf000
5319
5320
struct hfsc {
5321
    struct tc tc;
5322
    uint32_t max_rate;
5323
};
5324
5325
struct hfsc_class {
5326
    struct tc_queue tc_queue;
5327
    uint32_t min_rate;
5328
    uint32_t max_rate;
5329
};
5330
5331
static struct hfsc *
5332
hfsc_get__(const struct netdev *netdev_)
5333
0
{
5334
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5335
0
    return CONTAINER_OF(netdev->tc, struct hfsc, tc);
5336
0
}
5337
5338
static struct hfsc_class *
5339
hfsc_class_cast__(const struct tc_queue *queue)
5340
0
{
5341
0
    return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
5342
0
}
5343
5344
static void
5345
hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
5346
0
{
5347
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5348
0
    struct hfsc *hfsc;
5349
5350
0
    hfsc = xmalloc(sizeof *hfsc);
5351
0
    tc_init(&hfsc->tc, &tc_ops_hfsc);
5352
0
    hfsc->max_rate = max_rate;
5353
0
    netdev->tc = &hfsc->tc;
5354
0
}
5355
5356
static void
5357
hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
5358
                    const struct hfsc_class *hc)
5359
0
{
5360
0
    size_t hash;
5361
0
    struct hfsc *hfsc;
5362
0
    struct hfsc_class *hcp;
5363
0
    struct tc_queue *queue;
5364
5365
0
    hfsc = hfsc_get__(netdev);
5366
0
    hash = hash_int(queue_id, 0);
5367
5368
0
    queue = tc_find_queue__(netdev, queue_id, hash);
5369
0
    if (queue) {
5370
0
        hcp = hfsc_class_cast__(queue);
5371
0
    } else {
5372
0
        hcp             = xmalloc(sizeof *hcp);
5373
0
        queue           = &hcp->tc_queue;
5374
0
        queue->queue_id = queue_id;
5375
0
        queue->created  = time_msec();
5376
0
        hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
5377
0
    }
5378
5379
0
    hcp->min_rate = hc->min_rate;
5380
0
    hcp->max_rate = hc->max_rate;
5381
0
}
5382
5383
static int
5384
hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
5385
0
{
5386
0
    const struct tc_service_curve *rsc, *fsc, *usc;
5387
0
    static const struct nl_policy tca_hfsc_policy[] = {
5388
0
        [TCA_HFSC_RSC] = {
5389
0
            .type      = NL_A_UNSPEC,
5390
0
            .optional  = false,
5391
0
            .min_len   = sizeof(struct tc_service_curve),
5392
0
        },
5393
0
        [TCA_HFSC_FSC] = {
5394
0
            .type      = NL_A_UNSPEC,
5395
0
            .optional  = false,
5396
0
            .min_len   = sizeof(struct tc_service_curve),
5397
0
        },
5398
0
        [TCA_HFSC_USC] = {
5399
0
            .type      = NL_A_UNSPEC,
5400
0
            .optional  = false,
5401
0
            .min_len   = sizeof(struct tc_service_curve),
5402
0
        },
5403
0
    };
5404
0
    struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
5405
5406
0
    if (!nl_parse_nested(nl_options, tca_hfsc_policy,
5407
0
                         attrs, ARRAY_SIZE(tca_hfsc_policy))) {
5408
0
        VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
5409
0
        return EPROTO;
5410
0
    }
5411
5412
0
    rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
5413
0
    fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
5414
0
    usc = nl_attr_get(attrs[TCA_HFSC_USC]);
5415
5416
0
    if (rsc->m1 != 0 || rsc->d != 0 ||
5417
0
        fsc->m1 != 0 || fsc->d != 0 ||
5418
0
        usc->m1 != 0 || usc->d != 0) {
5419
0
        VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
5420
0
                     "Non-linear service curves are not supported.");
5421
0
        return EPROTO;
5422
0
    }
5423
5424
0
    if (rsc->m2 != fsc->m2) {
5425
0
        VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
5426
0
                     "Real-time service curves are not supported ");
5427
0
        return EPROTO;
5428
0
    }
5429
5430
0
    if (rsc->m2 > usc->m2) {
5431
0
        VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
5432
0
                     "Min-rate service curve is greater than "
5433
0
                     "the max-rate service curve.");
5434
0
        return EPROTO;
5435
0
    }
5436
5437
0
    class->min_rate = fsc->m2;
5438
0
    class->max_rate = usc->m2;
5439
0
    return 0;
5440
0
}
5441
5442
static int
5443
hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
5444
                   struct hfsc_class *options,
5445
                   struct netdev_queue_stats *stats)
5446
0
{
5447
0
    int error;
5448
0
    unsigned int handle;
5449
0
    struct nlattr *nl_options;
5450
5451
0
    error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
5452
0
    if (error) {
5453
0
        return error;
5454
0
    }
5455
5456
0
    if (queue_id) {
5457
0
        unsigned int major, minor;
5458
5459
0
        major = tc_get_major(handle);
5460
0
        minor = tc_get_minor(handle);
5461
0
        if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
5462
0
            *queue_id = minor - 1;
5463
0
        } else {
5464
0
            return EPROTO;
5465
0
        }
5466
0
    }
5467
5468
0
    if (options) {
5469
0
        error = hfsc_parse_tca_options__(nl_options, options);
5470
0
    }
5471
5472
0
    return error;
5473
0
}
5474
5475
static int
5476
hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
5477
                   unsigned int parent, struct hfsc_class *options,
5478
                   struct netdev_queue_stats *stats)
5479
0
{
5480
0
    int error;
5481
0
    struct ofpbuf *reply;
5482
5483
0
    error = tc_query_class(netdev, handle, parent, &reply);
5484
0
    if (error) {
5485
0
        return error;
5486
0
    }
5487
5488
0
    error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
5489
0
    ofpbuf_delete(reply);
5490
0
    return error;
5491
0
}
5492
5493
static void
5494
hfsc_parse_qdisc_details__(struct netdev *netdev, const struct smap *details,
5495
                           struct hfsc_class *class)
5496
0
{
5497
0
    uint32_t max_rate = smap_get_ullong(details, "max-rate", 0) / 8;
5498
0
    if (!max_rate) {
5499
0
        uint32_t current_speed;
5500
0
        uint32_t max_speed OVS_UNUSED;
5501
5502
0
        netdev_linux_get_speed_locked(netdev_linux_cast(netdev),
5503
0
                                      &current_speed, &max_speed);
5504
0
        max_rate = current_speed ? current_speed / 8 * 1000000ULL
5505
0
                                 : NETDEV_DEFAULT_BPS / 8;
5506
0
    }
5507
5508
0
    class->min_rate = max_rate;
5509
0
    class->max_rate = max_rate;
5510
0
}
5511
5512
static int
5513
hfsc_parse_class_details__(struct netdev *netdev,
5514
                           const struct smap *details,
5515
                           struct hfsc_class * class)
5516
0
{
5517
0
    const struct hfsc *hfsc;
5518
0
    uint32_t min_rate, max_rate;
5519
5520
0
    hfsc       = hfsc_get__(netdev);
5521
5522
0
    min_rate = smap_get_ullong(details, "min-rate", 0) / 8;
5523
0
    min_rate = MAX(min_rate, 1);
5524
0
    min_rate = MIN(min_rate, hfsc->max_rate);
5525
5526
0
    max_rate = smap_get_ullong(details, "max-rate", hfsc->max_rate * 8) / 8;
5527
0
    max_rate = MAX(max_rate, min_rate);
5528
0
    max_rate = MIN(max_rate, hfsc->max_rate);
5529
5530
0
    class->min_rate = min_rate;
5531
0
    class->max_rate = max_rate;
5532
5533
0
    return 0;
5534
0
}
5535
5536
/* Create an HFSC qdisc.
5537
 *
5538
 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
5539
static int
5540
hfsc_setup_qdisc__(struct netdev * netdev)
5541
0
{
5542
0
    struct tcmsg *tcmsg;
5543
0
    struct ofpbuf request;
5544
0
    struct tc_hfsc_qopt opt;
5545
5546
0
    tc_del_qdisc(netdev);
5547
5548
0
    tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
5549
0
                                         NLM_F_EXCL | NLM_F_CREATE, &request);
5550
5551
0
    if (!tcmsg) {
5552
0
        return ENODEV;
5553
0
    }
5554
5555
0
    tcmsg->tcm_handle = tc_make_handle(1, 0);
5556
0
    tcmsg->tcm_parent = TC_H_ROOT;
5557
5558
0
    memset(&opt, 0, sizeof opt);
5559
0
    opt.defcls = 1;
5560
5561
0
    nl_msg_put_string(&request, TCA_KIND, "hfsc");
5562
0
    nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
5563
5564
0
    return tc_transact(&request, NULL);
5565
0
}
5566
5567
/* Create an HFSC class.
5568
 *
5569
 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
5570
 * sc rate <min_rate> ul rate <max_rate>" */
5571
static int
5572
hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
5573
                   unsigned int parent, struct hfsc_class *class)
5574
0
{
5575
0
    int error;
5576
0
    size_t opt_offset;
5577
0
    struct tcmsg *tcmsg;
5578
0
    struct ofpbuf request;
5579
0
    struct tc_service_curve min, max;
5580
5581
0
    tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE,
5582
0
                                         &request);
5583
5584
0
    if (!tcmsg) {
5585
0
        return ENODEV;
5586
0
    }
5587
5588
0
    tcmsg->tcm_handle = handle;
5589
0
    tcmsg->tcm_parent = parent;
5590
5591
0
    min.m1 = 0;
5592
0
    min.d  = 0;
5593
0
    min.m2 = class->min_rate;
5594
5595
0
    max.m1 = 0;
5596
0
    max.d  = 0;
5597
0
    max.m2 = class->max_rate;
5598
5599
0
    nl_msg_put_string(&request, TCA_KIND, "hfsc");
5600
0
    opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
5601
0
    nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
5602
0
    nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
5603
0
    nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
5604
0
    nl_msg_end_nested(&request, opt_offset);
5605
5606
0
    error = tc_transact(&request, NULL);
5607
0
    if (error) {
5608
0
        VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
5609
0
                     "min-rate %ubps, max-rate %ubps (%s)",
5610
0
                     netdev_get_name(netdev),
5611
0
                     tc_get_major(handle), tc_get_minor(handle),
5612
0
                     tc_get_major(parent), tc_get_minor(parent),
5613
0
                     class->min_rate, class->max_rate, ovs_strerror(error));
5614
0
    }
5615
5616
0
    return error;
5617
0
}
5618
5619
static int
5620
hfsc_tc_install(struct netdev *netdev, const struct smap *details)
5621
0
{
5622
0
    int error;
5623
0
    struct hfsc_class class;
5624
5625
0
    error = hfsc_setup_qdisc__(netdev);
5626
5627
0
    if (error) {
5628
0
        return error;
5629
0
    }
5630
5631
0
    hfsc_parse_qdisc_details__(netdev, details, &class);
5632
0
    error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
5633
0
                               tc_make_handle(1, 0), &class);
5634
5635
0
    if (error) {
5636
0
        return error;
5637
0
    }
5638
5639
0
    hfsc_install__(netdev, class.max_rate);
5640
0
    return 0;
5641
0
}
5642
5643
static int
5644
hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
5645
0
{
5646
0
    struct ofpbuf msg;
5647
0
    struct queue_dump_state state;
5648
0
    struct hfsc_class hc;
5649
5650
0
    hc.max_rate = 0;
5651
0
    hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
5652
0
    hfsc_install__(netdev, hc.max_rate);
5653
5654
0
    if (!start_queue_dump(netdev, &state)) {
5655
0
        return ENODEV;
5656
0
    }
5657
5658
0
    while (nl_dump_next(&state.dump, &msg, &state.buf)) {
5659
0
        unsigned int queue_id;
5660
5661
0
        if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
5662
0
            hfsc_update_queue__(netdev, queue_id, &hc);
5663
0
        }
5664
0
    }
5665
5666
0
    finish_queue_dump(&state);
5667
0
    return 0;
5668
0
}
5669
5670
static void
5671
hfsc_tc_destroy(struct tc *tc)
5672
0
{
5673
0
    struct hfsc *hfsc;
5674
0
    struct hfsc_class *hc;
5675
5676
0
    hfsc = CONTAINER_OF(tc, struct hfsc, tc);
5677
5678
0
    HMAP_FOR_EACH_SAFE (hc, tc_queue.hmap_node, &hfsc->tc.queues) {
5679
0
        hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
5680
0
        free(hc);
5681
0
    }
5682
5683
0
    tc_destroy(tc);
5684
0
    free(hfsc);
5685
0
}
5686
5687
static int
5688
hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
5689
0
{
5690
0
    const struct hfsc *hfsc;
5691
0
    hfsc = hfsc_get__(netdev);
5692
0
    smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
5693
0
    return 0;
5694
0
}
5695
5696
static int
5697
hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
5698
0
{
5699
0
    int error;
5700
0
    struct hfsc_class class;
5701
5702
0
    hfsc_parse_qdisc_details__(netdev, details, &class);
5703
0
    error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
5704
0
                               tc_make_handle(1, 0), &class);
5705
5706
0
    if (!error) {
5707
0
        hfsc_get__(netdev)->max_rate = class.max_rate;
5708
0
    }
5709
5710
0
    return error;
5711
0
}
5712
5713
static int
5714
hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
5715
              const struct tc_queue *queue, struct smap *details)
5716
0
{
5717
0
    const struct hfsc_class *hc;
5718
5719
0
    hc = hfsc_class_cast__(queue);
5720
0
    smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
5721
0
    if (hc->min_rate != hc->max_rate) {
5722
0
        smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
5723
0
    }
5724
0
    return 0;
5725
0
}
5726
5727
static int
5728
hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
5729
               const struct smap *details)
5730
0
{
5731
0
    int error;
5732
0
    struct hfsc_class class;
5733
5734
0
    error = hfsc_parse_class_details__(netdev, details, &class);
5735
0
    if (error) {
5736
0
        return error;
5737
0
    }
5738
5739
0
    error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
5740
0
                               tc_make_handle(1, 0xfffe), &class);
5741
0
    if (error) {
5742
0
        return error;
5743
0
    }
5744
5745
0
    hfsc_update_queue__(netdev, queue_id, &class);
5746
0
    return 0;
5747
0
}
5748
5749
static int
5750
hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
5751
0
{
5752
0
    int error;
5753
0
    struct hfsc *hfsc;
5754
0
    struct hfsc_class *hc;
5755
5756
0
    hc   = hfsc_class_cast__(queue);
5757
0
    hfsc = hfsc_get__(netdev);
5758
5759
0
    error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
5760
0
    if (!error) {
5761
0
        hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
5762
0
        free(hc);
5763
0
    }
5764
0
    return error;
5765
0
}
5766
5767
static int
5768
hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
5769
                     struct netdev_queue_stats *stats)
5770
0
{
5771
0
    return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
5772
0
                             tc_make_handle(1, 0xfffe), NULL, stats);
5773
0
}
5774
5775
static int
5776
hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
5777
                      const struct ofpbuf *nlmsg,
5778
                      netdev_dump_queue_stats_cb *cb, void *aux)
5779
0
{
5780
0
    struct netdev_queue_stats stats;
5781
0
    unsigned int handle, major, minor;
5782
0
    int error;
5783
5784
0
    error = tc_parse_class(nlmsg, &handle, NULL, &stats);
5785
0
    if (error) {
5786
0
        return error;
5787
0
    }
5788
5789
0
    major = tc_get_major(handle);
5790
0
    minor = tc_get_minor(handle);
5791
0
    if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
5792
0
        (*cb)(minor - 1, &stats, aux);
5793
0
    }
5794
0
    return 0;
5795
0
}
5796
5797
static const struct tc_ops tc_ops_hfsc = {
5798
    .linux_name = "hfsc",
5799
    .ovs_name = "linux-hfsc",
5800
    .n_queues = HFSC_N_QUEUES,              /* n_queues */
5801
    .tc_install = hfsc_tc_install,
5802
    .tc_load = hfsc_tc_load,
5803
    .tc_destroy = hfsc_tc_destroy,
5804
    .qdisc_get = hfsc_qdisc_get,
5805
    .qdisc_set = hfsc_qdisc_set,
5806
    .class_get = hfsc_class_get,
5807
    .class_set = hfsc_class_set,
5808
    .class_delete = hfsc_class_delete,
5809
    .class_get_stats = hfsc_class_get_stats,
5810
    .class_dump_stats = hfsc_class_dump_stats,
5811
};
5812

5813
/* "linux-noop" traffic control class. */
5814
5815
static void
5816
noop_install__(struct netdev *netdev_)
5817
0
{
5818
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5819
0
    static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
5820
5821
0
    netdev->tc = CONST_CAST(struct tc *, &tc);
5822
0
}
5823
5824
static int
5825
noop_tc_install(struct netdev *netdev,
5826
                   const struct smap *details OVS_UNUSED)
5827
0
{
5828
0
    noop_install__(netdev);
5829
0
    return 0;
5830
0
}
5831
5832
static int
5833
noop_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
5834
0
{
5835
0
    noop_install__(netdev);
5836
0
    return 0;
5837
0
}
5838
5839
static const struct tc_ops tc_ops_noop = {
5840
    .ovs_name = "linux-noop",               /* ovs_name */
5841
    .tc_install = noop_tc_install,
5842
    .tc_load = noop_tc_load,
5843
};
5844

5845
/* "linux-default" traffic control class.
5846
 *
5847
 * This class represents the default, unnamed Linux qdisc.  It corresponds to
5848
 * the "" (empty string) QoS type in the OVS database. */
5849
5850
static void
5851
default_install__(struct netdev *netdev_)
5852
0
{
5853
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5854
0
    static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
5855
5856
    /* Nothing but a tc class implementation is allowed to write to a tc.  This
5857
     * class never does that, so we can legitimately use a const tc object. */
5858
0
    netdev->tc = CONST_CAST(struct tc *, &tc);
5859
0
}
5860
5861
static int
5862
default_tc_install(struct netdev *netdev,
5863
                   const struct smap *details OVS_UNUSED)
5864
0
{
5865
0
    default_install__(netdev);
5866
0
    return 0;
5867
0
}
5868
5869
static int
5870
default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
5871
0
{
5872
0
    default_install__(netdev);
5873
0
    return 0;
5874
0
}
5875
5876
static const struct tc_ops tc_ops_default = {
5877
    .ovs_name = "",                         /* ovs_name */
5878
    .tc_install = default_tc_install,
5879
    .tc_load = default_tc_load,
5880
};
5881

5882
/* "linux-other" traffic control class.
5883
 *
5884
 * */
5885
5886
static int
5887
other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
5888
0
{
5889
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5890
0
    static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
5891
5892
    /* Nothing but a tc class implementation is allowed to write to a tc.  This
5893
     * class never does that, so we can legitimately use a const tc object. */
5894
0
    netdev->tc = CONST_CAST(struct tc *, &tc);
5895
0
    return 0;
5896
0
}
5897
5898
static const struct tc_ops tc_ops_other = {
5899
    .ovs_name = "linux-other",
5900
    .tc_load = other_tc_load,
5901
};
5902

5903
/* Traffic control. */
5904
5905
/* Number of kernel "tc" ticks per second. */
5906
static double ticks_per_s;
5907
5908
/* Number of kernel "jiffies" per second.  This is used for the purpose of
5909
 * computing buffer sizes.  Generally kernel qdiscs need to be able to buffer
5910
 * one jiffy's worth of data.
5911
 *
5912
 * There are two possibilities here:
5913
 *
5914
 *    - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
5915
 *      approximate range of 100 to 1024.  That means that we really need to
5916
 *      make sure that the qdisc can buffer that much data.
5917
 *
5918
 *    - 'buffer_hz' is an absurdly large number.  That means that the kernel
5919
 *      has finely granular timers and there's no need to fudge additional room
5920
 *      for buffers.  (There's no extra effort needed to implement that: the
5921
 *      large 'buffer_hz' is used as a divisor, so practically any number will
5922
 *      come out as 0 in the division.  Small integer results in the case of
5923
 *      really high dividends won't have any real effect anyhow.)
5924
 */
5925
static unsigned int buffer_hz;
5926
5927
static struct tcmsg *
5928
netdev_linux_tc_make_request(const struct netdev *netdev, int type,
5929
                             unsigned int flags, struct ofpbuf *request)
5930
0
{
5931
0
    int ifindex;
5932
0
    int error;
5933
5934
0
    error = get_ifindex(netdev, &ifindex);
5935
0
    if (error) {
5936
0
        return NULL;
5937
0
    }
5938
5939
0
    return tc_make_request(ifindex, type, flags, request);
5940
0
}
5941
5942
static void
5943
tc_policer_init(struct tc_police *tc_police, uint64_t kbits_rate,
5944
                uint64_t kbits_burst)
5945
0
{
5946
0
    int mtu = 65535;
5947
5948
0
    memset(tc_police, 0, sizeof *tc_police);
5949
5950
0
    tc_police->action = TC_POLICE_SHOT;
5951
0
    tc_police->mtu = mtu;
5952
0
    tc_fill_rate(&tc_police->rate, kbits_rate * 1000 / 8, mtu);
5953
5954
    /* The following appears wrong in one way: In networking a kilobit is
5955
     * usually 1000 bits but this uses 1024 bits.
5956
     *
5957
     * However if you "fix" those problems then "tc filter show ..." shows
5958
     * "125000b", meaning 125,000 bits, when OVS configures it for 1000 kbit ==
5959
     * 1,000,000 bits, whereas this actually ends up doing the right thing from
5960
     * tc's point of view.  Whatever. */
5961
0
    tc_police->burst = tc_bytes_to_ticks(
5962
0
        tc_police->rate.rate, kbits_burst * 1024 / 8);
5963
0
}
5964
5965
/* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
5966
 * of 'kbits_burst', with a rate of 'kpkts_rate' and a burst size of
5967
 * 'kpkts_burst'.
5968
 *
5969
 * This function is equivalent to running:
5970
 *     /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
5971
 *              basic police rate <kbits_rate>kbit burst <kbits_burst>k
5972
 *              mtu 65535 drop
5973
 *
5974
 * The configuration and stats may be seen with the following command:
5975
 *     /sbin/tc -s filter show dev <devname> parent ffff:
5976
 *
5977
 * Returns 0 if successful, otherwise a positive errno value.
5978
 */
5979
static int
5980
tc_add_policer(struct netdev *netdev, uint64_t kbits_rate,
5981
               uint32_t kbits_burst, uint32_t kpkts_rate, uint32_t kpkts_burst)
5982
0
{
5983
0
    size_t basic_offset, police_offset;
5984
0
    struct ofpbuf request;
5985
0
    struct tcmsg *tcmsg;
5986
0
    int error;
5987
5988
0
    tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWTFILTER,
5989
0
                                         NLM_F_EXCL | NLM_F_CREATE, &request);
5990
0
    if (!tcmsg) {
5991
0
        return ENODEV;
5992
0
    }
5993
0
    tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
5994
0
    tcmsg->tcm_info = tc_make_handle(49,
5995
0
                                     (OVS_FORCE uint16_t) htons(ETH_P_ALL));
5996
0
    nl_msg_put_string(&request, TCA_KIND, "basic");
5997
5998
0
    basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
5999
0
    police_offset = nl_msg_start_nested(&request, TCA_BASIC_ACT);
6000
0
    nl_msg_put_act_police(&request, 0, kbits_rate, kbits_burst,
6001
0
                          kpkts_rate * 1000ULL, kpkts_burst * 1000ULL,
6002
0
                          TC_ACT_UNSPEC, false);
6003
0
    nl_msg_end_nested(&request, police_offset);
6004
0
    nl_msg_end_nested(&request, basic_offset);
6005
6006
0
    error = tc_transact(&request, NULL);
6007
0
    if (error) {
6008
0
        return error;
6009
0
    }
6010
6011
0
    return 0;
6012
0
}
6013
6014
int
6015
tc_add_policer_action(uint32_t index, uint64_t kbits_rate,
6016
                      uint32_t kbits_burst, uint32_t pkts_rate,
6017
                      uint32_t pkts_burst, bool update)
6018
0
{
6019
0
    struct ofpbuf request;
6020
0
    struct tcamsg *tcamsg;
6021
0
    size_t offset;
6022
0
    int flags;
6023
0
    int error;
6024
6025
0
    flags = (update ? NLM_F_REPLACE : NLM_F_EXCL) | NLM_F_CREATE;
6026
0
    tcamsg = tc_make_action_request(RTM_NEWACTION, flags, &request);
6027
0
    if (!tcamsg) {
6028
0
        return ENODEV;
6029
0
    }
6030
6031
0
    offset = nl_msg_start_nested(&request, TCA_ACT_TAB);
6032
0
    nl_msg_put_act_police(&request, index, kbits_rate, kbits_burst, pkts_rate,
6033
0
                          pkts_burst, TC_ACT_PIPE, true);
6034
0
    nl_msg_end_nested(&request, offset);
6035
6036
0
    error = tc_transact(&request, NULL);
6037
0
    if (error) {
6038
0
        VLOG_ERR_RL(&rl, "Failed to %s police action, err=%d",
6039
0
                    update ? "update" : "add", error);
6040
0
    }
6041
6042
0
    return error;
6043
0
}
6044
6045
static int
6046
tc_update_policer_action_stats(struct ofpbuf *msg,
6047
                               struct ofputil_meter_stats *stats)
6048
0
{
6049
0
    struct ofpbuf b = ofpbuf_const_initializer(msg->data, msg->size);
6050
0
    struct nlmsghdr *nlmsg = ofpbuf_try_pull(&b, sizeof *nlmsg);
6051
0
    struct tcamsg *tca = ofpbuf_try_pull(&b, sizeof *tca);
6052
0
    struct ovs_flow_stats stats_dropped;
6053
0
    struct ovs_flow_stats stats_hw;
6054
0
    struct ovs_flow_stats stats_sw;
6055
0
    const struct nlattr *act;
6056
0
    struct nlattr *prio;
6057
0
    int error = 0;
6058
6059
0
    if (!stats) {
6060
0
        goto exit;
6061
0
    }
6062
6063
0
    if (!nlmsg || !tca) {
6064
0
        VLOG_ERR_RL(&rl, "Failed to get action stats, size error");
6065
0
        error = EPROTO;
6066
0
        goto exit;
6067
0
    }
6068
6069
0
    act = nl_attr_find(&b, 0, TCA_ACT_TAB);
6070
0
    if (!act) {
6071
0
        VLOG_ERR_RL(&rl, "Failed to get action stats, can't find attribute");
6072
0
        error = EPROTO;
6073
0
        goto exit;
6074
0
    }
6075
6076
0
    prio = (struct nlattr *) act + 1;
6077
0
    memset(&stats_sw, 0, sizeof stats_sw);
6078
0
    memset(&stats_hw, 0, sizeof stats_hw);
6079
0
    memset(&stats_dropped, 0, sizeof stats_dropped);
6080
0
    error = tc_parse_action_stats(prio, &stats_sw, &stats_hw, &stats_dropped);
6081
0
    if (!error) {
6082
0
        stats->packet_in_count +=
6083
0
            get_32aligned_u64(&stats_sw.n_packets);
6084
0
        stats->byte_in_count += get_32aligned_u64(&stats_sw.n_bytes);
6085
0
        stats->packet_in_count +=
6086
0
            get_32aligned_u64(&stats_hw.n_packets);
6087
0
        stats->byte_in_count += get_32aligned_u64(&stats_hw.n_bytes);
6088
0
        if (stats->n_bands >= 1) {
6089
0
            stats->bands[0].packet_count +=
6090
0
                get_32aligned_u64(&stats_dropped.n_packets);
6091
0
        }
6092
0
    }
6093
6094
0
exit:
6095
0
    ofpbuf_delete(msg);
6096
0
    return error;
6097
0
}
6098
6099
int
6100
tc_get_policer_action(uint32_t index, struct ofputil_meter_stats *stats)
6101
0
{
6102
0
    struct ofpbuf *replyp = NULL;
6103
0
    struct ofpbuf request;
6104
0
    struct tcamsg *tcamsg;
6105
0
    size_t root_offset;
6106
0
    size_t prio_offset;
6107
0
    int error;
6108
6109
0
    tcamsg = tc_make_action_request(RTM_GETACTION, 0, &request);
6110
0
    if (!tcamsg) {
6111
0
        return ENODEV;
6112
0
    }
6113
6114
0
    root_offset = nl_msg_start_nested(&request, TCA_ACT_TAB);
6115
0
    prio_offset = nl_msg_start_nested(&request, 1);
6116
0
    nl_msg_put_string(&request, TCA_ACT_KIND, "police");
6117
0
    nl_msg_put_u32(&request, TCA_ACT_INDEX, index);
6118
0
    nl_msg_end_nested(&request, prio_offset);
6119
0
    nl_msg_end_nested(&request, root_offset);
6120
6121
0
    error = tc_transact(&request, &replyp);
6122
0
    if (error) {
6123
0
        VLOG_ERR_RL(&rl, "Failed to dump police action (index: %u), err=%d",
6124
0
                    index, error);
6125
0
        return error;
6126
0
    }
6127
6128
0
    return tc_update_policer_action_stats(replyp, stats);
6129
0
}
6130
6131
int
6132
tc_del_policer_action(uint32_t index, struct ofputil_meter_stats *stats)
6133
0
{
6134
0
    struct ofpbuf *replyp = NULL;
6135
0
    struct ofpbuf request;
6136
0
    struct tcamsg *tcamsg;
6137
0
    size_t root_offset;
6138
0
    size_t prio_offset;
6139
0
    int error;
6140
6141
0
    tcamsg = tc_make_action_request(RTM_DELACTION, NLM_F_ACK, &request);
6142
0
    if (!tcamsg) {
6143
0
        return ENODEV;
6144
0
    }
6145
6146
0
    root_offset = nl_msg_start_nested(&request, TCA_ACT_TAB);
6147
0
    prio_offset = nl_msg_start_nested(&request, 1);
6148
0
    nl_msg_put_string(&request, TCA_ACT_KIND, "police");
6149
0
    nl_msg_put_u32(&request, TCA_ACT_INDEX, index);
6150
0
    nl_msg_end_nested(&request, prio_offset);
6151
0
    nl_msg_end_nested(&request, root_offset);
6152
6153
0
    error = tc_transact(&request, &replyp);
6154
0
    if (error) {
6155
0
        VLOG_ERR_RL(&rl, "Failed to delete police action (index: %u), err=%d",
6156
0
                    index, error);
6157
0
        return error;
6158
0
    }
6159
6160
0
    return tc_update_policer_action_stats(replyp, stats);
6161
0
}
6162
6163
static void
6164
read_psched(void)
6165
0
{
6166
    /* The values in psched are not individually very meaningful, but they are
6167
     * important.  The tables below show some values seen in the wild.
6168
     *
6169
     * Some notes:
6170
     *
6171
     *   - "c" has always been a constant 1000000 since at least Linux 2.4.14.
6172
     *     (Before that, there are hints that it was 1000000000.)
6173
     *
6174
     *   - "d" can be unrealistically large, see the comment on 'buffer_hz'
6175
     *     above.
6176
     *
6177
     *                        /proc/net/psched
6178
     *     -----------------------------------
6179
     * [1] 000c8000 000f4240 000f4240 00000064
6180
     * [2] 000003e8 00000400 000f4240 3b9aca00
6181
     * [3] 000003e8 00000400 000f4240 3b9aca00
6182
     * [4] 000003e8 00000400 000f4240 00000064
6183
     * [5] 000003e8 00000040 000f4240 3b9aca00
6184
     * [6] 000003e8 00000040 000f4240 000000f9
6185
     *
6186
     *           a         b          c             d ticks_per_s     buffer_hz
6187
     *     ------- --------- ---------- ------------- ----------- -------------
6188
     * [1] 819,200 1,000,000  1,000,000           100     819,200           100
6189
     * [2]   1,000     1,024  1,000,000 1,000,000,000     976,562 1,000,000,000
6190
     * [3]   1,000     1,024  1,000,000 1,000,000,000     976,562 1,000,000,000
6191
     * [4]   1,000     1,024  1,000,000           100     976,562           100
6192
     * [5]   1,000        64  1,000,000 1,000,000,000  15,625,000 1,000,000,000
6193
     * [6]   1,000        64  1,000,000           249  15,625,000           249
6194
     *
6195
     * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
6196
     * [2] 2.6.26-1-686-bigmem from Debian lenny
6197
     * [3] 2.6.26-2-sparc64 from Debian lenny
6198
     * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
6199
     * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
6200
     * [6] 2.6.34 from kernel.org on KVM
6201
     */
6202
0
    static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
6203
0
    static const char fn[] = "/proc/net/psched";
6204
0
    unsigned int a, b, c, d;
6205
0
    FILE *stream;
6206
6207
0
    if (!ovsthread_once_start(&once)) {
6208
0
        return;
6209
0
    }
6210
6211
0
    ticks_per_s = 1.0;
6212
0
    buffer_hz = 100;
6213
6214
0
    stream = fopen(fn, "r");
6215
0
    if (!stream) {
6216
0
        VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
6217
0
        goto exit;
6218
0
    }
6219
6220
0
    if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
6221
0
        VLOG_WARN("%s: read failed", fn);
6222
0
        fclose(stream);
6223
0
        goto exit;
6224
0
    }
6225
0
    VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
6226
0
    fclose(stream);
6227
6228
0
    if (!a || !b || !c) {
6229
0
        VLOG_WARN("%s: invalid scheduler parameters", fn);
6230
0
        goto exit;
6231
0
    }
6232
6233
0
    ticks_per_s = (double) a * c / b;
6234
0
    if (c == 1000000) {
6235
0
        buffer_hz = d;
6236
0
    } else {
6237
0
        VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
6238
0
                  fn, a, b, c, d);
6239
0
    }
6240
0
    VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
6241
6242
0
exit:
6243
0
    ovsthread_once_done(&once);
6244
0
}
6245
6246
/* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
6247
 * rate of 'rate' bytes per second. */
6248
static unsigned int
6249
tc_ticks_to_bytes(uint64_t rate, unsigned int ticks)
6250
0
{
6251
0
    read_psched();
6252
0
    return (rate * ticks) / ticks_per_s;
6253
0
}
6254
6255
/* Returns the number of ticks that it would take to transmit 'size' bytes at a
6256
 * rate of 'rate' bytes per second. */
6257
static unsigned int
6258
tc_bytes_to_ticks(uint64_t rate, unsigned int size)
6259
0
{
6260
0
    read_psched();
6261
0
    return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
6262
0
}
6263
6264
/* Returns the number of bytes that need to be reserved for qdisc buffering at
6265
 * a transmission rate of 'rate' bytes per second. */
6266
static unsigned int
6267
tc_buffer_per_jiffy(uint64_t rate)
6268
0
{
6269
0
    read_psched();
6270
0
    return rate / buffer_hz;
6271
0
}
6272
6273
static uint32_t
6274
0
tc_time_to_ticks(uint32_t time) {
6275
0
    read_psched();
6276
0
    return time * (ticks_per_s / 1000000);
6277
0
}
6278
6279
/* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
6280
 * e.g. "htb", into '*kind' (if it is nonnull).  If 'options' is nonnull,
6281
 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
6282
 * stores NULL into it if it is absent.
6283
 *
6284
 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
6285
 * 'msg'.
6286
 *
6287
 * Returns 0 if successful, otherwise a positive errno value. */
6288
static int
6289
tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
6290
               struct nlattr **options)
6291
0
{
6292
0
    static const struct nl_policy tca_policy[] = {
6293
0
        [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
6294
0
        [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
6295
0
    };
6296
0
    struct nlattr *ta[ARRAY_SIZE(tca_policy)];
6297
6298
0
    if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
6299
0
                         tca_policy, ta, ARRAY_SIZE(ta))) {
6300
0
        VLOG_WARN_RL(&rl, "failed to parse qdisc message");
6301
0
        goto error;
6302
0
    }
6303
6304
0
    if (kind) {
6305
0
        *kind = nl_attr_get_string(ta[TCA_KIND]);
6306
0
    }
6307
6308
0
    if (options) {
6309
0
        *options = ta[TCA_OPTIONS];
6310
0
    }
6311
6312
0
    return 0;
6313
6314
0
error:
6315
0
    if (kind) {
6316
0
        *kind = NULL;
6317
0
    }
6318
0
    if (options) {
6319
0
        *options = NULL;
6320
0
    }
6321
0
    return EPROTO;
6322
0
}
6323
6324
/* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
6325
 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
6326
 * into '*options', and its queue statistics into '*stats'.  Any of the output
6327
 * arguments may be null.
6328
 *
6329
 * Returns 0 if successful, otherwise a positive errno value. */
6330
static int
6331
tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
6332
               struct nlattr **options, struct netdev_queue_stats *stats)
6333
0
{
6334
0
    struct ofpbuf b = ofpbuf_const_initializer(msg->data, msg->size);
6335
0
    struct nlmsghdr *nlmsg = ofpbuf_try_pull(&b, sizeof *nlmsg);
6336
0
    struct tcmsg *tc = ofpbuf_try_pull(&b, sizeof *tc);
6337
0
    static const struct nl_policy tca_policy[] = {
6338
0
        [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
6339
0
        [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
6340
0
    };
6341
0
    struct nlattr *ta[ARRAY_SIZE(tca_policy)];
6342
6343
0
    if (!nlmsg || !tc) {
6344
0
        VLOG_ERR_RL(&rl, "failed to parse class message, malformed reply");
6345
0
        goto error;
6346
0
    }
6347
6348
0
    if (!nl_policy_parse(&b, 0, tca_policy, ta, ARRAY_SIZE(ta))) {
6349
0
        VLOG_WARN_RL(&rl, "failed to parse class message");
6350
0
        goto error;
6351
0
    }
6352
6353
0
    if (handlep) {
6354
0
        *handlep = tc->tcm_handle;
6355
0
    }
6356
6357
0
    if (options) {
6358
0
        *options = ta[TCA_OPTIONS];
6359
0
    }
6360
6361
0
    if (stats) {
6362
0
        const struct gnet_stats_queue *gsq;
6363
0
        struct gnet_stats_basic gsb;
6364
6365
0
        static const struct nl_policy stats_policy[] = {
6366
0
            [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
6367
0
                                  .min_len = sizeof gsb },
6368
0
            [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
6369
0
                                  .min_len = sizeof *gsq },
6370
0
        };
6371
0
        struct nlattr *sa[ARRAY_SIZE(stats_policy)];
6372
6373
0
        if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
6374
0
                             sa, ARRAY_SIZE(sa))) {
6375
0
            VLOG_WARN_RL(&rl, "failed to parse class stats");
6376
0
            goto error;
6377
0
        }
6378
6379
        /* Alignment issues screw up the length of struct gnet_stats_basic on
6380
         * some arch/bitsize combinations.  Newer versions of Linux have a
6381
         * struct gnet_stats_basic_packed, but we can't depend on that.  The
6382
         * easiest thing to do is just to make a copy. */
6383
0
        memset(&gsb, 0, sizeof gsb);
6384
0
        memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
6385
0
               MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
6386
0
        stats->tx_bytes = gsb.bytes;
6387
0
        stats->tx_packets = gsb.packets;
6388
6389
0
        gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
6390
0
        stats->tx_errors = gsq->drops;
6391
0
    }
6392
6393
0
    return 0;
6394
6395
0
error:
6396
0
    if (options) {
6397
0
        *options = NULL;
6398
0
    }
6399
0
    if (stats) {
6400
0
        memset(stats, 0, sizeof *stats);
6401
0
    }
6402
0
    return EPROTO;
6403
0
}
6404
6405
/* Queries the kernel for class with identifier 'handle' and parent 'parent'
6406
 * on 'netdev'. */
6407
static int
6408
tc_query_class(const struct netdev *netdev,
6409
               unsigned int handle, unsigned int parent,
6410
               struct ofpbuf **replyp)
6411
0
{
6412
0
    struct ofpbuf request;
6413
0
    struct tcmsg *tcmsg;
6414
0
    int error;
6415
6416
0
    tcmsg = netdev_linux_tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO,
6417
0
                                         &request);
6418
0
    if (!tcmsg) {
6419
0
        return ENODEV;
6420
0
    }
6421
0
    tcmsg->tcm_handle = handle;
6422
0
    tcmsg->tcm_parent = parent;
6423
6424
0
    error = tc_transact(&request, replyp);
6425
0
    if (error) {
6426
0
        VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
6427
0
                     netdev_get_name(netdev),
6428
0
                     tc_get_major(handle), tc_get_minor(handle),
6429
0
                     tc_get_major(parent), tc_get_minor(parent),
6430
0
                     ovs_strerror(error));
6431
0
    }
6432
0
    return error;
6433
0
}
6434
6435
/* Equivalent to "tc class del dev <name> handle <handle>". */
6436
static int
6437
tc_delete_class(const struct netdev *netdev, unsigned int handle)
6438
0
{
6439
0
    struct ofpbuf request;
6440
0
    struct tcmsg *tcmsg;
6441
0
    int error;
6442
6443
0
    tcmsg = netdev_linux_tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
6444
0
    if (!tcmsg) {
6445
0
        return ENODEV;
6446
0
    }
6447
0
    tcmsg->tcm_handle = handle;
6448
0
    tcmsg->tcm_parent = 0;
6449
6450
0
    error = tc_transact(&request, NULL);
6451
0
    if (error) {
6452
0
        VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
6453
0
                     netdev_get_name(netdev),
6454
0
                     tc_get_major(handle), tc_get_minor(handle),
6455
0
                     ovs_strerror(error));
6456
0
    }
6457
0
    return error;
6458
0
}
6459
6460
/* Equivalent to "tc qdisc del dev <name> root". */
6461
static int
6462
tc_del_qdisc(struct netdev *netdev_)
6463
0
{
6464
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
6465
0
    struct ofpbuf request;
6466
0
    struct tcmsg *tcmsg;
6467
0
    int error;
6468
6469
0
    tcmsg = netdev_linux_tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
6470
0
    if (!tcmsg) {
6471
0
        return ENODEV;
6472
0
    }
6473
0
    tcmsg->tcm_parent = TC_H_ROOT;
6474
6475
0
    error = tc_transact(&request, NULL);
6476
0
    if (error == EINVAL || error == ENOENT) {
6477
        /* EINVAL or ENOENT probably means that the default qdisc was in use,
6478
         * in which case we've accomplished our purpose. */
6479
0
        error = 0;
6480
0
    }
6481
0
    if (!error && netdev->tc) {
6482
0
        if (netdev->tc->ops->tc_destroy) {
6483
0
            netdev->tc->ops->tc_destroy(netdev->tc);
6484
0
        }
6485
0
        netdev->tc = NULL;
6486
0
    }
6487
0
    return error;
6488
0
}
6489
6490
static bool
6491
getqdisc_is_safe(void)
6492
0
{
6493
0
    static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
6494
0
    static bool safe = false;
6495
6496
0
    if (ovsthread_once_start(&once)) {
6497
0
        if (ovs_kernel_is_version_or_newer(2, 35)) {
6498
0
            safe = true;
6499
0
        } else {
6500
0
            VLOG_INFO("disabling unsafe RTM_GETQDISC in Linux kernel");
6501
0
        }
6502
0
        ovsthread_once_done(&once);
6503
0
    }
6504
0
    return safe;
6505
0
}
6506
6507
/* If 'netdev''s qdisc type and parameters are not yet known, queries the
6508
 * kernel to determine what they are.  Returns 0 if successful, otherwise a
6509
 * positive errno value. */
6510
static int
6511
tc_query_qdisc(const struct netdev *netdev_)
6512
0
{
6513
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
6514
0
    struct ofpbuf request, *qdisc;
6515
0
    const struct tc_ops *ops;
6516
0
    struct tcmsg *tcmsg;
6517
0
    int load_error;
6518
0
    int error;
6519
6520
0
    if (netdev->tc) {
6521
0
        return 0;
6522
0
    }
6523
6524
    /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
6525
     * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
6526
     * 2.6.35 without that fix backported to it.
6527
     *
6528
     * To avoid the OOPS, we must not make a request that would attempt to dump
6529
     * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
6530
     * few others.  There are a few ways that I can see to do this, but most of
6531
     * them seem to be racy (and if you lose the race the kernel OOPSes).  The
6532
     * technique chosen here is to assume that any non-default qdisc that we
6533
     * create will have a class with handle 1:0.  The built-in qdiscs only have
6534
     * a class with handle 0:0.
6535
     *
6536
     * On Linux 2.6.35+ we use the straightforward method because it allows us
6537
     * to handle non-builtin qdiscs without handle 1:0 (e.g. codel).  However,
6538
     * in such a case we get no response at all from the kernel (!) if a
6539
     * builtin qdisc is in use (which is later caught by "!error &&
6540
     * !qdisc->size"). */
6541
0
    tcmsg = netdev_linux_tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO,
6542
0
                                         &request);
6543
0
    if (!tcmsg) {
6544
0
        return ENODEV;
6545
0
    }
6546
0
    tcmsg->tcm_handle = tc_make_handle(getqdisc_is_safe() ? 0 : 1, 0);
6547
0
    tcmsg->tcm_parent = getqdisc_is_safe() ? TC_H_ROOT : 0;
6548
6549
    /* Figure out what tc class to instantiate. */
6550
0
    error = tc_transact(&request, &qdisc);
6551
0
    if (!error && qdisc->size) {
6552
0
        const char *kind;
6553
6554
0
        error = tc_parse_qdisc(qdisc, &kind, NULL);
6555
0
        if (error) {
6556
0
            ops = &tc_ops_other;
6557
0
        } else {
6558
0
            ops = tc_lookup_linux_name(kind);
6559
0
            if (!ops) {
6560
0
                static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
6561
0
                VLOG_DBG_RL(&rl2, "unknown qdisc \"%s\"", kind);
6562
6563
0
                ops = &tc_ops_other;
6564
0
            }
6565
0
        }
6566
0
    } else if ((!error && !qdisc->size) || error == ENOENT) {
6567
        /* Either it's a built-in qdisc, or (on Linux pre-2.6.35) it's a qdisc
6568
         * set up by some other entity that doesn't have a handle 1:0.  We will
6569
         * assume that it's the system default qdisc. */
6570
0
        ops = &tc_ops_default;
6571
0
        error = 0;
6572
0
    } else {
6573
        /* Who knows?  Maybe the device got deleted. */
6574
0
        VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
6575
0
                     netdev_get_name(netdev_), ovs_strerror(error));
6576
0
        ops = &tc_ops_other;
6577
0
    }
6578
6579
    /* Instantiate it. */
6580
0
    load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
6581
0
    ovs_assert((load_error == 0) == (netdev->tc != NULL));
6582
0
    ofpbuf_delete(qdisc);
6583
6584
0
    return error ? error : load_error;
6585
0
}
6586
6587
/* Linux traffic control uses tables with 256 entries ("rtab" tables) to
6588
   approximate the time to transmit packets of various lengths.  For an MTU of
6589
   256 or less, each entry is exact; for an MTU of 257 through 512, each entry
6590
   represents two possible packet lengths; for a MTU of 513 through 1024, four
6591
   possible lengths; and so on.
6592
6593
   Returns, for the specified 'mtu', the number of bits that packet lengths
6594
   need to be shifted right to fit within such a 256-entry table. */
6595
static int
6596
tc_calc_cell_log(unsigned int mtu)
6597
0
{
6598
0
    int cell_log;
6599
6600
0
    if (!mtu) {
6601
0
        mtu = ETH_PAYLOAD_MAX;
6602
0
    }
6603
0
    mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
6604
6605
0
    for (cell_log = 0; mtu >= 256; cell_log++) {
6606
0
        mtu >>= 1;
6607
0
    }
6608
6609
0
    return cell_log;
6610
0
}
6611
6612
/* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
6613
 * of 'mtu'. */
6614
static void
6615
tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
6616
0
{
6617
0
    memset(rate, 0, sizeof *rate);
6618
0
    rate->cell_log = tc_calc_cell_log(mtu);
6619
    /* rate->overhead = 0; */           /* New in 2.6.24, not yet in some */
6620
    /* rate->cell_align = 0; */         /* distro headers. */
6621
0
    rate->mpu = ETH_TOTAL_MIN;
6622
0
    rate->rate = MIN(UINT32_MAX, Bps);
6623
0
}
6624
6625
/* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
6626
 * attribute of the specified "type".
6627
 *
6628
 * A 64-bit rate can be provided via 'rate64' in bps.
6629
 * If zero, the rate in 'rate' will be used.
6630
 *
6631
 * See tc_calc_cell_log() above for a description of "rtab"s. */
6632
void
6633
tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate,
6634
            uint64_t rate64)
6635
0
{
6636
0
    uint32_t *rtab;
6637
0
    unsigned int i;
6638
6639
0
    rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
6640
0
    for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
6641
0
        unsigned packet_size = (i + 1) << rate->cell_log;
6642
0
        if (packet_size < rate->mpu) {
6643
0
            packet_size = rate->mpu;
6644
0
        }
6645
0
        rtab[i] = tc_bytes_to_ticks(rate64 ? rate64 : rate->rate, packet_size);
6646
0
    }
6647
0
}
6648
6649
/* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
6650
 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
6651
 * burst size of 'burst_bytes'.  (If no value was requested, a 'burst_bytes' of
6652
 * 0 is fine.) */
6653
static int
6654
tc_calc_buffer(uint64_t Bps, int mtu, uint64_t burst_bytes)
6655
0
{
6656
0
    unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
6657
0
    return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
6658
0
}
6659

6660
/* Linux-only functions declared in netdev-linux.h  */
6661
6662
/* Modifies the 'flag' bit in ethtool's flags field for 'netdev'.  If
6663
 * 'enable' is true, the bit is set.  Otherwise, it is cleared. */
6664
int
6665
netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
6666
                              const char *flag_name, bool enable)
6667
0
{
6668
0
    const char *netdev_name = netdev_get_name(netdev);
6669
0
    struct ethtool_value evalue;
6670
0
    uint32_t new_flags;
6671
0
    int error;
6672
6673
0
    COVERAGE_INC(netdev_get_ethtool);
6674
0
    memset(&evalue, 0, sizeof evalue);
6675
0
    error = netdev_linux_do_ethtool(netdev_name,
6676
0
                                    (struct ethtool_cmd *)&evalue,
6677
0
                                    ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
6678
0
    if (error) {
6679
0
        return error;
6680
0
    }
6681
6682
0
    COVERAGE_INC(netdev_set_ethtool);
6683
0
    new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
6684
0
    if (new_flags == evalue.data) {
6685
0
        return 0;
6686
0
    }
6687
0
    evalue.data = new_flags;
6688
0
    error = netdev_linux_do_ethtool(netdev_name,
6689
0
                                    (struct ethtool_cmd *)&evalue,
6690
0
                                    ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
6691
0
    if (error) {
6692
0
        return error;
6693
0
    }
6694
6695
0
    COVERAGE_INC(netdev_get_ethtool);
6696
0
    memset(&evalue, 0, sizeof evalue);
6697
0
    error = netdev_linux_do_ethtool(netdev_name,
6698
0
                                    (struct ethtool_cmd *)&evalue,
6699
0
                                    ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
6700
0
    if (error) {
6701
0
        return error;
6702
0
    }
6703
6704
0
    if (new_flags != evalue.data) {
6705
0
        VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
6706
0
                     "device %s failed", enable ? "enable" : "disable",
6707
0
                     flag_name, netdev_name);
6708
0
        return EOPNOTSUPP;
6709
0
    }
6710
6711
0
    return 0;
6712
0
}
6713

6714
/* Utility functions. */
6715
6716
/* Copies 'src' into 'dst', performing format conversion in the process. */
6717
static void
6718
netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
6719
                                  const struct rtnl_link_stats *src)
6720
0
{
6721
0
    dst->rx_packets = src->rx_packets;
6722
0
    dst->tx_packets = src->tx_packets;
6723
0
    dst->rx_bytes = src->rx_bytes;
6724
0
    dst->tx_bytes = src->tx_bytes;
6725
0
    dst->rx_errors = src->rx_errors;
6726
0
    dst->tx_errors = src->tx_errors;
6727
0
    dst->rx_dropped = src->rx_dropped;
6728
0
    dst->tx_dropped = src->tx_dropped;
6729
0
    dst->multicast = src->multicast;
6730
0
    dst->collisions = src->collisions;
6731
0
    dst->rx_length_errors = src->rx_length_errors;
6732
0
    dst->rx_over_errors = src->rx_over_errors;
6733
0
    dst->rx_crc_errors = src->rx_crc_errors;
6734
0
    dst->rx_frame_errors = src->rx_frame_errors;
6735
0
    dst->rx_fifo_errors = src->rx_fifo_errors;
6736
0
    dst->rx_missed_errors = src->rx_missed_errors;
6737
0
    dst->tx_aborted_errors = src->tx_aborted_errors;
6738
0
    dst->tx_carrier_errors = src->tx_carrier_errors;
6739
0
    dst->tx_fifo_errors = src->tx_fifo_errors;
6740
0
    dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
6741
0
    dst->tx_window_errors = src->tx_window_errors;
6742
0
}
6743
6744
/* Copies 'src' into 'dst', performing format conversion in the process. */
6745
static void
6746
netdev_stats_from_rtnl_link_stats64(struct netdev_stats *dst,
6747
                                    const struct rtnl_link_stats64 *src)
6748
0
{
6749
0
    dst->rx_packets = src->rx_packets;
6750
0
    dst->tx_packets = src->tx_packets;
6751
0
    dst->rx_bytes = src->rx_bytes;
6752
0
    dst->tx_bytes = src->tx_bytes;
6753
0
    dst->rx_errors = src->rx_errors;
6754
0
    dst->tx_errors = src->tx_errors;
6755
0
    dst->rx_dropped = src->rx_dropped;
6756
0
    dst->tx_dropped = src->tx_dropped;
6757
0
    dst->multicast = src->multicast;
6758
0
    dst->collisions = src->collisions;
6759
0
    dst->rx_length_errors = src->rx_length_errors;
6760
0
    dst->rx_over_errors = src->rx_over_errors;
6761
0
    dst->rx_crc_errors = src->rx_crc_errors;
6762
0
    dst->rx_frame_errors = src->rx_frame_errors;
6763
0
    dst->rx_fifo_errors = src->rx_fifo_errors;
6764
0
    dst->rx_missed_errors = src->rx_missed_errors;
6765
0
    dst->tx_aborted_errors = src->tx_aborted_errors;
6766
0
    dst->tx_carrier_errors = src->tx_carrier_errors;
6767
0
    dst->tx_fifo_errors = src->tx_fifo_errors;
6768
0
    dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
6769
0
    dst->tx_window_errors = src->tx_window_errors;
6770
0
}
6771
6772
int
6773
get_stats_via_netlink(const struct netdev *netdev_, struct netdev_stats *stats)
6774
0
{
6775
0
    struct ofpbuf request;
6776
0
    struct ofpbuf *reply;
6777
0
    int error;
6778
6779
    /* Filtering all counters by default */
6780
0
    memset(stats, 0xFF, sizeof(struct netdev_stats));
6781
6782
0
    ofpbuf_init(&request, 0);
6783
0
    nl_msg_put_nlmsghdr(&request,
6784
0
                        sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
6785
0
                        RTM_GETLINK, NLM_F_REQUEST);
6786
0
    ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
6787
0
    nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(netdev_));
6788
0
    error = nl_transact(NETLINK_ROUTE, &request, &reply);
6789
0
    ofpbuf_uninit(&request);
6790
0
    if (error) {
6791
0
        return error;
6792
0
    }
6793
6794
0
    if (ofpbuf_try_pull(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg))) {
6795
0
        const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS64);
6796
0
        if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats64)) {
6797
0
            const struct rtnl_link_stats64 *lstats = nl_attr_get(a);
6798
0
            struct rtnl_link_stats64 aligned_lstats;
6799
6800
0
            if (!IS_PTR_ALIGNED(lstats)) {
6801
0
                memcpy(&aligned_lstats, (void *) lstats,
6802
0
                       sizeof aligned_lstats);
6803
0
                lstats = &aligned_lstats;
6804
0
            }
6805
0
            netdev_stats_from_rtnl_link_stats64(stats, lstats);
6806
0
            error = 0;
6807
0
        } else {
6808
0
            a = nl_attr_find(reply, 0, IFLA_STATS);
6809
0
            if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats)) {
6810
0
                netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(a));
6811
0
                error = 0;
6812
0
            } else {
6813
0
                VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
6814
0
                error = EPROTO;
6815
0
            }
6816
0
        }
6817
0
    } else {
6818
0
        VLOG_WARN_RL(&rl, "short RTM_GETLINK reply");
6819
0
        error = EPROTO;
6820
0
    }
6821
6822
6823
0
    ofpbuf_delete(reply);
6824
0
    return error;
6825
0
}
6826
6827
static int
6828
get_flags(const struct netdev *dev, unsigned int *flags)
6829
0
{
6830
0
    struct ifreq ifr;
6831
0
    int error;
6832
6833
0
    memset(&ifr, 0, sizeof ifr);
6834
0
    *flags = 0;
6835
0
    error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
6836
0
    if (!error) {
6837
0
        *flags = ifr.ifr_flags;
6838
0
    }
6839
0
    return error;
6840
0
}
6841
6842
static int
6843
set_flags(const char *name, unsigned int flags)
6844
0
{
6845
0
    struct ifreq ifr;
6846
6847
0
    memset(&ifr, 0, sizeof ifr);
6848
0
    ifr.ifr_flags = flags;
6849
0
    return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
6850
0
}
6851
6852
int
6853
linux_get_ifindex(const char *netdev_name)
6854
0
{
6855
0
    struct ifreq ifr;
6856
0
    int error;
6857
6858
0
    memset(&ifr, 0, sizeof ifr);
6859
0
    ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
6860
0
    COVERAGE_INC(netdev_get_ifindex);
6861
6862
0
    error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
6863
0
    if (error) {
6864
        /* ENODEV probably means that a vif disappeared asynchronously and
6865
         * hasn't been removed from the database yet, so reduce the log level
6866
         * to INFO for that case. */
6867
0
        VLOG_RL(&rl, error == ENODEV ? VLL_INFO : VLL_ERR,
6868
0
                "ioctl(SIOCGIFINDEX) on %s device failed: %s",
6869
0
                netdev_name, ovs_strerror(error));
6870
0
        return -error;
6871
0
    }
6872
0
    return ifr.ifr_ifindex;
6873
0
}
6874
6875
static int
6876
get_ifindex(const struct netdev *netdev_, int *ifindexp)
6877
0
{
6878
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
6879
6880
0
    if (!(netdev->cache_valid & VALID_IFINDEX)) {
6881
0
        netdev_linux_update_via_netlink(netdev);
6882
0
    }
6883
6884
0
    if (!(netdev->cache_valid & VALID_IFINDEX)) {
6885
        /* Fall back to ioctl if netlink fails */
6886
0
        int ifindex = linux_get_ifindex(netdev_get_name(netdev_));
6887
6888
0
        if (ifindex < 0) {
6889
0
            netdev->get_ifindex_error = -ifindex;
6890
0
            netdev->ifindex = 0;
6891
0
        } else {
6892
0
            netdev->get_ifindex_error = 0;
6893
0
            netdev->ifindex = ifindex;
6894
0
        }
6895
0
        netdev->cache_valid |= VALID_IFINDEX;
6896
0
    }
6897
6898
0
    *ifindexp = netdev->ifindex;
6899
0
    return netdev->get_ifindex_error;
6900
0
}
6901
6902
static int
6903
netdev_linux_update_via_netlink(struct netdev_linux *netdev)
6904
0
{
6905
0
    struct ofpbuf request;
6906
0
    struct ofpbuf *reply;
6907
0
    struct rtnetlink_change chg;
6908
0
    struct rtnetlink_change *change = &chg;
6909
0
    int error;
6910
6911
0
    ofpbuf_init(&request, 0);
6912
0
    nl_msg_put_nlmsghdr(&request,
6913
0
                        sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ) +
6914
0
                        NL_A_U32_SIZE, RTM_GETLINK, NLM_F_REQUEST);
6915
0
    ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
6916
6917
    /* The correct identifiers for a Linux device are netnsid and ifindex,
6918
     * but ifindex changes as the port is moved to another network namespace
6919
     * and the interface name statically stored in ovsdb. */
6920
0
    nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(&netdev->up));
6921
0
    if (netdev_linux_netnsid_is_remote(netdev)) {
6922
0
        nl_msg_put_u32(&request, IFLA_IF_NETNSID, netdev->netnsid);
6923
0
    }
6924
6925
0
    nl_msg_put_u32(&request, IFLA_EXT_MASK, RTEXT_FILTER_SKIP_STATS);
6926
6927
0
    error = nl_transact(NETLINK_ROUTE, &request, &reply);
6928
0
    ofpbuf_uninit(&request);
6929
0
    if (error) {
6930
0
        ofpbuf_delete(reply);
6931
0
        return error;
6932
0
    }
6933
6934
0
    if (rtnetlink_parse(reply, change)
6935
0
        && !change->irrelevant
6936
0
        && change->nlmsg_type == RTM_NEWLINK) {
6937
0
        bool changed = false;
6938
0
        error = 0;
6939
6940
        /* Update netdev from rtnl msg and increment its seq if needed. */
6941
0
        if ((change->ifi_flags ^ netdev->ifi_flags) & IFF_RUNNING) {
6942
0
            netdev->carrier_resets++;
6943
0
            changed = true;
6944
0
        }
6945
0
        if (change->ifi_flags != netdev->ifi_flags) {
6946
0
            netdev->ifi_flags = change->ifi_flags;
6947
0
            changed = true;
6948
0
        }
6949
0
        if (change->mtu && change->mtu != netdev->mtu) {
6950
0
            netdev->mtu = change->mtu;
6951
0
            netdev->cache_valid |= VALID_MTU;
6952
0
            netdev->netdev_mtu_error = 0;
6953
0
            changed = true;
6954
0
        }
6955
0
        if (!eth_addr_is_zero(change->mac)
6956
0
            && !eth_addr_equals(change->mac, netdev->etheraddr)) {
6957
0
            netdev->etheraddr = change->mac;
6958
0
            netdev->cache_valid |= VALID_ETHERADDR;
6959
0
            netdev->ether_addr_error = 0;
6960
0
            changed = true;
6961
0
        }
6962
0
        if (change->if_index != netdev->ifindex) {
6963
0
            netdev->ifindex = change->if_index;
6964
0
            netdev->cache_valid |= VALID_IFINDEX;
6965
0
            netdev->get_ifindex_error = 0;
6966
0
            changed = true;
6967
0
        }
6968
0
        if (change->primary && netdev_linux_kind_is_lag(change->primary)) {
6969
0
            netdev->is_lag_primary = true;
6970
0
        }
6971
0
        if (changed) {
6972
0
            netdev_change_seq_changed(&netdev->up);
6973
0
        }
6974
0
    } else {
6975
0
        error = EINVAL;
6976
0
    }
6977
6978
0
    ofpbuf_delete(reply);
6979
0
    return error;
6980
0
}
6981
6982
static int
6983
get_etheraddr(const char *netdev_name, struct eth_addr *ea)
6984
0
{
6985
0
    struct ifreq ifr;
6986
0
    int hwaddr_family;
6987
0
    int error;
6988
6989
0
    memset(&ifr, 0, sizeof ifr);
6990
0
    ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
6991
0
    COVERAGE_INC(netdev_get_hwaddr);
6992
0
    error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
6993
0
    if (error) {
6994
        /* ENODEV probably means that a vif disappeared asynchronously and
6995
         * hasn't been removed from the database yet, so reduce the log level
6996
         * to INFO for that case. */
6997
0
        VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
6998
0
             "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
6999
0
             netdev_name, ovs_strerror(error));
7000
0
        return error;
7001
0
    }
7002
0
    hwaddr_family = ifr.ifr_hwaddr.sa_family;
7003
0
    if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER &&
7004
0
        hwaddr_family != ARPHRD_NONE) {
7005
0
        VLOG_INFO("%s device has unknown hardware address family %d",
7006
0
                  netdev_name, hwaddr_family);
7007
0
        return EINVAL;
7008
0
    }
7009
0
    memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
7010
0
    return 0;
7011
0
}
7012
7013
static int
7014
set_etheraddr(const char *netdev_name, const struct eth_addr mac)
7015
0
{
7016
0
    struct ifreq ifr;
7017
0
    int error;
7018
7019
0
    memset(&ifr, 0, sizeof ifr);
7020
0
    ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
7021
0
    ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
7022
0
    memcpy(ifr.ifr_hwaddr.sa_data, &mac, ETH_ADDR_LEN);
7023
0
    COVERAGE_INC(netdev_set_hwaddr);
7024
0
    error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
7025
0
    if (error) {
7026
0
        VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
7027
0
                 netdev_name, ovs_strerror(error));
7028
0
    }
7029
0
    return error;
7030
0
}
7031
7032
static int
7033
netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
7034
                        int cmd, const char *cmd_name)
7035
0
{
7036
0
    struct ifreq ifr;
7037
0
    int error;
7038
7039
0
    memset(&ifr, 0, sizeof ifr);
7040
0
    ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
7041
0
    ifr.ifr_data = (caddr_t) ecmd;
7042
7043
0
    ecmd->cmd = cmd;
7044
0
    error = af_inet_ioctl(SIOCETHTOOL, &ifr);
7045
0
    if (error) {
7046
0
        if (error != EOPNOTSUPP) {
7047
0
            VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
7048
0
                         "failed: %s", cmd_name, name, ovs_strerror(error));
7049
0
        } else {
7050
            /* The device doesn't support this operation.  That's pretty
7051
             * common, so there's no point in logging anything. */
7052
0
        }
7053
0
    }
7054
0
    return error;
7055
0
}
7056
7057
/* Returns an AF_PACKET raw socket or a negative errno value. */
7058
static int
7059
af_packet_sock(void)
7060
0
{
7061
0
    static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
7062
0
    static int sock;
7063
7064
0
    if (ovsthread_once_start(&once)) {
7065
0
        sock = socket(AF_PACKET, SOCK_RAW, 0);
7066
0
        if (sock >= 0) {
7067
0
            int error = set_nonblocking(sock);
7068
0
            if (error) {
7069
0
                close(sock);
7070
0
                sock = -error;
7071
0
            } else if (userspace_tso_enabled()) {
7072
0
                int val = 1;
7073
0
                error = setsockopt(sock, SOL_PACKET, PACKET_VNET_HDR, &val,
7074
0
                                   sizeof val);
7075
0
                if (error) {
7076
0
                    error = errno;
7077
0
                    VLOG_ERR("failed to enable vnet hdr in raw socket: %s",
7078
0
                             ovs_strerror(errno));
7079
0
                    close(sock);
7080
0
                    sock = -error;
7081
0
                }
7082
0
            }
7083
0
        } else {
7084
0
            sock = -errno;
7085
0
            VLOG_ERR("failed to create packet socket: %s",
7086
0
                     ovs_strerror(errno));
7087
0
        }
7088
0
        ovsthread_once_done(&once);
7089
0
    }
7090
7091
0
    return sock;
7092
0
}
7093
7094
/* Initializes packet 'b' with features enabled in the prepended
7095
 * struct virtio_net_hdr.  Returns 0 if successful, otherwise a
7096
 * positive errno value. */
7097
static int
7098
netdev_linux_parse_vnet_hdr(struct dp_packet *b)
7099
0
{
7100
0
    struct virtio_net_hdr *vnet = dp_packet_pull(b, sizeof *vnet);
7101
7102
0
    if (OVS_UNLIKELY(!vnet)) {
7103
0
        return EINVAL;
7104
0
    }
7105
7106
0
    if (vnet->flags == 0 && vnet->gso_type == VIRTIO_NET_HDR_GSO_NONE) {
7107
0
        return 0;
7108
0
    }
7109
7110
0
    if (vnet->flags == VIRTIO_NET_HDR_F_NEEDS_CSUM) {
7111
0
        uint16_t csum_offset = (OVS_FORCE uint16_t) vnet->csum_offset;
7112
0
        uint16_t csum_start = (OVS_FORCE uint16_t) vnet->csum_start;
7113
7114
0
        if (csum_start >= dp_packet_size(b)
7115
0
            || csum_start + csum_offset >= dp_packet_size(b)) {
7116
0
            COVERAGE_INC(netdev_linux_invalid_l4_csum);
7117
0
            return EINVAL;
7118
0
        }
7119
7120
0
        parse_tcp_flags(b, NULL, NULL, NULL);
7121
7122
0
        if (csum_start == b->l4_ofs
7123
0
            && ((csum_offset == offsetof(struct tcp_header, tcp_csum)
7124
0
                 && dp_packet_l4_proto_tcp(b))
7125
0
                || (csum_offset == offsetof(struct udp_header, udp_csum)
7126
0
                    && dp_packet_l4_proto_udp(b))
7127
0
                || (csum_offset == offsetof(struct sctp_header, sctp_csum)
7128
0
                    && dp_packet_l4_proto_sctp(b)))) {
7129
0
            dp_packet_l4_checksum_set_partial(b);
7130
0
        } else {
7131
0
            ovs_be16 *csum_l4;
7132
0
            void *l4;
7133
7134
0
            COVERAGE_INC(netdev_linux_unknown_l4_csum);
7135
7136
0
            csum_l4 = dp_packet_at(b, csum_start + csum_offset,
7137
0
                                   sizeof *csum_l4);
7138
0
            if (!csum_l4) {
7139
0
                return EINVAL;
7140
0
            }
7141
7142
0
            l4 = dp_packet_at(b, csum_start, dp_packet_size(b) - csum_start);
7143
0
            *csum_l4 = csum(l4, dp_packet_size(b) - csum_start);
7144
7145
0
            if (dp_packet_l4_proto_tcp(b)
7146
0
                || dp_packet_l4_proto_udp(b)
7147
0
                || dp_packet_l4_proto_sctp(b)) {
7148
0
                dp_packet_l4_checksum_set_good(b);
7149
0
            }
7150
0
        }
7151
0
    }
7152
7153
0
    int ret = 0;
7154
0
    switch (vnet->gso_type) {
7155
0
    case VIRTIO_NET_HDR_GSO_TCPV4:
7156
0
    case VIRTIO_NET_HDR_GSO_TCPV6:
7157
0
        dp_packet_set_tso_segsz(b, (OVS_FORCE uint16_t) vnet->gso_size);
7158
0
        break;
7159
7160
0
    case VIRTIO_NET_HDR_GSO_UDP:
7161
        /* UFO is not supported. */
7162
0
        VLOG_WARN_RL(&rl, "Received an unsupported packet with UFO enabled.");
7163
0
        ret = ENOTSUP;
7164
0
        break;
7165
7166
0
    case VIRTIO_NET_HDR_GSO_NONE:
7167
0
        break;
7168
7169
0
    default:
7170
0
        ret = ENOTSUP;
7171
0
        VLOG_WARN_RL(&rl, "Received an unsupported packet with GSO type: 0x%x",
7172
0
                     vnet->gso_type);
7173
0
    }
7174
7175
0
    return ret;
7176
0
}
7177
7178
/* Prepends struct virtio_net_hdr to packet 'b'.
7179
 * Returns 0 if successful, otherwise a positive errno value.
7180
 * Returns EMSGSIZE if the packet 'b' cannot be sent over MTU 'mtu'. */
7181
static int
7182
netdev_linux_prepend_vnet_hdr(struct dp_packet *b, int mtu)
7183
0
{
7184
0
    struct virtio_net_hdr v;
7185
0
    struct virtio_net_hdr *vnet = &v;
7186
7187
0
    if (dp_packet_get_tso_segsz(b)) {
7188
0
        uint16_t tso_segsz = dp_packet_get_tso_segsz(b);
7189
0
        const struct tcp_header *tcp;
7190
0
        const struct ip_header *ip;
7191
0
        if (dp_packet_inner_l4(b)) {
7192
0
            tcp = dp_packet_inner_l4(b);
7193
0
            ip = dp_packet_inner_l3(b);
7194
0
        } else {
7195
0
            tcp = dp_packet_l4(b);
7196
0
            ip = dp_packet_l3(b);
7197
0
        }
7198
0
        int tcp_hdr_len = TCP_OFFSET(tcp->tcp_ctl) * 4;
7199
0
        int hdr_len = ((char *) tcp - (char *) dp_packet_eth(b))
7200
0
                      + tcp_hdr_len;
7201
0
        int max_packet_len = mtu + ETH_HEADER_LEN + VLAN_HEADER_LEN;
7202
7203
0
        if (OVS_UNLIKELY((hdr_len + tso_segsz) > max_packet_len)) {
7204
0
            VLOG_WARN_RL(&rl, "Oversized TSO packet. hdr_len: %"PRIu32", "
7205
0
                         "gso: %"PRIu16", max length: %"PRIu32".", hdr_len,
7206
0
                         tso_segsz, max_packet_len);
7207
0
            return EMSGSIZE;
7208
0
        }
7209
7210
0
        vnet->hdr_len = (OVS_FORCE __virtio16)hdr_len;
7211
0
        vnet->gso_size = (OVS_FORCE __virtio16)(tso_segsz);
7212
0
        if (IP_VER(ip->ip_ihl_ver) == 4) {
7213
0
            vnet->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
7214
0
        } else if (IP_VER(ip->ip_ihl_ver) == 6) {
7215
0
            vnet->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
7216
0
        } else {
7217
0
            VLOG_ERR_RL(&rl, "Unknown gso_type for TSO packet. "
7218
0
                        "Offloads: %"PRIu32, b->offloads);
7219
0
            return EINVAL;
7220
0
        }
7221
0
    } else {
7222
0
        vnet->hdr_len = 0;
7223
0
        vnet->gso_size = 0;
7224
0
        vnet->gso_type = VIRTIO_NET_HDR_GSO_NONE;
7225
0
    }
7226
7227
0
    if (dp_packet_l4_checksum_good(b)
7228
0
        && (!dp_packet_tunnel(b)
7229
0
            || dp_packet_inner_l4_checksum_good(b))) {
7230
        /* The packet has good L4 checksum. No need to validate again. */
7231
0
        vnet->csum_start = vnet->csum_offset = (OVS_FORCE __virtio16) 0;
7232
0
        vnet->flags = VIRTIO_NET_HDR_F_DATA_VALID;
7233
0
    } else if (dp_packet_l4_checksum_partial(b)
7234
0
               || dp_packet_inner_l4_checksum_partial(b)) {
7235
0
        const struct ip_header *ip_hdr;
7236
0
        void *l3_off;
7237
0
        void *l4_off;
7238
0
        bool is_sctp;
7239
0
        bool is_tcp;
7240
0
        bool is_udp;
7241
7242
0
        if (dp_packet_inner_l4_checksum_partial(b)) {
7243
0
            l3_off = dp_packet_inner_l3(b);
7244
0
            l4_off = dp_packet_inner_l4(b);
7245
0
            is_tcp = dp_packet_inner_l4_proto_tcp(b);
7246
0
            is_udp = dp_packet_inner_l4_proto_udp(b);
7247
0
            is_sctp = dp_packet_inner_l4_proto_sctp(b);
7248
0
        } else {
7249
0
            l3_off = dp_packet_l3(b);
7250
0
            l4_off = dp_packet_l4(b);
7251
0
            is_tcp = dp_packet_l4_proto_tcp(b);
7252
0
            is_udp = dp_packet_l4_proto_udp(b);
7253
0
            is_sctp = dp_packet_l4_proto_sctp(b);
7254
0
        }
7255
0
        ip_hdr = l3_off;
7256
7257
        /* The csum calculation is offloaded. */
7258
0
        if (is_tcp) {
7259
            /* Virtual I/O Device (VIRTIO) Version 1.1
7260
             * 5.1.6.2 Packet Transmission
7261
             * If the driver negotiated VIRTIO_NET_F_CSUM, it can skip
7262
             * checksumming the packet:
7263
             *  - flags has the VIRTIO_NET_HDR_F_NEEDS_CSUM set,
7264
             *  - csum_start is set to the offset within the packet
7265
             *    to begin checksumming, and
7266
             *  - csum_offset indicates how many bytes after the
7267
             *    csum_start the new (16 bit ones complement) checksum
7268
             *    is placed by the device.
7269
             * The TCP checksum field in the packet is set to the sum of
7270
             * the TCP pseudo header, so that replacing it by the ones
7271
             * complement checksum of the TCP header and body will give
7272
             * the correct result. */
7273
0
            struct tcp_header *tcp_hdr = l4_off;
7274
0
            ovs_be16 csum = 0;
7275
7276
0
            if (IP_VER(ip_hdr->ip_ihl_ver) == 4) {
7277
0
                csum = ~csum_finish(packet_csum_pseudoheader(ip_hdr));
7278
0
            } else if (IP_VER(ip_hdr->ip_ihl_ver) == 6) {
7279
0
                const struct ovs_16aligned_ip6_hdr *ip6_hdr = l3_off;
7280
0
                csum = ~csum_finish(packet_csum_pseudoheader6(ip6_hdr));
7281
0
            }
7282
7283
0
            tcp_hdr->tcp_csum = csum;
7284
0
            vnet->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
7285
0
            vnet->csum_start = (OVS_FORCE __virtio16) ((char *) l4_off -
7286
0
                                    (char *) dp_packet_data(b));
7287
0
            vnet->csum_offset = (OVS_FORCE __virtio16) __builtin_offsetof(
7288
0
                                    struct tcp_header, tcp_csum);
7289
0
        } else if (is_udp) {
7290
0
            struct udp_header *udp_hdr = l4_off;
7291
0
            ovs_be16 csum = 0;
7292
7293
0
            if (IP_VER(ip_hdr->ip_ihl_ver) == 4) {
7294
0
                csum = ~csum_finish(packet_csum_pseudoheader(ip_hdr));
7295
0
            } else if (IP_VER(ip_hdr->ip_ihl_ver) == 6) {
7296
0
                const struct ovs_16aligned_ip6_hdr *ip6_hdr = l3_off;
7297
0
                csum = ~csum_finish(packet_csum_pseudoheader6(ip6_hdr));
7298
0
            }
7299
7300
0
            udp_hdr->udp_csum = csum;
7301
0
            vnet->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
7302
0
            vnet->csum_start = (OVS_FORCE __virtio16) ((char *) udp_hdr -
7303
0
                                    (char *) dp_packet_data(b));;
7304
0
            vnet->csum_offset = (OVS_FORCE __virtio16) __builtin_offsetof(
7305
0
                                    struct udp_header, udp_csum);
7306
0
        } else if (is_sctp) {
7307
            /* The Linux kernel networking stack only supports csum_start
7308
             * and csum_offset when SCTP GSO is enabled.  See kernel's
7309
             * skb_csum_hwoffload_help(). Currently there is no SCTP
7310
             * segmentation offload support in OVS. */
7311
0
            vnet->csum_start = vnet->csum_offset = (OVS_FORCE __virtio16) 0;
7312
0
            vnet->flags = 0;
7313
0
        } else {
7314
            /* This should only happen when a new L4 proto
7315
             * is not covered in above checks. */
7316
0
            VLOG_WARN_RL(&rl, "Unsupported L4 checksum offload. "
7317
0
                         "Offloads: %"PRIu32, b->offloads);
7318
0
            vnet->csum_start = vnet->csum_offset = (OVS_FORCE __virtio16) 0;
7319
0
            vnet->flags = 0;
7320
0
        }
7321
0
    } else {
7322
        /* Packet L4 csum is unknown. */
7323
0
        vnet->csum_start = vnet->csum_offset = (OVS_FORCE __virtio16) 0;
7324
0
        vnet->flags = 0;
7325
0
    }
7326
7327
0
    dp_packet_push(b, vnet, sizeof *vnet);
7328
0
    return 0;
7329
0
}