Coverage Report

Created: 2025-11-11 06:38

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/openvswitch/lib/netdev-linux.c
Line
Count
Source
1
/*
2
 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017 Nicira, Inc.
3
 *
4
 * Licensed under the Apache License, Version 2.0 (the "License");
5
 * you may not use this file except in compliance with the License.
6
 * You may obtain a copy of the License at:
7
 *
8
 *     http://www.apache.org/licenses/LICENSE-2.0
9
 *
10
 * Unless required by applicable law or agreed to in writing, software
11
 * distributed under the License is distributed on an "AS IS" BASIS,
12
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
 * See the License for the specific language governing permissions and
14
 * limitations under the License.
15
 */
16
17
#include <config.h>
18
19
#include "netdev-linux.h"
20
#include "netdev-linux-private.h"
21
22
#include <errno.h>
23
#include <fcntl.h>
24
#include <sys/types.h>
25
#include <netinet/in.h>
26
#include <arpa/inet.h>
27
#include <inttypes.h>
28
#include <math.h>
29
#include <linux/filter.h>
30
#include <linux/gen_stats.h>
31
#include <linux/if_ether.h>
32
#include <linux/if_packet.h>
33
#include <linux/if_tun.h>
34
#include <linux/types.h>
35
#include <linux/ethtool.h>
36
#include <linux/mii.h>
37
#include <linux/rtnetlink.h>
38
#include <linux/sockios.h>
39
#include <linux/virtio_net.h>
40
#include <sys/ioctl.h>
41
#include <sys/socket.h>
42
#include <sys/uio.h>
43
#include <net/if.h>
44
#include <net/if_arp.h>
45
#include <net/route.h>
46
#include <poll.h>
47
#include <stdlib.h>
48
#include <string.h>
49
#include <unistd.h>
50
51
#include "coverage.h"
52
#include "dp-packet.h"
53
#include "dpif-netlink.h"
54
#include "dpif-netdev.h"
55
#include "openvswitch/dynamic-string.h"
56
#include "fatal-signal.h"
57
#include "hash.h"
58
#include "openvswitch/hmap.h"
59
#include "netdev-afxdp.h"
60
#include "netdev-provider.h"
61
#include "netdev-vport.h"
62
#include "netlink-notifier.h"
63
#include "netlink-socket.h"
64
#include "netlink.h"
65
#include "netnsid.h"
66
#include "openvswitch/ofpbuf.h"
67
#include "openflow/openflow.h"
68
#include "ovs-atomic.h"
69
#include "ovs-numa.h"
70
#include "packets.h"
71
#include "openvswitch/poll-loop.h"
72
#include "rtnetlink.h"
73
#include "openvswitch/shash.h"
74
#include "socket-util.h"
75
#include "sset.h"
76
#include "tc.h"
77
#include "timer.h"
78
#include "unaligned.h"
79
#include "openvswitch/vlog.h"
80
#include "userspace-tso.h"
81
#include "util.h"
82
83
VLOG_DEFINE_THIS_MODULE(netdev_linux);
84
85
COVERAGE_DEFINE(netdev_set_policing);
86
COVERAGE_DEFINE(netdev_arp_lookup);
87
COVERAGE_DEFINE(netdev_get_ifindex);
88
COVERAGE_DEFINE(netdev_get_hwaddr);
89
COVERAGE_DEFINE(netdev_set_hwaddr);
90
COVERAGE_DEFINE(netdev_get_ethtool);
91
COVERAGE_DEFINE(netdev_set_ethtool);
92
COVERAGE_DEFINE(netdev_linux_invalid_l4_csum);
93
COVERAGE_DEFINE(netdev_linux_unknown_l4_csum);
94
95

96
#ifndef IFLA_IF_NETNSID
97
0
#define IFLA_IF_NETNSID 0x45
98
#endif
99
/* These were introduced in Linux 2.6.14, so they might be missing if we have
100
 * old headers. */
101
#ifndef ADVERTISED_Pause
102
#define ADVERTISED_Pause                (1 << 13)
103
#endif
104
#ifndef ADVERTISED_Asym_Pause
105
#define ADVERTISED_Asym_Pause           (1 << 14)
106
#endif
107
108
/* These were introduced in Linux 2.6.24, so they might be missing if we
109
 * have old headers. */
110
#ifndef ETHTOOL_GFLAGS
111
#define ETHTOOL_GFLAGS       0x00000025 /* Get flags bitmap(ethtool_value) */
112
#endif
113
#ifndef ETHTOOL_SFLAGS
114
#define ETHTOOL_SFLAGS       0x00000026 /* Set flags bitmap(ethtool_value) */
115
#endif
116
117
/* This was introduced in Linux 2.6.25, so it might be missing if we have old
118
 * headers. */
119
#ifndef TC_RTAB_SIZE
120
#define TC_RTAB_SIZE 1024
121
#endif
122
123
/* Linux 2.6.21 introduced struct tpacket_auxdata.
124
 * Linux 2.6.27 added the tp_vlan_tci member.
125
 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
126
 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
127
 * TP_STATUS_VLAN_TPID_VALID.
128
 *
129
 * With all this churn it's easiest to unconditionally define a replacement
130
 * structure that has everything we want.
131
 */
132
#ifndef PACKET_AUXDATA
133
#define PACKET_AUXDATA                  8
134
#endif
135
#ifndef TP_STATUS_VLAN_VALID
136
#define TP_STATUS_VLAN_VALID            (1 << 4)
137
#endif
138
#ifndef TP_STATUS_VLAN_TPID_VALID
139
#define TP_STATUS_VLAN_TPID_VALID       (1 << 6)
140
#endif
141
#undef tpacket_auxdata
142
#define tpacket_auxdata rpl_tpacket_auxdata
143
struct tpacket_auxdata {
144
    uint32_t tp_status;
145
    uint32_t tp_len;
146
    uint32_t tp_snaplen;
147
    uint16_t tp_mac;
148
    uint16_t tp_net;
149
    uint16_t tp_vlan_tci;
150
    uint16_t tp_vlan_tpid;
151
};
152
153
/* Linux 2.6.27 introduced ethtool_cmd_speed
154
 *
155
 * To avoid revisiting problems reported with using configure to detect
156
 * compatibility (see report at
157
 * https://mail.openvswitch.org/pipermail/ovs-dev/2014-October/291521.html)
158
 * unconditionally replace ethtool_cmd_speed. */
159
0
#define ethtool_cmd_speed rpl_ethtool_cmd_speed
160
static inline uint32_t rpl_ethtool_cmd_speed(const struct ethtool_cmd *ep)
161
0
{
162
0
        return ep->speed | (ep->speed_hi << 16);
163
0
}
164
165
/* Linux 2.6.30 introduced supported and advertised flags for
166
 * 1G base KX, and 10G base KX4, KR and R. */
167
#ifndef SUPPORTED_1000baseKX_Full
168
#define SUPPORTED_1000baseKX_Full      (1 << 17)
169
#define SUPPORTED_10000baseKX4_Full    (1 << 18)
170
#define SUPPORTED_10000baseKR_Full     (1 << 19)
171
#define SUPPORTED_10000baseR_FEC       (1 << 20)
172
#define ADVERTISED_1000baseKX_Full     (1 << 17)
173
#define ADVERTISED_10000baseKX4_Full   (1 << 18)
174
#define ADVERTISED_10000baseKR_Full    (1 << 19)
175
#define ADVERTISED_10000baseR_FEC      (1 << 20)
176
#endif
177
178
/* Linux 3.2 introduced "unknown" speed and duplex. */
179
#ifndef SPEED_UNKNOWN
180
#define SPEED_UNKNOWN -1
181
#endif
182
#ifndef DUPLEX_UNKNOWN
183
#define DUPLEX_UNKNOWN 0xff
184
#endif
185
186
/* Linux 3.5 introduced supported and advertised flags for
187
 * 40G base KR4, CR4, SR4 and LR4. */
188
#ifndef SUPPORTED_40000baseKR4_Full
189
#define SUPPORTED_40000baseKR4_Full    (1 << 23)
190
#define SUPPORTED_40000baseCR4_Full    (1 << 24)
191
#define SUPPORTED_40000baseSR4_Full    (1 << 25)
192
#define SUPPORTED_40000baseLR4_Full    (1 << 26)
193
#define ADVERTISED_40000baseKR4_Full   (1 << 23)
194
#define ADVERTISED_40000baseCR4_Full   (1 << 24)
195
#define ADVERTISED_40000baseSR4_Full   (1 << 25)
196
#define ADVERTISED_40000baseLR4_Full   (1 << 26)
197
#endif
198
199
/* Linux 3.19 introduced speed for 40G. */
200
#ifndef SPEED_40000
201
#define SPEED_40000 40000
202
#endif
203
204
/* Linux 4.2 introduced speed for 100G. */
205
#ifndef SPEED_100000
206
#define SPEED_100000 100000
207
#endif
208
209
/* Linux 2.6.35 introduced IFLA_STATS64 and rtnl_link_stats64.
210
 *
211
 * Tests for rtnl_link_stats64 don't seem to consistently work, e.g. on
212
 * 2.6.32-431.29.2.el6.x86_64 (see report at
213
 * https://mail.openvswitch.org/pipermail/ovs-dev/2014-October/291521.html).
214
 * Maybe if_link.h is not self-contained on those kernels.  It is easiest to
215
 * unconditionally define a replacement. */
216
#ifndef IFLA_STATS64
217
0
#define IFLA_STATS64 23
218
#endif
219
#define rtnl_link_stats64 rpl_rtnl_link_stats64
220
struct rtnl_link_stats64 {
221
    uint64_t rx_packets;
222
    uint64_t tx_packets;
223
    uint64_t rx_bytes;
224
    uint64_t tx_bytes;
225
    uint64_t rx_errors;
226
    uint64_t tx_errors;
227
    uint64_t rx_dropped;
228
    uint64_t tx_dropped;
229
    uint64_t multicast;
230
    uint64_t collisions;
231
232
    uint64_t rx_length_errors;
233
    uint64_t rx_over_errors;
234
    uint64_t rx_crc_errors;
235
    uint64_t rx_frame_errors;
236
    uint64_t rx_fifo_errors;
237
    uint64_t rx_missed_errors;
238
239
    uint64_t tx_aborted_errors;
240
    uint64_t tx_carrier_errors;
241
    uint64_t tx_fifo_errors;
242
    uint64_t tx_heartbeat_errors;
243
    uint64_t tx_window_errors;
244
245
    uint64_t rx_compressed;
246
    uint64_t tx_compressed;
247
};
248
249
/* Linux 3.19 introduced virtio_types.h.  It might be missing
250
 * if we are using old kernel. */
251
#ifndef HAVE_VIRTIO_TYPES
252
typedef __u16 __bitwise__ __virtio16;
253
typedef __u32 __bitwise__ __virtio32;
254
typedef __u64 __bitwise__ __virtio64;
255
#endif
256
257
enum {
258
    VALID_IFINDEX           = 1 << 0,
259
    VALID_ETHERADDR         = 1 << 1,
260
    VALID_IN                = 1 << 2,
261
    VALID_MTU               = 1 << 3,
262
    VALID_POLICING          = 1 << 4,
263
    VALID_VPORT_STAT_ERROR  = 1 << 5,
264
    VALID_DRVINFO           = 1 << 6,
265
    VALID_FEATURES          = 1 << 7,
266
    VALID_NUMA_ID           = 1 << 8,
267
};
268
269
/* Linux 4.4 introduced the ability to skip the internal stats gathering
270
 * that netlink does via an external filter mask that can be passed into
271
 * a netlink request.
272
 */
273
#ifndef RTEXT_FILTER_SKIP_STATS
274
#define RTEXT_FILTER_SKIP_STATS (1 << 3)
275
#endif
276
277
/* Use one for the packet buffer and another for the aux buffer to receive
278
 * TSO packets. */
279
0
#define IOV_STD_SIZE 1
280
0
#define IOV_TSO_SIZE 2
281
282
enum {
283
    IOV_PACKET = 0,
284
    IOV_AUXBUF = 1,
285
};
286

287
struct linux_lag_member {
288
   uint32_t block_id;
289
   struct shash_node *node;
290
};
291
292
/* Protects 'lag_shash' and the mutable members of struct linux_lag_member. */
293
static struct ovs_mutex lag_mutex = OVS_MUTEX_INITIALIZER;
294
295
/* All members whose LAG primary interfaces are OVS network devices. */
296
static struct shash lag_shash OVS_GUARDED_BY(lag_mutex)
297
    = SHASH_INITIALIZER(&lag_shash);
298
299
/* Traffic control. */
300
301
/* An instance of a traffic control class.  Always associated with a particular
302
 * network device.
303
 *
304
 * Each TC implementation subclasses this with whatever additional data it
305
 * needs. */
306
struct tc {
307
    const struct tc_ops *ops;
308
    struct hmap queues;         /* Contains "struct tc_queue"s.
309
                                 * Read by generic TC layer.
310
                                 * Written only by TC implementation. */
311
};
312
313
0
#define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
314
315
/* One traffic control queue.
316
 *
317
 * Each TC implementation subclasses this with whatever additional data it
318
 * needs. */
319
struct tc_queue {
320
    struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
321
    unsigned int queue_id;      /* OpenFlow queue ID. */
322
    long long int created;      /* Time queue was created, in msecs. */
323
};
324
325
/* A particular kind of traffic control.  Each implementation generally maps to
326
 * one particular Linux qdisc class.
327
 *
328
 * The functions below return 0 if successful or a positive errno value on
329
 * failure, except where otherwise noted.  All of them must be provided, except
330
 * where otherwise noted. */
331
struct tc_ops {
332
    /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
333
     * This is null for tc_ops_default and tc_ops_other, for which there are no
334
     * appropriate values. */
335
    const char *linux_name;
336
337
    /* Name used in OVS database, e.g. "linux-htb".  Must be nonnull. */
338
    const char *ovs_name;
339
340
    /* Number of supported OpenFlow queues, 0 for qdiscs that have no
341
     * queues.  The queues are numbered 0 through n_queues - 1. */
342
    unsigned int n_queues;
343
344
    /* Called to install this TC class on 'netdev'.  The implementation should
345
     * make the Netlink calls required to set up 'netdev' with the right qdisc
346
     * and configure it according to 'details'.  The implementation may assume
347
     * that the current qdisc is the default; that is, there is no need for it
348
     * to delete the current qdisc before installing itself.
349
     *
350
     * The contents of 'details' should be documented as valid for 'ovs_name'
351
     * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
352
     * (which is built as ovs-vswitchd.conf.db(8)).
353
     *
354
     * This function must return 0 if and only if it sets 'netdev->tc' to an
355
     * initialized 'struct tc'.
356
     *
357
     * (This function is null for tc_ops_other, which cannot be installed.  For
358
     * other TC classes it should always be nonnull.) */
359
    int (*tc_install)(struct netdev *netdev, const struct smap *details);
360
361
    /* Called when the netdev code determines (through a Netlink query) that
362
     * this TC class's qdisc is installed on 'netdev', but we didn't install
363
     * it ourselves and so don't know any of the details.
364
     *
365
     * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
366
     * 'netdev'.  The TCA_KIND attribute of 'nlmsg' is 'linux_name'.  The
367
     * implementation should parse the other attributes of 'nlmsg' as
368
     * necessary to determine its configuration.  If necessary it should also
369
     * use Netlink queries to determine the configuration of queues on
370
     * 'netdev'.
371
     *
372
     * This function must return 0 if and only if it sets 'netdev->tc' to an
373
     * initialized 'struct tc'. */
374
    int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
375
376
    /* Destroys the data structures allocated by the implementation as part of
377
     * 'tc'.  (This includes destroying 'tc->queues' by calling
378
     * tc_destroy(tc).
379
     *
380
     * The implementation should not need to perform any Netlink calls.  If
381
     * desirable, the caller is responsible for deconfiguring the kernel qdisc.
382
     * (But it may not be desirable.)
383
     *
384
     * This function may be null if 'tc' is trivial. */
385
    void (*tc_destroy)(struct tc *tc);
386
387
    /* Retrieves details of 'netdev->tc' configuration into 'details'.
388
     *
389
     * The implementation should not need to perform any Netlink calls, because
390
     * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
391
     * cached the configuration.
392
     *
393
     * The contents of 'details' should be documented as valid for 'ovs_name'
394
     * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
395
     * (which is built as ovs-vswitchd.conf.db(8)).
396
     *
397
     * This function may be null if 'tc' is not configurable.
398
     */
399
    int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
400
401
    /* Reconfigures 'netdev->tc' according to 'details', performing any
402
     * required Netlink calls to complete the reconfiguration.
403
     *
404
     * The contents of 'details' should be documented as valid for 'ovs_name'
405
     * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
406
     * (which is built as ovs-vswitchd.conf.db(8)).
407
     *
408
     * This function may be null if 'tc' is not configurable.
409
     */
410
    int (*qdisc_set)(struct netdev *, const struct smap *details);
411
412
    /* Retrieves details of 'queue' on 'netdev->tc' into 'details'.  'queue' is
413
     * one of the 'struct tc_queue's within 'netdev->tc->queues'.
414
     *
415
     * The contents of 'details' should be documented as valid for 'ovs_name'
416
     * in the "other_config" column in the "Queue" table in
417
     * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
418
     *
419
     * The implementation should not need to perform any Netlink calls, because
420
     * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
421
     * cached the queue configuration.
422
     *
423
     * This function may be null if 'tc' does not have queues ('n_queues' is
424
     * 0). */
425
    int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
426
                     struct smap *details);
427
428
    /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
429
     * 'details', perfoming any required Netlink calls to complete the
430
     * reconfiguration.  The caller ensures that 'queue_id' is less than
431
     * 'n_queues'.
432
     *
433
     * The contents of 'details' should be documented as valid for 'ovs_name'
434
     * in the "other_config" column in the "Queue" table in
435
     * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
436
     *
437
     * This function may be null if 'tc' does not have queues or its queues are
438
     * not configurable. */
439
    int (*class_set)(struct netdev *, unsigned int queue_id,
440
                     const struct smap *details);
441
442
    /* Deletes 'queue' from 'netdev->tc'.  'queue' is one of the 'struct
443
     * tc_queue's within 'netdev->tc->queues'.
444
     *
445
     * This function may be null if 'tc' does not have queues or its queues
446
     * cannot be deleted. */
447
    int (*class_delete)(struct netdev *, struct tc_queue *queue);
448
449
    /* Obtains stats for 'queue' from 'netdev->tc'.  'queue' is one of the
450
     * 'struct tc_queue's within 'netdev->tc->queues'.
451
     *
452
     * On success, initializes '*stats'.
453
     *
454
     * This function may be null if 'tc' does not have queues or if it cannot
455
     * report queue statistics. */
456
    int (*class_get_stats)(const struct netdev *netdev,
457
                           const struct tc_queue *queue,
458
                           struct netdev_queue_stats *stats);
459
460
    /* Extracts queue stats from 'nlmsg', which is a response to a
461
     * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
462
     *
463
     * This function may be null if 'tc' does not have queues or if it cannot
464
     * report queue statistics. */
465
    int (*class_dump_stats)(const struct netdev *netdev,
466
                            const struct ofpbuf *nlmsg,
467
                            netdev_dump_queue_stats_cb *cb, void *aux);
468
};
469
470
static void
471
tc_init(struct tc *tc, const struct tc_ops *ops)
472
0
{
473
0
    tc->ops = ops;
474
0
    hmap_init(&tc->queues);
475
0
}
476
477
static void
478
tc_destroy(struct tc *tc)
479
0
{
480
0
    hmap_destroy(&tc->queues);
481
0
}
482
483
static const struct tc_ops tc_ops_htb;
484
static const struct tc_ops tc_ops_hfsc;
485
static const struct tc_ops tc_ops_codel;
486
static const struct tc_ops tc_ops_fqcodel;
487
static const struct tc_ops tc_ops_sfq;
488
static const struct tc_ops tc_ops_netem;
489
static const struct tc_ops tc_ops_default;
490
static const struct tc_ops tc_ops_noop;
491
static const struct tc_ops tc_ops_other;
492
493
static const struct tc_ops *const tcs[] = {
494
    &tc_ops_htb,                /* Hierarchy token bucket (see tc-htb(8)). */
495
    &tc_ops_hfsc,               /* Hierarchical fair service curve. */
496
    &tc_ops_codel,              /* Controlled delay */
497
    &tc_ops_fqcodel,            /* Fair queue controlled delay */
498
    &tc_ops_sfq,                /* Stochastic fair queueing */
499
    &tc_ops_netem,              /* Network Emulator */
500
    &tc_ops_noop,               /* Non operating qos type. */
501
    &tc_ops_default,            /* Default qdisc (see tc-pfifo_fast(8)). */
502
    &tc_ops_other,              /* Some other qdisc. */
503
    NULL
504
};
505
506
static unsigned int tc_ticks_to_bytes(uint64_t rate, unsigned int ticks);
507
static unsigned int tc_bytes_to_ticks(uint64_t rate, unsigned int size);
508
static unsigned int tc_buffer_per_jiffy(uint64_t rate);
509
static uint32_t tc_time_to_ticks(uint32_t time);
510
511
static struct tcmsg *netdev_linux_tc_make_request(const struct netdev *,
512
                                                  int type,
513
                                                  unsigned int flags,
514
                                                  struct ofpbuf *);
515
516
static int tc_add_policer(struct netdev *, uint64_t kbits_rate,
517
                          uint32_t kbits_burst, uint32_t kpkts_rate,
518
                          uint32_t kpkts_burst);
519
520
static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
521
                          struct nlattr **options);
522
static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
523
                          struct nlattr **options,
524
                          struct netdev_queue_stats *);
525
static int tc_query_class(const struct netdev *,
526
                          unsigned int handle, unsigned int parent,
527
                          struct ofpbuf **replyp);
528
static int tc_delete_class(const struct netdev *, unsigned int handle);
529
530
static int tc_del_qdisc(struct netdev *netdev);
531
static int tc_query_qdisc(const struct netdev *netdev);
532
static void tc_policer_init(struct tc_police *tc_police, uint64_t kbits_rate,
533
                            uint64_t kbits_burst);
534
535
void
536
tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate,
537
            uint64_t rate64);
538
static int tc_calc_cell_log(unsigned int mtu);
539
static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
540
static int tc_calc_buffer(uint64_t Bps, int mtu, uint64_t burst_bytes);
541

542
543
/* This is set pretty low because we probably won't learn anything from the
544
 * additional log messages. */
545
static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
546
547
/* Polling miimon status for all ports causes performance degradation when
548
 * handling a large number of ports. If there are no devices using miimon, then
549
 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait().
550
 *
551
 * Readers do not depend on this variable synchronizing with the related
552
 * changes in the device miimon status, so we can use atomic_count. */
553
static atomic_count miimon_cnt = ATOMIC_COUNT_INIT(0);
554
555
/* Very old kernels from the 2.6 era don't support vnet headers with the tun
556
 * device. We can detect this while constructing a netdev, but need this for
557
 * packet rx/tx. */
558
static bool tap_supports_vnet_hdr = true;
559
560
static int netdev_linux_parse_vnet_hdr(struct dp_packet *b);
561
static int netdev_linux_prepend_vnet_hdr(struct dp_packet *b, int mtu);
562
static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
563
                                   int cmd, const char *cmd_name);
564
static int get_flags(const struct netdev *, unsigned int *flags);
565
static int set_flags(const char *, unsigned int flags);
566
static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
567
                        enum netdev_flags on, enum netdev_flags *old_flagsp)
568
    OVS_REQUIRES(netdev->mutex);
569
static int get_ifindex(const struct netdev *, int *ifindexp);
570
static int do_set_addr(struct netdev *netdev,
571
                       int ioctl_nr, const char *ioctl_name,
572
                       struct in_addr addr);
573
static int get_etheraddr(const char *netdev_name, struct eth_addr *ea);
574
static int set_etheraddr(const char *netdev_name, const struct eth_addr);
575
static int af_packet_sock(void);
576
static bool netdev_linux_miimon_enabled(void);
577
static void netdev_linux_miimon_run(void);
578
static void netdev_linux_miimon_wait(void);
579
static int netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup);
580
static void netdev_linux_set_ol(struct netdev *netdev);
581
582
static bool
583
is_tap_netdev(const struct netdev *netdev)
584
0
{
585
0
    return netdev_get_class(netdev) == &netdev_tap_class;
586
0
}
587

588
static int
589
netdev_linux_netnsid_update__(struct netdev_linux *netdev)
590
0
{
591
0
    struct dpif_netlink_vport reply;
592
0
    struct ofpbuf *buf;
593
0
    int error;
594
595
0
    error = dpif_netlink_vport_get(netdev_get_name(&netdev->up), &reply, &buf);
596
0
    if (error) {
597
0
        if (error == ENOENT) {
598
            /* Assume it is local if there is no API (e.g. if the openvswitch
599
             * kernel module is not loaded). */
600
0
            netnsid_set_local(&netdev->netnsid);
601
0
        } else {
602
0
            netnsid_unset(&netdev->netnsid);
603
0
        }
604
0
        return error;
605
0
    }
606
607
0
    netnsid_set(&netdev->netnsid, reply.netnsid);
608
0
    ofpbuf_delete(buf);
609
0
    return 0;
610
0
}
611
612
static int
613
netdev_linux_netnsid_update(struct netdev_linux *netdev)
614
0
{
615
0
    if (netnsid_is_unset(netdev->netnsid)) {
616
0
        if (netdev_get_class(&netdev->up) == &netdev_tap_class) {
617
0
            netnsid_set_local(&netdev->netnsid);
618
0
        } else {
619
0
            return netdev_linux_netnsid_update__(netdev);
620
0
        }
621
0
    }
622
623
0
    return 0;
624
0
}
625
626
static bool
627
netdev_linux_netnsid_is_eq(struct netdev_linux *netdev, int nsid)
628
0
{
629
0
    netdev_linux_netnsid_update(netdev);
630
0
    return netnsid_eq(netdev->netnsid, nsid);
631
0
}
632
633
static bool
634
netdev_linux_netnsid_is_remote(struct netdev_linux *netdev)
635
0
{
636
0
    netdev_linux_netnsid_update(netdev);
637
0
    return netnsid_is_remote(netdev->netnsid);
638
0
}
639
640
static int netdev_linux_update_via_netlink(struct netdev_linux *);
641
static void netdev_linux_update(struct netdev_linux *netdev, int,
642
                                const struct rtnetlink_change *)
643
    OVS_REQUIRES(netdev->mutex);
644
static void netdev_linux_changed(struct netdev_linux *netdev,
645
                                 unsigned int ifi_flags, unsigned int mask)
646
    OVS_REQUIRES(netdev->mutex);
647
648
/* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK,
649
 * RTNLGRP_IPV4_IFADDR and RTNLGRP_IPV6_IFADDR changes, or NULL
650
 * if no such socket could be created. */
651
static struct nl_sock *
652
netdev_linux_notify_sock(void)
653
0
{
654
0
    static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
655
0
    static struct nl_sock *sock;
656
0
    unsigned int mcgroups[] = {RTNLGRP_LINK, RTNLGRP_IPV4_IFADDR,
657
0
                                RTNLGRP_IPV6_IFADDR, RTNLGRP_IPV6_IFINFO};
658
659
0
    if (ovsthread_once_start(&once)) {
660
0
        int error;
661
662
0
        error = nl_sock_create(NETLINK_ROUTE, &sock);
663
0
        if (!error) {
664
0
            size_t i;
665
666
0
            nl_sock_listen_all_nsid(sock, true);
667
0
            for (i = 0; i < ARRAY_SIZE(mcgroups); i++) {
668
0
                error = nl_sock_join_mcgroup(sock, mcgroups[i]);
669
0
                if (error) {
670
0
                    nl_sock_destroy(sock);
671
0
                    sock = NULL;
672
0
                    break;
673
0
                }
674
0
            }
675
0
        }
676
0
        ovsthread_once_done(&once);
677
0
    }
678
679
0
    return sock;
680
0
}
681
682
static bool
683
netdev_linux_miimon_enabled(void)
684
0
{
685
0
    return atomic_count_get(&miimon_cnt) > 0;
686
0
}
687
688
static bool
689
netdev_linux_kind_is_lag(const char *kind)
690
0
{
691
0
    if (!strcmp(kind, "bond") || !strcmp(kind, "team")) {
692
0
        return true;
693
0
    }
694
695
0
    return false;
696
0
}
697
698
static void
699
netdev_linux_update_lag(struct rtnetlink_change *change)
700
    OVS_REQUIRES(lag_mutex)
701
0
{
702
0
    struct linux_lag_member *lag;
703
704
0
    if (change->sub && netdev_linux_kind_is_lag(change->sub)) {
705
0
        lag = shash_find_data(&lag_shash, change->ifname);
706
707
0
        if (!lag) {
708
0
            struct netdev *primary_netdev;
709
0
            char primary_name[IFNAMSIZ];
710
0
            uint32_t block_id;
711
0
            int error = 0;
712
713
0
            if (!if_indextoname(change->master_ifindex, primary_name)) {
714
0
                return;
715
0
            }
716
0
            primary_netdev = netdev_from_name(primary_name);
717
0
            if (!primary_netdev) {
718
0
                return;
719
0
            }
720
721
            /* If LAG primary member is not attached to ovs,
722
             * ingress block on LAG members should not be updated. */
723
0
            if (!primary_netdev->auto_classified &&
724
0
                is_netdev_linux_class(primary_netdev->netdev_class)) {
725
0
                block_id = netdev_get_block_id(primary_netdev);
726
0
                if (!block_id) {
727
0
                    netdev_close(primary_netdev);
728
0
                    return;
729
0
                }
730
731
0
                lag = xmalloc(sizeof *lag);
732
0
                lag->block_id = block_id;
733
0
                lag->node = shash_add(&lag_shash, change->ifname, lag);
734
735
                /* delete ingress block in case it exists */
736
0
                tc_add_del_qdisc(change->if_index, false, 0, TC_INGRESS);
737
                /* LAG primary is linux netdev so add member to same block. */
738
0
                error = tc_add_del_qdisc(change->if_index, true, block_id,
739
0
                                         TC_INGRESS);
740
0
                if (error) {
741
0
                    VLOG_WARN("failed to bind LAG member %s to "
742
0
                              "primary's block", change->ifname);
743
0
                    shash_delete(&lag_shash, lag->node);
744
0
                    free(lag);
745
0
                }
746
0
            }
747
748
0
            netdev_close(primary_netdev);
749
0
        }
750
0
    } else if (change->master_ifindex == 0) {
751
        /* Check if this was a lag member that has been removed. */
752
0
        lag = shash_find_data(&lag_shash, change->ifname);
753
754
0
        if (lag) {
755
0
            tc_add_del_qdisc(change->if_index, false, lag->block_id,
756
0
                             TC_INGRESS);
757
0
            shash_delete(&lag_shash, lag->node);
758
0
            free(lag);
759
0
        }
760
0
    }
761
0
}
762
763
void
764
netdev_linux_run(const struct netdev_class *netdev_class OVS_UNUSED)
765
0
{
766
0
    struct nl_sock *sock;
767
0
    int error;
768
769
0
    if (netdev_linux_miimon_enabled()) {
770
0
        netdev_linux_miimon_run();
771
0
    }
772
773
0
    sock = netdev_linux_notify_sock();
774
0
    if (!sock) {
775
0
        return;
776
0
    }
777
778
0
    do {
779
0
        uint64_t buf_stub[4096 / 8];
780
0
        int nsid;
781
0
        struct ofpbuf buf;
782
783
0
        ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
784
0
        error = nl_sock_recv(sock, &buf, &nsid, false);
785
0
        if (!error) {
786
0
            struct rtnetlink_change change;
787
788
0
            if (rtnetlink_parse(&buf, &change) && !change.irrelevant) {
789
0
                struct netdev *netdev_ = NULL;
790
0
                char dev_name[IFNAMSIZ];
791
792
0
                if (!change.ifname) {
793
0
                     change.ifname = if_indextoname(change.if_index, dev_name);
794
0
                }
795
796
0
                if (change.ifname) {
797
0
                    netdev_ = netdev_from_name(change.ifname);
798
0
                }
799
0
                if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
800
0
                    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
801
802
0
                    ovs_mutex_lock(&netdev->mutex);
803
0
                    netdev_linux_update(netdev, nsid, &change);
804
0
                    ovs_mutex_unlock(&netdev->mutex);
805
0
                }
806
807
0
                if (change.ifname &&
808
0
                    rtnetlink_type_is_rtnlgrp_link(change.nlmsg_type)) {
809
810
                    /* Need to try updating the LAG information. */
811
0
                    ovs_mutex_lock(&lag_mutex);
812
0
                    netdev_linux_update_lag(&change);
813
0
                    ovs_mutex_unlock(&lag_mutex);
814
0
                }
815
0
                netdev_close(netdev_);
816
0
            }
817
0
        } else if (error == ENOBUFS) {
818
0
            struct shash device_shash;
819
0
            struct shash_node *node;
820
821
0
            nl_sock_drain(sock);
822
823
0
            shash_init(&device_shash);
824
0
            netdev_get_devices(&netdev_linux_class, &device_shash);
825
0
            SHASH_FOR_EACH (node, &device_shash) {
826
0
                struct netdev *netdev_ = node->data;
827
0
                struct netdev_linux *netdev = netdev_linux_cast(netdev_);
828
0
                unsigned int flags;
829
830
0
                ovs_mutex_lock(&netdev->mutex);
831
0
                get_flags(netdev_, &flags);
832
0
                netdev_linux_changed(netdev, flags, 0);
833
0
                ovs_mutex_unlock(&netdev->mutex);
834
835
0
                netdev_close(netdev_);
836
0
            }
837
0
            shash_destroy(&device_shash);
838
0
        } else if (error != EAGAIN) {
839
0
            static struct vlog_rate_limit rll = VLOG_RATE_LIMIT_INIT(1, 5);
840
0
            VLOG_WARN_RL(&rll, "error reading or parsing netlink (%s)",
841
0
                         ovs_strerror(error));
842
0
        }
843
0
        ofpbuf_uninit(&buf);
844
0
    } while (!error);
845
0
}
846
847
static void
848
netdev_linux_wait(const struct netdev_class *netdev_class OVS_UNUSED)
849
0
{
850
0
    struct nl_sock *sock;
851
852
0
    if (netdev_linux_miimon_enabled()) {
853
0
        netdev_linux_miimon_wait();
854
0
    }
855
0
    sock = netdev_linux_notify_sock();
856
0
    if (sock) {
857
0
        nl_sock_wait(sock, POLLIN);
858
0
    }
859
0
}
860
861
static void
862
netdev_linux_changed(struct netdev_linux *dev,
863
                     unsigned int ifi_flags, unsigned int mask)
864
    OVS_REQUIRES(dev->mutex)
865
0
{
866
0
    netdev_change_seq_changed(&dev->up);
867
868
0
    if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
869
0
        dev->carrier_resets++;
870
0
    }
871
0
    dev->ifi_flags = ifi_flags;
872
873
0
    dev->cache_valid &= mask;
874
0
    if (!(mask & VALID_IN)) {
875
0
        netdev_get_addrs_list_flush();
876
0
    }
877
0
}
878
879
static void
880
netdev_linux_update__(struct netdev_linux *dev,
881
                      const struct rtnetlink_change *change)
882
    OVS_REQUIRES(dev->mutex)
883
0
{
884
0
    if (rtnetlink_type_is_rtnlgrp_link(change->nlmsg_type)) {
885
0
        if (change->nlmsg_type == RTM_NEWLINK) {
886
            /* Keep drv-info, ip addresses, and NUMA id. */
887
0
            netdev_linux_changed(dev, change->ifi_flags,
888
0
                                 VALID_DRVINFO | VALID_IN | VALID_NUMA_ID);
889
890
            /* Update netdev from rtnl-change msg. */
891
0
            if (change->mtu) {
892
0
                dev->mtu = change->mtu;
893
0
                dev->cache_valid |= VALID_MTU;
894
0
                dev->netdev_mtu_error = 0;
895
0
            }
896
897
0
            if (!eth_addr_is_zero(change->mac)) {
898
0
                dev->etheraddr = change->mac;
899
0
                dev->cache_valid |= VALID_ETHERADDR;
900
0
                dev->ether_addr_error = 0;
901
902
                /* The mac addr has been changed, report it now. */
903
0
                rtnetlink_report_link();
904
0
            }
905
906
0
            if (change->primary && netdev_linux_kind_is_lag(change->primary)) {
907
0
                dev->is_lag_primary = true;
908
0
            }
909
910
0
            dev->ifindex = change->if_index;
911
0
            dev->cache_valid |= VALID_IFINDEX;
912
0
            dev->get_ifindex_error = 0;
913
0
            dev->present = true;
914
0
        } else {
915
            /* FIXME */
916
0
            netdev_linux_changed(dev, change->ifi_flags, 0);
917
0
            dev->present = false;
918
0
            netnsid_unset(&dev->netnsid);
919
0
        }
920
0
    } else if (rtnetlink_type_is_rtnlgrp_addr(change->nlmsg_type)) {
921
        /* Invalidates in4, in6. */
922
0
        netdev_linux_changed(dev, dev->ifi_flags, ~VALID_IN);
923
0
    } else {
924
0
        OVS_NOT_REACHED();
925
0
    }
926
0
}
927
928
static void
929
netdev_linux_update(struct netdev_linux *dev, int nsid,
930
                    const struct rtnetlink_change *change)
931
    OVS_REQUIRES(dev->mutex)
932
0
{
933
0
    if (netdev_linux_netnsid_is_eq(dev, nsid)) {
934
0
        netdev_linux_update__(dev, change);
935
0
    }
936
0
}
937
938
static struct netdev *
939
netdev_linux_alloc(void)
940
0
{
941
0
    struct netdev_linux *netdev = xzalloc(sizeof *netdev);
942
0
    return &netdev->up;
943
0
}
944
945
static int
946
netdev_linux_common_construct(struct netdev *netdev_)
947
0
{
948
    /* Prevent any attempt to create (or open) a network device named "default"
949
     * or "all".  These device names are effectively reserved on Linux because
950
     * /proc/sys/net/ipv4/conf/ always contains directories by these names.  By
951
     * itself this wouldn't call for any special treatment, but in practice if
952
     * a program tries to create devices with these names, it causes the kernel
953
     * to fire a "new device" notification event even though creation failed,
954
     * and in turn that causes OVS to wake up and try to create them again,
955
     * which ends up as a 100% CPU loop. */
956
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
957
0
    const char *name = netdev_->name;
958
0
    if (!strcmp(name, "default") || !strcmp(name, "all")) {
959
0
        static struct vlog_rate_limit rll = VLOG_RATE_LIMIT_INIT(1, 1);
960
0
        VLOG_WARN_RL(&rll, "%s: Linux forbids network device with this name",
961
0
                     name);
962
0
        return EINVAL;
963
0
    }
964
965
    /* The device could be in the same network namespace or in another one. */
966
0
    netnsid_unset(&netdev->netnsid);
967
0
    ovs_mutex_init(&netdev->mutex);
968
969
0
    return 0;
970
0
}
971
972
/* Creates system and internal devices. */
973
int
974
netdev_linux_construct(struct netdev *netdev_)
975
0
{
976
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
977
0
    int error = netdev_linux_common_construct(netdev_);
978
0
    if (error) {
979
0
        return error;
980
0
    }
981
982
0
    if (userspace_tso_enabled()) {
983
        /* The AF_PACKET socket interface uses the same option to facilitate
984
         * both csum and segmentation offloading. However, these features can
985
         * be toggled off or on individually at the interface level. The netdev
986
         * flags are set based on the features indicated by ethtool. */
987
0
        netdev_linux_set_ol(netdev_);
988
0
    }
989
990
0
    error = get_flags(&netdev->up, &netdev->ifi_flags);
991
0
    if (error == ENODEV) {
992
0
        if (netdev->up.netdev_class != &netdev_internal_class) {
993
            /* The device does not exist, so don't allow it to be opened. */
994
0
            return ENODEV;
995
0
        } else {
996
            /* "Internal" netdevs have to be created as netdev objects before
997
             * they exist in the kernel, because creating them in the kernel
998
             * happens by passing a netdev object to dpif_port_add().
999
             * Therefore, ignore the error. */
1000
0
        }
1001
0
    }
1002
1003
0
    return 0;
1004
0
}
1005
1006
/* For most types of netdevs we open the device for each call of
1007
 * netdev_open().  However, this is not the case with tap devices,
1008
 * since it is only possible to open the device once.  In this
1009
 * situation we share a single file descriptor, and consequently
1010
 * buffers, across all readers.  Therefore once data is read it will
1011
 * be unavailable to other reads for tap devices. */
1012
static int
1013
netdev_linux_construct_tap(struct netdev *netdev_)
1014
0
{
1015
0
    static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
1016
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1017
0
    static const char tap_dev[] = "/dev/net/tun";
1018
0
    const char *name = netdev_->name;
1019
0
    unsigned long oflags;
1020
0
    unsigned int up;
1021
0
    struct ifreq ifr;
1022
1023
0
    int error = netdev_linux_common_construct(netdev_);
1024
0
    if (error) {
1025
0
        return error;
1026
0
    }
1027
1028
    /* Open tap device. */
1029
0
    netdev->tap_fd = open(tap_dev, O_RDWR);
1030
0
    if (netdev->tap_fd < 0) {
1031
0
        error = errno;
1032
0
        VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
1033
0
        return error;
1034
0
    }
1035
1036
    /* Create tap device. */
1037
0
    get_flags(&netdev->up, &netdev->ifi_flags);
1038
1039
0
    if (ovsthread_once_start(&once)) {
1040
0
        if (ioctl(netdev->tap_fd, TUNGETFEATURES, &up) == -1) {
1041
0
            VLOG_WARN("%s: querying tap features failed: %s", name,
1042
0
                      ovs_strerror(errno));
1043
0
            tap_supports_vnet_hdr = false;
1044
0
        } else if (!(up & IFF_VNET_HDR)) {
1045
0
            VLOG_WARN("TAP interfaces do not support virtio-net headers");
1046
0
            tap_supports_vnet_hdr = false;
1047
0
        }
1048
0
        ovsthread_once_done(&once);
1049
0
    }
1050
1051
0
    memset(&ifr, 0, sizeof ifr);
1052
1053
0
    ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
1054
0
    if (tap_supports_vnet_hdr) {
1055
0
        ifr.ifr_flags |= IFF_VNET_HDR;
1056
0
    }
1057
1058
0
    ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
1059
0
    if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
1060
0
        VLOG_WARN("%s: creating tap device failed: %s", name,
1061
0
                  ovs_strerror(errno));
1062
0
        error = errno;
1063
0
        goto error_close;
1064
0
    }
1065
1066
    /* Make non-blocking. */
1067
0
    error = set_nonblocking(netdev->tap_fd);
1068
0
    if (error) {
1069
0
        goto error_close;
1070
0
    }
1071
1072
0
    if (ioctl(netdev->tap_fd, TUNSETPERSIST, 1)) {
1073
0
        VLOG_WARN("%s: creating tap device failed (persist): %s", name,
1074
0
                  ovs_strerror(errno));
1075
0
        error = errno;
1076
0
        goto error_close;
1077
0
    }
1078
1079
0
    oflags = TUN_F_CSUM;
1080
0
    if (userspace_tso_enabled()) {
1081
0
        oflags |= (TUN_F_TSO4 | TUN_F_TSO6);
1082
0
    }
1083
1084
0
    if (tap_supports_vnet_hdr
1085
0
        && ioctl(netdev->tap_fd, TUNSETOFFLOAD, oflags) == 0) {
1086
0
        netdev_->ol_flags |= (NETDEV_TX_OFFLOAD_TCP_CKSUM
1087
0
                              | NETDEV_TX_OFFLOAD_UDP_CKSUM);
1088
1089
0
        if (userspace_tso_enabled()) {
1090
0
            netdev_->ol_flags |= NETDEV_TX_OFFLOAD_TCP_TSO;
1091
0
        }
1092
0
    } else {
1093
0
       VLOG_INFO("%s: Disabling checksum and segment offloading due to "
1094
0
                 "missing kernel support", name);
1095
0
    }
1096
1097
0
    netdev->present = true;
1098
0
    return 0;
1099
1100
0
error_close:
1101
0
    close(netdev->tap_fd);
1102
0
    return error;
1103
0
}
1104
1105
static void
1106
netdev_linux_destruct(struct netdev *netdev_)
1107
0
{
1108
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1109
1110
0
    if (netdev->tc && netdev->tc->ops->tc_destroy) {
1111
0
        netdev->tc->ops->tc_destroy(netdev->tc);
1112
0
    }
1113
1114
0
    if (netdev_get_class(netdev_) == &netdev_tap_class
1115
0
        && netdev->tap_fd >= 0)
1116
0
    {
1117
0
        ioctl(netdev->tap_fd, TUNSETPERSIST, 0);
1118
0
        close(netdev->tap_fd);
1119
0
    }
1120
1121
0
    if (netdev->miimon_interval > 0) {
1122
0
        atomic_count_dec(&miimon_cnt);
1123
0
    }
1124
1125
0
    ovs_mutex_destroy(&netdev->mutex);
1126
0
}
1127
1128
static void
1129
netdev_linux_dealloc(struct netdev *netdev_)
1130
0
{
1131
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1132
0
    free(netdev);
1133
0
}
1134
1135
static struct netdev_rxq *
1136
netdev_linux_rxq_alloc(void)
1137
0
{
1138
0
    struct netdev_rxq_linux *rx = xzalloc(sizeof *rx);
1139
0
    return &rx->up;
1140
0
}
1141
1142
static int
1143
netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
1144
0
{
1145
0
    struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1146
0
    struct netdev *netdev_ = rx->up.netdev;
1147
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1148
0
    int error;
1149
1150
0
    ovs_mutex_lock(&netdev->mutex);
1151
0
    rx->is_tap = is_tap_netdev(netdev_);
1152
0
    if (rx->is_tap) {
1153
0
        rx->fd = netdev->tap_fd;
1154
0
    } else {
1155
0
        struct sockaddr_ll sll;
1156
0
        int ifindex, val;
1157
        /* Result of tcpdump -dd inbound */
1158
0
        static const struct sock_filter filt[] = {
1159
0
            { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
1160
0
            { 0x15, 0, 1, 0x00000004 }, /* jeq #4     jt 2  jf 3 */
1161
0
            { 0x6, 0, 0, 0x00000000 },  /* ret #0 */
1162
0
            { 0x6, 0, 0, 0x0000ffff }   /* ret #65535 */
1163
0
        };
1164
0
        static const struct sock_fprog fprog = {
1165
0
            ARRAY_SIZE(filt), (struct sock_filter *) filt
1166
0
        };
1167
1168
        /* Create file descriptor. */
1169
0
        rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
1170
0
        if (rx->fd < 0) {
1171
0
            error = errno;
1172
0
            VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
1173
0
            goto error;
1174
0
        }
1175
1176
0
        val = 1;
1177
0
        if (setsockopt(rx->fd, SOL_PACKET, PACKET_AUXDATA, &val, sizeof val)) {
1178
0
            error = errno;
1179
0
            VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
1180
0
                     netdev_get_name(netdev_), ovs_strerror(error));
1181
0
            goto error;
1182
0
        }
1183
1184
0
        if (userspace_tso_enabled()
1185
0
            && setsockopt(rx->fd, SOL_PACKET, PACKET_VNET_HDR, &val,
1186
0
                          sizeof val)) {
1187
0
            error = errno;
1188
0
            VLOG_ERR("%s: failed to enable vnet hdr in txq raw socket: %s",
1189
0
                     netdev_get_name(netdev_), ovs_strerror(errno));
1190
0
            goto error;
1191
0
        }
1192
1193
        /* Set non-blocking mode. */
1194
0
        error = set_nonblocking(rx->fd);
1195
0
        if (error) {
1196
0
            goto error;
1197
0
        }
1198
1199
        /* Get ethernet device index. */
1200
0
        error = get_ifindex(&netdev->up, &ifindex);
1201
0
        if (error) {
1202
0
            goto error;
1203
0
        }
1204
1205
        /* Bind to specific ethernet device. */
1206
0
        memset(&sll, 0, sizeof sll);
1207
0
        sll.sll_family = AF_PACKET;
1208
0
        sll.sll_ifindex = ifindex;
1209
0
        sll.sll_protocol = htons(ETH_P_ALL);
1210
0
        if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
1211
0
            error = errno;
1212
0
            VLOG_ERR("%s: failed to bind raw socket (%s)",
1213
0
                     netdev_get_name(netdev_), ovs_strerror(error));
1214
0
            goto error;
1215
0
        }
1216
1217
        /* Filter for only inbound packets. */
1218
0
        error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
1219
0
                           sizeof fprog);
1220
0
        if (error) {
1221
0
            error = errno;
1222
0
            VLOG_ERR("%s: failed to attach filter (%s)",
1223
0
                     netdev_get_name(netdev_), ovs_strerror(error));
1224
0
            goto error;
1225
0
        }
1226
0
    }
1227
0
    ovs_mutex_unlock(&netdev->mutex);
1228
1229
0
    return 0;
1230
1231
0
error:
1232
0
    if (rx->fd >= 0) {
1233
0
        close(rx->fd);
1234
0
    }
1235
0
    ovs_mutex_unlock(&netdev->mutex);
1236
0
    return error;
1237
0
}
1238
1239
static void
1240
netdev_linux_rxq_destruct(struct netdev_rxq *rxq_)
1241
0
{
1242
0
    struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1243
0
    int i;
1244
1245
0
    if (!rx->is_tap) {
1246
0
        close(rx->fd);
1247
0
    }
1248
1249
0
    for (i = 0; i < NETDEV_MAX_BURST; i++) {
1250
0
        dp_packet_delete(rx->aux_bufs[i]);
1251
0
    }
1252
0
}
1253
1254
static void
1255
netdev_linux_rxq_dealloc(struct netdev_rxq *rxq_)
1256
0
{
1257
0
    struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1258
1259
0
    free(rx);
1260
0
}
1261
1262
static ovs_be16
1263
auxdata_to_vlan_tpid(const struct tpacket_auxdata *aux, bool double_tagged)
1264
0
{
1265
0
    if (aux->tp_status & TP_STATUS_VLAN_TPID_VALID) {
1266
0
        return htons(aux->tp_vlan_tpid);
1267
0
    } else if (double_tagged) {
1268
0
        return htons(ETH_TYPE_VLAN_8021AD);
1269
0
    } else {
1270
0
        return htons(ETH_TYPE_VLAN_8021Q);
1271
0
    }
1272
0
}
1273
1274
static bool
1275
auxdata_has_vlan_tci(const struct tpacket_auxdata *aux)
1276
0
{
1277
0
    return aux->tp_vlan_tci || aux->tp_status & TP_STATUS_VLAN_VALID;
1278
0
}
1279
1280
/*
1281
 * Receive packets from raw socket in batch process for better performance,
1282
 * it can receive NETDEV_MAX_BURST packets at most once, the received
1283
 * packets are added into *batch. The return value is 0 or errno.
1284
 *
1285
 * It also used recvmmsg to reduce multiple syscalls overhead;
1286
 */
1287
static int
1288
netdev_linux_batch_rxq_recv_sock(struct netdev_rxq_linux *rx, int mtu,
1289
                                 struct dp_packet_batch *batch)
1290
0
{
1291
0
    int iovlen;
1292
0
    size_t std_len;
1293
0
    ssize_t retval;
1294
0
    int virtio_net_hdr_size;
1295
0
    struct iovec iovs[NETDEV_MAX_BURST][IOV_TSO_SIZE];
1296
0
    struct cmsghdr *cmsg;
1297
0
    union {
1298
0
        struct cmsghdr cmsg;
1299
0
        char buffer[CMSG_SPACE(sizeof(struct tpacket_auxdata))];
1300
0
    } cmsg_buffers[NETDEV_MAX_BURST];
1301
0
    struct mmsghdr mmsgs[NETDEV_MAX_BURST];
1302
0
    struct dp_packet *buffers[NETDEV_MAX_BURST];
1303
0
    int i;
1304
1305
0
    if (userspace_tso_enabled()) {
1306
        /* Use the buffer from the allocated packet below to receive MTU
1307
         * sized packets and an aux_buf for extra TSO data. */
1308
0
        iovlen = IOV_TSO_SIZE;
1309
0
        virtio_net_hdr_size = sizeof(struct virtio_net_hdr);
1310
0
    } else {
1311
        /* Use only the buffer from the allocated packet. */
1312
0
        iovlen = IOV_STD_SIZE;
1313
0
        virtio_net_hdr_size = 0;
1314
0
    }
1315
1316
    /* The length here needs to be accounted in the same way when the
1317
     * aux_buf is allocated so that it can be prepended to TSO buffer. */
1318
0
    std_len = virtio_net_hdr_size + VLAN_ETH_HEADER_LEN + mtu;
1319
0
    for (i = 0; i < NETDEV_MAX_BURST; i++) {
1320
0
        buffers[i] = dp_packet_new_with_headroom(std_len, DP_NETDEV_HEADROOM);
1321
0
        iovs[i][IOV_PACKET].iov_base = dp_packet_data(buffers[i]);
1322
0
        iovs[i][IOV_PACKET].iov_len = std_len;
1323
0
        if (iovlen == IOV_TSO_SIZE) {
1324
0
            iovs[i][IOV_AUXBUF].iov_base = dp_packet_data(rx->aux_bufs[i]);
1325
0
            iovs[i][IOV_AUXBUF].iov_len = dp_packet_tailroom(rx->aux_bufs[i]);
1326
0
        }
1327
1328
0
        mmsgs[i].msg_hdr.msg_name = NULL;
1329
0
        mmsgs[i].msg_hdr.msg_namelen = 0;
1330
0
        mmsgs[i].msg_hdr.msg_iov = iovs[i];
1331
0
        mmsgs[i].msg_hdr.msg_iovlen = iovlen;
1332
0
        mmsgs[i].msg_hdr.msg_control = &cmsg_buffers[i];
1333
0
        mmsgs[i].msg_hdr.msg_controllen = sizeof cmsg_buffers[i];
1334
0
        mmsgs[i].msg_hdr.msg_flags = 0;
1335
0
    }
1336
1337
0
    do {
1338
0
        retval = recvmmsg(rx->fd, mmsgs, NETDEV_MAX_BURST, MSG_TRUNC, NULL);
1339
0
    } while (retval < 0 && errno == EINTR);
1340
1341
0
    if (retval < 0) {
1342
0
        retval = errno;
1343
0
        for (i = 0; i < NETDEV_MAX_BURST; i++) {
1344
0
            dp_packet_delete(buffers[i]);
1345
0
        }
1346
1347
0
        return retval;
1348
0
    }
1349
1350
0
    for (i = 0; i < retval; i++) {
1351
0
        struct dp_packet *pkt;
1352
1353
0
        if (mmsgs[i].msg_hdr.msg_flags & MSG_TRUNC
1354
0
            || mmsgs[i].msg_len < ETH_HEADER_LEN) {
1355
0
            struct netdev *netdev_ = netdev_rxq_get_netdev(&rx->up);
1356
0
            struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1357
1358
            /* The rx->aux_bufs[i] will be re-used next time. */
1359
0
            dp_packet_delete(buffers[i]);
1360
0
            netdev->rx_dropped += 1;
1361
0
            if (mmsgs[i].msg_hdr.msg_flags & MSG_TRUNC) {
1362
                /* Data is truncated, so the packet is corrupted, and needs
1363
                 * to be dropped. This can happen if TSO/GRO is enabled in
1364
                 * the kernel, but not in userspace, i.e. there is no dp
1365
                 * buffer to store the full packet. */
1366
0
                VLOG_WARN_RL(&rl,
1367
0
                             "%s: Dropped packet: Too big. GRO/TSO enabled?",
1368
0
                             netdev_get_name(netdev_));
1369
0
            } else {
1370
0
                VLOG_WARN_RL(&rl,
1371
0
                             "%s: Dropped packet: less than ether hdr size",
1372
0
                             netdev_get_name(netdev_));
1373
0
            }
1374
1375
0
            continue;
1376
0
        }
1377
1378
0
        if (mmsgs[i].msg_len > std_len) {
1379
            /* Build a single linear TSO packet by prepending the data from
1380
             * std_len buffer to the aux_buf. */
1381
0
            pkt = rx->aux_bufs[i];
1382
0
            dp_packet_set_size(pkt, mmsgs[i].msg_len - std_len);
1383
0
            dp_packet_push(pkt, dp_packet_data(buffers[i]), std_len);
1384
            /* The headroom should be the same in buffers[i], pkt and
1385
             * DP_NETDEV_HEADROOM. */
1386
0
            dp_packet_resize(pkt, DP_NETDEV_HEADROOM, 0);
1387
0
            dp_packet_delete(buffers[i]);
1388
0
            rx->aux_bufs[i] = NULL;
1389
0
         } else {
1390
0
            dp_packet_set_size(buffers[i], mmsgs[i].msg_len);
1391
0
            pkt = buffers[i];
1392
0
         }
1393
1394
0
        if (virtio_net_hdr_size) {
1395
0
            int ret = netdev_linux_parse_vnet_hdr(pkt);
1396
0
            if (OVS_UNLIKELY(ret)) {
1397
0
                struct netdev *netdev_ = netdev_rxq_get_netdev(&rx->up);
1398
0
                struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1399
1400
                /* Unexpected error situation: the virtio header is not
1401
                 * present or corrupted or contains unsupported features.
1402
                 * Drop the packet but continue in case next ones are
1403
                 * correct. */
1404
0
                dp_packet_delete(pkt);
1405
0
                netdev->rx_dropped += 1;
1406
0
                VLOG_WARN_RL(&rl, "%s: Dropped packet: vnet header is missing "
1407
0
                             "or corrupt: %s", netdev_get_name(netdev_),
1408
0
                             ovs_strerror(ret));
1409
0
                continue;
1410
0
            }
1411
0
        }
1412
1413
0
        for (cmsg = CMSG_FIRSTHDR(&mmsgs[i].msg_hdr); cmsg;
1414
0
                 cmsg = CMSG_NXTHDR(&mmsgs[i].msg_hdr, cmsg)) {
1415
0
            const struct tpacket_auxdata *aux;
1416
1417
0
            if (cmsg->cmsg_level != SOL_PACKET
1418
0
                || cmsg->cmsg_type != PACKET_AUXDATA
1419
0
                || cmsg->cmsg_len <
1420
0
                       CMSG_LEN(sizeof(struct tpacket_auxdata))) {
1421
0
                continue;
1422
0
            }
1423
1424
0
            aux = ALIGNED_CAST(struct tpacket_auxdata *, CMSG_DATA(cmsg));
1425
0
            if (auxdata_has_vlan_tci(aux)) {
1426
0
                struct eth_header *eth;
1427
0
                bool double_tagged;
1428
1429
0
                eth = dp_packet_data(pkt);
1430
0
                double_tagged = eth->eth_type == htons(ETH_TYPE_VLAN_8021Q);
1431
1432
0
                eth_push_vlan(pkt,
1433
0
                              auxdata_to_vlan_tpid(aux, double_tagged),
1434
0
                              htons(aux->tp_vlan_tci));
1435
0
                break;
1436
0
            }
1437
0
        }
1438
0
        dp_packet_batch_add(batch, pkt);
1439
0
    }
1440
1441
    /* Delete unused buffers. */
1442
0
    for (; i < NETDEV_MAX_BURST; i++) {
1443
0
        dp_packet_delete(buffers[i]);
1444
0
    }
1445
1446
0
    return 0;
1447
0
}
1448
1449
/*
1450
 * Receive packets from tap by batch process for better performance,
1451
 * it can receive NETDEV_MAX_BURST packets at most once, the received
1452
 * packets are added into *batch. The return value is 0 or errno.
1453
 */
1454
static int
1455
netdev_linux_batch_rxq_recv_tap(struct netdev_rxq_linux *rx, int mtu,
1456
                                struct dp_packet_batch *batch)
1457
0
{
1458
0
    int virtio_net_hdr_size;
1459
0
    ssize_t retval;
1460
0
    size_t std_len;
1461
0
    int iovlen;
1462
0
    int i;
1463
1464
0
    if (userspace_tso_enabled()) {
1465
        /* Use the buffer from the allocated packet below to receive MTU
1466
         * sized packets and an aux_buf for extra TSO data. */
1467
0
        iovlen = IOV_TSO_SIZE;
1468
0
    } else {
1469
        /* Use only the buffer from the allocated packet. */
1470
0
        iovlen = IOV_STD_SIZE;
1471
0
    }
1472
0
    if (OVS_LIKELY(tap_supports_vnet_hdr)) {
1473
0
        virtio_net_hdr_size = sizeof(struct virtio_net_hdr);
1474
0
    } else {
1475
0
        virtio_net_hdr_size = 0;
1476
0
    }
1477
1478
    /* The length here needs to be accounted in the same way when the
1479
     * aux_buf is allocated so that it can be prepended to TSO buffer. */
1480
0
    std_len = virtio_net_hdr_size + VLAN_ETH_HEADER_LEN + mtu;
1481
0
    for (i = 0; i < NETDEV_MAX_BURST; i++) {
1482
0
        struct dp_packet *buffer;
1483
0
        struct dp_packet *pkt;
1484
0
        struct iovec iov[IOV_TSO_SIZE];
1485
1486
        /* Assume Ethernet port. No need to set packet_type. */
1487
0
        buffer = dp_packet_new_with_headroom(std_len, DP_NETDEV_HEADROOM);
1488
0
        iov[IOV_PACKET].iov_base = dp_packet_data(buffer);
1489
0
        iov[IOV_PACKET].iov_len = std_len;
1490
0
        if (iovlen == IOV_TSO_SIZE) {
1491
0
            iov[IOV_AUXBUF].iov_base = dp_packet_data(rx->aux_bufs[i]);
1492
0
            iov[IOV_AUXBUF].iov_len = dp_packet_tailroom(rx->aux_bufs[i]);
1493
0
        }
1494
1495
0
        do {
1496
0
            retval = readv(rx->fd, iov, iovlen);
1497
0
        } while (retval < 0 && errno == EINTR);
1498
1499
0
        if (retval < 0) {
1500
0
            dp_packet_delete(buffer);
1501
0
            break;
1502
0
        }
1503
1504
0
        if (retval > std_len) {
1505
            /* Build a single linear TSO packet by prepending the data from
1506
             * std_len buffer to the aux_buf. */
1507
0
            pkt = rx->aux_bufs[i];
1508
0
            dp_packet_set_size(pkt, retval - std_len);
1509
0
            dp_packet_push(pkt, dp_packet_data(buffer), std_len);
1510
            /* The headroom should be the same in buffers[i], pkt and
1511
             * DP_NETDEV_HEADROOM. */
1512
0
            dp_packet_resize(pkt, DP_NETDEV_HEADROOM, 0);
1513
0
            dp_packet_delete(buffer);
1514
0
            rx->aux_bufs[i] = NULL;
1515
0
        } else {
1516
0
            dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
1517
0
            pkt = buffer;
1518
0
        }
1519
1520
0
        if (OVS_LIKELY(virtio_net_hdr_size) &&
1521
0
            netdev_linux_parse_vnet_hdr(pkt)) {
1522
0
            struct netdev *netdev_ = netdev_rxq_get_netdev(&rx->up);
1523
0
            struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1524
1525
            /* Unexpected error situation: the virtio header is not present
1526
             * or corrupted. Drop the packet but continue in case next ones
1527
             * are correct. */
1528
0
            dp_packet_delete(pkt);
1529
0
            netdev->rx_dropped += 1;
1530
0
            VLOG_WARN_RL(&rl, "%s: Dropped packet: Invalid virtio net header",
1531
0
                         netdev_get_name(netdev_));
1532
0
            continue;
1533
0
        }
1534
1535
0
        dp_packet_batch_add(batch, pkt);
1536
0
    }
1537
1538
0
    if ((i == 0) && (retval < 0)) {
1539
0
        return errno;
1540
0
    }
1541
1542
0
    return 0;
1543
0
}
1544
1545
static int
1546
netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet_batch *batch,
1547
                      int *qfill)
1548
0
{
1549
0
    struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1550
0
    struct netdev *netdev = rx->up.netdev;
1551
0
    ssize_t retval;
1552
0
    int mtu;
1553
1554
0
    if (netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
1555
0
        mtu = ETH_PAYLOAD_MAX;
1556
0
    }
1557
1558
0
    if (userspace_tso_enabled()) {
1559
        /* Allocate TSO packets. The packet has enough headroom to store
1560
         * a full non-TSO packet. When a TSO packet is received, the data
1561
         * from non-TSO buffer (std_len) is prepended to the TSO packet
1562
         * (aux_buf). */
1563
0
        size_t std_len = sizeof(struct virtio_net_hdr) + VLAN_ETH_HEADER_LEN
1564
0
                         + DP_NETDEV_HEADROOM + mtu;
1565
0
        size_t data_len = LINUX_RXQ_TSO_MAX_LEN - std_len;
1566
0
        for (int i = 0; i < NETDEV_MAX_BURST; i++) {
1567
0
            if (rx->aux_bufs[i]) {
1568
0
                continue;
1569
0
            }
1570
1571
0
            rx->aux_bufs[i] = dp_packet_new_with_headroom(data_len, std_len);
1572
0
        }
1573
0
    }
1574
1575
0
    dp_packet_batch_init(batch);
1576
0
    retval = (rx->is_tap
1577
0
              ? netdev_linux_batch_rxq_recv_tap(rx, mtu, batch)
1578
0
              : netdev_linux_batch_rxq_recv_sock(rx, mtu, batch));
1579
1580
0
    if (retval) {
1581
0
        if (retval != EAGAIN && retval != EMSGSIZE) {
1582
0
            VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
1583
0
                         netdev_rxq_get_name(rxq_), ovs_strerror(errno));
1584
0
        }
1585
0
    }
1586
1587
0
    if (qfill) {
1588
0
        *qfill = -ENOTSUP;
1589
0
    }
1590
1591
0
    return retval;
1592
0
}
1593
1594
static void
1595
netdev_linux_rxq_wait(struct netdev_rxq *rxq_)
1596
0
{
1597
0
    struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1598
0
    poll_fd_wait(rx->fd, POLLIN);
1599
0
}
1600
1601
static int
1602
netdev_linux_rxq_drain(struct netdev_rxq *rxq_)
1603
0
{
1604
0
    struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1605
0
    if (rx->is_tap) {
1606
0
        struct ifreq ifr;
1607
0
        int error;
1608
1609
0
        memset(&ifr, 0, sizeof ifr);
1610
0
        error = af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_), &ifr,
1611
0
                                    SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
1612
0
        if (error) {
1613
0
            return error;
1614
0
        }
1615
0
        drain_fd(rx->fd, ifr.ifr_qlen);
1616
0
        return 0;
1617
0
    } else {
1618
0
        return drain_rcvbuf(rx->fd);
1619
0
    }
1620
0
}
1621
1622
static int
1623
netdev_linux_sock_batch_send(struct netdev *netdev_, int sock, int ifindex,
1624
                             bool tso, int mtu, struct dp_packet_batch *batch)
1625
0
{
1626
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1627
0
    const size_t size = dp_packet_batch_size(batch);
1628
    /* We don't bother setting most fields in sockaddr_ll because the
1629
     * kernel ignores them for SOCK_RAW. */
1630
0
    struct sockaddr_ll sll = { .sll_family = AF_PACKET,
1631
0
                               .sll_ifindex = ifindex };
1632
1633
0
    struct mmsghdr *mmsg = xmalloc(sizeof(*mmsg) * size);
1634
0
    struct iovec *iov = xmalloc(sizeof(*iov) * size);
1635
0
    struct dp_packet *packet;
1636
0
    int cnt = 0;
1637
1638
0
    DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
1639
0
        if (tso) {
1640
0
            int ret = netdev_linux_prepend_vnet_hdr(packet, mtu);
1641
1642
0
            if (OVS_UNLIKELY(ret)) {
1643
0
                netdev->tx_dropped += 1;
1644
0
                VLOG_WARN_RL(&rl, "%s: Prepend vnet hdr failed, packet "
1645
0
                                  "dropped. %s", netdev_get_name(netdev_),
1646
0
                             ovs_strerror(ret));
1647
0
                continue;
1648
0
            }
1649
0
         }
1650
1651
0
        iov[cnt].iov_base = dp_packet_data(packet);
1652
0
        iov[cnt].iov_len = dp_packet_size(packet);
1653
0
        mmsg[cnt].msg_hdr = (struct msghdr) { .msg_name = &sll,
1654
0
                                              .msg_namelen = sizeof sll,
1655
0
                                              .msg_iov = &iov[cnt],
1656
0
                                              .msg_iovlen = 1 };
1657
0
        cnt++;
1658
0
    }
1659
1660
0
    int error = 0;
1661
0
    for (uint32_t ofs = 0; ofs < cnt;) {
1662
0
        ssize_t retval;
1663
0
        do {
1664
0
            retval = sendmmsg(sock, mmsg + ofs, cnt - ofs, 0);
1665
0
            error = retval < 0 ? errno : 0;
1666
0
        } while (error == EINTR);
1667
0
        if (error) {
1668
0
            break;
1669
0
        }
1670
0
        ofs += retval;
1671
0
    }
1672
1673
0
    free(mmsg);
1674
0
    free(iov);
1675
0
    return error;
1676
0
}
1677
1678
/* Use the tap fd to send 'batch' to tap device 'netdev'.  Using the tap fd is
1679
 * essential, because packets sent to a tap device with an AF_PACKET socket
1680
 * will loop back to be *received* again on the tap device.  This doesn't occur
1681
 * on other interface types because we attach a socket filter to the rx
1682
 * socket. */
1683
static int
1684
netdev_linux_tap_batch_send(struct netdev *netdev_, int mtu,
1685
                            struct dp_packet_batch *batch)
1686
0
{
1687
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1688
0
    struct dp_packet *packet;
1689
1690
    /* The Linux tap driver returns EIO if the device is not up,
1691
     * so if the device is not up, don't waste time sending it.
1692
     * However, if the device is in another network namespace
1693
     * then OVS can't retrieve the state. In that case, send the
1694
     * packets anyway. */
1695
0
    if (netdev->present && !(netdev->ifi_flags & IFF_UP)) {
1696
0
        netdev->tx_dropped += dp_packet_batch_size(batch);
1697
0
        return 0;
1698
0
    }
1699
1700
0
    DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
1701
0
        size_t size;
1702
0
        ssize_t retval;
1703
0
        int error;
1704
1705
0
        if (OVS_LIKELY(tap_supports_vnet_hdr)) {
1706
0
            error = netdev_linux_prepend_vnet_hdr(packet, mtu);
1707
0
            if (OVS_UNLIKELY(error)) {
1708
0
                netdev->tx_dropped++;
1709
0
                VLOG_WARN_RL(&rl, "%s: Prepend vnet hdr failed, packet "
1710
0
                             "dropped. %s", netdev_get_name(netdev_),
1711
0
                             ovs_strerror(error));
1712
0
                continue;
1713
0
            }
1714
0
        }
1715
1716
0
        size = dp_packet_size(packet);
1717
0
        do {
1718
0
            retval = write(netdev->tap_fd, dp_packet_data(packet), size);
1719
0
            error = retval < 0 ? errno : 0;
1720
0
        } while (error == EINTR);
1721
1722
0
        if (error) {
1723
            /* The Linux tap driver returns EIO if the device is not up.  From
1724
             * the OVS side this is not an error, so we ignore it; otherwise,
1725
             * return the erro. */
1726
0
            if (error != EIO) {
1727
0
                return error;
1728
0
            }
1729
0
        } else if (retval != size) {
1730
0
            VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE" "
1731
0
                         "bytes of %"PRIuSIZE") on %s",
1732
0
                         retval, size, netdev_get_name(netdev_));
1733
0
            return EMSGSIZE;
1734
0
        }
1735
0
    }
1736
0
    return 0;
1737
0
}
1738
1739
static int
1740
netdev_linux_get_numa_id__(struct netdev_linux *netdev)
1741
    OVS_REQUIRES(netdev->mutex)
1742
0
{
1743
0
    char *numa_node_path;
1744
0
    const char *name;
1745
0
    int node_id;
1746
0
    FILE *stream;
1747
0
1748
0
    if (netdev->cache_valid & VALID_NUMA_ID) {
1749
0
        return netdev->numa_id;
1750
0
    }
1751
0
1752
0
    netdev->numa_id = 0;
1753
0
    netdev->cache_valid |= VALID_NUMA_ID;
1754
0
1755
0
    if (ovs_numa_get_n_numas() < 2) {
1756
0
        /* No need to check on system with a single NUMA node. */
1757
0
        return 0;
1758
0
    }
1759
0
1760
0
    name = netdev_get_name(&netdev->up);
1761
0
    if (strpbrk(name, "/\\")) {
1762
0
        VLOG_ERR_RL(&rl, "\"%s\" is not a valid name for a port. "
1763
0
                    "A valid name must not include '/' or '\\'."
1764
0
                    "Using numa_id 0", name);
1765
0
        return 0;
1766
0
    }
1767
0
1768
0
    numa_node_path = xasprintf("/sys/class/net/%s/device/numa_node", name);
1769
0
1770
0
    stream = fopen(numa_node_path, "r");
1771
0
    if (!stream) {
1772
0
        /* Virtual device does not have this info. */
1773
0
        VLOG_INFO_RL(&rl, "%s: Can't open '%s': %s, using numa_id 0",
1774
0
                     name, numa_node_path, ovs_strerror(errno));
1775
0
        free(numa_node_path);
1776
0
        return 0;
1777
0
    }
1778
0
1779
0
    if (fscanf(stream, "%d", &node_id) != 1
1780
0
        || !ovs_numa_numa_id_is_valid(node_id))  {
1781
0
        VLOG_WARN_RL(&rl, "%s: Can't detect NUMA node, using numa_id 0", name);
1782
0
        node_id = 0;
1783
0
    }
1784
0
1785
0
    netdev->numa_id = node_id;
1786
0
    fclose(stream);
1787
0
    free(numa_node_path);
1788
0
    return node_id;
1789
0
}
1790
1791
static int OVS_UNUSED
1792
netdev_linux_get_numa_id(const struct netdev *netdev_)
1793
0
{
1794
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1795
0
    int numa_id;
1796
0
1797
0
    ovs_mutex_lock(&netdev->mutex);
1798
0
    numa_id = netdev_linux_get_numa_id__(netdev);
1799
0
    ovs_mutex_unlock(&netdev->mutex);
1800
0
1801
0
    return numa_id;
1802
0
}
1803
1804
/* Sends 'batch' on 'netdev'.  Returns 0 if successful, otherwise a positive
1805
 * errno value.  Returns EAGAIN without blocking if the packet cannot be queued
1806
 * immediately.  Returns EMSGSIZE if a partial packet was transmitted or if
1807
 * the packet is too big or too small to transmit on the device.
1808
 *
1809
 * The kernel maintains a packet transmission queue, so the caller is not
1810
 * expected to do additional queuing of packets. */
1811
static int
1812
netdev_linux_send(struct netdev *netdev_, int qid OVS_UNUSED,
1813
                  struct dp_packet_batch *batch,
1814
                  bool concurrent_txq OVS_UNUSED)
1815
0
{
1816
0
    bool tso = userspace_tso_enabled();
1817
0
    int mtu = ETH_PAYLOAD_MAX;
1818
0
    int error = 0;
1819
0
    int sock = 0;
1820
1821
0
    if (tso) {
1822
0
        netdev_linux_get_mtu__(netdev_linux_cast(netdev_), &mtu);
1823
0
    }
1824
1825
0
    if (!is_tap_netdev(netdev_)) {
1826
0
        if (netdev_linux_netnsid_is_remote(netdev_linux_cast(netdev_))) {
1827
0
            error = EOPNOTSUPP;
1828
0
            goto free_batch;
1829
0
        }
1830
1831
0
        sock = af_packet_sock();
1832
0
        if (sock < 0) {
1833
0
            error = -sock;
1834
0
            goto free_batch;
1835
0
        }
1836
1837
0
        int ifindex = netdev_get_ifindex(netdev_);
1838
0
        if (ifindex < 0) {
1839
0
            error = -ifindex;
1840
0
            goto free_batch;
1841
0
        }
1842
1843
0
        error = netdev_linux_sock_batch_send(netdev_, sock, ifindex, tso, mtu,
1844
0
                                             batch);
1845
0
    } else {
1846
0
        error = netdev_linux_tap_batch_send(netdev_, mtu, batch);
1847
0
    }
1848
0
    if (error) {
1849
0
        if (error == ENOBUFS) {
1850
            /* The Linux AF_PACKET implementation never blocks waiting
1851
             * for room for packets, instead returning ENOBUFS.
1852
             * Translate this into EAGAIN for the caller. */
1853
0
            error = EAGAIN;
1854
0
        } else {
1855
0
            VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
1856
0
                         netdev_get_name(netdev_), ovs_strerror(error));
1857
0
        }
1858
0
    }
1859
1860
0
free_batch:
1861
0
    dp_packet_delete_batch(batch, true);
1862
0
    return error;
1863
0
}
1864
1865
/* Registers with the poll loop to wake up from the next call to poll_block()
1866
 * when the packet transmission queue has sufficient room to transmit a packet
1867
 * with netdev_send().
1868
 *
1869
 * The kernel maintains a packet transmission queue, so the client is not
1870
 * expected to do additional queuing of packets.  Thus, this function is
1871
 * unlikely to ever be used.  It is included for completeness. */
1872
static void
1873
netdev_linux_send_wait(struct netdev *netdev, int qid OVS_UNUSED)
1874
0
{
1875
0
    if (is_tap_netdev(netdev)) {
1876
        /* TAP device always accepts packets.*/
1877
0
        poll_immediate_wake();
1878
0
    }
1879
0
}
1880
1881
/* Attempts to set 'netdev''s MAC address to 'mac'.  Returns 0 if successful,
1882
 * otherwise a positive errno value. */
1883
static int
1884
netdev_linux_set_etheraddr(struct netdev *netdev_, const struct eth_addr mac)
1885
0
{
1886
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1887
0
    enum netdev_flags old_flags = 0;
1888
0
    int error;
1889
1890
0
    ovs_mutex_lock(&netdev->mutex);
1891
0
    if (netdev_linux_netnsid_is_remote(netdev)) {
1892
0
        error = EOPNOTSUPP;
1893
0
        goto exit;
1894
0
    }
1895
1896
0
    if (netdev->cache_valid & VALID_ETHERADDR) {
1897
0
        error = netdev->ether_addr_error;
1898
0
        if (error || eth_addr_equals(netdev->etheraddr, mac)) {
1899
0
            goto exit;
1900
0
        }
1901
0
        netdev->cache_valid &= ~VALID_ETHERADDR;
1902
0
    }
1903
1904
    /* Tap devices must be brought down before setting the address. */
1905
0
    if (is_tap_netdev(netdev_)) {
1906
0
        update_flags(netdev, NETDEV_UP, 0, &old_flags);
1907
0
    }
1908
0
    error = set_etheraddr(netdev_get_name(netdev_), mac);
1909
0
    if (!error || error == ENODEV) {
1910
0
        netdev->ether_addr_error = error;
1911
0
        netdev->cache_valid |= VALID_ETHERADDR;
1912
0
        if (!error) {
1913
0
            netdev->etheraddr = mac;
1914
0
        }
1915
0
    }
1916
1917
0
    if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1918
0
        update_flags(netdev, 0, NETDEV_UP, &old_flags);
1919
0
    }
1920
1921
0
exit:
1922
0
    ovs_mutex_unlock(&netdev->mutex);
1923
0
    return error;
1924
0
}
1925
1926
/* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1927
static int
1928
netdev_linux_get_etheraddr(const struct netdev *netdev_, struct eth_addr *mac)
1929
0
{
1930
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1931
0
    int error;
1932
1933
0
    ovs_mutex_lock(&netdev->mutex);
1934
0
    if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1935
0
        netdev_linux_update_via_netlink(netdev);
1936
0
    }
1937
1938
0
    if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1939
        /* Fall back to ioctl if netlink fails */
1940
0
        netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
1941
0
                                                 &netdev->etheraddr);
1942
0
        netdev->cache_valid |= VALID_ETHERADDR;
1943
0
    }
1944
1945
0
    error = netdev->ether_addr_error;
1946
0
    if (!error) {
1947
0
        *mac = netdev->etheraddr;
1948
0
    }
1949
0
    ovs_mutex_unlock(&netdev->mutex);
1950
1951
0
    return error;
1952
0
}
1953
1954
static int
1955
netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup)
1956
0
{
1957
0
    int error;
1958
1959
0
    if (!(netdev->cache_valid & VALID_MTU)) {
1960
0
        netdev_linux_update_via_netlink(netdev);
1961
0
    }
1962
1963
0
    if (!(netdev->cache_valid & VALID_MTU)) {
1964
        /* Fall back to ioctl if netlink fails */
1965
0
        struct ifreq ifr;
1966
1967
0
        memset(&ifr, 0, sizeof ifr);
1968
0
        netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
1969
0
            netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
1970
0
        netdev->mtu = ifr.ifr_mtu;
1971
0
        netdev->cache_valid |= VALID_MTU;
1972
0
    }
1973
1974
0
    error = netdev->netdev_mtu_error;
1975
0
    if (!error) {
1976
0
        *mtup = netdev->mtu;
1977
0
    }
1978
1979
0
    return error;
1980
0
}
1981
1982
/* Returns the maximum size of transmitted (and received) packets on 'netdev',
1983
 * in bytes, not including the hardware header; thus, this is typically 1500
1984
 * bytes for Ethernet devices. */
1985
static int
1986
netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1987
0
{
1988
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1989
0
    int error;
1990
1991
0
    ovs_mutex_lock(&netdev->mutex);
1992
0
    error = netdev_linux_get_mtu__(netdev, mtup);
1993
0
    ovs_mutex_unlock(&netdev->mutex);
1994
1995
0
    return error;
1996
0
}
1997
1998
/* Sets the maximum size of transmitted (MTU) for given device using linux
1999
 * networking ioctl interface.
2000
 */
2001
static int
2002
netdev_linux_set_mtu(struct netdev *netdev_, int mtu)
2003
0
{
2004
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2005
0
    struct ifreq ifr;
2006
0
    int error;
2007
2008
0
    ovs_mutex_lock(&netdev->mutex);
2009
0
    if (netdev_linux_netnsid_is_remote(netdev)) {
2010
0
        error = EOPNOTSUPP;
2011
0
        goto exit;
2012
0
    }
2013
2014
#ifdef HAVE_AF_XDP
2015
    if (netdev_get_class(netdev_) == &netdev_afxdp_class) {
2016
        error = netdev_afxdp_verify_mtu_size(netdev_, mtu);
2017
        if (error) {
2018
            goto exit;
2019
        }
2020
    }
2021
#endif
2022
2023
0
    if (netdev->cache_valid & VALID_MTU) {
2024
0
        error = netdev->netdev_mtu_error;
2025
0
        if (error || netdev->mtu == mtu) {
2026
0
            goto exit;
2027
0
        }
2028
0
        netdev->cache_valid &= ~VALID_MTU;
2029
0
    }
2030
2031
0
    memset(&ifr, 0, sizeof ifr);
2032
0
    ifr.ifr_mtu = mtu;
2033
2034
0
    error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
2035
0
                                SIOCSIFMTU, "SIOCSIFMTU");
2036
0
    if (!error || error == ENODEV) {
2037
0
        netdev->netdev_mtu_error = error;
2038
0
        netdev->mtu = ifr.ifr_mtu;
2039
0
        netdev->cache_valid |= VALID_MTU;
2040
0
    }
2041
0
exit:
2042
0
    ovs_mutex_unlock(&netdev->mutex);
2043
0
    return error;
2044
0
}
2045
2046
/* Returns the ifindex of 'netdev', if successful, as a positive number.
2047
 * On failure, returns a negative errno value. */
2048
static int
2049
netdev_linux_get_ifindex(const struct netdev *netdev_)
2050
0
{
2051
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2052
0
    int ifindex, error;
2053
2054
0
    ovs_mutex_lock(&netdev->mutex);
2055
0
    if (netdev_linux_netnsid_is_remote(netdev)) {
2056
0
        error = EOPNOTSUPP;
2057
0
        goto exit;
2058
0
    }
2059
0
    error = get_ifindex(netdev_, &ifindex);
2060
2061
0
exit:
2062
0
    ovs_mutex_unlock(&netdev->mutex);
2063
0
    return error ? -error : ifindex;
2064
0
}
2065
2066
static int
2067
netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
2068
0
{
2069
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2070
2071
0
    ovs_mutex_lock(&netdev->mutex);
2072
0
    if (netdev->miimon_interval > 0) {
2073
0
        *carrier = netdev->miimon;
2074
0
    } else {
2075
0
        *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
2076
0
    }
2077
0
    ovs_mutex_unlock(&netdev->mutex);
2078
2079
0
    return 0;
2080
0
}
2081
2082
static long long int
2083
netdev_linux_get_carrier_resets(const struct netdev *netdev_)
2084
0
{
2085
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2086
0
    long long int carrier_resets;
2087
2088
0
    ovs_mutex_lock(&netdev->mutex);
2089
0
    carrier_resets = netdev->carrier_resets;
2090
0
    ovs_mutex_unlock(&netdev->mutex);
2091
2092
0
    return carrier_resets;
2093
0
}
2094
2095
static int
2096
netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
2097
                       struct mii_ioctl_data *data)
2098
0
{
2099
0
    struct ifreq ifr;
2100
0
    int error;
2101
2102
0
    memset(&ifr, 0, sizeof ifr);
2103
0
    memcpy(&ifr.ifr_data, data, sizeof *data);
2104
0
    error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
2105
0
    memcpy(data, &ifr.ifr_data, sizeof *data);
2106
2107
0
    return error;
2108
0
}
2109
2110
static int
2111
netdev_linux_get_miimon(const char *name, bool *miimon)
2112
0
{
2113
0
    struct mii_ioctl_data data;
2114
0
    int error;
2115
2116
0
    *miimon = false;
2117
2118
0
    memset(&data, 0, sizeof data);
2119
0
    error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
2120
0
    if (!error) {
2121
        /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
2122
0
        data.reg_num = MII_BMSR;
2123
0
        error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
2124
0
                                       &data);
2125
2126
0
        if (!error) {
2127
0
            *miimon = !!(data.val_out & BMSR_LSTATUS);
2128
0
        }
2129
0
    }
2130
0
    if (error) {
2131
0
        struct ethtool_cmd ecmd;
2132
2133
0
        VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
2134
0
                    name);
2135
2136
0
        COVERAGE_INC(netdev_get_ethtool);
2137
0
        memset(&ecmd, 0, sizeof ecmd);
2138
0
        error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
2139
0
                                        "ETHTOOL_GLINK");
2140
0
        if (!error) {
2141
0
            struct ethtool_value eval;
2142
2143
0
            memcpy(&eval, &ecmd, sizeof eval);
2144
0
            *miimon = !!eval.data;
2145
0
        } else {
2146
0
            VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
2147
0
        }
2148
0
    }
2149
2150
0
    return error;
2151
0
}
2152
2153
static int
2154
netdev_linux_set_miimon_interval(struct netdev *netdev_,
2155
                                 long long int interval)
2156
0
{
2157
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2158
2159
0
    ovs_mutex_lock(&netdev->mutex);
2160
0
    interval = interval > 0 ? MAX(interval, 100) : 0;
2161
0
    if (netdev->miimon_interval != interval) {
2162
0
        if (interval && !netdev->miimon_interval) {
2163
0
            atomic_count_inc(&miimon_cnt);
2164
0
        } else if (!interval && netdev->miimon_interval) {
2165
0
            atomic_count_dec(&miimon_cnt);
2166
0
        }
2167
2168
0
        netdev->miimon_interval = interval;
2169
0
        timer_set_expired(&netdev->miimon_timer);
2170
0
    }
2171
0
    ovs_mutex_unlock(&netdev->mutex);
2172
2173
0
    return 0;
2174
0
}
2175
2176
static void
2177
netdev_linux_miimon_run(void)
2178
0
{
2179
0
    struct shash device_shash;
2180
0
    struct shash_node *node;
2181
2182
0
    shash_init(&device_shash);
2183
0
    netdev_get_devices(&netdev_linux_class, &device_shash);
2184
0
    SHASH_FOR_EACH (node, &device_shash) {
2185
0
        struct netdev *netdev = node->data;
2186
0
        struct netdev_linux *dev = netdev_linux_cast(netdev);
2187
0
        bool miimon;
2188
2189
0
        ovs_mutex_lock(&dev->mutex);
2190
0
        if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
2191
0
            netdev_linux_get_miimon(dev->up.name, &miimon);
2192
0
            if (miimon != dev->miimon) {
2193
0
                dev->miimon = miimon;
2194
0
                netdev_linux_changed(dev, dev->ifi_flags, 0);
2195
0
            }
2196
2197
0
            timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
2198
0
        }
2199
0
        ovs_mutex_unlock(&dev->mutex);
2200
0
        netdev_close(netdev);
2201
0
    }
2202
2203
0
    shash_destroy(&device_shash);
2204
0
}
2205
2206
static void
2207
netdev_linux_miimon_wait(void)
2208
0
{
2209
0
    struct shash device_shash;
2210
0
    struct shash_node *node;
2211
2212
0
    shash_init(&device_shash);
2213
0
    netdev_get_devices(&netdev_linux_class, &device_shash);
2214
0
    SHASH_FOR_EACH (node, &device_shash) {
2215
0
        struct netdev *netdev = node->data;
2216
0
        struct netdev_linux *dev = netdev_linux_cast(netdev);
2217
2218
0
        ovs_mutex_lock(&dev->mutex);
2219
0
        if (dev->miimon_interval > 0) {
2220
0
            timer_wait(&dev->miimon_timer);
2221
0
        }
2222
0
        ovs_mutex_unlock(&dev->mutex);
2223
0
        netdev_close(netdev);
2224
0
    }
2225
0
    shash_destroy(&device_shash);
2226
0
}
2227
2228
static void
2229
swap_uint64(uint64_t *a, uint64_t *b)
2230
0
{
2231
0
    uint64_t tmp = *a;
2232
0
    *a = *b;
2233
0
    *b = tmp;
2234
0
}
2235
2236
/* Copies 'src' into 'dst', performing format conversion in the process.
2237
 *
2238
 * 'src' is allowed to be misaligned. */
2239
static void
2240
netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
2241
                                  const struct dpif_netlink_vport *vport)
2242
0
{
2243
0
    dst->rx_packets = get_32aligned_u64(&vport->stats->rx_packets);
2244
0
    dst->tx_packets = get_32aligned_u64(&vport->stats->tx_packets);
2245
0
    dst->rx_bytes = get_32aligned_u64(&vport->stats->rx_bytes);
2246
0
    dst->tx_bytes = get_32aligned_u64(&vport->stats->tx_bytes);
2247
0
    dst->rx_errors = get_32aligned_u64(&vport->stats->rx_errors);
2248
0
    dst->tx_errors = get_32aligned_u64(&vport->stats->tx_errors);
2249
0
    dst->rx_dropped = get_32aligned_u64(&vport->stats->rx_dropped);
2250
0
    dst->tx_dropped = get_32aligned_u64(&vport->stats->tx_dropped);
2251
0
    dst->multicast = 0;
2252
0
    dst->collisions = 0;
2253
0
    dst->rx_length_errors = 0;
2254
0
    dst->rx_over_errors = 0;
2255
0
    dst->rx_crc_errors = 0;
2256
0
    dst->rx_frame_errors = 0;
2257
0
    dst->rx_fifo_errors = 0;
2258
0
    dst->rx_missed_errors = 0;
2259
0
    dst->tx_aborted_errors = 0;
2260
0
    dst->tx_carrier_errors = 0;
2261
0
    dst->tx_fifo_errors = 0;
2262
0
    dst->tx_heartbeat_errors = 0;
2263
0
    dst->tx_window_errors = 0;
2264
0
    dst->upcall_packets = vport->upcall_success;
2265
0
    dst->upcall_errors = vport->upcall_fail;
2266
0
}
2267
2268
static int
2269
get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
2270
0
{
2271
0
    struct dpif_netlink_vport reply;
2272
0
    struct ofpbuf *buf;
2273
0
    int error;
2274
2275
0
    error = dpif_netlink_vport_get(netdev_get_name(netdev), &reply, &buf);
2276
0
    if (error) {
2277
0
        return error;
2278
0
    } else if (!reply.stats) {
2279
0
        ofpbuf_delete(buf);
2280
0
        return EOPNOTSUPP;
2281
0
    }
2282
2283
0
    netdev_stats_from_ovs_vport_stats(stats, &reply);
2284
2285
0
    ofpbuf_delete(buf);
2286
2287
0
    return 0;
2288
0
}
2289
2290
static void
2291
get_stats_via_vport(const struct netdev *netdev_,
2292
                    struct netdev_stats *stats)
2293
0
{
2294
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2295
2296
0
    if (!netdev->vport_stats_error ||
2297
0
        !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
2298
0
        int error;
2299
2300
0
        error = get_stats_via_vport__(netdev_, stats);
2301
0
        if (error && error != ENOENT && error != ENODEV) {
2302
0
            VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
2303
0
                         "(%s)",
2304
0
                         netdev_get_name(netdev_), ovs_strerror(error));
2305
0
        }
2306
0
        netdev->vport_stats_error = error;
2307
0
        netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
2308
0
    }
2309
0
}
2310
2311
/* Retrieves current device stats for 'netdev-linux'. */
2312
static int
2313
netdev_linux_get_stats(const struct netdev *netdev_,
2314
                       struct netdev_stats *stats)
2315
0
{
2316
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2317
0
    struct netdev_stats dev_stats;
2318
0
    int error;
2319
2320
0
    ovs_mutex_lock(&netdev->mutex);
2321
0
    get_stats_via_vport(netdev_, stats);
2322
0
    error = get_stats_via_netlink(netdev_, &dev_stats);
2323
0
    if (error) {
2324
0
        if (!netdev->vport_stats_error) {
2325
0
            error = 0;
2326
0
        }
2327
0
    } else if (netdev->vport_stats_error) {
2328
        /* stats not available from OVS then use netdev stats. */
2329
0
        *stats = dev_stats;
2330
0
    } else {
2331
0
        stats->multicast           += dev_stats.multicast;
2332
0
        stats->collisions          += dev_stats.collisions;
2333
0
        stats->rx_length_errors    += dev_stats.rx_length_errors;
2334
0
        stats->rx_over_errors      += dev_stats.rx_over_errors;
2335
0
        stats->rx_crc_errors       += dev_stats.rx_crc_errors;
2336
0
        stats->rx_frame_errors     += dev_stats.rx_frame_errors;
2337
0
        stats->rx_fifo_errors      += dev_stats.rx_fifo_errors;
2338
0
        stats->rx_missed_errors    += dev_stats.rx_missed_errors;
2339
0
        stats->tx_aborted_errors   += dev_stats.tx_aborted_errors;
2340
0
        stats->tx_carrier_errors   += dev_stats.tx_carrier_errors;
2341
0
        stats->tx_fifo_errors      += dev_stats.tx_fifo_errors;
2342
0
        stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
2343
0
        stats->tx_window_errors    += dev_stats.tx_window_errors;
2344
0
    }
2345
0
    ovs_mutex_unlock(&netdev->mutex);
2346
2347
0
    return error;
2348
0
}
2349
2350
/* Retrieves current device stats for 'netdev-tap' netdev or
2351
 * netdev-internal. */
2352
static int
2353
netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
2354
0
{
2355
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2356
0
    struct netdev_stats dev_stats;
2357
0
    int error;
2358
2359
0
    ovs_mutex_lock(&netdev->mutex);
2360
0
    get_stats_via_vport(netdev_, stats);
2361
0
    error = get_stats_via_netlink(netdev_, &dev_stats);
2362
0
    if (error) {
2363
0
        if (!netdev->vport_stats_error) {
2364
0
            error = 0;
2365
0
        }
2366
0
    } else if (netdev->vport_stats_error) {
2367
        /* Transmit and receive stats will appear to be swapped relative to the
2368
         * other ports since we are the one sending the data, not a remote
2369
         * computer.  For consistency, we swap them back here. This does not
2370
         * apply if we are getting stats from the vport layer because it always
2371
         * tracks stats from the perspective of the switch. */
2372
2373
0
        *stats = dev_stats;
2374
0
        swap_uint64(&stats->rx_packets, &stats->tx_packets);
2375
0
        swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
2376
0
        swap_uint64(&stats->rx_errors, &stats->tx_errors);
2377
0
        swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
2378
0
        stats->rx_length_errors = 0;
2379
0
        stats->rx_over_errors = 0;
2380
0
        stats->rx_crc_errors = 0;
2381
0
        stats->rx_frame_errors = 0;
2382
0
        stats->rx_fifo_errors = 0;
2383
0
        stats->rx_missed_errors = 0;
2384
0
        stats->tx_aborted_errors = 0;
2385
0
        stats->tx_carrier_errors = 0;
2386
0
        stats->tx_fifo_errors = 0;
2387
0
        stats->tx_heartbeat_errors = 0;
2388
0
        stats->tx_window_errors = 0;
2389
0
    } else {
2390
        /* Use kernel netdev's packet and byte counts since vport counters
2391
         * do not reflect packet counts on the wire when GSO, TSO or GRO
2392
         * are enabled. */
2393
0
        stats->rx_packets = dev_stats.tx_packets;
2394
0
        stats->rx_bytes = dev_stats.tx_bytes;
2395
0
        stats->tx_packets = dev_stats.rx_packets;
2396
0
        stats->tx_bytes = dev_stats.rx_bytes;
2397
2398
0
        stats->rx_dropped          += dev_stats.tx_dropped;
2399
0
        stats->tx_dropped          += dev_stats.rx_dropped;
2400
2401
0
        stats->rx_errors           += dev_stats.tx_errors;
2402
0
        stats->tx_errors           += dev_stats.rx_errors;
2403
2404
0
        stats->multicast           += dev_stats.multicast;
2405
0
        stats->collisions          += dev_stats.collisions;
2406
0
    }
2407
0
    stats->tx_dropped += netdev->tx_dropped;
2408
0
    stats->rx_dropped += netdev->rx_dropped;
2409
0
    ovs_mutex_unlock(&netdev->mutex);
2410
2411
0
    return error;
2412
0
}
2413
2414
static int
2415
netdev_internal_get_stats(const struct netdev *netdev_,
2416
                          struct netdev_stats *stats)
2417
0
{
2418
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2419
0
    int error;
2420
2421
0
    ovs_mutex_lock(&netdev->mutex);
2422
0
    get_stats_via_vport(netdev_, stats);
2423
0
    error = netdev->vport_stats_error;
2424
0
    ovs_mutex_unlock(&netdev->mutex);
2425
2426
0
    return error;
2427
0
}
2428
2429
static int
2430
netdev_linux_read_stringset_info(struct netdev_linux *netdev, uint32_t *len)
2431
0
{
2432
0
    union {
2433
0
        struct ethtool_cmd ecmd;
2434
0
        struct ethtool_sset_info hdr;
2435
0
        struct {
2436
0
            uint64_t pad[2];
2437
0
            uint32_t sset_len[1];
2438
0
        };
2439
0
    } sset_info;
2440
0
    int error;
2441
2442
0
    sset_info.hdr.cmd = ETHTOOL_GSSET_INFO;
2443
0
    sset_info.hdr.reserved = 0;
2444
0
    sset_info.hdr.sset_mask = 1ULL << ETH_SS_FEATURES;
2445
2446
0
    error = netdev_linux_do_ethtool(netdev_get_name(&netdev->up),
2447
0
                                    (struct ethtool_cmd *) &sset_info,
2448
0
                                    ETHTOOL_GSSET_INFO, "ETHTOOL_GSSET_INFO");
2449
0
    if (error) {
2450
0
        return error;
2451
0
    }
2452
0
    if (sset_info.hdr.sset_mask & (1ULL << ETH_SS_FEATURES)) {
2453
0
        *len = sset_info.sset_len[0];
2454
0
        return 0;
2455
0
    } else {
2456
        /* ETH_SS_FEATURES is not supported. */
2457
0
        return -EOPNOTSUPP;
2458
0
    }
2459
0
}
2460
2461
2462
static int
2463
netdev_linux_read_definitions(struct netdev_linux *netdev,
2464
                              struct ethtool_gstrings **pstrings)
2465
0
{
2466
0
    struct ethtool_gstrings *strings = NULL;
2467
0
    uint32_t len = 0;
2468
0
    int error = 0;
2469
2470
0
    error = netdev_linux_read_stringset_info(netdev, &len);
2471
0
    if (error) {
2472
0
        return error;
2473
0
    } else if (!len) {
2474
0
        return -EOPNOTSUPP;
2475
0
    }
2476
2477
0
    strings = xzalloc(sizeof *strings + len * ETH_GSTRING_LEN);
2478
2479
0
    strings->cmd = ETHTOOL_GSTRINGS;
2480
0
    strings->string_set = ETH_SS_FEATURES;
2481
0
    strings->len = len;
2482
0
    error = netdev_linux_do_ethtool(netdev_get_name(&netdev->up),
2483
0
                                    (struct ethtool_cmd *) strings,
2484
0
                                    ETHTOOL_GSTRINGS, "ETHTOOL_GSTRINGS");
2485
0
    if (error) {
2486
0
        goto out;
2487
0
    }
2488
2489
0
    for (int i = 0; i < len; i++) {
2490
0
        strings->data[(i + 1) * ETH_GSTRING_LEN - 1] = 0;
2491
0
    }
2492
2493
0
    *pstrings = strings;
2494
2495
0
    return 0;
2496
0
out:
2497
0
    *pstrings = NULL;
2498
0
    free(strings);
2499
0
    return error;
2500
0
}
2501
2502
static void
2503
netdev_linux_set_ol(struct netdev *netdev_)
2504
0
{
2505
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2506
0
    struct ethtool_gfeatures *features = NULL;
2507
0
    struct ethtool_gstrings *names = NULL;
2508
0
    int error;
2509
2510
0
    COVERAGE_INC(netdev_get_ethtool);
2511
2512
0
    error = netdev_linux_read_definitions(netdev, &names);
2513
0
    if (error) {
2514
0
        return;
2515
0
    }
2516
2517
0
    features = xzalloc(sizeof *features +
2518
0
                       DIV_ROUND_UP(names->len, 32) *
2519
0
                       sizeof features->features[0]);
2520
2521
0
    features->cmd = ETHTOOL_GFEATURES;
2522
0
    features->size = DIV_ROUND_UP(names->len, 32);
2523
0
    error = netdev_linux_do_ethtool(netdev_get_name(netdev_),
2524
0
                                    (struct ethtool_cmd *) features,
2525
0
                                    ETHTOOL_GFEATURES, "ETHTOOL_GFEATURES");
2526
2527
0
    if (error) {
2528
0
        goto out;
2529
0
    }
2530
2531
0
#define FEATURE_WORD(blocks, index, field)  ((blocks)[(index) / 32U].field)
2532
0
#define FEATURE_FIELD_FLAG(index)       (1U << (index) % 32U)
2533
0
#define FEATURE_BIT_IS_SET(blocks, index, field)        \
2534
0
    (FEATURE_WORD(blocks, index, field) & FEATURE_FIELD_FLAG(index))
2535
2536
0
    netdev->up.ol_flags = 0;
2537
0
    static const struct {
2538
0
        char *string;
2539
0
        uint32_t value;
2540
0
    } t_list[] = {
2541
0
        {"tx-checksum-ipv4", NETDEV_TX_OFFLOAD_TCP_CKSUM |
2542
0
                             NETDEV_TX_OFFLOAD_UDP_CKSUM},
2543
0
        {"tx-checksum-ipv6", NETDEV_TX_OFFLOAD_TCP_CKSUM |
2544
0
                             NETDEV_TX_OFFLOAD_UDP_CKSUM},
2545
0
        {"tx-checksum-ip-generic", NETDEV_TX_OFFLOAD_TCP_CKSUM |
2546
0
                                   NETDEV_TX_OFFLOAD_UDP_CKSUM},
2547
0
        {"tx-checksum-sctp", NETDEV_TX_OFFLOAD_SCTP_CKSUM},
2548
0
        {"tx-tcp-segmentation", NETDEV_TX_OFFLOAD_TCP_TSO},
2549
0
    };
2550
2551
0
    for (int j = 0; j < ARRAY_SIZE(t_list); j++) {
2552
0
        for (int i = 0; i < names->len; i++) {
2553
0
            char *name = (char *) names->data + i * ETH_GSTRING_LEN;
2554
0
            if (strcmp(t_list[j].string, name) == 0) {
2555
0
                if (FEATURE_BIT_IS_SET(features->features, i, active)) {
2556
0
                    netdev_->ol_flags |= t_list[j].value;
2557
0
                }
2558
0
                break;
2559
0
            }
2560
0
        }
2561
0
    }
2562
2563
0
out:
2564
0
    free(names);
2565
0
    free(features);
2566
0
}
2567
2568
static void
2569
netdev_linux_read_features(struct netdev_linux *netdev)
2570
0
{
2571
0
    struct ethtool_cmd ecmd;
2572
0
    int error;
2573
2574
0
    if (netdev->cache_valid & VALID_FEATURES) {
2575
0
        return;
2576
0
    }
2577
2578
0
    COVERAGE_INC(netdev_get_ethtool);
2579
0
    memset(&ecmd, 0, sizeof ecmd);
2580
0
    error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
2581
0
                                    ETHTOOL_GSET, "ETHTOOL_GSET");
2582
0
    if (error) {
2583
0
        goto out;
2584
0
    }
2585
2586
    /* Supported features. */
2587
0
    netdev->supported = 0;
2588
0
    if (ecmd.supported & SUPPORTED_10baseT_Half) {
2589
0
        netdev->supported |= NETDEV_F_10MB_HD;
2590
0
    }
2591
0
    if (ecmd.supported & SUPPORTED_10baseT_Full) {
2592
0
        netdev->supported |= NETDEV_F_10MB_FD;
2593
0
    }
2594
0
    if (ecmd.supported & SUPPORTED_100baseT_Half)  {
2595
0
        netdev->supported |= NETDEV_F_100MB_HD;
2596
0
    }
2597
0
    if (ecmd.supported & SUPPORTED_100baseT_Full) {
2598
0
        netdev->supported |= NETDEV_F_100MB_FD;
2599
0
    }
2600
0
    if (ecmd.supported & SUPPORTED_1000baseT_Half) {
2601
0
        netdev->supported |= NETDEV_F_1GB_HD;
2602
0
    }
2603
0
    if ((ecmd.supported & SUPPORTED_1000baseT_Full) ||
2604
0
        (ecmd.supported & SUPPORTED_1000baseKX_Full)) {
2605
0
        netdev->supported |= NETDEV_F_1GB_FD;
2606
0
    }
2607
0
    if ((ecmd.supported & SUPPORTED_10000baseT_Full) ||
2608
0
        (ecmd.supported & SUPPORTED_10000baseKX4_Full) ||
2609
0
        (ecmd.supported & SUPPORTED_10000baseKR_Full) ||
2610
0
        (ecmd.supported & SUPPORTED_10000baseR_FEC)) {
2611
0
        netdev->supported |= NETDEV_F_10GB_FD;
2612
0
    }
2613
0
    if ((ecmd.supported & SUPPORTED_40000baseKR4_Full) ||
2614
0
        (ecmd.supported & SUPPORTED_40000baseCR4_Full) ||
2615
0
        (ecmd.supported & SUPPORTED_40000baseSR4_Full) ||
2616
0
        (ecmd.supported & SUPPORTED_40000baseLR4_Full)) {
2617
0
        netdev->supported |= NETDEV_F_40GB_FD;
2618
0
    }
2619
0
    if (ecmd.supported & SUPPORTED_TP) {
2620
0
        netdev->supported |= NETDEV_F_COPPER;
2621
0
    }
2622
0
    if (ecmd.supported & SUPPORTED_FIBRE) {
2623
0
        netdev->supported |= NETDEV_F_FIBER;
2624
0
    }
2625
0
    if (ecmd.supported & SUPPORTED_Autoneg) {
2626
0
        netdev->supported |= NETDEV_F_AUTONEG;
2627
0
    }
2628
0
    if (ecmd.supported & SUPPORTED_Pause) {
2629
0
        netdev->supported |= NETDEV_F_PAUSE;
2630
0
    }
2631
0
    if (ecmd.supported & SUPPORTED_Asym_Pause) {
2632
0
        netdev->supported |= NETDEV_F_PAUSE_ASYM;
2633
0
    }
2634
2635
    /* Advertised features. */
2636
0
    netdev->advertised = 0;
2637
0
    if (ecmd.advertising & ADVERTISED_10baseT_Half) {
2638
0
        netdev->advertised |= NETDEV_F_10MB_HD;
2639
0
    }
2640
0
    if (ecmd.advertising & ADVERTISED_10baseT_Full) {
2641
0
        netdev->advertised |= NETDEV_F_10MB_FD;
2642
0
    }
2643
0
    if (ecmd.advertising & ADVERTISED_100baseT_Half) {
2644
0
        netdev->advertised |= NETDEV_F_100MB_HD;
2645
0
    }
2646
0
    if (ecmd.advertising & ADVERTISED_100baseT_Full) {
2647
0
        netdev->advertised |= NETDEV_F_100MB_FD;
2648
0
    }
2649
0
    if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
2650
0
        netdev->advertised |= NETDEV_F_1GB_HD;
2651
0
    }
2652
0
    if ((ecmd.advertising & ADVERTISED_1000baseT_Full) ||
2653
0
        (ecmd.advertising & ADVERTISED_1000baseKX_Full)) {
2654
0
        netdev->advertised |= NETDEV_F_1GB_FD;
2655
0
    }
2656
0
    if ((ecmd.advertising & ADVERTISED_10000baseT_Full) ||
2657
0
        (ecmd.advertising & ADVERTISED_10000baseKX4_Full) ||
2658
0
        (ecmd.advertising & ADVERTISED_10000baseKR_Full) ||
2659
0
        (ecmd.advertising & ADVERTISED_10000baseR_FEC)) {
2660
0
        netdev->advertised |= NETDEV_F_10GB_FD;
2661
0
    }
2662
0
    if ((ecmd.advertising & ADVERTISED_40000baseKR4_Full) ||
2663
0
        (ecmd.advertising & ADVERTISED_40000baseCR4_Full) ||
2664
0
        (ecmd.advertising & ADVERTISED_40000baseSR4_Full) ||
2665
0
        (ecmd.advertising & ADVERTISED_40000baseLR4_Full)) {
2666
0
        netdev->advertised |= NETDEV_F_40GB_FD;
2667
0
    }
2668
0
    if (ecmd.advertising & ADVERTISED_TP) {
2669
0
        netdev->advertised |= NETDEV_F_COPPER;
2670
0
    }
2671
0
    if (ecmd.advertising & ADVERTISED_FIBRE) {
2672
0
        netdev->advertised |= NETDEV_F_FIBER;
2673
0
    }
2674
0
    if (ecmd.advertising & ADVERTISED_Autoneg) {
2675
0
        netdev->advertised |= NETDEV_F_AUTONEG;
2676
0
    }
2677
0
    if (ecmd.advertising & ADVERTISED_Pause) {
2678
0
        netdev->advertised |= NETDEV_F_PAUSE;
2679
0
    }
2680
0
    if (ecmd.advertising & ADVERTISED_Asym_Pause) {
2681
0
        netdev->advertised |= NETDEV_F_PAUSE_ASYM;
2682
0
    }
2683
2684
    /* Current settings. */
2685
0
    netdev->current_speed = ethtool_cmd_speed(&ecmd);
2686
0
    if (netdev->current_speed == SPEED_10) {
2687
0
        netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
2688
0
    } else if (netdev->current_speed == SPEED_100) {
2689
0
        netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
2690
0
    } else if (netdev->current_speed == SPEED_1000) {
2691
0
        netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
2692
0
    } else if (netdev->current_speed == SPEED_10000) {
2693
0
        netdev->current = NETDEV_F_10GB_FD;
2694
0
    } else if (netdev->current_speed == SPEED_40000) {
2695
0
        netdev->current = NETDEV_F_40GB_FD;
2696
0
    } else if (netdev->current_speed == SPEED_100000) {
2697
0
        netdev->current = NETDEV_F_100GB_FD;
2698
0
    } else if (netdev->current_speed == 1000000) {
2699
0
        netdev->current = NETDEV_F_1TB_FD;
2700
0
    } else if (netdev->current_speed
2701
0
               && netdev->current_speed != SPEED_UNKNOWN) {
2702
0
        netdev->current = NETDEV_F_OTHER;
2703
0
    } else {
2704
0
        netdev->current = 0;
2705
0
    }
2706
0
    netdev->current_duplex = ecmd.duplex;
2707
2708
0
    if (ecmd.port == PORT_TP) {
2709
0
        netdev->current |= NETDEV_F_COPPER;
2710
0
    } else if (ecmd.port == PORT_FIBRE) {
2711
0
        netdev->current |= NETDEV_F_FIBER;
2712
0
    }
2713
2714
0
    if (ecmd.autoneg) {
2715
0
        netdev->current |= NETDEV_F_AUTONEG;
2716
0
    }
2717
2718
0
out:
2719
0
    netdev->cache_valid |= VALID_FEATURES;
2720
0
    netdev->get_features_error = error;
2721
0
}
2722
2723
/* Stores the features supported by 'netdev' into of '*current', '*advertised',
2724
 * '*supported', and '*peer'.  Each value is a bitmap of NETDEV_* bits.
2725
 * Returns 0 if successful, otherwise a positive errno value. */
2726
static int
2727
netdev_linux_get_features(const struct netdev *netdev_,
2728
                          enum netdev_features *current,
2729
                          enum netdev_features *advertised,
2730
                          enum netdev_features *supported,
2731
                          enum netdev_features *peer)
2732
0
{
2733
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2734
0
    int error;
2735
2736
0
    ovs_mutex_lock(&netdev->mutex);
2737
0
    if (netdev_linux_netnsid_is_remote(netdev)) {
2738
0
        error = EOPNOTSUPP;
2739
0
        goto exit;
2740
0
    }
2741
2742
0
    netdev_linux_read_features(netdev);
2743
0
    if (!netdev->get_features_error) {
2744
0
        *current = netdev->current;
2745
0
        *advertised = netdev->advertised;
2746
0
        *supported = netdev->supported;
2747
0
        *peer = 0;              /* XXX */
2748
0
    }
2749
0
    error = netdev->get_features_error;
2750
2751
0
exit:
2752
0
    ovs_mutex_unlock(&netdev->mutex);
2753
0
    return error;
2754
0
}
2755
2756
static int
2757
netdev_linux_get_speed_locked(struct netdev_linux *netdev,
2758
                              uint32_t *current, uint32_t *max)
2759
0
{
2760
0
    if (netdev_linux_netnsid_is_remote(netdev)) {
2761
0
        *current = *max = 0;
2762
0
        return EOPNOTSUPP;
2763
0
    }
2764
2765
0
    netdev_linux_read_features(netdev);
2766
0
    if (!netdev->get_features_error) {
2767
0
        *current = netdev->current_speed == SPEED_UNKNOWN
2768
0
                   ? 0 : netdev->current_speed;
2769
0
        *max = MIN(UINT32_MAX,
2770
0
                   netdev_features_to_bps(netdev->supported, 0) / 1000000ULL);
2771
0
    } else {
2772
0
        *current = *max = 0;
2773
0
    }
2774
0
    return netdev->get_features_error;
2775
0
}
2776
2777
static int
2778
netdev_linux_get_speed(const struct netdev *netdev_, uint32_t *current,
2779
                       uint32_t *max)
2780
0
{
2781
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2782
0
    int error;
2783
2784
0
    ovs_mutex_lock(&netdev->mutex);
2785
0
    error = netdev_linux_get_speed_locked(netdev, current, max);
2786
0
    ovs_mutex_unlock(&netdev->mutex);
2787
0
    return error;
2788
0
}
2789
2790
static int
2791
netdev_linux_get_duplex(const struct netdev *netdev_, bool *full_duplex)
2792
0
{
2793
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2794
0
    int err;
2795
2796
0
    ovs_mutex_lock(&netdev->mutex);
2797
2798
0
    if (netdev_linux_netnsid_is_remote(netdev)) {
2799
0
        err = EOPNOTSUPP;
2800
0
        goto exit;
2801
0
    }
2802
2803
0
    netdev_linux_read_features(netdev);
2804
0
    err = netdev->get_features_error;
2805
0
    if (!err && netdev->current_duplex == DUPLEX_UNKNOWN) {
2806
0
        err = EOPNOTSUPP;
2807
0
        goto exit;
2808
0
    }
2809
0
    *full_duplex = netdev->current_duplex == DUPLEX_FULL;
2810
2811
0
exit:
2812
0
    ovs_mutex_unlock(&netdev->mutex);
2813
0
    return err;
2814
0
}
2815
2816
/* Set the features advertised by 'netdev' to 'advertise'. */
2817
static int
2818
netdev_linux_set_advertisements(struct netdev *netdev_,
2819
                                enum netdev_features advertise)
2820
0
{
2821
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2822
0
    struct ethtool_cmd ecmd;
2823
0
    int error;
2824
2825
0
    ovs_mutex_lock(&netdev->mutex);
2826
2827
0
    COVERAGE_INC(netdev_get_ethtool);
2828
2829
0
    if (netdev_linux_netnsid_is_remote(netdev)) {
2830
0
        error = EOPNOTSUPP;
2831
0
        goto exit;
2832
0
    }
2833
2834
0
    memset(&ecmd, 0, sizeof ecmd);
2835
0
    error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
2836
0
                                    ETHTOOL_GSET, "ETHTOOL_GSET");
2837
0
    if (error) {
2838
0
        goto exit;
2839
0
    }
2840
2841
0
    ecmd.advertising = 0;
2842
0
    if (advertise & NETDEV_F_10MB_HD) {
2843
0
        ecmd.advertising |= ADVERTISED_10baseT_Half;
2844
0
    }
2845
0
    if (advertise & NETDEV_F_10MB_FD) {
2846
0
        ecmd.advertising |= ADVERTISED_10baseT_Full;
2847
0
    }
2848
0
    if (advertise & NETDEV_F_100MB_HD) {
2849
0
        ecmd.advertising |= ADVERTISED_100baseT_Half;
2850
0
    }
2851
0
    if (advertise & NETDEV_F_100MB_FD) {
2852
0
        ecmd.advertising |= ADVERTISED_100baseT_Full;
2853
0
    }
2854
0
    if (advertise & NETDEV_F_1GB_HD) {
2855
0
        ecmd.advertising |= ADVERTISED_1000baseT_Half;
2856
0
    }
2857
0
    if (advertise & NETDEV_F_1GB_FD) {
2858
0
        ecmd.advertising |= ADVERTISED_1000baseT_Full;
2859
0
    }
2860
0
    if (advertise & NETDEV_F_10GB_FD) {
2861
0
        ecmd.advertising |= ADVERTISED_10000baseT_Full;
2862
0
    }
2863
0
    if (advertise & NETDEV_F_COPPER) {
2864
0
        ecmd.advertising |= ADVERTISED_TP;
2865
0
    }
2866
0
    if (advertise & NETDEV_F_FIBER) {
2867
0
        ecmd.advertising |= ADVERTISED_FIBRE;
2868
0
    }
2869
0
    if (advertise & NETDEV_F_AUTONEG) {
2870
0
        ecmd.advertising |= ADVERTISED_Autoneg;
2871
0
    }
2872
0
    if (advertise & NETDEV_F_PAUSE) {
2873
0
        ecmd.advertising |= ADVERTISED_Pause;
2874
0
    }
2875
0
    if (advertise & NETDEV_F_PAUSE_ASYM) {
2876
0
        ecmd.advertising |= ADVERTISED_Asym_Pause;
2877
0
    }
2878
0
    COVERAGE_INC(netdev_set_ethtool);
2879
0
    error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
2880
0
                                    ETHTOOL_SSET, "ETHTOOL_SSET");
2881
2882
0
exit:
2883
0
    ovs_mutex_unlock(&netdev->mutex);
2884
0
    return error;
2885
0
}
2886
2887
static void
2888
nl_msg_act_police_start_nest(struct ofpbuf *request, uint32_t prio,
2889
                             size_t *offset, size_t *act_offset,
2890
                             bool single_action)
2891
0
{
2892
0
    *act_offset = nl_msg_start_nested(request, prio);
2893
0
    nl_msg_put_string(request, TCA_ACT_KIND, "police");
2894
2895
    /* If police action is added independently from filter, we need to
2896
     * add action flag according to tc-policy. */
2897
0
    if (single_action) {
2898
0
        nl_msg_put_act_tc_policy_flag(request);
2899
0
    }
2900
0
    *offset = nl_msg_start_nested(request, TCA_ACT_OPTIONS);
2901
0
}
2902
2903
static void
2904
nl_msg_act_police_end_nest(struct ofpbuf *request, size_t offset,
2905
                           size_t act_offset, uint32_t notexceed_act)
2906
0
{
2907
0
    nl_msg_put_u32(request, TCA_POLICE_RESULT, notexceed_act);
2908
0
    nl_msg_end_nested(request, offset);
2909
0
    nl_msg_end_nested(request, act_offset);
2910
0
}
2911
2912
static void
2913
nl_msg_put_act_police(struct ofpbuf *request, uint32_t index,
2914
                      uint64_t kbits_rate, uint64_t kbits_burst,
2915
                      uint64_t pkts_rate, uint64_t pkts_burst,
2916
                      uint32_t notexceed_act, bool single_action)
2917
0
{
2918
0
    uint64_t bytes_rate = kbits_rate / 8 * 1000;
2919
0
    size_t offset, act_offset;
2920
0
    struct tc_police police;
2921
0
    uint32_t prio = 0;
2922
2923
0
    if (!kbits_rate && !pkts_rate) {
2924
0
        return;
2925
0
    }
2926
2927
0
    tc_policer_init(&police, kbits_rate, kbits_burst);
2928
0
    police.index = index;
2929
2930
0
    nl_msg_act_police_start_nest(request, ++prio, &offset, &act_offset,
2931
0
                                 single_action);
2932
0
    if (police.rate.rate) {
2933
0
        tc_put_rtab(request, TCA_POLICE_RATE, &police.rate, bytes_rate);
2934
0
    }
2935
#ifdef HAVE_TCA_POLICE_PKTRATE64
2936
    if (bytes_rate > UINT32_MAX) {
2937
        nl_msg_put_u64(request, TCA_POLICE_RATE64, bytes_rate);
2938
    }
2939
#endif
2940
0
    if (pkts_rate) {
2941
0
        uint64_t pkt_burst_ticks;
2942
        /* Here tc_bytes_to_ticks is used to convert packets rather than bytes
2943
           to ticks. */
2944
0
        pkt_burst_ticks = tc_bytes_to_ticks(pkts_rate, pkts_burst);
2945
0
        nl_msg_put_u64(request, TCA_POLICE_PKTRATE64, pkts_rate);
2946
0
        nl_msg_put_u64(request, TCA_POLICE_PKTBURST64, pkt_burst_ticks);
2947
0
    }
2948
0
    nl_msg_put_unspec(request, TCA_POLICE_TBF, &police, sizeof police);
2949
0
    nl_msg_act_police_end_nest(request, offset, act_offset, notexceed_act);
2950
0
}
2951
2952
static int
2953
tc_add_matchall_policer(struct netdev *netdev, uint64_t kbits_rate,
2954
                        uint32_t kbits_burst, uint32_t kpkts_rate,
2955
                        uint32_t kpkts_burst)
2956
0
{
2957
0
    uint16_t eth_type = (OVS_FORCE uint16_t) htons(ETH_P_ALL);
2958
0
    size_t basic_offset, action_offset;
2959
0
    uint16_t prio = TC_RESERVED_PRIORITY_POLICE;
2960
0
    int ifindex, err = 0;
2961
0
    struct ofpbuf request;
2962
0
    struct ofpbuf *reply;
2963
0
    struct tcmsg *tcmsg;
2964
0
    uint32_t handle = 1;
2965
2966
0
    err = get_ifindex(netdev, &ifindex);
2967
0
    if (err) {
2968
0
        return err;
2969
0
    }
2970
2971
0
    tcmsg = tc_make_request(ifindex, RTM_NEWTFILTER, NLM_F_CREATE | NLM_F_ECHO,
2972
0
                            &request);
2973
0
    tcmsg->tcm_parent = TC_INGRESS_PARENT;
2974
0
    tcmsg->tcm_info = tc_make_handle(prio, eth_type);
2975
0
    tcmsg->tcm_handle = handle;
2976
2977
0
    nl_msg_put_string(&request, TCA_KIND, "matchall");
2978
0
    basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2979
0
    action_offset = nl_msg_start_nested(&request, TCA_MATCHALL_ACT);
2980
0
    nl_msg_put_act_police(&request, 0, kbits_rate, kbits_burst,
2981
0
                          kpkts_rate * 1000ULL, kpkts_burst * 1000ULL,
2982
0
                          TC_ACT_UNSPEC, false);
2983
0
    nl_msg_end_nested(&request, action_offset);
2984
0
    nl_msg_end_nested(&request, basic_offset);
2985
2986
0
    err = tc_transact(&request, &reply);
2987
0
    if (!err) {
2988
0
        struct ofpbuf b = ofpbuf_const_initializer(reply->data, reply->size);
2989
0
        struct nlmsghdr *nlmsg = ofpbuf_try_pull(&b, sizeof *nlmsg);
2990
0
        struct tcmsg *tc = ofpbuf_try_pull(&b, sizeof *tc);
2991
2992
0
        if (!nlmsg || !tc) {
2993
0
            VLOG_ERR_RL(&rl,
2994
0
                        "Failed to add match all policer, malformed reply");
2995
0
            ofpbuf_delete(reply);
2996
0
            return EPROTO;
2997
0
        }
2998
0
        ofpbuf_delete(reply);
2999
0
    }
3000
3001
0
    return err;
3002
0
}
3003
3004
static int
3005
tc_del_matchall_policer(struct netdev *netdev)
3006
0
{
3007
0
    int prio = TC_RESERVED_PRIORITY_POLICE;
3008
0
    uint32_t block_id = 0;
3009
0
    struct tcf_id id;
3010
0
    int ifindex;
3011
0
    int err;
3012
3013
0
    err = get_ifindex(netdev, &ifindex);
3014
0
    if (err) {
3015
0
        return err;
3016
0
    }
3017
3018
0
    id = tc_make_tcf_id(ifindex, block_id, prio, TC_INGRESS);
3019
0
    err = tc_del_filter(&id, "matchall");
3020
0
    if (err) {
3021
0
        return err;
3022
0
    }
3023
3024
0
    return 0;
3025
0
}
3026
3027
/* Attempts to set input rate limiting (policing) policy.  Returns 0 if
3028
 * successful, otherwise a positive errno value. */
3029
static int
3030
netdev_linux_set_policing(struct netdev *netdev_, uint32_t kbits_rate,
3031
                          uint32_t kbits_burst, uint32_t kpkts_rate,
3032
                          uint32_t kpkts_burst)
3033
0
{
3034
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3035
0
    const char *netdev_name = netdev_get_name(netdev_);
3036
0
    int ifindex;
3037
0
    int error;
3038
3039
0
    kbits_burst = (!kbits_rate ? 0       /* Force to 0 if no rate specified. */
3040
0
                   : !kbits_burst ? 8000 /* Default to 8000 kbits if 0. */
3041
0
                   : kbits_burst);       /* Stick with user-specified value. */
3042
3043
0
    kpkts_burst = (!kpkts_rate ? 0       /* Force to 0 if no rate specified. */
3044
0
                   : !kpkts_burst ? 16   /* Default to 16 kpkts if 0. */
3045
0
                   : kpkts_burst);       /* Stick with user-specified value. */
3046
3047
0
    ovs_mutex_lock(&netdev->mutex);
3048
0
    if (netdev_linux_netnsid_is_remote(netdev)) {
3049
0
        error = EOPNOTSUPP;
3050
0
        goto out;
3051
0
    }
3052
3053
0
    if (netdev->cache_valid & VALID_POLICING) {
3054
0
        error = netdev->netdev_policing_error;
3055
0
        if (error || (netdev->kbits_rate == kbits_rate &&
3056
0
                      netdev->kpkts_rate == kpkts_rate &&
3057
0
                      netdev->kbits_burst == kbits_burst &&
3058
0
                      netdev->kpkts_burst == kpkts_burst)) {
3059
            /* Assume that settings haven't changed since we last set them. */
3060
0
            goto out;
3061
0
        }
3062
0
        netdev->cache_valid &= ~VALID_POLICING;
3063
0
    }
3064
3065
0
    COVERAGE_INC(netdev_set_policing);
3066
3067
    /* Use matchall for policing when offloadling ovs with tc-flower. */
3068
0
    if (netdev_is_flow_api_enabled()) {
3069
0
        error = tc_del_matchall_policer(netdev_);
3070
0
        if (kbits_rate || kpkts_rate) {
3071
0
            error = tc_add_matchall_policer(netdev_, kbits_rate, kbits_burst,
3072
0
                                            kpkts_rate, kpkts_burst);
3073
0
        }
3074
0
        goto out;
3075
0
    }
3076
3077
0
    error = get_ifindex(netdev_, &ifindex);
3078
0
    if (error) {
3079
0
        goto out;
3080
0
    }
3081
3082
    /* Remove any existing ingress qdisc. */
3083
0
    error = tc_add_del_qdisc(ifindex, false, 0, TC_INGRESS);
3084
0
    if (error) {
3085
0
        VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
3086
0
                     netdev_name, ovs_strerror(error));
3087
0
        goto out;
3088
0
    }
3089
3090
0
    if (kbits_rate || kpkts_rate) {
3091
0
        const char *cls_name = "matchall";
3092
3093
0
        error = tc_add_del_qdisc(ifindex, true, 0, TC_INGRESS);
3094
0
        if (error) {
3095
0
            VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
3096
0
                         netdev_name, ovs_strerror(error));
3097
0
            goto out;
3098
0
        }
3099
3100
0
        error = tc_add_matchall_policer(netdev_, kbits_rate, kbits_burst,
3101
0
                                        kpkts_rate, kpkts_burst);
3102
0
        if (error == ENOENT) {
3103
0
            cls_name = "basic";
3104
            /* This error is returned when the matchall classifier is missing.
3105
             * Fall back to the basic classifier.  */
3106
0
            error = tc_add_policer(netdev_, kbits_rate, kbits_burst,
3107
0
                                   kpkts_rate, kpkts_burst);
3108
0
        }
3109
0
        if (error){
3110
0
            VLOG_WARN_RL(&rl, "%s: adding cls_%s policing action failed: %s",
3111
0
                         netdev_name, cls_name, ovs_strerror(error));
3112
0
            goto out;
3113
0
        }
3114
0
    }
3115
3116
0
out:
3117
0
    if (!error) {
3118
0
        netdev->kbits_rate = kbits_rate;
3119
0
        netdev->kbits_burst = kbits_burst;
3120
0
        netdev->kpkts_rate = kpkts_rate;
3121
0
        netdev->kpkts_burst = kpkts_burst;
3122
0
    }
3123
3124
0
    if (!error || error == ENODEV) {
3125
0
        netdev->netdev_policing_error = error;
3126
0
        netdev->cache_valid |= VALID_POLICING;
3127
0
    }
3128
0
    ovs_mutex_unlock(&netdev->mutex);
3129
0
    return error;
3130
0
}
3131
3132
static int
3133
netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
3134
                           struct sset *types)
3135
0
{
3136
0
    const struct tc_ops *const *opsp;
3137
0
    for (opsp = tcs; *opsp != NULL; opsp++) {
3138
0
        const struct tc_ops *ops = *opsp;
3139
0
        if (ops->tc_install && ops->ovs_name[0] != '\0') {
3140
0
            sset_add(types, ops->ovs_name);
3141
0
        }
3142
0
    }
3143
0
    return 0;
3144
0
}
3145
3146
static const struct tc_ops *
3147
tc_lookup_ovs_name(const char *name)
3148
0
{
3149
0
    const struct tc_ops *const *opsp;
3150
3151
0
    for (opsp = tcs; *opsp != NULL; opsp++) {
3152
0
        const struct tc_ops *ops = *opsp;
3153
0
        if (!strcmp(name, ops->ovs_name)) {
3154
0
            return ops;
3155
0
        }
3156
0
    }
3157
0
    return NULL;
3158
0
}
3159
3160
static const struct tc_ops *
3161
tc_lookup_linux_name(const char *name)
3162
0
{
3163
0
    const struct tc_ops *const *opsp;
3164
3165
0
    for (opsp = tcs; *opsp != NULL; opsp++) {
3166
0
        const struct tc_ops *ops = *opsp;
3167
0
        if (ops->linux_name && !strcmp(name, ops->linux_name)) {
3168
0
            return ops;
3169
0
        }
3170
0
    }
3171
0
    return NULL;
3172
0
}
3173
3174
static struct tc_queue *
3175
tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
3176
                size_t hash)
3177
0
{
3178
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3179
0
    struct tc_queue *queue;
3180
3181
0
    HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
3182
0
        if (queue->queue_id == queue_id) {
3183
0
            return queue;
3184
0
        }
3185
0
    }
3186
0
    return NULL;
3187
0
}
3188
3189
static struct tc_queue *
3190
tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
3191
0
{
3192
0
    return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
3193
0
}
3194
3195
static int
3196
netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
3197
                                  const char *type,
3198
                                  struct netdev_qos_capabilities *caps)
3199
0
{
3200
0
    const struct tc_ops *ops = tc_lookup_ovs_name(type);
3201
0
    if (!ops) {
3202
0
        return EOPNOTSUPP;
3203
0
    }
3204
0
    caps->n_queues = ops->n_queues;
3205
0
    return 0;
3206
0
}
3207
3208
static int
3209
netdev_linux_get_qos(const struct netdev *netdev_,
3210
                     const char **typep, struct smap *details)
3211
0
{
3212
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3213
0
    int error;
3214
3215
0
    ovs_mutex_lock(&netdev->mutex);
3216
0
    if (netdev_linux_netnsid_is_remote(netdev)) {
3217
0
        error = EOPNOTSUPP;
3218
0
        goto exit;
3219
0
    }
3220
3221
0
    error = tc_query_qdisc(netdev_);
3222
0
    if (!error) {
3223
0
        *typep = netdev->tc->ops->ovs_name;
3224
0
        error = (netdev->tc->ops->qdisc_get
3225
0
                 ? netdev->tc->ops->qdisc_get(netdev_, details)
3226
0
                 : 0);
3227
0
    }
3228
3229
0
exit:
3230
0
    ovs_mutex_unlock(&netdev->mutex);
3231
0
    return error;
3232
0
}
3233
3234
static int
3235
netdev_linux_set_qos(struct netdev *netdev_,
3236
                     const char *type, const struct smap *details)
3237
0
{
3238
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3239
0
    const struct tc_ops *new_ops;
3240
0
    int error;
3241
3242
0
    new_ops = tc_lookup_ovs_name(type);
3243
0
    if (!new_ops || !new_ops->tc_install) {
3244
0
        return EOPNOTSUPP;
3245
0
    }
3246
3247
0
    if (new_ops == &tc_ops_noop) {
3248
0
        return new_ops->tc_install(netdev_, details);
3249
0
    }
3250
3251
0
    ovs_mutex_lock(&netdev->mutex);
3252
0
    if (netdev_linux_netnsid_is_remote(netdev)) {
3253
0
        error = EOPNOTSUPP;
3254
0
        goto exit;
3255
0
    }
3256
3257
0
    error = tc_query_qdisc(netdev_);
3258
0
    if (error) {
3259
0
        goto exit;
3260
0
    }
3261
3262
0
    if (new_ops == netdev->tc->ops) {
3263
0
        error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
3264
0
    } else {
3265
        /* Delete existing qdisc. */
3266
0
        error = tc_del_qdisc(netdev_);
3267
0
        if (error) {
3268
0
            VLOG_WARN_RL(&rl, "%s: Failed to delete existing qdisc: %s",
3269
0
                         netdev_get_name(netdev_), ovs_strerror(error));
3270
0
            goto exit;
3271
0
        }
3272
0
        ovs_assert(netdev->tc == NULL);
3273
3274
        /* Install new qdisc. */
3275
0
        error = new_ops->tc_install(netdev_, details);
3276
0
        if (error) {
3277
0
            VLOG_WARN_RL(&rl, "%s: Failed to install new qdisc: %s",
3278
0
                         netdev_get_name(netdev_), ovs_strerror(error));
3279
0
        }
3280
0
        ovs_assert((error == 0) == (netdev->tc != NULL));
3281
0
    }
3282
3283
0
exit:
3284
0
    ovs_mutex_unlock(&netdev->mutex);
3285
0
    return error;
3286
0
}
3287
3288
static int
3289
netdev_linux_get_queue(const struct netdev *netdev_,
3290
                       unsigned int queue_id, struct smap *details)
3291
0
{
3292
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3293
0
    int error;
3294
3295
0
    ovs_mutex_lock(&netdev->mutex);
3296
0
    if (netdev_linux_netnsid_is_remote(netdev)) {
3297
0
        error = EOPNOTSUPP;
3298
0
        goto exit;
3299
0
    }
3300
3301
0
    error = tc_query_qdisc(netdev_);
3302
0
    if (!error) {
3303
0
        struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
3304
0
        error = (queue
3305
0
                ? netdev->tc->ops->class_get(netdev_, queue, details)
3306
0
                : ENOENT);
3307
0
    }
3308
3309
0
exit:
3310
0
    ovs_mutex_unlock(&netdev->mutex);
3311
0
    return error;
3312
0
}
3313
3314
static int
3315
netdev_linux_set_queue(struct netdev *netdev_,
3316
                       unsigned int queue_id, const struct smap *details)
3317
0
{
3318
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3319
0
    int error;
3320
3321
0
    ovs_mutex_lock(&netdev->mutex);
3322
0
    if (netdev_linux_netnsid_is_remote(netdev)) {
3323
0
        error = EOPNOTSUPP;
3324
0
        goto exit;
3325
0
    }
3326
3327
0
    error = tc_query_qdisc(netdev_);
3328
0
    if (!error) {
3329
0
        error = (queue_id < netdev->tc->ops->n_queues
3330
0
                 && netdev->tc->ops->class_set
3331
0
                 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
3332
0
                 : EINVAL);
3333
0
    }
3334
3335
0
exit:
3336
0
    ovs_mutex_unlock(&netdev->mutex);
3337
0
    return error;
3338
0
}
3339
3340
static int
3341
netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
3342
0
{
3343
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3344
0
    int error;
3345
3346
0
    ovs_mutex_lock(&netdev->mutex);
3347
0
    if (netdev_linux_netnsid_is_remote(netdev)) {
3348
0
        error = EOPNOTSUPP;
3349
0
        goto exit;
3350
0
    }
3351
3352
0
    error = tc_query_qdisc(netdev_);
3353
0
    if (!error) {
3354
0
        if (netdev->tc->ops->class_delete) {
3355
0
            struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
3356
0
            error = (queue
3357
0
                     ? netdev->tc->ops->class_delete(netdev_, queue)
3358
0
                     : ENOENT);
3359
0
        } else {
3360
0
            error = EINVAL;
3361
0
        }
3362
0
    }
3363
3364
0
exit:
3365
0
    ovs_mutex_unlock(&netdev->mutex);
3366
0
    return error;
3367
0
}
3368
3369
static int
3370
netdev_linux_get_queue_stats(const struct netdev *netdev_,
3371
                             unsigned int queue_id,
3372
                             struct netdev_queue_stats *stats)
3373
0
{
3374
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3375
0
    int error;
3376
3377
0
    ovs_mutex_lock(&netdev->mutex);
3378
0
    if (netdev_linux_netnsid_is_remote(netdev)) {
3379
0
        error = EOPNOTSUPP;
3380
0
        goto exit;
3381
0
    }
3382
3383
0
    error = tc_query_qdisc(netdev_);
3384
0
    if (!error) {
3385
0
        if (netdev->tc->ops->class_get_stats) {
3386
0
            const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
3387
0
            if (queue) {
3388
0
                stats->created = queue->created;
3389
0
                error = netdev->tc->ops->class_get_stats(netdev_, queue,
3390
0
                                                         stats);
3391
0
            } else {
3392
0
                error = ENOENT;
3393
0
            }
3394
0
        } else {
3395
0
            error = EOPNOTSUPP;
3396
0
        }
3397
0
    }
3398
3399
0
exit:
3400
0
    ovs_mutex_unlock(&netdev->mutex);
3401
0
    return error;
3402
0
}
3403
3404
struct queue_dump_state {
3405
    struct nl_dump dump;
3406
    struct ofpbuf buf;
3407
};
3408
3409
static bool
3410
start_queue_dump(const struct netdev *netdev, struct queue_dump_state *state)
3411
0
{
3412
0
    struct ofpbuf request;
3413
0
    struct tcmsg *tcmsg;
3414
3415
0
    tcmsg = netdev_linux_tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
3416
0
    if (!tcmsg) {
3417
0
        return false;
3418
0
    }
3419
0
    tcmsg->tcm_parent = 0;
3420
0
    nl_dump_start(&state->dump, NETLINK_ROUTE, &request);
3421
0
    ofpbuf_uninit(&request);
3422
3423
0
    ofpbuf_init(&state->buf, NL_DUMP_BUFSIZE);
3424
0
    return true;
3425
0
}
3426
3427
static int
3428
finish_queue_dump(struct queue_dump_state *state)
3429
0
{
3430
0
    ofpbuf_uninit(&state->buf);
3431
0
    return nl_dump_done(&state->dump);
3432
0
}
3433
3434
struct netdev_linux_queue_state {
3435
    unsigned int *queues;
3436
    size_t cur_queue;
3437
    size_t n_queues;
3438
};
3439
3440
static int
3441
netdev_linux_queue_dump_start(const struct netdev *netdev_, void **statep)
3442
0
{
3443
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3444
0
    int error;
3445
3446
0
    ovs_mutex_lock(&netdev->mutex);
3447
0
    if (netdev_linux_netnsid_is_remote(netdev)) {
3448
0
        error = EOPNOTSUPP;
3449
0
        goto exit;
3450
0
    }
3451
3452
0
    error = tc_query_qdisc(netdev_);
3453
0
    if (!error) {
3454
0
        if (netdev->tc->ops->class_get) {
3455
0
            struct netdev_linux_queue_state *state;
3456
0
            struct tc_queue *queue;
3457
0
            size_t i;
3458
3459
0
            *statep = state = xmalloc(sizeof *state);
3460
0
            state->n_queues = hmap_count(&netdev->tc->queues);
3461
0
            state->cur_queue = 0;
3462
0
            state->queues = xmalloc(state->n_queues * sizeof *state->queues);
3463
3464
0
            i = 0;
3465
0
            HMAP_FOR_EACH (queue, hmap_node, &netdev->tc->queues) {
3466
0
                state->queues[i++] = queue->queue_id;
3467
0
            }
3468
0
        } else {
3469
0
            error = EOPNOTSUPP;
3470
0
        }
3471
0
    }
3472
3473
0
exit:
3474
0
    ovs_mutex_unlock(&netdev->mutex);
3475
0
    return error;
3476
0
}
3477
3478
static int
3479
netdev_linux_queue_dump_next(const struct netdev *netdev_, void *state_,
3480
                             unsigned int *queue_idp, struct smap *details)
3481
0
{
3482
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3483
0
    struct netdev_linux_queue_state *state = state_;
3484
0
    int error = EOF;
3485
3486
0
    ovs_mutex_lock(&netdev->mutex);
3487
0
    if (netdev_linux_netnsid_is_remote(netdev)) {
3488
0
        error = EOPNOTSUPP;
3489
0
        goto exit;
3490
0
    }
3491
3492
0
    while (state->cur_queue < state->n_queues) {
3493
0
        unsigned int queue_id = state->queues[state->cur_queue++];
3494
0
        struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
3495
3496
0
        if (queue) {
3497
0
            *queue_idp = queue_id;
3498
0
            error = netdev->tc->ops->class_get(netdev_, queue, details);
3499
0
            break;
3500
0
        }
3501
0
    }
3502
3503
0
exit:
3504
0
    ovs_mutex_unlock(&netdev->mutex);
3505
0
    return error;
3506
0
}
3507
3508
static int
3509
netdev_linux_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
3510
                             void *state_)
3511
0
{
3512
0
    struct netdev_linux_queue_state *state = state_;
3513
3514
0
    free(state->queues);
3515
0
    free(state);
3516
0
    return 0;
3517
0
}
3518
3519
static int
3520
netdev_linux_dump_queue_stats(const struct netdev *netdev_,
3521
                              netdev_dump_queue_stats_cb *cb, void *aux)
3522
0
{
3523
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3524
0
    int error;
3525
3526
0
    ovs_mutex_lock(&netdev->mutex);
3527
0
    if (netdev_linux_netnsid_is_remote(netdev)) {
3528
0
        error = EOPNOTSUPP;
3529
0
        goto exit;
3530
0
    }
3531
3532
0
    error = tc_query_qdisc(netdev_);
3533
0
    if (!error) {
3534
0
        struct queue_dump_state state;
3535
3536
0
        if (!netdev->tc->ops->class_dump_stats) {
3537
0
            error = EOPNOTSUPP;
3538
0
        } else if (!start_queue_dump(netdev_, &state)) {
3539
0
            error = ENODEV;
3540
0
        } else {
3541
0
            struct ofpbuf msg;
3542
0
            int retval;
3543
3544
0
            while (nl_dump_next(&state.dump, &msg, &state.buf)) {
3545
0
                retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
3546
0
                                                           cb, aux);
3547
0
                if (retval) {
3548
0
                    error = retval;
3549
0
                }
3550
0
            }
3551
3552
0
            retval = finish_queue_dump(&state);
3553
0
            if (retval) {
3554
0
                error = retval;
3555
0
            }
3556
0
        }
3557
0
    }
3558
3559
0
exit:
3560
0
    ovs_mutex_unlock(&netdev->mutex);
3561
0
    return error;
3562
0
}
3563
3564
static int
3565
netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
3566
                     struct in_addr netmask)
3567
0
{
3568
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3569
0
    int error;
3570
3571
0
    ovs_mutex_lock(&netdev->mutex);
3572
0
    if (netdev_linux_netnsid_is_remote(netdev)) {
3573
0
        error = EOPNOTSUPP;
3574
0
        goto exit;
3575
0
    }
3576
3577
0
    error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
3578
0
    if (!error) {
3579
0
        if (address.s_addr != INADDR_ANY) {
3580
0
            error = do_set_addr(netdev_, SIOCSIFNETMASK,
3581
0
                                "SIOCSIFNETMASK", netmask);
3582
0
        }
3583
0
    }
3584
3585
0
exit:
3586
0
    ovs_mutex_unlock(&netdev->mutex);
3587
0
    return error;
3588
0
}
3589
3590
/* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address.
3591
 * Otherwise, sets '*in6' to 'in6addr_any' and returns the corresponding
3592
 * error. */
3593
static int
3594
netdev_linux_get_addr_list(const struct netdev *netdev_,
3595
                          struct in6_addr **addr, struct in6_addr **mask, int *n_cnt)
3596
0
{
3597
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3598
0
    int error;
3599
3600
0
    ovs_mutex_lock(&netdev->mutex);
3601
0
    if (netdev_linux_netnsid_is_remote(netdev)) {
3602
0
        error = EOPNOTSUPP;
3603
0
        goto exit;
3604
0
    }
3605
3606
0
    error = netdev_get_addrs(netdev_get_name(netdev_), addr, mask, n_cnt);
3607
3608
0
exit:
3609
0
    ovs_mutex_unlock(&netdev->mutex);
3610
0
    return error;
3611
0
}
3612
3613
static void
3614
make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
3615
0
{
3616
0
    struct sockaddr_in sin;
3617
0
    memset(&sin, 0, sizeof sin);
3618
0
    sin.sin_family = AF_INET;
3619
0
    sin.sin_addr = addr;
3620
0
    sin.sin_port = 0;
3621
3622
0
    memset(sa, 0, sizeof *sa);
3623
0
    memcpy(sa, &sin, sizeof sin);
3624
0
}
3625
3626
static int
3627
do_set_addr(struct netdev *netdev,
3628
            int ioctl_nr, const char *ioctl_name, struct in_addr addr)
3629
0
{
3630
0
    struct ifreq ifr;
3631
3632
0
    memset(&ifr, 0, sizeof ifr);
3633
0
    make_in4_sockaddr(&ifr.ifr_addr, addr);
3634
0
    return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
3635
0
                               ioctl_name);
3636
0
}
3637
3638
/* Adds 'router' as a default IP gateway. */
3639
static int
3640
netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
3641
0
{
3642
0
    struct in_addr any = { INADDR_ANY };
3643
0
    struct rtentry rt;
3644
0
    int error;
3645
3646
0
    memset(&rt, 0, sizeof rt);
3647
0
    make_in4_sockaddr(&rt.rt_dst, any);
3648
0
    make_in4_sockaddr(&rt.rt_gateway, router);
3649
0
    make_in4_sockaddr(&rt.rt_genmask, any);
3650
0
    rt.rt_flags = RTF_UP | RTF_GATEWAY;
3651
0
    error = af_inet_ioctl(SIOCADDRT, &rt);
3652
0
    if (error) {
3653
0
        VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
3654
0
    }
3655
0
    return error;
3656
0
}
3657
3658
static int
3659
netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
3660
                          char **netdev_name)
3661
0
{
3662
0
    static const char fn[] = "/proc/net/route";
3663
0
    FILE *stream;
3664
0
    char line[256];
3665
0
    int ln;
3666
3667
0
    *netdev_name = NULL;
3668
0
    stream = fopen(fn, "r");
3669
0
    if (stream == NULL) {
3670
0
        VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
3671
0
        return errno;
3672
0
    }
3673
3674
0
    ln = 0;
3675
0
    while (fgets(line, sizeof line, stream)) {
3676
0
        if (++ln >= 2) {
3677
0
            char iface[17];
3678
0
            ovs_be32 dest, gateway, mask;
3679
0
            int refcnt, metric, mtu;
3680
0
            unsigned int flags, use, window, irtt;
3681
3682
0
            if (!ovs_scan(line,
3683
0
                          "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
3684
0
                          " %d %u %u\n",
3685
0
                          iface, &dest, &gateway, &flags, &refcnt,
3686
0
                          &use, &metric, &mask, &mtu, &window, &irtt)) {
3687
0
                VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
3688
0
                        fn, ln, line);
3689
0
                continue;
3690
0
            }
3691
0
            if (!(flags & RTF_UP)) {
3692
                /* Skip routes that aren't up. */
3693
0
                continue;
3694
0
            }
3695
3696
            /* The output of 'dest', 'mask', and 'gateway' were given in
3697
             * network byte order, so we don't need need any endian
3698
             * conversions here. */
3699
0
            if ((dest & mask) == (host->s_addr & mask)) {
3700
0
                if (!gateway) {
3701
                    /* The host is directly reachable. */
3702
0
                    next_hop->s_addr = 0;
3703
0
                } else {
3704
                    /* To reach the host, we must go through a gateway. */
3705
0
                    next_hop->s_addr = gateway;
3706
0
                }
3707
0
                *netdev_name = xstrdup(iface);
3708
0
                fclose(stream);
3709
0
                return 0;
3710
0
            }
3711
0
        }
3712
0
    }
3713
3714
0
    fclose(stream);
3715
0
    return ENXIO;
3716
0
}
3717
3718
int
3719
netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
3720
0
{
3721
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3722
0
    int error = 0;
3723
3724
0
    ovs_mutex_lock(&netdev->mutex);
3725
0
    if (!(netdev->cache_valid & VALID_DRVINFO)) {
3726
0
        struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
3727
3728
0
        COVERAGE_INC(netdev_get_ethtool);
3729
0
        memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
3730
0
        error = netdev_linux_do_ethtool(netdev->up.name,
3731
0
                                        cmd,
3732
0
                                        ETHTOOL_GDRVINFO,
3733
0
                                        "ETHTOOL_GDRVINFO");
3734
0
        if (!error) {
3735
0
            netdev->cache_valid |= VALID_DRVINFO;
3736
0
        }
3737
0
    }
3738
3739
0
    if (!error) {
3740
0
        smap_add(smap, "driver_name", netdev->drvinfo.driver);
3741
0
        smap_add(smap, "driver_version", netdev->drvinfo.version);
3742
0
        smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
3743
0
    }
3744
0
    ovs_mutex_unlock(&netdev->mutex);
3745
3746
0
    return error;
3747
0
}
3748
3749
static int
3750
netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
3751
                           struct smap *smap)
3752
0
{
3753
0
    smap_add(smap, "driver_name", "openvswitch");
3754
0
    return 0;
3755
0
}
3756
3757
static uint32_t
3758
netdev_linux_get_block_id(struct netdev *netdev_)
3759
0
{
3760
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3761
0
    uint32_t block_id = 0;
3762
3763
0
    ovs_mutex_lock(&netdev->mutex);
3764
    /* Ensure the linux netdev has had its fields populated. */
3765
0
    if (!(netdev->cache_valid & VALID_IFINDEX)) {
3766
0
        netdev_linux_update_via_netlink(netdev);
3767
0
    }
3768
3769
    /* Only assigning block ids to linux netdevs that are
3770
     * LAG primary members. */
3771
0
    if (netdev->is_lag_primary) {
3772
0
        block_id = netdev->ifindex;
3773
0
    }
3774
0
    ovs_mutex_unlock(&netdev->mutex);
3775
3776
0
    return block_id;
3777
0
}
3778
3779
/* Looks up the ARP table entry for 'ip' on 'netdev'.  If one exists and can be
3780
 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
3781
 * returns 0.  Otherwise, it returns a positive errno value; in particular,
3782
 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
3783
static int
3784
netdev_linux_arp_lookup(const struct netdev *netdev,
3785
                        ovs_be32 ip, struct eth_addr *mac)
3786
0
{
3787
0
    struct arpreq r;
3788
0
    struct sockaddr_in sin;
3789
0
    int retval;
3790
3791
0
    memset(&r, 0, sizeof r);
3792
0
    memset(&sin, 0, sizeof sin);
3793
0
    sin.sin_family = AF_INET;
3794
0
    sin.sin_addr.s_addr = ip;
3795
0
    sin.sin_port = 0;
3796
0
    memcpy(&r.arp_pa, &sin, sizeof sin);
3797
0
    r.arp_ha.sa_family = ARPHRD_ETHER;
3798
0
    r.arp_flags = 0;
3799
0
    ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
3800
0
    COVERAGE_INC(netdev_arp_lookup);
3801
0
    retval = af_inet_ioctl(SIOCGARP, &r);
3802
0
    if (!retval) {
3803
0
        memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
3804
0
    } else if (retval != ENXIO) {
3805
0
        VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
3806
0
                     netdev_get_name(netdev), IP_ARGS(ip),
3807
0
                     ovs_strerror(retval));
3808
0
    }
3809
0
    return retval;
3810
0
}
3811
3812
static unsigned int
3813
nd_to_iff_flags(enum netdev_flags nd)
3814
0
{
3815
0
    unsigned int iff = 0;
3816
0
    if (nd & NETDEV_UP) {
3817
0
        iff |= IFF_UP;
3818
0
    }
3819
0
    if (nd & NETDEV_PROMISC) {
3820
0
        iff |= IFF_PROMISC;
3821
0
    }
3822
0
    if (nd & NETDEV_LOOPBACK) {
3823
0
        iff |= IFF_LOOPBACK;
3824
0
    }
3825
0
    return iff;
3826
0
}
3827
3828
static int
3829
iff_to_nd_flags(unsigned int iff)
3830
0
{
3831
0
    enum netdev_flags nd = 0;
3832
0
    if (iff & IFF_UP) {
3833
0
        nd |= NETDEV_UP;
3834
0
    }
3835
0
    if (iff & IFF_PROMISC) {
3836
0
        nd |= NETDEV_PROMISC;
3837
0
    }
3838
0
    if (iff & IFF_LOOPBACK) {
3839
0
        nd |= NETDEV_LOOPBACK;
3840
0
    }
3841
0
    return nd;
3842
0
}
3843
3844
static int
3845
update_flags(struct netdev_linux *netdev, enum netdev_flags off,
3846
             enum netdev_flags on, enum netdev_flags *old_flagsp)
3847
    OVS_REQUIRES(netdev->mutex)
3848
0
{
3849
0
    unsigned int old_flags, new_flags;
3850
0
    int error = 0;
3851
3852
0
    old_flags = netdev->ifi_flags;
3853
0
    *old_flagsp = iff_to_nd_flags(old_flags);
3854
0
    new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
3855
0
    if (new_flags != old_flags) {
3856
0
        error = set_flags(netdev_get_name(&netdev->up), new_flags);
3857
0
        get_flags(&netdev->up, &netdev->ifi_flags);
3858
0
    }
3859
3860
0
    return error;
3861
0
}
3862
3863
static int
3864
netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
3865
                          enum netdev_flags on, enum netdev_flags *old_flagsp)
3866
0
{
3867
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3868
0
    int error = 0;
3869
3870
0
    ovs_mutex_lock(&netdev->mutex);
3871
0
    if (on || off) {
3872
        /* Changing flags over netlink isn't support yet. */
3873
0
        if (netdev_linux_netnsid_is_remote(netdev)) {
3874
0
            error = EOPNOTSUPP;
3875
0
            goto exit;
3876
0
        }
3877
0
        error = update_flags(netdev, off, on, old_flagsp);
3878
0
    } else {
3879
        /* Try reading flags over netlink, or fall back to ioctl. */
3880
0
        if (!netdev_linux_update_via_netlink(netdev)) {
3881
0
            *old_flagsp = iff_to_nd_flags(netdev->ifi_flags);
3882
0
        } else {
3883
0
            error = update_flags(netdev, off, on, old_flagsp);
3884
0
        }
3885
0
    }
3886
3887
0
exit:
3888
0
    ovs_mutex_unlock(&netdev->mutex);
3889
0
    return error;
3890
0
}
3891
3892
#define NETDEV_LINUX_CLASS_COMMON                               \
3893
    .run = netdev_linux_run,                                    \
3894
    .wait = netdev_linux_wait,                                  \
3895
    .alloc = netdev_linux_alloc,                                \
3896
    .dealloc = netdev_linux_dealloc,                            \
3897
    .send_wait = netdev_linux_send_wait,                        \
3898
    .set_etheraddr = netdev_linux_set_etheraddr,                \
3899
    .get_etheraddr = netdev_linux_get_etheraddr,                \
3900
    .get_mtu = netdev_linux_get_mtu,                            \
3901
    .set_mtu = netdev_linux_set_mtu,                            \
3902
    .get_ifindex = netdev_linux_get_ifindex,                    \
3903
    .get_carrier = netdev_linux_get_carrier,                    \
3904
    .get_carrier_resets = netdev_linux_get_carrier_resets,      \
3905
    .set_miimon_interval = netdev_linux_set_miimon_interval,    \
3906
    .set_advertisements = netdev_linux_set_advertisements,      \
3907
    .set_policing = netdev_linux_set_policing,                  \
3908
    .get_qos_types = netdev_linux_get_qos_types,                \
3909
    .get_qos_capabilities = netdev_linux_get_qos_capabilities,  \
3910
    .get_qos = netdev_linux_get_qos,                            \
3911
    .set_qos = netdev_linux_set_qos,                            \
3912
    .get_queue = netdev_linux_get_queue,                        \
3913
    .set_queue = netdev_linux_set_queue,                        \
3914
    .delete_queue = netdev_linux_delete_queue,                  \
3915
    .get_queue_stats = netdev_linux_get_queue_stats,            \
3916
    .queue_dump_start = netdev_linux_queue_dump_start,          \
3917
    .queue_dump_next = netdev_linux_queue_dump_next,            \
3918
    .queue_dump_done = netdev_linux_queue_dump_done,            \
3919
    .dump_queue_stats = netdev_linux_dump_queue_stats,          \
3920
    .set_in4 = netdev_linux_set_in4,                            \
3921
    .get_addr_list = netdev_linux_get_addr_list,                \
3922
    .add_router = netdev_linux_add_router,                      \
3923
    .get_next_hop = netdev_linux_get_next_hop,                  \
3924
    .arp_lookup = netdev_linux_arp_lookup,                      \
3925
    .update_flags = netdev_linux_update_flags,                  \
3926
    .rxq_alloc = netdev_linux_rxq_alloc,                        \
3927
    .rxq_dealloc = netdev_linux_rxq_dealloc,                    \
3928
    .rxq_wait = netdev_linux_rxq_wait,                          \
3929
    .rxq_drain = netdev_linux_rxq_drain
3930
3931
const struct netdev_class netdev_linux_class = {
3932
    NETDEV_LINUX_CLASS_COMMON,
3933
    .type = "system",
3934
    .is_pmd = false,
3935
    .construct = netdev_linux_construct,
3936
    .destruct = netdev_linux_destruct,
3937
    .get_stats = netdev_linux_get_stats,
3938
    .get_features = netdev_linux_get_features,
3939
    .get_speed = netdev_linux_get_speed,
3940
    .get_duplex = netdev_linux_get_duplex,
3941
    .get_status = netdev_linux_get_status,
3942
    .get_block_id = netdev_linux_get_block_id,
3943
    .send = netdev_linux_send,
3944
    .rxq_construct = netdev_linux_rxq_construct,
3945
    .rxq_destruct = netdev_linux_rxq_destruct,
3946
    .rxq_recv = netdev_linux_rxq_recv,
3947
};
3948
3949
const struct netdev_class netdev_tap_class = {
3950
    NETDEV_LINUX_CLASS_COMMON,
3951
    .type = "tap",
3952
    .is_pmd = false,
3953
    .construct = netdev_linux_construct_tap,
3954
    .destruct = netdev_linux_destruct,
3955
    .get_stats = netdev_tap_get_stats,
3956
    .get_features = netdev_linux_get_features,
3957
    .get_speed = netdev_linux_get_speed,
3958
    .get_duplex = netdev_linux_get_duplex,
3959
    .get_status = netdev_linux_get_status,
3960
    .send = netdev_linux_send,
3961
    .rxq_construct = netdev_linux_rxq_construct,
3962
    .rxq_destruct = netdev_linux_rxq_destruct,
3963
    .rxq_recv = netdev_linux_rxq_recv,
3964
};
3965
3966
const struct netdev_class netdev_internal_class = {
3967
    NETDEV_LINUX_CLASS_COMMON,
3968
    .type = "internal",
3969
    .is_pmd = false,
3970
    .construct = netdev_linux_construct,
3971
    .destruct = netdev_linux_destruct,
3972
    .get_stats = netdev_internal_get_stats,
3973
    .get_status = netdev_internal_get_status,
3974
    .send = netdev_linux_send,
3975
    .rxq_construct = netdev_linux_rxq_construct,
3976
    .rxq_destruct = netdev_linux_rxq_destruct,
3977
    .rxq_recv = netdev_linux_rxq_recv,
3978
};
3979
3980
#ifdef HAVE_AF_XDP
3981
#define NETDEV_AFXDP_CLASS_COMMON                               \
3982
    .construct = netdev_afxdp_construct,                        \
3983
    .destruct = netdev_afxdp_destruct,                          \
3984
    .get_stats = netdev_afxdp_get_stats,                        \
3985
    .get_custom_stats = netdev_afxdp_get_custom_stats,          \
3986
    .get_status = netdev_afxdp_get_status,                      \
3987
    .set_config = netdev_afxdp_set_config,                      \
3988
    .get_config = netdev_afxdp_get_config,                      \
3989
    .reconfigure = netdev_afxdp_reconfigure,                    \
3990
    .get_numa_id = netdev_linux_get_numa_id,                    \
3991
    .send = netdev_afxdp_batch_send,                            \
3992
    .rxq_construct = netdev_afxdp_rxq_construct,                \
3993
    .rxq_destruct = netdev_afxdp_rxq_destruct,                  \
3994
    .rxq_recv = netdev_afxdp_rxq_recv
3995
3996
const struct netdev_class netdev_afxdp_class = {
3997
    NETDEV_LINUX_CLASS_COMMON,
3998
    NETDEV_AFXDP_CLASS_COMMON,
3999
    .type = "afxdp",
4000
    .is_pmd = true,
4001
};
4002
4003
const struct netdev_class netdev_afxdp_nonpmd_class = {
4004
    NETDEV_LINUX_CLASS_COMMON,
4005
    NETDEV_AFXDP_CLASS_COMMON,
4006
    .type = "afxdp-nonpmd",
4007
    .is_pmd = false,
4008
};
4009
#endif
4010

4011
4012
#define CODEL_N_QUEUES 0x0000
4013
4014
/* In sufficiently new kernel headers these are defined as enums in
4015
 * <linux/pkt_sched.h>.  Define them here as macros to help out with older
4016
 * kernels.  (This overrides any enum definition in the header file but that's
4017
 * harmless.) */
4018
0
#define TCA_CODEL_TARGET   1
4019
0
#define TCA_CODEL_LIMIT    2
4020
0
#define TCA_CODEL_INTERVAL 3
4021
4022
struct codel {
4023
    struct tc tc;
4024
    uint32_t target;
4025
    uint32_t limit;
4026
    uint32_t interval;
4027
};
4028
4029
static struct codel *
4030
codel_get__(const struct netdev *netdev_)
4031
0
{
4032
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4033
0
    return CONTAINER_OF(netdev->tc, struct codel, tc);
4034
0
}
4035
4036
static void
4037
codel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
4038
                uint32_t interval)
4039
0
{
4040
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4041
0
    struct codel *codel;
4042
4043
0
    codel = xmalloc(sizeof *codel);
4044
0
    tc_init(&codel->tc, &tc_ops_codel);
4045
0
    codel->target = target;
4046
0
    codel->limit = limit;
4047
0
    codel->interval = interval;
4048
4049
0
    netdev->tc = &codel->tc;
4050
0
}
4051
4052
static int
4053
codel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
4054
                    uint32_t interval)
4055
0
{
4056
0
    size_t opt_offset;
4057
0
    struct ofpbuf request;
4058
0
    struct tcmsg *tcmsg;
4059
0
    uint32_t otarget, olimit, ointerval;
4060
0
    int error;
4061
4062
0
    tc_del_qdisc(netdev);
4063
4064
0
    tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
4065
0
                                         NLM_F_EXCL | NLM_F_CREATE, &request);
4066
0
    if (!tcmsg) {
4067
0
        return ENODEV;
4068
0
    }
4069
0
    tcmsg->tcm_handle = tc_make_handle(1, 0);
4070
0
    tcmsg->tcm_parent = TC_H_ROOT;
4071
4072
0
    otarget = target ? target : 5000;
4073
0
    olimit = limit ? limit : 10240;
4074
0
    ointerval = interval ? interval : 100000;
4075
4076
0
    nl_msg_put_string(&request, TCA_KIND, "codel");
4077
0
    opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4078
0
    nl_msg_put_u32(&request, TCA_CODEL_TARGET, otarget);
4079
0
    nl_msg_put_u32(&request, TCA_CODEL_LIMIT, olimit);
4080
0
    nl_msg_put_u32(&request, TCA_CODEL_INTERVAL, ointerval);
4081
0
    nl_msg_end_nested(&request, opt_offset);
4082
4083
0
    error = tc_transact(&request, NULL);
4084
0
    if (error) {
4085
0
        VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
4086
0
        "target %u, limit %u, interval %u error %d(%s)",
4087
0
        netdev_get_name(netdev),
4088
0
        otarget, olimit, ointerval,
4089
0
        error, ovs_strerror(error));
4090
0
    }
4091
0
    return error;
4092
0
}
4093
4094
static void
4095
codel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
4096
                            const struct smap *details, struct codel *codel)
4097
0
{
4098
0
    codel->target = smap_get_ullong(details, "target", 0);
4099
0
    codel->limit = smap_get_ullong(details, "limit", 0);
4100
0
    codel->interval = smap_get_ullong(details, "interval", 0);
4101
4102
0
    if (!codel->target) {
4103
0
        codel->target = 5000;
4104
0
    }
4105
0
    if (!codel->limit) {
4106
0
        codel->limit = 10240;
4107
0
    }
4108
0
    if (!codel->interval) {
4109
0
        codel->interval = 100000;
4110
0
    }
4111
0
}
4112
4113
static int
4114
codel_tc_install(struct netdev *netdev, const struct smap *details)
4115
0
{
4116
0
    int error;
4117
0
    struct codel codel;
4118
4119
0
    codel_parse_qdisc_details__(netdev, details, &codel);
4120
0
    error = codel_setup_qdisc__(netdev, codel.target, codel.limit,
4121
0
                                codel.interval);
4122
0
    if (!error) {
4123
0
        codel_install__(netdev, codel.target, codel.limit, codel.interval);
4124
0
    }
4125
0
    return error;
4126
0
}
4127
4128
static int
4129
codel_parse_tca_options__(struct nlattr *nl_options, struct codel *codel)
4130
0
{
4131
0
    static const struct nl_policy tca_codel_policy[] = {
4132
0
        [TCA_CODEL_TARGET] = { .type = NL_A_U32 },
4133
0
        [TCA_CODEL_LIMIT] = { .type = NL_A_U32 },
4134
0
        [TCA_CODEL_INTERVAL] = { .type = NL_A_U32 }
4135
0
    };
4136
4137
0
    struct nlattr *attrs[ARRAY_SIZE(tca_codel_policy)];
4138
4139
0
    if (!nl_parse_nested(nl_options, tca_codel_policy,
4140
0
                         attrs, ARRAY_SIZE(tca_codel_policy))) {
4141
0
        VLOG_WARN_RL(&rl, "failed to parse CoDel class options");
4142
0
        return EPROTO;
4143
0
    }
4144
4145
0
    codel->target = nl_attr_get_u32(attrs[TCA_CODEL_TARGET]);
4146
0
    codel->limit = nl_attr_get_u32(attrs[TCA_CODEL_LIMIT]);
4147
0
    codel->interval = nl_attr_get_u32(attrs[TCA_CODEL_INTERVAL]);
4148
0
    return 0;
4149
0
}
4150
4151
static int
4152
codel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
4153
0
{
4154
0
    struct nlattr *nlattr;
4155
0
    const char * kind;
4156
0
    int error;
4157
0
    struct codel codel;
4158
4159
0
    error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
4160
0
    if (error != 0) {
4161
0
        return error;
4162
0
    }
4163
4164
0
    error = codel_parse_tca_options__(nlattr, &codel);
4165
0
    if (error != 0) {
4166
0
        return error;
4167
0
    }
4168
4169
0
    codel_install__(netdev, codel.target, codel.limit, codel.interval);
4170
0
    return 0;
4171
0
}
4172
4173
4174
static void
4175
codel_tc_destroy(struct tc *tc)
4176
0
{
4177
0
    struct codel *codel = CONTAINER_OF(tc, struct codel, tc);
4178
0
    tc_destroy(tc);
4179
0
    free(codel);
4180
0
}
4181
4182
static int
4183
codel_qdisc_get(const struct netdev *netdev, struct smap *details)
4184
0
{
4185
0
    const struct codel *codel = codel_get__(netdev);
4186
0
    smap_add_format(details, "target", "%u", codel->target);
4187
0
    smap_add_format(details, "limit", "%u", codel->limit);
4188
0
    smap_add_format(details, "interval", "%u", codel->interval);
4189
0
    return 0;
4190
0
}
4191
4192
static int
4193
codel_qdisc_set(struct netdev *netdev, const struct smap *details)
4194
0
{
4195
0
    struct codel codel;
4196
4197
0
    codel_parse_qdisc_details__(netdev, details, &codel);
4198
0
    codel_install__(netdev, codel.target, codel.limit, codel.interval);
4199
0
    codel_get__(netdev)->target = codel.target;
4200
0
    codel_get__(netdev)->limit = codel.limit;
4201
0
    codel_get__(netdev)->interval = codel.interval;
4202
0
    return 0;
4203
0
}
4204
4205
static const struct tc_ops tc_ops_codel = {
4206
    .linux_name = "codel",
4207
    .ovs_name = "linux-codel",
4208
    .n_queues = CODEL_N_QUEUES,
4209
    .tc_install = codel_tc_install,
4210
    .tc_load = codel_tc_load,
4211
    .tc_destroy = codel_tc_destroy,
4212
    .qdisc_get = codel_qdisc_get,
4213
    .qdisc_set = codel_qdisc_set,
4214
};
4215

4216
/* FQ-CoDel traffic control class. */
4217
4218
#define FQCODEL_N_QUEUES 0x0000
4219
4220
/* In sufficiently new kernel headers these are defined as enums in
4221
 * <linux/pkt_sched.h>.  Define them here as macros to help out with older
4222
 * kernels.  (This overrides any enum definition in the header file but that's
4223
 * harmless.) */
4224
0
#define TCA_FQ_CODEL_TARGET     1
4225
0
#define TCA_FQ_CODEL_LIMIT      2
4226
0
#define TCA_FQ_CODEL_INTERVAL   3
4227
#define TCA_FQ_CODEL_ECN        4
4228
0
#define TCA_FQ_CODEL_FLOWS      5
4229
0
#define TCA_FQ_CODEL_QUANTUM    6
4230
4231
struct fqcodel {
4232
    struct tc tc;
4233
    uint32_t target;
4234
    uint32_t limit;
4235
    uint32_t interval;
4236
    uint32_t flows;
4237
    uint32_t quantum;
4238
};
4239
4240
static struct fqcodel *
4241
fqcodel_get__(const struct netdev *netdev_)
4242
0
{
4243
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4244
0
    return CONTAINER_OF(netdev->tc, struct fqcodel, tc);
4245
0
}
4246
4247
static void
4248
fqcodel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
4249
                  uint32_t interval, uint32_t flows, uint32_t quantum)
4250
0
{
4251
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4252
0
    struct fqcodel *fqcodel;
4253
4254
0
    fqcodel = xmalloc(sizeof *fqcodel);
4255
0
    tc_init(&fqcodel->tc, &tc_ops_fqcodel);
4256
0
    fqcodel->target = target;
4257
0
    fqcodel->limit = limit;
4258
0
    fqcodel->interval = interval;
4259
0
    fqcodel->flows = flows;
4260
0
    fqcodel->quantum = quantum;
4261
4262
0
    netdev->tc = &fqcodel->tc;
4263
0
}
4264
4265
static int
4266
fqcodel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
4267
                      uint32_t interval, uint32_t flows, uint32_t quantum)
4268
0
{
4269
0
    size_t opt_offset;
4270
0
    struct ofpbuf request;
4271
0
    struct tcmsg *tcmsg;
4272
0
    uint32_t otarget, olimit, ointerval, oflows,  oquantum;
4273
0
    int error;
4274
4275
0
    tc_del_qdisc(netdev);
4276
4277
0
    tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
4278
0
                                         NLM_F_EXCL | NLM_F_CREATE, &request);
4279
0
    if (!tcmsg) {
4280
0
        return ENODEV;
4281
0
    }
4282
0
    tcmsg->tcm_handle = tc_make_handle(1, 0);
4283
0
    tcmsg->tcm_parent = TC_H_ROOT;
4284
4285
0
    otarget = target ? target : 5000;
4286
0
    olimit = limit ? limit : 10240;
4287
0
    ointerval = interval ? interval : 100000;
4288
0
    oflows = flows ? flows : 1024;
4289
0
    oquantum = quantum ? quantum : 1514; /* fq_codel default quantum is 1514
4290
                                            not mtu */
4291
4292
0
    nl_msg_put_string(&request, TCA_KIND, "fq_codel");
4293
0
    opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4294
0
    nl_msg_put_u32(&request, TCA_FQ_CODEL_TARGET, otarget);
4295
0
    nl_msg_put_u32(&request, TCA_FQ_CODEL_LIMIT, olimit);
4296
0
    nl_msg_put_u32(&request, TCA_FQ_CODEL_INTERVAL, ointerval);
4297
0
    nl_msg_put_u32(&request, TCA_FQ_CODEL_FLOWS, oflows);
4298
0
    nl_msg_put_u32(&request, TCA_FQ_CODEL_QUANTUM, oquantum);
4299
0
    nl_msg_end_nested(&request, opt_offset);
4300
4301
0
    error = tc_transact(&request, NULL);
4302
0
    if (error) {
4303
0
        VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
4304
0
        "target %u, limit %u, interval %u, flows %u, quantum %u error %d(%s)",
4305
0
        netdev_get_name(netdev),
4306
0
        otarget, olimit, ointerval, oflows, oquantum,
4307
0
        error, ovs_strerror(error));
4308
0
    }
4309
0
    return error;
4310
0
}
4311
4312
static void
4313
fqcodel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
4314
                          const struct smap *details, struct fqcodel *fqcodel)
4315
0
{
4316
0
    fqcodel->target = smap_get_ullong(details, "target", 0);
4317
0
    fqcodel->limit = smap_get_ullong(details, "limit", 0);
4318
0
    fqcodel->interval = smap_get_ullong(details, "interval", 0);
4319
0
    fqcodel->flows = smap_get_ullong(details, "flows", 0);
4320
0
    fqcodel->quantum = smap_get_ullong(details, "quantum", 0);
4321
4322
0
    if (!fqcodel->target) {
4323
0
        fqcodel->target = 5000;
4324
0
    }
4325
0
    if (!fqcodel->limit) {
4326
0
        fqcodel->limit = 10240;
4327
0
    }
4328
0
    if (!fqcodel->interval) {
4329
0
        fqcodel->interval = 1000000;
4330
0
    }
4331
0
    if (!fqcodel->flows) {
4332
0
        fqcodel->flows = 1024;
4333
0
    }
4334
0
    if (!fqcodel->quantum) {
4335
0
        fqcodel->quantum = 1514;
4336
0
    }
4337
0
}
4338
4339
static int
4340
fqcodel_tc_install(struct netdev *netdev, const struct smap *details)
4341
0
{
4342
0
    int error;
4343
0
    struct fqcodel fqcodel;
4344
4345
0
    fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
4346
0
    error = fqcodel_setup_qdisc__(netdev, fqcodel.target, fqcodel.limit,
4347
0
                                  fqcodel.interval, fqcodel.flows,
4348
0
                                  fqcodel.quantum);
4349
0
    if (!error) {
4350
0
        fqcodel_install__(netdev, fqcodel.target, fqcodel.limit,
4351
0
                          fqcodel.interval, fqcodel.flows, fqcodel.quantum);
4352
0
    }
4353
0
    return error;
4354
0
}
4355
4356
static int
4357
fqcodel_parse_tca_options__(struct nlattr *nl_options, struct fqcodel *fqcodel)
4358
0
{
4359
0
    static const struct nl_policy tca_fqcodel_policy[] = {
4360
0
        [TCA_FQ_CODEL_TARGET] = { .type = NL_A_U32 },
4361
0
        [TCA_FQ_CODEL_LIMIT] = { .type = NL_A_U32 },
4362
0
        [TCA_FQ_CODEL_INTERVAL] = { .type = NL_A_U32 },
4363
0
        [TCA_FQ_CODEL_FLOWS] = { .type = NL_A_U32 },
4364
0
        [TCA_FQ_CODEL_QUANTUM] = { .type = NL_A_U32 }
4365
0
    };
4366
4367
0
    struct nlattr *attrs[ARRAY_SIZE(tca_fqcodel_policy)];
4368
4369
0
    if (!nl_parse_nested(nl_options, tca_fqcodel_policy,
4370
0
                         attrs, ARRAY_SIZE(tca_fqcodel_policy))) {
4371
0
        VLOG_WARN_RL(&rl, "failed to parse FQ_CoDel class options");
4372
0
        return EPROTO;
4373
0
    }
4374
4375
0
    fqcodel->target = nl_attr_get_u32(attrs[TCA_FQ_CODEL_TARGET]);
4376
0
    fqcodel->limit = nl_attr_get_u32(attrs[TCA_FQ_CODEL_LIMIT]);
4377
0
    fqcodel->interval =nl_attr_get_u32(attrs[TCA_FQ_CODEL_INTERVAL]);
4378
0
    fqcodel->flows = nl_attr_get_u32(attrs[TCA_FQ_CODEL_FLOWS]);
4379
0
    fqcodel->quantum = nl_attr_get_u32(attrs[TCA_FQ_CODEL_QUANTUM]);
4380
0
    return 0;
4381
0
}
4382
4383
static int
4384
fqcodel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
4385
0
{
4386
0
    struct nlattr *nlattr;
4387
0
    const char * kind;
4388
0
    int error;
4389
0
    struct fqcodel fqcodel;
4390
4391
0
    error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
4392
0
    if (error != 0) {
4393
0
        return error;
4394
0
    }
4395
4396
0
    error = fqcodel_parse_tca_options__(nlattr, &fqcodel);
4397
0
    if (error != 0) {
4398
0
        return error;
4399
0
    }
4400
4401
0
    fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
4402
0
                      fqcodel.flows, fqcodel.quantum);
4403
0
    return 0;
4404
0
}
4405
4406
static void
4407
fqcodel_tc_destroy(struct tc *tc)
4408
0
{
4409
0
    struct fqcodel *fqcodel = CONTAINER_OF(tc, struct fqcodel, tc);
4410
0
    tc_destroy(tc);
4411
0
    free(fqcodel);
4412
0
}
4413
4414
static int
4415
fqcodel_qdisc_get(const struct netdev *netdev, struct smap *details)
4416
0
{
4417
0
    const struct fqcodel *fqcodel = fqcodel_get__(netdev);
4418
0
    smap_add_format(details, "target", "%u", fqcodel->target);
4419
0
    smap_add_format(details, "limit", "%u", fqcodel->limit);
4420
0
    smap_add_format(details, "interval", "%u", fqcodel->interval);
4421
0
    smap_add_format(details, "flows", "%u", fqcodel->flows);
4422
0
    smap_add_format(details, "quantum", "%u", fqcodel->quantum);
4423
0
    return 0;
4424
0
}
4425
4426
static int
4427
fqcodel_qdisc_set(struct netdev *netdev, const struct smap *details)
4428
0
{
4429
0
    struct fqcodel fqcodel;
4430
4431
0
    fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
4432
0
    fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
4433
0
                      fqcodel.flows, fqcodel.quantum);
4434
0
    fqcodel_get__(netdev)->target = fqcodel.target;
4435
0
    fqcodel_get__(netdev)->limit = fqcodel.limit;
4436
0
    fqcodel_get__(netdev)->interval = fqcodel.interval;
4437
0
    fqcodel_get__(netdev)->flows = fqcodel.flows;
4438
0
    fqcodel_get__(netdev)->quantum = fqcodel.quantum;
4439
0
    return 0;
4440
0
}
4441
4442
static const struct tc_ops tc_ops_fqcodel = {
4443
    .linux_name = "fq_codel",
4444
    .ovs_name = "linux-fq_codel",
4445
    .n_queues = FQCODEL_N_QUEUES,
4446
    .tc_install = fqcodel_tc_install,
4447
    .tc_load = fqcodel_tc_load,
4448
    .tc_destroy = fqcodel_tc_destroy,
4449
    .qdisc_get = fqcodel_qdisc_get,
4450
    .qdisc_set = fqcodel_qdisc_set,
4451
};
4452

4453
/* SFQ traffic control class. */
4454
4455
#define SFQ_N_QUEUES 0x0000
4456
4457
struct sfq {
4458
    struct tc tc;
4459
    uint32_t quantum;
4460
    uint32_t perturb;
4461
};
4462
4463
static struct sfq *
4464
sfq_get__(const struct netdev *netdev_)
4465
0
{
4466
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4467
0
    return CONTAINER_OF(netdev->tc, struct sfq, tc);
4468
0
}
4469
4470
static void
4471
sfq_install__(struct netdev *netdev_, uint32_t quantum, uint32_t perturb)
4472
0
{
4473
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4474
0
    struct sfq *sfq;
4475
4476
0
    sfq = xmalloc(sizeof *sfq);
4477
0
    tc_init(&sfq->tc, &tc_ops_sfq);
4478
0
    sfq->perturb = perturb;
4479
0
    sfq->quantum = quantum;
4480
4481
0
    netdev->tc = &sfq->tc;
4482
0
}
4483
4484
static int
4485
sfq_setup_qdisc__(struct netdev *netdev, uint32_t quantum, uint32_t perturb)
4486
0
{
4487
0
    struct tc_sfq_qopt opt;
4488
0
    struct ofpbuf request;
4489
0
    struct tcmsg *tcmsg;
4490
0
    int mtu;
4491
0
    int mtu_error, error;
4492
0
    mtu_error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
4493
4494
0
    tc_del_qdisc(netdev);
4495
4496
0
    tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
4497
0
                                         NLM_F_EXCL | NLM_F_CREATE, &request);
4498
0
    if (!tcmsg) {
4499
0
        return ENODEV;
4500
0
    }
4501
0
    tcmsg->tcm_handle = tc_make_handle(1, 0);
4502
0
    tcmsg->tcm_parent = TC_H_ROOT;
4503
4504
0
    memset(&opt, 0, sizeof opt);
4505
0
    if (!quantum) {
4506
0
        if (!mtu_error) {
4507
0
            opt.quantum = mtu; /* if we cannot find mtu, use default */
4508
0
        }
4509
0
    } else {
4510
0
        opt.quantum = quantum;
4511
0
    }
4512
4513
0
    if (!perturb) {
4514
0
        opt.perturb_period = 10;
4515
0
    } else {
4516
0
        opt.perturb_period = perturb;
4517
0
    }
4518
4519
0
    nl_msg_put_string(&request, TCA_KIND, "sfq");
4520
0
    nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
4521
4522
0
    error = tc_transact(&request, NULL);
4523
0
    if (error) {
4524
0
        VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
4525
0
                     "quantum %u, perturb %u error %d(%s)",
4526
0
                     netdev_get_name(netdev),
4527
0
                     opt.quantum, opt.perturb_period,
4528
0
                     error, ovs_strerror(error));
4529
0
    }
4530
0
    return error;
4531
0
}
4532
4533
static void
4534
sfq_parse_qdisc_details__(struct netdev *netdev,
4535
                          const struct smap *details, struct sfq *sfq)
4536
0
{
4537
0
    sfq->perturb = smap_get_ullong(details, "perturb", 0);
4538
0
    sfq->quantum = smap_get_ullong(details, "quantum", 0);
4539
4540
0
    if (!sfq->perturb) {
4541
0
        sfq->perturb = 10;
4542
0
    }
4543
4544
0
    if (!sfq->quantum) {
4545
0
        int mtu;
4546
0
        if (!netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
4547
0
            sfq->quantum = mtu;
4548
0
        } else {
4549
0
            VLOG_WARN_RL(&rl, "when using SFQ, you must specify quantum on a "
4550
0
                         "device without mtu");
4551
0
        }
4552
0
    }
4553
0
}
4554
4555
static int
4556
sfq_tc_install(struct netdev *netdev, const struct smap *details)
4557
0
{
4558
0
    int error;
4559
0
    struct sfq sfq;
4560
4561
0
    sfq_parse_qdisc_details__(netdev, details, &sfq);
4562
0
    error = sfq_setup_qdisc__(netdev, sfq.quantum, sfq.perturb);
4563
0
    if (!error) {
4564
0
        sfq_install__(netdev, sfq.quantum, sfq.perturb);
4565
0
    }
4566
0
    return error;
4567
0
}
4568
4569
static int
4570
sfq_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
4571
0
{
4572
0
    const struct tc_sfq_qopt *sfq;
4573
0
    struct nlattr *nlattr;
4574
0
    const char * kind;
4575
0
    int error;
4576
4577
0
    error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
4578
0
    if (error == 0) {
4579
0
        sfq = nl_attr_get(nlattr);
4580
0
        sfq_install__(netdev, sfq->quantum, sfq->perturb_period);
4581
0
        return 0;
4582
0
    }
4583
4584
0
    return error;
4585
0
}
4586
4587
static void
4588
sfq_tc_destroy(struct tc *tc)
4589
0
{
4590
0
    struct sfq *sfq = CONTAINER_OF(tc, struct sfq, tc);
4591
0
    tc_destroy(tc);
4592
0
    free(sfq);
4593
0
}
4594
4595
static int
4596
sfq_qdisc_get(const struct netdev *netdev, struct smap *details)
4597
0
{
4598
0
    const struct sfq *sfq = sfq_get__(netdev);
4599
0
    smap_add_format(details, "quantum", "%u", sfq->quantum);
4600
0
    smap_add_format(details, "perturb", "%u", sfq->perturb);
4601
0
    return 0;
4602
0
}
4603
4604
static int
4605
sfq_qdisc_set(struct netdev *netdev, const struct smap *details)
4606
0
{
4607
0
    struct sfq sfq;
4608
4609
0
    sfq_parse_qdisc_details__(netdev, details, &sfq);
4610
0
    sfq_install__(netdev, sfq.quantum, sfq.perturb);
4611
0
    sfq_get__(netdev)->quantum = sfq.quantum;
4612
0
    sfq_get__(netdev)->perturb = sfq.perturb;
4613
0
    return 0;
4614
0
}
4615
4616
static const struct tc_ops tc_ops_sfq = {
4617
    .linux_name = "sfq",
4618
    .ovs_name = "linux-sfq",
4619
    .n_queues = SFQ_N_QUEUES,
4620
    .tc_install = sfq_tc_install,
4621
    .tc_load = sfq_tc_load,
4622
    .tc_destroy = sfq_tc_destroy,
4623
    .qdisc_get = sfq_qdisc_get,
4624
    .qdisc_set = sfq_qdisc_set,
4625
};
4626

4627
/* netem traffic control class. */
4628
4629
struct netem {
4630
    struct tc tc;
4631
    uint32_t latency;
4632
    uint32_t limit;
4633
    uint32_t loss;
4634
    uint32_t jitter;
4635
};
4636
4637
static struct netem *
4638
netem_get__(const struct netdev *netdev_)
4639
0
{
4640
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4641
0
    return CONTAINER_OF(netdev->tc, struct netem, tc);
4642
0
}
4643
4644
static void
4645
netem_install__(struct netdev *netdev_, uint32_t latency,
4646
                uint32_t limit, uint32_t loss, uint32_t jitter)
4647
0
{
4648
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4649
0
    struct netem *netem;
4650
4651
0
    netem = xmalloc(sizeof *netem);
4652
0
    tc_init(&netem->tc, &tc_ops_netem);
4653
0
    netem->latency = latency;
4654
0
    netem->limit = limit;
4655
0
    netem->loss = loss;
4656
0
    netem->jitter = jitter;
4657
4658
0
    netdev->tc = &netem->tc;
4659
0
}
4660
4661
static int
4662
netem_setup_qdisc__(struct netdev *netdev, uint32_t latency,
4663
                    uint32_t limit, uint32_t loss, uint32_t jitter)
4664
0
{
4665
0
    struct tc_netem_qopt opt;
4666
0
    struct ofpbuf request;
4667
0
    struct tcmsg *tcmsg;
4668
0
    int error;
4669
4670
0
    tc_del_qdisc(netdev);
4671
4672
0
    tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
4673
0
                                         NLM_F_EXCL | NLM_F_CREATE, &request);
4674
0
    if (!tcmsg) {
4675
0
        return ENODEV;
4676
0
    }
4677
0
    tcmsg->tcm_handle = tc_make_handle(1, 0);
4678
0
    tcmsg->tcm_parent = TC_H_ROOT;
4679
4680
0
    memset(&opt, 0, sizeof opt);
4681
4682
0
    if (!limit) {
4683
0
        opt.limit = 1000;
4684
0
    } else {
4685
0
        opt.limit = limit;
4686
0
    }
4687
4688
0
    if (loss) {
4689
0
        if (loss > 100) {
4690
0
            VLOG_WARN_RL(&rl,
4691
0
                         "loss should be a percentage value between 0 to 100, "
4692
0
                         "loss was %u", loss);
4693
0
            return EINVAL;
4694
0
        }
4695
0
        opt.loss = floor(UINT32_MAX * (loss / 100.0));
4696
0
    }
4697
4698
0
    opt.latency = tc_time_to_ticks(latency);
4699
0
    opt.jitter = tc_time_to_ticks(jitter);
4700
4701
0
    nl_msg_put_string(&request, TCA_KIND, "netem");
4702
0
    nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
4703
4704
0
    error = tc_transact(&request, NULL);
4705
0
    if (error) {
4706
0
        VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
4707
0
                          "latency %u, limit %u, loss %u, jitter %u "
4708
0
                          "error %d(%s)",
4709
0
                     netdev_get_name(netdev),
4710
0
                     opt.latency, opt.limit, opt.loss, opt.jitter,
4711
0
                     error, ovs_strerror(error));
4712
0
    }
4713
0
    return error;
4714
0
}
4715
4716
static void
4717
netem_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
4718
                          const struct smap *details, struct netem *netem)
4719
0
{
4720
0
    netem->latency = smap_get_ullong(details, "latency", 0);
4721
0
    netem->limit = smap_get_ullong(details, "limit", 0);
4722
0
    netem->loss = smap_get_ullong(details, "loss", 0);
4723
0
    netem->jitter = smap_get_ullong(details, "jitter", 0);
4724
4725
0
    if (!netem->limit) {
4726
0
        netem->limit = 1000;
4727
0
    }
4728
0
}
4729
4730
static int
4731
netem_tc_install(struct netdev *netdev, const struct smap *details)
4732
0
{
4733
0
    int error;
4734
0
    struct netem netem;
4735
4736
0
    netem_parse_qdisc_details__(netdev, details, &netem);
4737
0
    error = netem_setup_qdisc__(netdev, netem.latency,
4738
0
                                netem.limit, netem.loss, netem.jitter);
4739
0
    if (!error) {
4740
0
        netem_install__(netdev, netem.latency,
4741
0
                        netem.limit, netem.loss, netem.jitter);
4742
0
    }
4743
0
    return error;
4744
0
}
4745
4746
static int
4747
netem_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
4748
0
{
4749
0
    const struct tc_netem_qopt *netem;
4750
0
    struct nlattr *nlattr;
4751
0
    const char *kind;
4752
0
    int error;
4753
4754
0
    error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
4755
0
    if (error == 0) {
4756
0
        netem = nl_attr_get(nlattr);
4757
0
        netem_install__(netdev, netem->latency,
4758
0
                        netem->limit, netem->loss, netem->jitter);
4759
0
        return 0;
4760
0
    }
4761
4762
0
    return error;
4763
0
}
4764
4765
static void
4766
netem_tc_destroy(struct tc *tc)
4767
0
{
4768
0
    struct netem *netem = CONTAINER_OF(tc, struct netem, tc);
4769
0
    tc_destroy(tc);
4770
0
    free(netem);
4771
0
}
4772
4773
static int
4774
netem_qdisc_get(const struct netdev *netdev, struct smap *details)
4775
0
{
4776
0
    const struct netem *netem = netem_get__(netdev);
4777
0
    smap_add_format(details, "latency", "%u", netem->latency);
4778
0
    smap_add_format(details, "limit", "%u", netem->limit);
4779
0
    smap_add_format(details, "loss", "%u", netem->loss);
4780
0
    smap_add_format(details, "jitter", "%u", netem->jitter);
4781
0
    return 0;
4782
0
}
4783
4784
static int
4785
netem_qdisc_set(struct netdev *netdev, const struct smap *details)
4786
0
{
4787
0
    struct netem netem;
4788
4789
0
    netem_parse_qdisc_details__(netdev, details, &netem);
4790
0
    netem_install__(netdev, netem.latency,
4791
0
                    netem.limit, netem.loss, netem.jitter);
4792
0
    netem_get__(netdev)->latency = netem.latency;
4793
0
    netem_get__(netdev)->limit = netem.limit;
4794
0
    netem_get__(netdev)->loss = netem.loss;
4795
0
    netem_get__(netdev)->jitter = netem.jitter;
4796
0
    return 0;
4797
0
}
4798
4799
static const struct tc_ops tc_ops_netem = {
4800
    .linux_name = "netem",
4801
    .ovs_name = "linux-netem",
4802
    .n_queues = 0,
4803
    .tc_install = netem_tc_install,
4804
    .tc_load = netem_tc_load,
4805
    .tc_destroy = netem_tc_destroy,
4806
    .qdisc_get = netem_qdisc_get,
4807
    .qdisc_set = netem_qdisc_set,
4808
};
4809

4810
/* HTB traffic control class. */
4811
4812
0
#define HTB_N_QUEUES 0xf000
4813
0
#define HTB_RATE2QUANTUM 10
4814
4815
struct htb {
4816
    struct tc tc;
4817
    uint64_t max_rate;          /* In bytes/s. */
4818
};
4819
4820
struct htb_class {
4821
    struct tc_queue tc_queue;
4822
    uint64_t min_rate;          /* In bytes/s. */
4823
    uint64_t max_rate;          /* In bytes/s. */
4824
    unsigned int burst;         /* In bytes. */
4825
    unsigned int priority;      /* Lower values are higher priorities. */
4826
};
4827
4828
static struct htb *
4829
htb_get__(const struct netdev *netdev_)
4830
0
{
4831
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4832
0
    return CONTAINER_OF(netdev->tc, struct htb, tc);
4833
0
}
4834
4835
static void
4836
htb_install__(struct netdev *netdev_, uint64_t max_rate)
4837
0
{
4838
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4839
0
    struct htb *htb;
4840
4841
0
    htb = xmalloc(sizeof *htb);
4842
0
    tc_init(&htb->tc, &tc_ops_htb);
4843
0
    htb->max_rate = max_rate;
4844
4845
0
    netdev->tc = &htb->tc;
4846
0
}
4847
4848
/* Create an HTB qdisc.
4849
 *
4850
 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
4851
static int
4852
htb_setup_qdisc__(struct netdev *netdev)
4853
0
{
4854
0
    size_t opt_offset;
4855
0
    struct tc_htb_glob opt;
4856
0
    struct ofpbuf request;
4857
0
    struct tcmsg *tcmsg;
4858
4859
0
    tc_del_qdisc(netdev);
4860
4861
0
    tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
4862
0
                                         NLM_F_EXCL | NLM_F_CREATE, &request);
4863
0
    if (!tcmsg) {
4864
0
        return ENODEV;
4865
0
    }
4866
0
    tcmsg->tcm_handle = tc_make_handle(1, 0);
4867
0
    tcmsg->tcm_parent = TC_H_ROOT;
4868
4869
0
    nl_msg_put_string(&request, TCA_KIND, "htb");
4870
4871
0
    memset(&opt, 0, sizeof opt);
4872
0
    opt.rate2quantum = HTB_RATE2QUANTUM;
4873
0
    opt.version = 3;
4874
0
    opt.defcls = 1;
4875
4876
0
    opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4877
0
    nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
4878
0
    nl_msg_end_nested(&request, opt_offset);
4879
4880
0
    return tc_transact(&request, NULL);
4881
0
}
4882
4883
/* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
4884
 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
4885
static int
4886
htb_setup_class__(struct netdev *netdev, unsigned int handle,
4887
                  unsigned int parent, struct htb_class *class)
4888
0
{
4889
0
    size_t opt_offset;
4890
0
    struct tc_htb_opt opt;
4891
0
    struct ofpbuf request;
4892
0
    struct tcmsg *tcmsg;
4893
0
    int error;
4894
0
    int mtu;
4895
4896
0
    error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
4897
0
    if (error) {
4898
0
        VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
4899
0
                     netdev_get_name(netdev));
4900
0
        return error;
4901
0
    }
4902
4903
0
    memset(&opt, 0, sizeof opt);
4904
0
    tc_fill_rate(&opt.rate, class->min_rate, mtu);
4905
0
    tc_fill_rate(&opt.ceil, class->max_rate, mtu);
4906
    /* Makes sure the quantum is at least MTU.  Setting quantum will
4907
     * make htb ignore the r2q for this class. */
4908
0
    if ((class->min_rate / HTB_RATE2QUANTUM) < mtu) {
4909
0
        opt.quantum = mtu;
4910
0
    }
4911
0
    opt.buffer = tc_calc_buffer(class->min_rate, mtu, class->burst);
4912
0
    opt.cbuffer = tc_calc_buffer(class->max_rate, mtu, class->burst);
4913
0
    opt.prio = class->priority;
4914
4915
0
    tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE,
4916
0
                                         &request);
4917
0
    if (!tcmsg) {
4918
0
        return ENODEV;
4919
0
    }
4920
0
    tcmsg->tcm_handle = handle;
4921
0
    tcmsg->tcm_parent = parent;
4922
4923
0
    nl_msg_put_string(&request, TCA_KIND, "htb");
4924
0
    opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4925
4926
0
#ifdef HAVE_TCA_HTB_RATE64
4927
0
    if (class->min_rate > UINT32_MAX) {
4928
0
        nl_msg_put_u64(&request, TCA_HTB_RATE64, class->min_rate);
4929
0
    }
4930
0
    if (class->max_rate > UINT32_MAX) {
4931
0
        nl_msg_put_u64(&request, TCA_HTB_CEIL64, class->max_rate);
4932
0
    }
4933
0
#endif
4934
0
    nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
4935
4936
0
    tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate, class->min_rate);
4937
0
    tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil, class->max_rate);
4938
0
    nl_msg_end_nested(&request, opt_offset);
4939
4940
0
    error = tc_transact(&request, NULL);
4941
0
    if (error) {
4942
0
        VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
4943
0
                     "min_rate=%"PRIu64" max_rate=%"PRIu64" burst=%u prio=%u "
4944
0
                     "(%s)",
4945
0
                     netdev_get_name(netdev),
4946
0
                     tc_get_major(handle), tc_get_minor(handle),
4947
0
                     tc_get_major(parent), tc_get_minor(parent),
4948
0
                     class->min_rate, class->max_rate,
4949
0
                     class->burst, class->priority, ovs_strerror(error));
4950
0
    }
4951
0
    return error;
4952
0
}
4953
4954
/* Parses Netlink attributes in 'options' for HTB parameters and stores a
4955
 * description of them into 'details'.  The description complies with the
4956
 * specification given in the vswitch database documentation for linux-htb
4957
 * queue details. */
4958
static int
4959
htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
4960
0
{
4961
0
    static const struct nl_policy tca_htb_policy[] = {
4962
0
        [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
4963
0
                            .min_len = sizeof(struct tc_htb_opt) },
4964
0
#ifdef HAVE_TCA_HTB_RATE64
4965
0
        [TCA_HTB_RATE64] = { .type = NL_A_U64, .optional = true },
4966
0
        [TCA_HTB_CEIL64] = { .type = NL_A_U64, .optional = true },
4967
0
#endif
4968
0
    };
4969
4970
0
    struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
4971
0
    const struct tc_htb_opt *htb;
4972
4973
0
    if (!nl_parse_nested(nl_options, tca_htb_policy,
4974
0
                         attrs, ARRAY_SIZE(tca_htb_policy))) {
4975
0
        VLOG_WARN_RL(&rl, "failed to parse HTB class options");
4976
0
        return EPROTO;
4977
0
    }
4978
4979
0
    htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
4980
0
    class->min_rate = htb->rate.rate;
4981
0
    class->max_rate = htb->ceil.rate;
4982
0
#ifdef HAVE_TCA_HTB_RATE64
4983
0
    if (attrs[TCA_HTB_RATE64]) {
4984
0
        class->min_rate = nl_attr_get_u64(attrs[TCA_HTB_RATE64]);
4985
0
    }
4986
0
    if (attrs[TCA_HTB_CEIL64]) {
4987
0
        class->max_rate = nl_attr_get_u64(attrs[TCA_HTB_CEIL64]);
4988
0
    }
4989
0
#endif
4990
0
    class->burst = tc_ticks_to_bytes(class->min_rate, htb->buffer);
4991
0
    class->priority = htb->prio;
4992
0
    return 0;
4993
0
}
4994
4995
static int
4996
htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
4997
                  struct htb_class *options,
4998
                  struct netdev_queue_stats *stats)
4999
0
{
5000
0
    struct nlattr *nl_options;
5001
0
    unsigned int handle;
5002
0
    int error;
5003
5004
0
    error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
5005
0
    if (!error && queue_id) {
5006
0
        unsigned int major = tc_get_major(handle);
5007
0
        unsigned int minor = tc_get_minor(handle);
5008
0
        if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
5009
0
            *queue_id = minor - 1;
5010
0
        } else {
5011
0
            error = EPROTO;
5012
0
        }
5013
0
    }
5014
0
    if (!error && options) {
5015
0
        error = htb_parse_tca_options__(nl_options, options);
5016
0
    }
5017
0
    return error;
5018
0
}
5019
5020
static void
5021
htb_parse_qdisc_details__(struct netdev *netdev, const struct smap *details,
5022
                          struct htb_class *hc)
5023
0
{
5024
0
    hc->max_rate = smap_get_ullong(details, "max-rate", 0) / 8;
5025
0
    if (!hc->max_rate) {
5026
0
        uint32_t current_speed;
5027
0
        uint32_t max_speed OVS_UNUSED;
5028
5029
0
        netdev_linux_get_speed_locked(netdev_linux_cast(netdev),
5030
0
                                      &current_speed, &max_speed);
5031
0
        hc->max_rate = current_speed ? current_speed / 8 * 1000000ULL
5032
0
                                     : NETDEV_DEFAULT_BPS / 8;
5033
0
    }
5034
0
    hc->min_rate = hc->max_rate;
5035
0
    hc->burst = 0;
5036
0
    hc->priority = 0;
5037
0
}
5038
5039
static int
5040
htb_parse_class_details__(struct netdev *netdev,
5041
                          const struct smap *details, struct htb_class *hc)
5042
0
{
5043
0
    const struct htb *htb = htb_get__(netdev);
5044
0
    int mtu, error;
5045
0
    unsigned long long int max_rate_bit;
5046
5047
0
    error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
5048
0
    if (error) {
5049
0
        VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
5050
0
                     netdev_get_name(netdev));
5051
0
        return error;
5052
0
    }
5053
5054
    /* HTB requires at least an mtu sized min-rate to send any traffic even
5055
     * on uncongested links. */
5056
0
    hc->min_rate = smap_get_ullong(details, "min-rate", 0) / 8;
5057
0
    hc->min_rate = MAX(hc->min_rate, mtu);
5058
0
    hc->min_rate = MIN(hc->min_rate, htb->max_rate);
5059
5060
    /* max-rate */
5061
0
    max_rate_bit = smap_get_ullong(details, "max-rate", 0);
5062
0
    hc->max_rate = max_rate_bit ? max_rate_bit / 8 : htb->max_rate;
5063
0
    hc->max_rate = MAX(hc->max_rate, hc->min_rate);
5064
0
    hc->max_rate = MIN(hc->max_rate, htb->max_rate);
5065
5066
    /* burst
5067
     *
5068
     * According to hints in the documentation that I've read, it is important
5069
     * that 'burst' be at least as big as the largest frame that might be
5070
     * transmitted.  Also, making 'burst' a bit bigger than necessary is OK,
5071
     * but having it a bit too small is a problem.  Since netdev_get_mtu()
5072
     * doesn't include the Ethernet header, we need to add at least 14 (18?) to
5073
     * the MTU.  We actually add 64, instead of 14, as a guard against
5074
     * additional headers get tacked on somewhere that we're not aware of. */
5075
0
    hc->burst = smap_get_ullong(details, "burst", 0) / 8;
5076
0
    hc->burst = MAX(hc->burst, mtu + 64);
5077
5078
    /* priority */
5079
0
    hc->priority = smap_get_ullong(details, "priority", 0);
5080
5081
0
    return 0;
5082
0
}
5083
5084
static int
5085
htb_query_class__(const struct netdev *netdev, unsigned int handle,
5086
                  unsigned int parent, struct htb_class *options,
5087
                  struct netdev_queue_stats *stats)
5088
0
{
5089
0
    struct ofpbuf *reply;
5090
0
    int error;
5091
5092
0
    error = tc_query_class(netdev, handle, parent, &reply);
5093
0
    if (!error) {
5094
0
        error = htb_parse_tcmsg__(reply, NULL, options, stats);
5095
0
        ofpbuf_delete(reply);
5096
0
    }
5097
0
    return error;
5098
0
}
5099
5100
static int
5101
htb_tc_install(struct netdev *netdev, const struct smap *details)
5102
0
{
5103
0
    int error;
5104
5105
0
    error = htb_setup_qdisc__(netdev);
5106
0
    if (!error) {
5107
0
        struct htb_class hc;
5108
5109
0
        htb_parse_qdisc_details__(netdev, details, &hc);
5110
0
        error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
5111
0
                                  tc_make_handle(1, 0), &hc);
5112
0
        if (!error) {
5113
0
            htb_install__(netdev, hc.max_rate);
5114
0
        }
5115
0
    }
5116
0
    return error;
5117
0
}
5118
5119
static struct htb_class *
5120
htb_class_cast__(const struct tc_queue *queue)
5121
0
{
5122
0
    return CONTAINER_OF(queue, struct htb_class, tc_queue);
5123
0
}
5124
5125
static void
5126
htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
5127
                   const struct htb_class *hc)
5128
0
{
5129
0
    struct htb *htb = htb_get__(netdev);
5130
0
    size_t hash = hash_int(queue_id, 0);
5131
0
    struct tc_queue *queue;
5132
0
    struct htb_class *hcp;
5133
5134
0
    queue = tc_find_queue__(netdev, queue_id, hash);
5135
0
    if (queue) {
5136
0
        hcp = htb_class_cast__(queue);
5137
0
    } else {
5138
0
        hcp = xmalloc(sizeof *hcp);
5139
0
        queue = &hcp->tc_queue;
5140
0
        queue->queue_id = queue_id;
5141
0
        queue->created = time_msec();
5142
0
        hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
5143
0
    }
5144
5145
0
    hcp->min_rate = hc->min_rate;
5146
0
    hcp->max_rate = hc->max_rate;
5147
0
    hcp->burst = hc->burst;
5148
0
    hcp->priority = hc->priority;
5149
0
}
5150
5151
static int
5152
htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
5153
0
{
5154
0
    struct ofpbuf msg;
5155
0
    struct queue_dump_state state;
5156
0
    struct htb_class hc;
5157
5158
    /* Get qdisc options. */
5159
0
    hc.max_rate = 0;
5160
0
    htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
5161
0
    htb_install__(netdev, hc.max_rate);
5162
5163
    /* Get queues. */
5164
0
    if (!start_queue_dump(netdev, &state)) {
5165
0
        return ENODEV;
5166
0
    }
5167
0
    while (nl_dump_next(&state.dump, &msg, &state.buf)) {
5168
0
        unsigned int queue_id;
5169
5170
0
        if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
5171
0
            htb_update_queue__(netdev, queue_id, &hc);
5172
0
        }
5173
0
    }
5174
0
    finish_queue_dump(&state);
5175
5176
0
    return 0;
5177
0
}
5178
5179
static void
5180
htb_tc_destroy(struct tc *tc)
5181
0
{
5182
0
    struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
5183
0
    struct htb_class *hc;
5184
5185
0
    HMAP_FOR_EACH_POP (hc, tc_queue.hmap_node, &htb->tc.queues) {
5186
0
        free(hc);
5187
0
    }
5188
0
    tc_destroy(tc);
5189
0
    free(htb);
5190
0
}
5191
5192
static int
5193
htb_qdisc_get(const struct netdev *netdev, struct smap *details)
5194
0
{
5195
0
    const struct htb *htb = htb_get__(netdev);
5196
0
    smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
5197
0
    return 0;
5198
0
}
5199
5200
static int
5201
htb_qdisc_set(struct netdev *netdev, const struct smap *details)
5202
0
{
5203
0
    struct htb_class hc;
5204
0
    int error;
5205
5206
0
    htb_parse_qdisc_details__(netdev, details, &hc);
5207
0
    error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
5208
0
                              tc_make_handle(1, 0), &hc);
5209
0
    if (!error) {
5210
0
        htb_get__(netdev)->max_rate = hc.max_rate;
5211
0
    }
5212
0
    return error;
5213
0
}
5214
5215
static int
5216
htb_class_get(const struct netdev *netdev OVS_UNUSED,
5217
              const struct tc_queue *queue, struct smap *details)
5218
0
{
5219
0
    const struct htb_class *hc = htb_class_cast__(queue);
5220
5221
0
    smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
5222
0
    if (hc->min_rate != hc->max_rate) {
5223
0
        smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
5224
0
    }
5225
0
    smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
5226
0
    if (hc->priority) {
5227
0
        smap_add_format(details, "priority", "%u", hc->priority);
5228
0
    }
5229
0
    return 0;
5230
0
}
5231
5232
static int
5233
htb_class_set(struct netdev *netdev, unsigned int queue_id,
5234
              const struct smap *details)
5235
0
{
5236
0
    struct htb_class hc;
5237
0
    int error;
5238
5239
0
    error = htb_parse_class_details__(netdev, details, &hc);
5240
0
    if (error) {
5241
0
        return error;
5242
0
    }
5243
5244
0
    error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
5245
0
                              tc_make_handle(1, 0xfffe), &hc);
5246
0
    if (error) {
5247
0
        return error;
5248
0
    }
5249
5250
0
    htb_update_queue__(netdev, queue_id, &hc);
5251
0
    return 0;
5252
0
}
5253
5254
static int
5255
htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
5256
0
{
5257
0
    struct htb_class *hc = htb_class_cast__(queue);
5258
0
    struct htb *htb = htb_get__(netdev);
5259
0
    int error;
5260
5261
0
    error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
5262
0
    if (!error) {
5263
0
        hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
5264
0
        free(hc);
5265
0
    }
5266
0
    return error;
5267
0
}
5268
5269
static int
5270
htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
5271
                    struct netdev_queue_stats *stats)
5272
0
{
5273
0
    return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
5274
0
                             tc_make_handle(1, 0xfffe), NULL, stats);
5275
0
}
5276
5277
static int
5278
htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
5279
                     const struct ofpbuf *nlmsg,
5280
                     netdev_dump_queue_stats_cb *cb, void *aux)
5281
0
{
5282
0
    struct netdev_queue_stats stats;
5283
0
    unsigned int handle, major, minor;
5284
0
    int error;
5285
5286
0
    error = tc_parse_class(nlmsg, &handle, NULL, &stats);
5287
0
    if (error) {
5288
0
        return error;
5289
0
    }
5290
5291
0
    major = tc_get_major(handle);
5292
0
    minor = tc_get_minor(handle);
5293
0
    if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
5294
0
        (*cb)(minor - 1, &stats, aux);
5295
0
    }
5296
0
    return 0;
5297
0
}
5298
5299
static const struct tc_ops tc_ops_htb = {
5300
    .linux_name = "htb",
5301
    .ovs_name = "linux-htb",
5302
    .n_queues = HTB_N_QUEUES,
5303
    .tc_install = htb_tc_install,
5304
    .tc_load = htb_tc_load,
5305
    .tc_destroy = htb_tc_destroy,
5306
    .qdisc_get = htb_qdisc_get,
5307
    .qdisc_set = htb_qdisc_set,
5308
    .class_get = htb_class_get,
5309
    .class_set = htb_class_set,
5310
    .class_delete = htb_class_delete,
5311
    .class_get_stats = htb_class_get_stats,
5312
    .class_dump_stats = htb_class_dump_stats
5313
};
5314

5315
/* "linux-hfsc" traffic control class. */
5316
5317
0
#define HFSC_N_QUEUES 0xf000
5318
5319
struct hfsc {
5320
    struct tc tc;
5321
    uint32_t max_rate;
5322
};
5323
5324
struct hfsc_class {
5325
    struct tc_queue tc_queue;
5326
    uint32_t min_rate;
5327
    uint32_t max_rate;
5328
};
5329
5330
static struct hfsc *
5331
hfsc_get__(const struct netdev *netdev_)
5332
0
{
5333
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5334
0
    return CONTAINER_OF(netdev->tc, struct hfsc, tc);
5335
0
}
5336
5337
static struct hfsc_class *
5338
hfsc_class_cast__(const struct tc_queue *queue)
5339
0
{
5340
0
    return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
5341
0
}
5342
5343
static void
5344
hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
5345
0
{
5346
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5347
0
    struct hfsc *hfsc;
5348
5349
0
    hfsc = xmalloc(sizeof *hfsc);
5350
0
    tc_init(&hfsc->tc, &tc_ops_hfsc);
5351
0
    hfsc->max_rate = max_rate;
5352
0
    netdev->tc = &hfsc->tc;
5353
0
}
5354
5355
static void
5356
hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
5357
                    const struct hfsc_class *hc)
5358
0
{
5359
0
    size_t hash;
5360
0
    struct hfsc *hfsc;
5361
0
    struct hfsc_class *hcp;
5362
0
    struct tc_queue *queue;
5363
5364
0
    hfsc = hfsc_get__(netdev);
5365
0
    hash = hash_int(queue_id, 0);
5366
5367
0
    queue = tc_find_queue__(netdev, queue_id, hash);
5368
0
    if (queue) {
5369
0
        hcp = hfsc_class_cast__(queue);
5370
0
    } else {
5371
0
        hcp             = xmalloc(sizeof *hcp);
5372
0
        queue           = &hcp->tc_queue;
5373
0
        queue->queue_id = queue_id;
5374
0
        queue->created  = time_msec();
5375
0
        hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
5376
0
    }
5377
5378
0
    hcp->min_rate = hc->min_rate;
5379
0
    hcp->max_rate = hc->max_rate;
5380
0
}
5381
5382
static int
5383
hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
5384
0
{
5385
0
    const struct tc_service_curve *rsc, *fsc, *usc;
5386
0
    static const struct nl_policy tca_hfsc_policy[] = {
5387
0
        [TCA_HFSC_RSC] = {
5388
0
            .type      = NL_A_UNSPEC,
5389
0
            .optional  = false,
5390
0
            .min_len   = sizeof(struct tc_service_curve),
5391
0
        },
5392
0
        [TCA_HFSC_FSC] = {
5393
0
            .type      = NL_A_UNSPEC,
5394
0
            .optional  = false,
5395
0
            .min_len   = sizeof(struct tc_service_curve),
5396
0
        },
5397
0
        [TCA_HFSC_USC] = {
5398
0
            .type      = NL_A_UNSPEC,
5399
0
            .optional  = false,
5400
0
            .min_len   = sizeof(struct tc_service_curve),
5401
0
        },
5402
0
    };
5403
0
    struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
5404
5405
0
    if (!nl_parse_nested(nl_options, tca_hfsc_policy,
5406
0
                         attrs, ARRAY_SIZE(tca_hfsc_policy))) {
5407
0
        VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
5408
0
        return EPROTO;
5409
0
    }
5410
5411
0
    rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
5412
0
    fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
5413
0
    usc = nl_attr_get(attrs[TCA_HFSC_USC]);
5414
5415
0
    if (rsc->m1 != 0 || rsc->d != 0 ||
5416
0
        fsc->m1 != 0 || fsc->d != 0 ||
5417
0
        usc->m1 != 0 || usc->d != 0) {
5418
0
        VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
5419
0
                     "Non-linear service curves are not supported.");
5420
0
        return EPROTO;
5421
0
    }
5422
5423
0
    if (rsc->m2 != fsc->m2) {
5424
0
        VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
5425
0
                     "Real-time service curves are not supported ");
5426
0
        return EPROTO;
5427
0
    }
5428
5429
0
    if (rsc->m2 > usc->m2) {
5430
0
        VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
5431
0
                     "Min-rate service curve is greater than "
5432
0
                     "the max-rate service curve.");
5433
0
        return EPROTO;
5434
0
    }
5435
5436
0
    class->min_rate = fsc->m2;
5437
0
    class->max_rate = usc->m2;
5438
0
    return 0;
5439
0
}
5440
5441
static int
5442
hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
5443
                   struct hfsc_class *options,
5444
                   struct netdev_queue_stats *stats)
5445
0
{
5446
0
    int error;
5447
0
    unsigned int handle;
5448
0
    struct nlattr *nl_options;
5449
5450
0
    error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
5451
0
    if (error) {
5452
0
        return error;
5453
0
    }
5454
5455
0
    if (queue_id) {
5456
0
        unsigned int major, minor;
5457
5458
0
        major = tc_get_major(handle);
5459
0
        minor = tc_get_minor(handle);
5460
0
        if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
5461
0
            *queue_id = minor - 1;
5462
0
        } else {
5463
0
            return EPROTO;
5464
0
        }
5465
0
    }
5466
5467
0
    if (options) {
5468
0
        error = hfsc_parse_tca_options__(nl_options, options);
5469
0
    }
5470
5471
0
    return error;
5472
0
}
5473
5474
static int
5475
hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
5476
                   unsigned int parent, struct hfsc_class *options,
5477
                   struct netdev_queue_stats *stats)
5478
0
{
5479
0
    int error;
5480
0
    struct ofpbuf *reply;
5481
5482
0
    error = tc_query_class(netdev, handle, parent, &reply);
5483
0
    if (error) {
5484
0
        return error;
5485
0
    }
5486
5487
0
    error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
5488
0
    ofpbuf_delete(reply);
5489
0
    return error;
5490
0
}
5491
5492
static void
5493
hfsc_parse_qdisc_details__(struct netdev *netdev, const struct smap *details,
5494
                           struct hfsc_class *class)
5495
0
{
5496
0
    uint32_t max_rate = smap_get_ullong(details, "max-rate", 0) / 8;
5497
0
    if (!max_rate) {
5498
0
        uint32_t current_speed;
5499
0
        uint32_t max_speed OVS_UNUSED;
5500
5501
0
        netdev_linux_get_speed_locked(netdev_linux_cast(netdev),
5502
0
                                      &current_speed, &max_speed);
5503
0
        max_rate = current_speed ? current_speed / 8 * 1000000ULL
5504
0
                                 : NETDEV_DEFAULT_BPS / 8;
5505
0
    }
5506
5507
0
    class->min_rate = max_rate;
5508
0
    class->max_rate = max_rate;
5509
0
}
5510
5511
static int
5512
hfsc_parse_class_details__(struct netdev *netdev,
5513
                           const struct smap *details,
5514
                           struct hfsc_class * class)
5515
0
{
5516
0
    const struct hfsc *hfsc;
5517
0
    uint32_t min_rate, max_rate;
5518
5519
0
    hfsc       = hfsc_get__(netdev);
5520
5521
0
    min_rate = smap_get_ullong(details, "min-rate", 0) / 8;
5522
0
    min_rate = MAX(min_rate, 1);
5523
0
    min_rate = MIN(min_rate, hfsc->max_rate);
5524
5525
0
    max_rate = smap_get_ullong(details, "max-rate", hfsc->max_rate * 8) / 8;
5526
0
    max_rate = MAX(max_rate, min_rate);
5527
0
    max_rate = MIN(max_rate, hfsc->max_rate);
5528
5529
0
    class->min_rate = min_rate;
5530
0
    class->max_rate = max_rate;
5531
5532
0
    return 0;
5533
0
}
5534
5535
/* Create an HFSC qdisc.
5536
 *
5537
 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
5538
static int
5539
hfsc_setup_qdisc__(struct netdev * netdev)
5540
0
{
5541
0
    struct tcmsg *tcmsg;
5542
0
    struct ofpbuf request;
5543
0
    struct tc_hfsc_qopt opt;
5544
5545
0
    tc_del_qdisc(netdev);
5546
5547
0
    tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
5548
0
                                         NLM_F_EXCL | NLM_F_CREATE, &request);
5549
5550
0
    if (!tcmsg) {
5551
0
        return ENODEV;
5552
0
    }
5553
5554
0
    tcmsg->tcm_handle = tc_make_handle(1, 0);
5555
0
    tcmsg->tcm_parent = TC_H_ROOT;
5556
5557
0
    memset(&opt, 0, sizeof opt);
5558
0
    opt.defcls = 1;
5559
5560
0
    nl_msg_put_string(&request, TCA_KIND, "hfsc");
5561
0
    nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
5562
5563
0
    return tc_transact(&request, NULL);
5564
0
}
5565
5566
/* Create an HFSC class.
5567
 *
5568
 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
5569
 * sc rate <min_rate> ul rate <max_rate>" */
5570
static int
5571
hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
5572
                   unsigned int parent, struct hfsc_class *class)
5573
0
{
5574
0
    int error;
5575
0
    size_t opt_offset;
5576
0
    struct tcmsg *tcmsg;
5577
0
    struct ofpbuf request;
5578
0
    struct tc_service_curve min, max;
5579
5580
0
    tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE,
5581
0
                                         &request);
5582
5583
0
    if (!tcmsg) {
5584
0
        return ENODEV;
5585
0
    }
5586
5587
0
    tcmsg->tcm_handle = handle;
5588
0
    tcmsg->tcm_parent = parent;
5589
5590
0
    min.m1 = 0;
5591
0
    min.d  = 0;
5592
0
    min.m2 = class->min_rate;
5593
5594
0
    max.m1 = 0;
5595
0
    max.d  = 0;
5596
0
    max.m2 = class->max_rate;
5597
5598
0
    nl_msg_put_string(&request, TCA_KIND, "hfsc");
5599
0
    opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
5600
0
    nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
5601
0
    nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
5602
0
    nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
5603
0
    nl_msg_end_nested(&request, opt_offset);
5604
5605
0
    error = tc_transact(&request, NULL);
5606
0
    if (error) {
5607
0
        VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
5608
0
                     "min-rate %ubps, max-rate %ubps (%s)",
5609
0
                     netdev_get_name(netdev),
5610
0
                     tc_get_major(handle), tc_get_minor(handle),
5611
0
                     tc_get_major(parent), tc_get_minor(parent),
5612
0
                     class->min_rate, class->max_rate, ovs_strerror(error));
5613
0
    }
5614
5615
0
    return error;
5616
0
}
5617
5618
static int
5619
hfsc_tc_install(struct netdev *netdev, const struct smap *details)
5620
0
{
5621
0
    int error;
5622
0
    struct hfsc_class class;
5623
5624
0
    error = hfsc_setup_qdisc__(netdev);
5625
5626
0
    if (error) {
5627
0
        return error;
5628
0
    }
5629
5630
0
    hfsc_parse_qdisc_details__(netdev, details, &class);
5631
0
    error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
5632
0
                               tc_make_handle(1, 0), &class);
5633
5634
0
    if (error) {
5635
0
        return error;
5636
0
    }
5637
5638
0
    hfsc_install__(netdev, class.max_rate);
5639
0
    return 0;
5640
0
}
5641
5642
static int
5643
hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
5644
0
{
5645
0
    struct ofpbuf msg;
5646
0
    struct queue_dump_state state;
5647
0
    struct hfsc_class hc;
5648
5649
0
    hc.max_rate = 0;
5650
0
    hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
5651
0
    hfsc_install__(netdev, hc.max_rate);
5652
5653
0
    if (!start_queue_dump(netdev, &state)) {
5654
0
        return ENODEV;
5655
0
    }
5656
5657
0
    while (nl_dump_next(&state.dump, &msg, &state.buf)) {
5658
0
        unsigned int queue_id;
5659
5660
0
        if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
5661
0
            hfsc_update_queue__(netdev, queue_id, &hc);
5662
0
        }
5663
0
    }
5664
5665
0
    finish_queue_dump(&state);
5666
0
    return 0;
5667
0
}
5668
5669
static void
5670
hfsc_tc_destroy(struct tc *tc)
5671
0
{
5672
0
    struct hfsc *hfsc;
5673
0
    struct hfsc_class *hc;
5674
5675
0
    hfsc = CONTAINER_OF(tc, struct hfsc, tc);
5676
5677
0
    HMAP_FOR_EACH_SAFE (hc, tc_queue.hmap_node, &hfsc->tc.queues) {
5678
0
        hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
5679
0
        free(hc);
5680
0
    }
5681
5682
0
    tc_destroy(tc);
5683
0
    free(hfsc);
5684
0
}
5685
5686
static int
5687
hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
5688
0
{
5689
0
    const struct hfsc *hfsc;
5690
0
    hfsc = hfsc_get__(netdev);
5691
0
    smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
5692
0
    return 0;
5693
0
}
5694
5695
static int
5696
hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
5697
0
{
5698
0
    int error;
5699
0
    struct hfsc_class class;
5700
5701
0
    hfsc_parse_qdisc_details__(netdev, details, &class);
5702
0
    error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
5703
0
                               tc_make_handle(1, 0), &class);
5704
5705
0
    if (!error) {
5706
0
        hfsc_get__(netdev)->max_rate = class.max_rate;
5707
0
    }
5708
5709
0
    return error;
5710
0
}
5711
5712
static int
5713
hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
5714
              const struct tc_queue *queue, struct smap *details)
5715
0
{
5716
0
    const struct hfsc_class *hc;
5717
5718
0
    hc = hfsc_class_cast__(queue);
5719
0
    smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
5720
0
    if (hc->min_rate != hc->max_rate) {
5721
0
        smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
5722
0
    }
5723
0
    return 0;
5724
0
}
5725
5726
static int
5727
hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
5728
               const struct smap *details)
5729
0
{
5730
0
    int error;
5731
0
    struct hfsc_class class;
5732
5733
0
    error = hfsc_parse_class_details__(netdev, details, &class);
5734
0
    if (error) {
5735
0
        return error;
5736
0
    }
5737
5738
0
    error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
5739
0
                               tc_make_handle(1, 0xfffe), &class);
5740
0
    if (error) {
5741
0
        return error;
5742
0
    }
5743
5744
0
    hfsc_update_queue__(netdev, queue_id, &class);
5745
0
    return 0;
5746
0
}
5747
5748
static int
5749
hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
5750
0
{
5751
0
    int error;
5752
0
    struct hfsc *hfsc;
5753
0
    struct hfsc_class *hc;
5754
5755
0
    hc   = hfsc_class_cast__(queue);
5756
0
    hfsc = hfsc_get__(netdev);
5757
5758
0
    error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
5759
0
    if (!error) {
5760
0
        hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
5761
0
        free(hc);
5762
0
    }
5763
0
    return error;
5764
0
}
5765
5766
static int
5767
hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
5768
                     struct netdev_queue_stats *stats)
5769
0
{
5770
0
    return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
5771
0
                             tc_make_handle(1, 0xfffe), NULL, stats);
5772
0
}
5773
5774
static int
5775
hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
5776
                      const struct ofpbuf *nlmsg,
5777
                      netdev_dump_queue_stats_cb *cb, void *aux)
5778
0
{
5779
0
    struct netdev_queue_stats stats;
5780
0
    unsigned int handle, major, minor;
5781
0
    int error;
5782
5783
0
    error = tc_parse_class(nlmsg, &handle, NULL, &stats);
5784
0
    if (error) {
5785
0
        return error;
5786
0
    }
5787
5788
0
    major = tc_get_major(handle);
5789
0
    minor = tc_get_minor(handle);
5790
0
    if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
5791
0
        (*cb)(minor - 1, &stats, aux);
5792
0
    }
5793
0
    return 0;
5794
0
}
5795
5796
static const struct tc_ops tc_ops_hfsc = {
5797
    .linux_name = "hfsc",
5798
    .ovs_name = "linux-hfsc",
5799
    .n_queues = HFSC_N_QUEUES,              /* n_queues */
5800
    .tc_install = hfsc_tc_install,
5801
    .tc_load = hfsc_tc_load,
5802
    .tc_destroy = hfsc_tc_destroy,
5803
    .qdisc_get = hfsc_qdisc_get,
5804
    .qdisc_set = hfsc_qdisc_set,
5805
    .class_get = hfsc_class_get,
5806
    .class_set = hfsc_class_set,
5807
    .class_delete = hfsc_class_delete,
5808
    .class_get_stats = hfsc_class_get_stats,
5809
    .class_dump_stats = hfsc_class_dump_stats,
5810
};
5811

5812
/* "linux-noop" traffic control class. */
5813
5814
static void
5815
noop_install__(struct netdev *netdev_)
5816
0
{
5817
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5818
0
    static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
5819
5820
0
    netdev->tc = CONST_CAST(struct tc *, &tc);
5821
0
}
5822
5823
static int
5824
noop_tc_install(struct netdev *netdev,
5825
                   const struct smap *details OVS_UNUSED)
5826
0
{
5827
0
    noop_install__(netdev);
5828
0
    return 0;
5829
0
}
5830
5831
static int
5832
noop_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
5833
0
{
5834
0
    noop_install__(netdev);
5835
0
    return 0;
5836
0
}
5837
5838
static const struct tc_ops tc_ops_noop = {
5839
    .ovs_name = "linux-noop",               /* ovs_name */
5840
    .tc_install = noop_tc_install,
5841
    .tc_load = noop_tc_load,
5842
};
5843

5844
/* "linux-default" traffic control class.
5845
 *
5846
 * This class represents the default, unnamed Linux qdisc.  It corresponds to
5847
 * the "" (empty string) QoS type in the OVS database. */
5848
5849
static void
5850
default_install__(struct netdev *netdev_)
5851
0
{
5852
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5853
0
    static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
5854
5855
    /* Nothing but a tc class implementation is allowed to write to a tc.  This
5856
     * class never does that, so we can legitimately use a const tc object. */
5857
0
    netdev->tc = CONST_CAST(struct tc *, &tc);
5858
0
}
5859
5860
static int
5861
default_tc_install(struct netdev *netdev,
5862
                   const struct smap *details OVS_UNUSED)
5863
0
{
5864
0
    default_install__(netdev);
5865
0
    return 0;
5866
0
}
5867
5868
static int
5869
default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
5870
0
{
5871
0
    default_install__(netdev);
5872
0
    return 0;
5873
0
}
5874
5875
static const struct tc_ops tc_ops_default = {
5876
    .ovs_name = "",                         /* ovs_name */
5877
    .tc_install = default_tc_install,
5878
    .tc_load = default_tc_load,
5879
};
5880

5881
/* "linux-other" traffic control class.
5882
 *
5883
 * */
5884
5885
static int
5886
other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
5887
0
{
5888
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5889
0
    static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
5890
5891
    /* Nothing but a tc class implementation is allowed to write to a tc.  This
5892
     * class never does that, so we can legitimately use a const tc object. */
5893
0
    netdev->tc = CONST_CAST(struct tc *, &tc);
5894
0
    return 0;
5895
0
}
5896
5897
static const struct tc_ops tc_ops_other = {
5898
    .ovs_name = "linux-other",
5899
    .tc_load = other_tc_load,
5900
};
5901

5902
/* Traffic control. */
5903
5904
/* Number of kernel "tc" ticks per second. */
5905
static double ticks_per_s;
5906
5907
/* Number of kernel "jiffies" per second.  This is used for the purpose of
5908
 * computing buffer sizes.  Generally kernel qdiscs need to be able to buffer
5909
 * one jiffy's worth of data.
5910
 *
5911
 * There are two possibilities here:
5912
 *
5913
 *    - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
5914
 *      approximate range of 100 to 1024.  That means that we really need to
5915
 *      make sure that the qdisc can buffer that much data.
5916
 *
5917
 *    - 'buffer_hz' is an absurdly large number.  That means that the kernel
5918
 *      has finely granular timers and there's no need to fudge additional room
5919
 *      for buffers.  (There's no extra effort needed to implement that: the
5920
 *      large 'buffer_hz' is used as a divisor, so practically any number will
5921
 *      come out as 0 in the division.  Small integer results in the case of
5922
 *      really high dividends won't have any real effect anyhow.)
5923
 */
5924
static unsigned int buffer_hz;
5925
5926
static struct tcmsg *
5927
netdev_linux_tc_make_request(const struct netdev *netdev, int type,
5928
                             unsigned int flags, struct ofpbuf *request)
5929
0
{
5930
0
    int ifindex;
5931
0
    int error;
5932
5933
0
    error = get_ifindex(netdev, &ifindex);
5934
0
    if (error) {
5935
0
        return NULL;
5936
0
    }
5937
5938
0
    return tc_make_request(ifindex, type, flags, request);
5939
0
}
5940
5941
static void
5942
tc_policer_init(struct tc_police *tc_police, uint64_t kbits_rate,
5943
                uint64_t kbits_burst)
5944
0
{
5945
0
    int mtu = 65535;
5946
5947
0
    memset(tc_police, 0, sizeof *tc_police);
5948
5949
0
    tc_police->action = TC_POLICE_SHOT;
5950
0
    tc_police->mtu = mtu;
5951
0
    tc_fill_rate(&tc_police->rate, kbits_rate * 1000 / 8, mtu);
5952
5953
    /* The following appears wrong in one way: In networking a kilobit is
5954
     * usually 1000 bits but this uses 1024 bits.
5955
     *
5956
     * However if you "fix" those problems then "tc filter show ..." shows
5957
     * "125000b", meaning 125,000 bits, when OVS configures it for 1000 kbit ==
5958
     * 1,000,000 bits, whereas this actually ends up doing the right thing from
5959
     * tc's point of view.  Whatever. */
5960
0
    tc_police->burst = tc_bytes_to_ticks(
5961
0
        tc_police->rate.rate, kbits_burst * 1024 / 8);
5962
0
}
5963
5964
/* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
5965
 * of 'kbits_burst', with a rate of 'kpkts_rate' and a burst size of
5966
 * 'kpkts_burst'.
5967
 *
5968
 * This function is equivalent to running:
5969
 *     /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
5970
 *              basic police rate <kbits_rate>kbit burst <kbits_burst>k
5971
 *              mtu 65535 drop
5972
 *
5973
 * The configuration and stats may be seen with the following command:
5974
 *     /sbin/tc -s filter show dev <devname> parent ffff:
5975
 *
5976
 * Returns 0 if successful, otherwise a positive errno value.
5977
 */
5978
static int
5979
tc_add_policer(struct netdev *netdev, uint64_t kbits_rate,
5980
               uint32_t kbits_burst, uint32_t kpkts_rate, uint32_t kpkts_burst)
5981
0
{
5982
0
    size_t basic_offset, police_offset;
5983
0
    struct ofpbuf request;
5984
0
    struct tcmsg *tcmsg;
5985
0
    int error;
5986
5987
0
    tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWTFILTER,
5988
0
                                         NLM_F_EXCL | NLM_F_CREATE, &request);
5989
0
    if (!tcmsg) {
5990
0
        return ENODEV;
5991
0
    }
5992
0
    tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
5993
0
    tcmsg->tcm_info = tc_make_handle(49,
5994
0
                                     (OVS_FORCE uint16_t) htons(ETH_P_ALL));
5995
0
    nl_msg_put_string(&request, TCA_KIND, "basic");
5996
5997
0
    basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
5998
0
    police_offset = nl_msg_start_nested(&request, TCA_BASIC_ACT);
5999
0
    nl_msg_put_act_police(&request, 0, kbits_rate, kbits_burst,
6000
0
                          kpkts_rate * 1000ULL, kpkts_burst * 1000ULL,
6001
0
                          TC_ACT_UNSPEC, false);
6002
0
    nl_msg_end_nested(&request, police_offset);
6003
0
    nl_msg_end_nested(&request, basic_offset);
6004
6005
0
    error = tc_transact(&request, NULL);
6006
0
    if (error) {
6007
0
        return error;
6008
0
    }
6009
6010
0
    return 0;
6011
0
}
6012
6013
int
6014
tc_add_policer_action(uint32_t index, uint64_t kbits_rate,
6015
                      uint32_t kbits_burst, uint32_t pkts_rate,
6016
                      uint32_t pkts_burst, bool update)
6017
0
{
6018
0
    struct ofpbuf request;
6019
0
    struct tcamsg *tcamsg;
6020
0
    size_t offset;
6021
0
    int flags;
6022
0
    int error;
6023
6024
0
    flags = (update ? NLM_F_REPLACE : NLM_F_EXCL) | NLM_F_CREATE;
6025
0
    tcamsg = tc_make_action_request(RTM_NEWACTION, flags, &request);
6026
0
    if (!tcamsg) {
6027
0
        return ENODEV;
6028
0
    }
6029
6030
0
    offset = nl_msg_start_nested(&request, TCA_ACT_TAB);
6031
0
    nl_msg_put_act_police(&request, index, kbits_rate, kbits_burst, pkts_rate,
6032
0
                          pkts_burst, TC_ACT_PIPE, true);
6033
0
    nl_msg_end_nested(&request, offset);
6034
6035
0
    error = tc_transact(&request, NULL);
6036
0
    if (error) {
6037
0
        VLOG_ERR_RL(&rl, "Failed to %s police action, err=%d",
6038
0
                    update ? "update" : "add", error);
6039
0
    }
6040
6041
0
    return error;
6042
0
}
6043
6044
static int
6045
tc_update_policer_action_stats(struct ofpbuf *msg,
6046
                               struct ofputil_meter_stats *stats)
6047
0
{
6048
0
    struct ofpbuf b = ofpbuf_const_initializer(msg->data, msg->size);
6049
0
    struct nlmsghdr *nlmsg = ofpbuf_try_pull(&b, sizeof *nlmsg);
6050
0
    struct tcamsg *tca = ofpbuf_try_pull(&b, sizeof *tca);
6051
0
    struct ovs_flow_stats stats_dropped;
6052
0
    struct ovs_flow_stats stats_hw;
6053
0
    struct ovs_flow_stats stats_sw;
6054
0
    const struct nlattr *act;
6055
0
    struct nlattr *prio;
6056
0
    int error = 0;
6057
6058
0
    if (!stats) {
6059
0
        goto exit;
6060
0
    }
6061
6062
0
    if (!nlmsg || !tca) {
6063
0
        VLOG_ERR_RL(&rl, "Failed to get action stats, size error");
6064
0
        error = EPROTO;
6065
0
        goto exit;
6066
0
    }
6067
6068
0
    act = nl_attr_find(&b, 0, TCA_ACT_TAB);
6069
0
    if (!act) {
6070
0
        VLOG_ERR_RL(&rl, "Failed to get action stats, can't find attribute");
6071
0
        error = EPROTO;
6072
0
        goto exit;
6073
0
    }
6074
6075
0
    prio = (struct nlattr *) act + 1;
6076
0
    memset(&stats_sw, 0, sizeof stats_sw);
6077
0
    memset(&stats_hw, 0, sizeof stats_hw);
6078
0
    memset(&stats_dropped, 0, sizeof stats_dropped);
6079
0
    error = tc_parse_action_stats(prio, &stats_sw, &stats_hw, &stats_dropped);
6080
0
    if (!error) {
6081
0
        stats->packet_in_count +=
6082
0
            get_32aligned_u64(&stats_sw.n_packets);
6083
0
        stats->byte_in_count += get_32aligned_u64(&stats_sw.n_bytes);
6084
0
        stats->packet_in_count +=
6085
0
            get_32aligned_u64(&stats_hw.n_packets);
6086
0
        stats->byte_in_count += get_32aligned_u64(&stats_hw.n_bytes);
6087
0
        if (stats->n_bands >= 1) {
6088
0
            stats->bands[0].packet_count +=
6089
0
                get_32aligned_u64(&stats_dropped.n_packets);
6090
0
        }
6091
0
    }
6092
6093
0
exit:
6094
0
    ofpbuf_delete(msg);
6095
0
    return error;
6096
0
}
6097
6098
int
6099
tc_get_policer_action(uint32_t index, struct ofputil_meter_stats *stats)
6100
0
{
6101
0
    struct ofpbuf *replyp = NULL;
6102
0
    struct ofpbuf request;
6103
0
    struct tcamsg *tcamsg;
6104
0
    size_t root_offset;
6105
0
    size_t prio_offset;
6106
0
    int error;
6107
6108
0
    tcamsg = tc_make_action_request(RTM_GETACTION, 0, &request);
6109
0
    if (!tcamsg) {
6110
0
        return ENODEV;
6111
0
    }
6112
6113
0
    root_offset = nl_msg_start_nested(&request, TCA_ACT_TAB);
6114
0
    prio_offset = nl_msg_start_nested(&request, 1);
6115
0
    nl_msg_put_string(&request, TCA_ACT_KIND, "police");
6116
0
    nl_msg_put_u32(&request, TCA_ACT_INDEX, index);
6117
0
    nl_msg_end_nested(&request, prio_offset);
6118
0
    nl_msg_end_nested(&request, root_offset);
6119
6120
0
    error = tc_transact(&request, &replyp);
6121
0
    if (error) {
6122
0
        VLOG_ERR_RL(&rl, "Failed to dump police action (index: %u), err=%d",
6123
0
                    index, error);
6124
0
        return error;
6125
0
    }
6126
6127
0
    return tc_update_policer_action_stats(replyp, stats);
6128
0
}
6129
6130
int
6131
tc_del_policer_action(uint32_t index, struct ofputil_meter_stats *stats)
6132
0
{
6133
0
    struct ofpbuf *replyp = NULL;
6134
0
    struct ofpbuf request;
6135
0
    struct tcamsg *tcamsg;
6136
0
    size_t root_offset;
6137
0
    size_t prio_offset;
6138
0
    int error;
6139
6140
0
    tcamsg = tc_make_action_request(RTM_DELACTION, NLM_F_ACK, &request);
6141
0
    if (!tcamsg) {
6142
0
        return ENODEV;
6143
0
    }
6144
6145
0
    root_offset = nl_msg_start_nested(&request, TCA_ACT_TAB);
6146
0
    prio_offset = nl_msg_start_nested(&request, 1);
6147
0
    nl_msg_put_string(&request, TCA_ACT_KIND, "police");
6148
0
    nl_msg_put_u32(&request, TCA_ACT_INDEX, index);
6149
0
    nl_msg_end_nested(&request, prio_offset);
6150
0
    nl_msg_end_nested(&request, root_offset);
6151
6152
0
    error = tc_transact(&request, &replyp);
6153
0
    if (error) {
6154
0
        VLOG_ERR_RL(&rl, "Failed to delete police action (index: %u), err=%d",
6155
0
                    index, error);
6156
0
        return error;
6157
0
    }
6158
6159
0
    return tc_update_policer_action_stats(replyp, stats);
6160
0
}
6161
6162
static void
6163
read_psched(void)
6164
0
{
6165
    /* The values in psched are not individually very meaningful, but they are
6166
     * important.  The tables below show some values seen in the wild.
6167
     *
6168
     * Some notes:
6169
     *
6170
     *   - "c" has always been a constant 1000000 since at least Linux 2.4.14.
6171
     *     (Before that, there are hints that it was 1000000000.)
6172
     *
6173
     *   - "d" can be unrealistically large, see the comment on 'buffer_hz'
6174
     *     above.
6175
     *
6176
     *                        /proc/net/psched
6177
     *     -----------------------------------
6178
     * [1] 000c8000 000f4240 000f4240 00000064
6179
     * [2] 000003e8 00000400 000f4240 3b9aca00
6180
     * [3] 000003e8 00000400 000f4240 3b9aca00
6181
     * [4] 000003e8 00000400 000f4240 00000064
6182
     * [5] 000003e8 00000040 000f4240 3b9aca00
6183
     * [6] 000003e8 00000040 000f4240 000000f9
6184
     *
6185
     *           a         b          c             d ticks_per_s     buffer_hz
6186
     *     ------- --------- ---------- ------------- ----------- -------------
6187
     * [1] 819,200 1,000,000  1,000,000           100     819,200           100
6188
     * [2]   1,000     1,024  1,000,000 1,000,000,000     976,562 1,000,000,000
6189
     * [3]   1,000     1,024  1,000,000 1,000,000,000     976,562 1,000,000,000
6190
     * [4]   1,000     1,024  1,000,000           100     976,562           100
6191
     * [5]   1,000        64  1,000,000 1,000,000,000  15,625,000 1,000,000,000
6192
     * [6]   1,000        64  1,000,000           249  15,625,000           249
6193
     *
6194
     * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
6195
     * [2] 2.6.26-1-686-bigmem from Debian lenny
6196
     * [3] 2.6.26-2-sparc64 from Debian lenny
6197
     * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
6198
     * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
6199
     * [6] 2.6.34 from kernel.org on KVM
6200
     */
6201
0
    static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
6202
0
    static const char fn[] = "/proc/net/psched";
6203
0
    unsigned int a, b, c, d;
6204
0
    FILE *stream;
6205
6206
0
    if (!ovsthread_once_start(&once)) {
6207
0
        return;
6208
0
    }
6209
6210
0
    ticks_per_s = 1.0;
6211
0
    buffer_hz = 100;
6212
6213
0
    stream = fopen(fn, "r");
6214
0
    if (!stream) {
6215
0
        VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
6216
0
        goto exit;
6217
0
    }
6218
6219
0
    if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
6220
0
        VLOG_WARN("%s: read failed", fn);
6221
0
        fclose(stream);
6222
0
        goto exit;
6223
0
    }
6224
0
    VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
6225
0
    fclose(stream);
6226
6227
0
    if (!a || !b || !c) {
6228
0
        VLOG_WARN("%s: invalid scheduler parameters", fn);
6229
0
        goto exit;
6230
0
    }
6231
6232
0
    ticks_per_s = (double) a * c / b;
6233
0
    if (c == 1000000) {
6234
0
        buffer_hz = d;
6235
0
    } else {
6236
0
        VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
6237
0
                  fn, a, b, c, d);
6238
0
    }
6239
0
    VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
6240
6241
0
exit:
6242
0
    ovsthread_once_done(&once);
6243
0
}
6244
6245
/* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
6246
 * rate of 'rate' bytes per second. */
6247
static unsigned int
6248
tc_ticks_to_bytes(uint64_t rate, unsigned int ticks)
6249
0
{
6250
0
    read_psched();
6251
0
    return (rate * ticks) / ticks_per_s;
6252
0
}
6253
6254
/* Returns the number of ticks that it would take to transmit 'size' bytes at a
6255
 * rate of 'rate' bytes per second. */
6256
static unsigned int
6257
tc_bytes_to_ticks(uint64_t rate, unsigned int size)
6258
0
{
6259
0
    read_psched();
6260
0
    return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
6261
0
}
6262
6263
/* Returns the number of bytes that need to be reserved for qdisc buffering at
6264
 * a transmission rate of 'rate' bytes per second. */
6265
static unsigned int
6266
tc_buffer_per_jiffy(uint64_t rate)
6267
0
{
6268
0
    read_psched();
6269
0
    return rate / buffer_hz;
6270
0
}
6271
6272
static uint32_t
6273
0
tc_time_to_ticks(uint32_t time) {
6274
0
    read_psched();
6275
0
    return time * (ticks_per_s / 1000000);
6276
0
}
6277
6278
/* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
6279
 * e.g. "htb", into '*kind' (if it is nonnull).  If 'options' is nonnull,
6280
 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
6281
 * stores NULL into it if it is absent.
6282
 *
6283
 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
6284
 * 'msg'.
6285
 *
6286
 * Returns 0 if successful, otherwise a positive errno value. */
6287
static int
6288
tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
6289
               struct nlattr **options)
6290
0
{
6291
0
    static const struct nl_policy tca_policy[] = {
6292
0
        [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
6293
0
        [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
6294
0
    };
6295
0
    struct nlattr *ta[ARRAY_SIZE(tca_policy)];
6296
6297
0
    if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
6298
0
                         tca_policy, ta, ARRAY_SIZE(ta))) {
6299
0
        VLOG_WARN_RL(&rl, "failed to parse qdisc message");
6300
0
        goto error;
6301
0
    }
6302
6303
0
    if (kind) {
6304
0
        *kind = nl_attr_get_string(ta[TCA_KIND]);
6305
0
    }
6306
6307
0
    if (options) {
6308
0
        *options = ta[TCA_OPTIONS];
6309
0
    }
6310
6311
0
    return 0;
6312
6313
0
error:
6314
0
    if (kind) {
6315
0
        *kind = NULL;
6316
0
    }
6317
0
    if (options) {
6318
0
        *options = NULL;
6319
0
    }
6320
0
    return EPROTO;
6321
0
}
6322
6323
/* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
6324
 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
6325
 * into '*options', and its queue statistics into '*stats'.  Any of the output
6326
 * arguments may be null.
6327
 *
6328
 * Returns 0 if successful, otherwise a positive errno value. */
6329
static int
6330
tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
6331
               struct nlattr **options, struct netdev_queue_stats *stats)
6332
0
{
6333
0
    struct ofpbuf b = ofpbuf_const_initializer(msg->data, msg->size);
6334
0
    struct nlmsghdr *nlmsg = ofpbuf_try_pull(&b, sizeof *nlmsg);
6335
0
    struct tcmsg *tc = ofpbuf_try_pull(&b, sizeof *tc);
6336
0
    static const struct nl_policy tca_policy[] = {
6337
0
        [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
6338
0
        [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
6339
0
    };
6340
0
    struct nlattr *ta[ARRAY_SIZE(tca_policy)];
6341
6342
0
    if (!nlmsg || !tc) {
6343
0
        VLOG_ERR_RL(&rl, "failed to parse class message, malformed reply");
6344
0
        goto error;
6345
0
    }
6346
6347
0
    if (!nl_policy_parse(&b, 0, tca_policy, ta, ARRAY_SIZE(ta))) {
6348
0
        VLOG_WARN_RL(&rl, "failed to parse class message");
6349
0
        goto error;
6350
0
    }
6351
6352
0
    if (handlep) {
6353
0
        *handlep = tc->tcm_handle;
6354
0
    }
6355
6356
0
    if (options) {
6357
0
        *options = ta[TCA_OPTIONS];
6358
0
    }
6359
6360
0
    if (stats) {
6361
0
        const struct gnet_stats_queue *gsq;
6362
0
        struct gnet_stats_basic gsb;
6363
6364
0
        static const struct nl_policy stats_policy[] = {
6365
0
            [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
6366
0
                                  .min_len = sizeof gsb },
6367
0
            [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
6368
0
                                  .min_len = sizeof *gsq },
6369
0
        };
6370
0
        struct nlattr *sa[ARRAY_SIZE(stats_policy)];
6371
6372
0
        if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
6373
0
                             sa, ARRAY_SIZE(sa))) {
6374
0
            VLOG_WARN_RL(&rl, "failed to parse class stats");
6375
0
            goto error;
6376
0
        }
6377
6378
        /* Alignment issues screw up the length of struct gnet_stats_basic on
6379
         * some arch/bitsize combinations.  Newer versions of Linux have a
6380
         * struct gnet_stats_basic_packed, but we can't depend on that.  The
6381
         * easiest thing to do is just to make a copy. */
6382
0
        memset(&gsb, 0, sizeof gsb);
6383
0
        memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
6384
0
               MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
6385
0
        stats->tx_bytes = gsb.bytes;
6386
0
        stats->tx_packets = gsb.packets;
6387
6388
0
        gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
6389
0
        stats->tx_errors = gsq->drops;
6390
0
    }
6391
6392
0
    return 0;
6393
6394
0
error:
6395
0
    if (options) {
6396
0
        *options = NULL;
6397
0
    }
6398
0
    if (stats) {
6399
0
        memset(stats, 0, sizeof *stats);
6400
0
    }
6401
0
    return EPROTO;
6402
0
}
6403
6404
/* Queries the kernel for class with identifier 'handle' and parent 'parent'
6405
 * on 'netdev'. */
6406
static int
6407
tc_query_class(const struct netdev *netdev,
6408
               unsigned int handle, unsigned int parent,
6409
               struct ofpbuf **replyp)
6410
0
{
6411
0
    struct ofpbuf request;
6412
0
    struct tcmsg *tcmsg;
6413
0
    int error;
6414
6415
0
    tcmsg = netdev_linux_tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO,
6416
0
                                         &request);
6417
0
    if (!tcmsg) {
6418
0
        return ENODEV;
6419
0
    }
6420
0
    tcmsg->tcm_handle = handle;
6421
0
    tcmsg->tcm_parent = parent;
6422
6423
0
    error = tc_transact(&request, replyp);
6424
0
    if (error) {
6425
0
        VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
6426
0
                     netdev_get_name(netdev),
6427
0
                     tc_get_major(handle), tc_get_minor(handle),
6428
0
                     tc_get_major(parent), tc_get_minor(parent),
6429
0
                     ovs_strerror(error));
6430
0
    }
6431
0
    return error;
6432
0
}
6433
6434
/* Equivalent to "tc class del dev <name> handle <handle>". */
6435
static int
6436
tc_delete_class(const struct netdev *netdev, unsigned int handle)
6437
0
{
6438
0
    struct ofpbuf request;
6439
0
    struct tcmsg *tcmsg;
6440
0
    int error;
6441
6442
0
    tcmsg = netdev_linux_tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
6443
0
    if (!tcmsg) {
6444
0
        return ENODEV;
6445
0
    }
6446
0
    tcmsg->tcm_handle = handle;
6447
0
    tcmsg->tcm_parent = 0;
6448
6449
0
    error = tc_transact(&request, NULL);
6450
0
    if (error) {
6451
0
        VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
6452
0
                     netdev_get_name(netdev),
6453
0
                     tc_get_major(handle), tc_get_minor(handle),
6454
0
                     ovs_strerror(error));
6455
0
    }
6456
0
    return error;
6457
0
}
6458
6459
/* Equivalent to "tc qdisc del dev <name> root". */
6460
static int
6461
tc_del_qdisc(struct netdev *netdev_)
6462
0
{
6463
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
6464
0
    struct ofpbuf request;
6465
0
    struct tcmsg *tcmsg;
6466
0
    int error;
6467
6468
0
    tcmsg = netdev_linux_tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
6469
0
    if (!tcmsg) {
6470
0
        return ENODEV;
6471
0
    }
6472
0
    tcmsg->tcm_parent = TC_H_ROOT;
6473
6474
0
    error = tc_transact(&request, NULL);
6475
0
    if (error == EINVAL || error == ENOENT) {
6476
        /* EINVAL or ENOENT probably means that the default qdisc was in use,
6477
         * in which case we've accomplished our purpose. */
6478
0
        error = 0;
6479
0
    }
6480
0
    if (!error && netdev->tc) {
6481
0
        if (netdev->tc->ops->tc_destroy) {
6482
0
            netdev->tc->ops->tc_destroy(netdev->tc);
6483
0
        }
6484
0
        netdev->tc = NULL;
6485
0
    }
6486
0
    return error;
6487
0
}
6488
6489
static bool
6490
getqdisc_is_safe(void)
6491
0
{
6492
0
    static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
6493
0
    static bool safe = false;
6494
6495
0
    if (ovsthread_once_start(&once)) {
6496
0
        if (ovs_kernel_is_version_or_newer(2, 35)) {
6497
0
            safe = true;
6498
0
        } else {
6499
0
            VLOG_INFO("disabling unsafe RTM_GETQDISC in Linux kernel");
6500
0
        }
6501
0
        ovsthread_once_done(&once);
6502
0
    }
6503
0
    return safe;
6504
0
}
6505
6506
/* If 'netdev''s qdisc type and parameters are not yet known, queries the
6507
 * kernel to determine what they are.  Returns 0 if successful, otherwise a
6508
 * positive errno value. */
6509
static int
6510
tc_query_qdisc(const struct netdev *netdev_)
6511
0
{
6512
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
6513
0
    struct ofpbuf request, *qdisc;
6514
0
    const struct tc_ops *ops;
6515
0
    struct tcmsg *tcmsg;
6516
0
    int load_error;
6517
0
    int error;
6518
6519
0
    if (netdev->tc) {
6520
0
        return 0;
6521
0
    }
6522
6523
    /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
6524
     * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
6525
     * 2.6.35 without that fix backported to it.
6526
     *
6527
     * To avoid the OOPS, we must not make a request that would attempt to dump
6528
     * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
6529
     * few others.  There are a few ways that I can see to do this, but most of
6530
     * them seem to be racy (and if you lose the race the kernel OOPSes).  The
6531
     * technique chosen here is to assume that any non-default qdisc that we
6532
     * create will have a class with handle 1:0.  The built-in qdiscs only have
6533
     * a class with handle 0:0.
6534
     *
6535
     * On Linux 2.6.35+ we use the straightforward method because it allows us
6536
     * to handle non-builtin qdiscs without handle 1:0 (e.g. codel).  However,
6537
     * in such a case we get no response at all from the kernel (!) if a
6538
     * builtin qdisc is in use (which is later caught by "!error &&
6539
     * !qdisc->size"). */
6540
0
    tcmsg = netdev_linux_tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO,
6541
0
                                         &request);
6542
0
    if (!tcmsg) {
6543
0
        return ENODEV;
6544
0
    }
6545
0
    tcmsg->tcm_handle = tc_make_handle(getqdisc_is_safe() ? 0 : 1, 0);
6546
0
    tcmsg->tcm_parent = getqdisc_is_safe() ? TC_H_ROOT : 0;
6547
6548
    /* Figure out what tc class to instantiate. */
6549
0
    error = tc_transact(&request, &qdisc);
6550
0
    if (!error && qdisc->size) {
6551
0
        const char *kind;
6552
6553
0
        error = tc_parse_qdisc(qdisc, &kind, NULL);
6554
0
        if (error) {
6555
0
            ops = &tc_ops_other;
6556
0
        } else {
6557
0
            ops = tc_lookup_linux_name(kind);
6558
0
            if (!ops) {
6559
0
                static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
6560
0
                VLOG_DBG_RL(&rl2, "unknown qdisc \"%s\"", kind);
6561
6562
0
                ops = &tc_ops_other;
6563
0
            }
6564
0
        }
6565
0
    } else if ((!error && !qdisc->size) || error == ENOENT) {
6566
        /* Either it's a built-in qdisc, or (on Linux pre-2.6.35) it's a qdisc
6567
         * set up by some other entity that doesn't have a handle 1:0.  We will
6568
         * assume that it's the system default qdisc. */
6569
0
        ops = &tc_ops_default;
6570
0
        error = 0;
6571
0
    } else {
6572
        /* Who knows?  Maybe the device got deleted. */
6573
0
        VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
6574
0
                     netdev_get_name(netdev_), ovs_strerror(error));
6575
0
        ops = &tc_ops_other;
6576
0
    }
6577
6578
    /* Instantiate it. */
6579
0
    load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
6580
0
    ovs_assert((load_error == 0) == (netdev->tc != NULL));
6581
0
    ofpbuf_delete(qdisc);
6582
6583
0
    return error ? error : load_error;
6584
0
}
6585
6586
/* Linux traffic control uses tables with 256 entries ("rtab" tables) to
6587
   approximate the time to transmit packets of various lengths.  For an MTU of
6588
   256 or less, each entry is exact; for an MTU of 257 through 512, each entry
6589
   represents two possible packet lengths; for a MTU of 513 through 1024, four
6590
   possible lengths; and so on.
6591
6592
   Returns, for the specified 'mtu', the number of bits that packet lengths
6593
   need to be shifted right to fit within such a 256-entry table. */
6594
static int
6595
tc_calc_cell_log(unsigned int mtu)
6596
0
{
6597
0
    int cell_log;
6598
6599
0
    if (!mtu) {
6600
0
        mtu = ETH_PAYLOAD_MAX;
6601
0
    }
6602
0
    mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
6603
6604
0
    for (cell_log = 0; mtu >= 256; cell_log++) {
6605
0
        mtu >>= 1;
6606
0
    }
6607
6608
0
    return cell_log;
6609
0
}
6610
6611
/* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
6612
 * of 'mtu'. */
6613
static void
6614
tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
6615
0
{
6616
0
    memset(rate, 0, sizeof *rate);
6617
0
    rate->cell_log = tc_calc_cell_log(mtu);
6618
    /* rate->overhead = 0; */           /* New in 2.6.24, not yet in some */
6619
    /* rate->cell_align = 0; */         /* distro headers. */
6620
0
    rate->mpu = ETH_TOTAL_MIN;
6621
0
    rate->rate = MIN(UINT32_MAX, Bps);
6622
0
}
6623
6624
/* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
6625
 * attribute of the specified "type".
6626
 *
6627
 * A 64-bit rate can be provided via 'rate64' in bps.
6628
 * If zero, the rate in 'rate' will be used.
6629
 *
6630
 * See tc_calc_cell_log() above for a description of "rtab"s. */
6631
void
6632
tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate,
6633
            uint64_t rate64)
6634
0
{
6635
0
    uint32_t *rtab;
6636
0
    unsigned int i;
6637
6638
0
    rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
6639
0
    for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
6640
0
        unsigned packet_size = (i + 1) << rate->cell_log;
6641
0
        if (packet_size < rate->mpu) {
6642
0
            packet_size = rate->mpu;
6643
0
        }
6644
0
        rtab[i] = tc_bytes_to_ticks(rate64 ? rate64 : rate->rate, packet_size);
6645
0
    }
6646
0
}
6647
6648
/* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
6649
 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
6650
 * burst size of 'burst_bytes'.  (If no value was requested, a 'burst_bytes' of
6651
 * 0 is fine.) */
6652
static int
6653
tc_calc_buffer(uint64_t Bps, int mtu, uint64_t burst_bytes)
6654
0
{
6655
0
    unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
6656
0
    return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
6657
0
}
6658

6659
/* Linux-only functions declared in netdev-linux.h  */
6660
6661
/* Modifies the 'flag' bit in ethtool's flags field for 'netdev'.  If
6662
 * 'enable' is true, the bit is set.  Otherwise, it is cleared. */
6663
int
6664
netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
6665
                              const char *flag_name, bool enable)
6666
0
{
6667
0
    const char *netdev_name = netdev_get_name(netdev);
6668
0
    struct ethtool_value evalue;
6669
0
    uint32_t new_flags;
6670
0
    int error;
6671
6672
0
    COVERAGE_INC(netdev_get_ethtool);
6673
0
    memset(&evalue, 0, sizeof evalue);
6674
0
    error = netdev_linux_do_ethtool(netdev_name,
6675
0
                                    (struct ethtool_cmd *)&evalue,
6676
0
                                    ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
6677
0
    if (error) {
6678
0
        return error;
6679
0
    }
6680
6681
0
    COVERAGE_INC(netdev_set_ethtool);
6682
0
    new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
6683
0
    if (new_flags == evalue.data) {
6684
0
        return 0;
6685
0
    }
6686
0
    evalue.data = new_flags;
6687
0
    error = netdev_linux_do_ethtool(netdev_name,
6688
0
                                    (struct ethtool_cmd *)&evalue,
6689
0
                                    ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
6690
0
    if (error) {
6691
0
        return error;
6692
0
    }
6693
6694
0
    COVERAGE_INC(netdev_get_ethtool);
6695
0
    memset(&evalue, 0, sizeof evalue);
6696
0
    error = netdev_linux_do_ethtool(netdev_name,
6697
0
                                    (struct ethtool_cmd *)&evalue,
6698
0
                                    ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
6699
0
    if (error) {
6700
0
        return error;
6701
0
    }
6702
6703
0
    if (new_flags != evalue.data) {
6704
0
        VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
6705
0
                     "device %s failed", enable ? "enable" : "disable",
6706
0
                     flag_name, netdev_name);
6707
0
        return EOPNOTSUPP;
6708
0
    }
6709
6710
0
    return 0;
6711
0
}
6712

6713
/* Utility functions. */
6714
6715
/* Copies 'src' into 'dst', performing format conversion in the process. */
6716
static void
6717
netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
6718
                                  const struct rtnl_link_stats *src)
6719
0
{
6720
0
    dst->rx_packets = src->rx_packets;
6721
0
    dst->tx_packets = src->tx_packets;
6722
0
    dst->rx_bytes = src->rx_bytes;
6723
0
    dst->tx_bytes = src->tx_bytes;
6724
0
    dst->rx_errors = src->rx_errors;
6725
0
    dst->tx_errors = src->tx_errors;
6726
0
    dst->rx_dropped = src->rx_dropped;
6727
0
    dst->tx_dropped = src->tx_dropped;
6728
0
    dst->multicast = src->multicast;
6729
0
    dst->collisions = src->collisions;
6730
0
    dst->rx_length_errors = src->rx_length_errors;
6731
0
    dst->rx_over_errors = src->rx_over_errors;
6732
0
    dst->rx_crc_errors = src->rx_crc_errors;
6733
0
    dst->rx_frame_errors = src->rx_frame_errors;
6734
0
    dst->rx_fifo_errors = src->rx_fifo_errors;
6735
0
    dst->rx_missed_errors = src->rx_missed_errors;
6736
0
    dst->tx_aborted_errors = src->tx_aborted_errors;
6737
0
    dst->tx_carrier_errors = src->tx_carrier_errors;
6738
0
    dst->tx_fifo_errors = src->tx_fifo_errors;
6739
0
    dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
6740
0
    dst->tx_window_errors = src->tx_window_errors;
6741
0
}
6742
6743
/* Copies 'src' into 'dst', performing format conversion in the process. */
6744
static void
6745
netdev_stats_from_rtnl_link_stats64(struct netdev_stats *dst,
6746
                                    const struct rtnl_link_stats64 *src)
6747
0
{
6748
0
    dst->rx_packets = src->rx_packets;
6749
0
    dst->tx_packets = src->tx_packets;
6750
0
    dst->rx_bytes = src->rx_bytes;
6751
0
    dst->tx_bytes = src->tx_bytes;
6752
0
    dst->rx_errors = src->rx_errors;
6753
0
    dst->tx_errors = src->tx_errors;
6754
0
    dst->rx_dropped = src->rx_dropped;
6755
0
    dst->tx_dropped = src->tx_dropped;
6756
0
    dst->multicast = src->multicast;
6757
0
    dst->collisions = src->collisions;
6758
0
    dst->rx_length_errors = src->rx_length_errors;
6759
0
    dst->rx_over_errors = src->rx_over_errors;
6760
0
    dst->rx_crc_errors = src->rx_crc_errors;
6761
0
    dst->rx_frame_errors = src->rx_frame_errors;
6762
0
    dst->rx_fifo_errors = src->rx_fifo_errors;
6763
0
    dst->rx_missed_errors = src->rx_missed_errors;
6764
0
    dst->tx_aborted_errors = src->tx_aborted_errors;
6765
0
    dst->tx_carrier_errors = src->tx_carrier_errors;
6766
0
    dst->tx_fifo_errors = src->tx_fifo_errors;
6767
0
    dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
6768
0
    dst->tx_window_errors = src->tx_window_errors;
6769
0
}
6770
6771
int
6772
get_stats_via_netlink(const struct netdev *netdev_, struct netdev_stats *stats)
6773
0
{
6774
0
    struct ofpbuf request;
6775
0
    struct ofpbuf *reply;
6776
0
    int error;
6777
6778
    /* Filtering all counters by default */
6779
0
    memset(stats, 0xFF, sizeof(struct netdev_stats));
6780
6781
0
    ofpbuf_init(&request, 0);
6782
0
    nl_msg_put_nlmsghdr(&request,
6783
0
                        sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
6784
0
                        RTM_GETLINK, NLM_F_REQUEST);
6785
0
    ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
6786
0
    nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(netdev_));
6787
0
    error = nl_transact(NETLINK_ROUTE, &request, &reply);
6788
0
    ofpbuf_uninit(&request);
6789
0
    if (error) {
6790
0
        return error;
6791
0
    }
6792
6793
0
    if (ofpbuf_try_pull(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg))) {
6794
0
        const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS64);
6795
0
        if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats64)) {
6796
0
            const struct rtnl_link_stats64 *lstats = nl_attr_get(a);
6797
0
            struct rtnl_link_stats64 aligned_lstats;
6798
6799
0
            if (!IS_PTR_ALIGNED(lstats)) {
6800
0
                memcpy(&aligned_lstats, (void *) lstats,
6801
0
                       sizeof aligned_lstats);
6802
0
                lstats = &aligned_lstats;
6803
0
            }
6804
0
            netdev_stats_from_rtnl_link_stats64(stats, lstats);
6805
0
            error = 0;
6806
0
        } else {
6807
0
            a = nl_attr_find(reply, 0, IFLA_STATS);
6808
0
            if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats)) {
6809
0
                netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(a));
6810
0
                error = 0;
6811
0
            } else {
6812
0
                VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
6813
0
                error = EPROTO;
6814
0
            }
6815
0
        }
6816
0
    } else {
6817
0
        VLOG_WARN_RL(&rl, "short RTM_GETLINK reply");
6818
0
        error = EPROTO;
6819
0
    }
6820
6821
6822
0
    ofpbuf_delete(reply);
6823
0
    return error;
6824
0
}
6825
6826
static int
6827
get_flags(const struct netdev *dev, unsigned int *flags)
6828
0
{
6829
0
    struct ifreq ifr;
6830
0
    int error;
6831
6832
0
    memset(&ifr, 0, sizeof ifr);
6833
0
    *flags = 0;
6834
0
    error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
6835
0
    if (!error) {
6836
0
        *flags = ifr.ifr_flags;
6837
0
    }
6838
0
    return error;
6839
0
}
6840
6841
static int
6842
set_flags(const char *name, unsigned int flags)
6843
0
{
6844
0
    struct ifreq ifr;
6845
6846
0
    memset(&ifr, 0, sizeof ifr);
6847
0
    ifr.ifr_flags = flags;
6848
0
    return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
6849
0
}
6850
6851
int
6852
linux_get_ifindex(const char *netdev_name)
6853
0
{
6854
0
    struct ifreq ifr;
6855
0
    int error;
6856
6857
0
    memset(&ifr, 0, sizeof ifr);
6858
0
    ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
6859
0
    COVERAGE_INC(netdev_get_ifindex);
6860
6861
0
    error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
6862
0
    if (error) {
6863
        /* ENODEV probably means that a vif disappeared asynchronously and
6864
         * hasn't been removed from the database yet, so reduce the log level
6865
         * to INFO for that case. */
6866
0
        VLOG_RL(&rl, error == ENODEV ? VLL_INFO : VLL_ERR,
6867
0
                "ioctl(SIOCGIFINDEX) on %s device failed: %s",
6868
0
                netdev_name, ovs_strerror(error));
6869
0
        return -error;
6870
0
    }
6871
0
    return ifr.ifr_ifindex;
6872
0
}
6873
6874
static int
6875
get_ifindex(const struct netdev *netdev_, int *ifindexp)
6876
0
{
6877
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
6878
6879
0
    if (!(netdev->cache_valid & VALID_IFINDEX)) {
6880
0
        netdev_linux_update_via_netlink(netdev);
6881
0
    }
6882
6883
0
    if (!(netdev->cache_valid & VALID_IFINDEX)) {
6884
        /* Fall back to ioctl if netlink fails */
6885
0
        int ifindex = linux_get_ifindex(netdev_get_name(netdev_));
6886
6887
0
        if (ifindex < 0) {
6888
0
            netdev->get_ifindex_error = -ifindex;
6889
0
            netdev->ifindex = 0;
6890
0
        } else {
6891
0
            netdev->get_ifindex_error = 0;
6892
0
            netdev->ifindex = ifindex;
6893
0
        }
6894
0
        netdev->cache_valid |= VALID_IFINDEX;
6895
0
    }
6896
6897
0
    *ifindexp = netdev->ifindex;
6898
0
    return netdev->get_ifindex_error;
6899
0
}
6900
6901
static int
6902
netdev_linux_update_via_netlink(struct netdev_linux *netdev)
6903
0
{
6904
0
    struct ofpbuf request;
6905
0
    struct ofpbuf *reply;
6906
0
    struct rtnetlink_change chg;
6907
0
    struct rtnetlink_change *change = &chg;
6908
0
    int error;
6909
6910
0
    ofpbuf_init(&request, 0);
6911
0
    nl_msg_put_nlmsghdr(&request,
6912
0
                        sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ) +
6913
0
                        NL_A_U32_SIZE, RTM_GETLINK, NLM_F_REQUEST);
6914
0
    ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
6915
6916
    /* The correct identifiers for a Linux device are netnsid and ifindex,
6917
     * but ifindex changes as the port is moved to another network namespace
6918
     * and the interface name statically stored in ovsdb. */
6919
0
    nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(&netdev->up));
6920
0
    if (netdev_linux_netnsid_is_remote(netdev)) {
6921
0
        nl_msg_put_u32(&request, IFLA_IF_NETNSID, netdev->netnsid);
6922
0
    }
6923
6924
0
    nl_msg_put_u32(&request, IFLA_EXT_MASK, RTEXT_FILTER_SKIP_STATS);
6925
6926
0
    error = nl_transact(NETLINK_ROUTE, &request, &reply);
6927
0
    ofpbuf_uninit(&request);
6928
0
    if (error) {
6929
0
        ofpbuf_delete(reply);
6930
0
        return error;
6931
0
    }
6932
6933
0
    if (rtnetlink_parse(reply, change)
6934
0
        && !change->irrelevant
6935
0
        && change->nlmsg_type == RTM_NEWLINK) {
6936
0
        bool changed = false;
6937
0
        error = 0;
6938
6939
        /* Update netdev from rtnl msg and increment its seq if needed. */
6940
0
        if ((change->ifi_flags ^ netdev->ifi_flags) & IFF_RUNNING) {
6941
0
            netdev->carrier_resets++;
6942
0
            changed = true;
6943
0
        }
6944
0
        if (change->ifi_flags != netdev->ifi_flags) {
6945
0
            netdev->ifi_flags = change->ifi_flags;
6946
0
            changed = true;
6947
0
        }
6948
0
        if (change->mtu && change->mtu != netdev->mtu) {
6949
0
            netdev->mtu = change->mtu;
6950
0
            netdev->cache_valid |= VALID_MTU;
6951
0
            netdev->netdev_mtu_error = 0;
6952
0
            changed = true;
6953
0
        }
6954
0
        if (!eth_addr_is_zero(change->mac)
6955
0
            && !eth_addr_equals(change->mac, netdev->etheraddr)) {
6956
0
            netdev->etheraddr = change->mac;
6957
0
            netdev->cache_valid |= VALID_ETHERADDR;
6958
0
            netdev->ether_addr_error = 0;
6959
0
            changed = true;
6960
0
        }
6961
0
        if (change->if_index != netdev->ifindex) {
6962
0
            netdev->ifindex = change->if_index;
6963
0
            netdev->cache_valid |= VALID_IFINDEX;
6964
0
            netdev->get_ifindex_error = 0;
6965
0
            changed = true;
6966
0
        }
6967
0
        if (change->primary && netdev_linux_kind_is_lag(change->primary)) {
6968
0
            netdev->is_lag_primary = true;
6969
0
        }
6970
0
        if (changed) {
6971
0
            netdev_change_seq_changed(&netdev->up);
6972
0
        }
6973
0
    } else {
6974
0
        error = EINVAL;
6975
0
    }
6976
6977
0
    ofpbuf_delete(reply);
6978
0
    return error;
6979
0
}
6980
6981
static int
6982
get_etheraddr(const char *netdev_name, struct eth_addr *ea)
6983
0
{
6984
0
    struct ifreq ifr;
6985
0
    int hwaddr_family;
6986
0
    int error;
6987
6988
0
    memset(&ifr, 0, sizeof ifr);
6989
0
    ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
6990
0
    COVERAGE_INC(netdev_get_hwaddr);
6991
0
    error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
6992
0
    if (error) {
6993
        /* ENODEV probably means that a vif disappeared asynchronously and
6994
         * hasn't been removed from the database yet, so reduce the log level
6995
         * to INFO for that case. */
6996
0
        VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
6997
0
             "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
6998
0
             netdev_name, ovs_strerror(error));
6999
0
        return error;
7000
0
    }
7001
0
    hwaddr_family = ifr.ifr_hwaddr.sa_family;
7002
0
    if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER &&
7003
0
        hwaddr_family != ARPHRD_NONE) {
7004
0
        VLOG_INFO("%s device has unknown hardware address family %d",
7005
0
                  netdev_name, hwaddr_family);
7006
0
        return EINVAL;
7007
0
    }
7008
0
    memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
7009
0
    return 0;
7010
0
}
7011
7012
static int
7013
set_etheraddr(const char *netdev_name, const struct eth_addr mac)
7014
0
{
7015
0
    struct ifreq ifr;
7016
0
    int error;
7017
7018
0
    memset(&ifr, 0, sizeof ifr);
7019
0
    ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
7020
0
    ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
7021
0
    memcpy(ifr.ifr_hwaddr.sa_data, &mac, ETH_ADDR_LEN);
7022
0
    COVERAGE_INC(netdev_set_hwaddr);
7023
0
    error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
7024
0
    if (error) {
7025
0
        VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
7026
0
                 netdev_name, ovs_strerror(error));
7027
0
    }
7028
0
    return error;
7029
0
}
7030
7031
static int
7032
netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
7033
                        int cmd, const char *cmd_name)
7034
0
{
7035
0
    struct ifreq ifr;
7036
0
    int error;
7037
7038
0
    memset(&ifr, 0, sizeof ifr);
7039
0
    ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
7040
0
    ifr.ifr_data = (caddr_t) ecmd;
7041
7042
0
    ecmd->cmd = cmd;
7043
0
    error = af_inet_ioctl(SIOCETHTOOL, &ifr);
7044
0
    if (error) {
7045
0
        if (error != EOPNOTSUPP) {
7046
0
            VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
7047
0
                         "failed: %s", cmd_name, name, ovs_strerror(error));
7048
0
        } else {
7049
            /* The device doesn't support this operation.  That's pretty
7050
             * common, so there's no point in logging anything. */
7051
0
        }
7052
0
    }
7053
0
    return error;
7054
0
}
7055
7056
/* Returns an AF_PACKET raw socket or a negative errno value. */
7057
static int
7058
af_packet_sock(void)
7059
0
{
7060
0
    static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
7061
0
    static int sock;
7062
7063
0
    if (ovsthread_once_start(&once)) {
7064
0
        sock = socket(AF_PACKET, SOCK_RAW, 0);
7065
0
        if (sock >= 0) {
7066
0
            int error = set_nonblocking(sock);
7067
0
            if (error) {
7068
0
                close(sock);
7069
0
                sock = -error;
7070
0
            } else if (userspace_tso_enabled()) {
7071
0
                int val = 1;
7072
0
                error = setsockopt(sock, SOL_PACKET, PACKET_VNET_HDR, &val,
7073
0
                                   sizeof val);
7074
0
                if (error) {
7075
0
                    error = errno;
7076
0
                    VLOG_ERR("failed to enable vnet hdr in raw socket: %s",
7077
0
                             ovs_strerror(errno));
7078
0
                    close(sock);
7079
0
                    sock = -error;
7080
0
                }
7081
0
            }
7082
0
        } else {
7083
0
            sock = -errno;
7084
0
            VLOG_ERR("failed to create packet socket: %s",
7085
0
                     ovs_strerror(errno));
7086
0
        }
7087
0
        ovsthread_once_done(&once);
7088
0
    }
7089
7090
0
    return sock;
7091
0
}
7092
7093
/* Initializes packet 'b' with features enabled in the prepended
7094
 * struct virtio_net_hdr.  Returns 0 if successful, otherwise a
7095
 * positive errno value. */
7096
static int
7097
netdev_linux_parse_vnet_hdr(struct dp_packet *b)
7098
0
{
7099
0
    struct virtio_net_hdr *vnet = dp_packet_pull(b, sizeof *vnet);
7100
7101
0
    if (OVS_UNLIKELY(!vnet)) {
7102
0
        return EINVAL;
7103
0
    }
7104
7105
0
    if (vnet->flags == 0 && vnet->gso_type == VIRTIO_NET_HDR_GSO_NONE) {
7106
0
        return 0;
7107
0
    }
7108
7109
0
    if (vnet->flags == VIRTIO_NET_HDR_F_NEEDS_CSUM) {
7110
0
        uint16_t csum_offset = (OVS_FORCE uint16_t) vnet->csum_offset;
7111
0
        uint16_t csum_start = (OVS_FORCE uint16_t) vnet->csum_start;
7112
7113
0
        if (csum_start >= dp_packet_size(b)
7114
0
            || csum_start + csum_offset >= dp_packet_size(b)) {
7115
0
            COVERAGE_INC(netdev_linux_invalid_l4_csum);
7116
0
            return EINVAL;
7117
0
        }
7118
7119
0
        parse_tcp_flags(b, NULL, NULL, NULL);
7120
7121
0
        if (csum_start == b->l4_ofs
7122
0
            && ((csum_offset == offsetof(struct tcp_header, tcp_csum)
7123
0
                 && dp_packet_l4_proto_tcp(b))
7124
0
                || (csum_offset == offsetof(struct udp_header, udp_csum)
7125
0
                    && dp_packet_l4_proto_udp(b))
7126
0
                || (csum_offset == offsetof(struct sctp_header, sctp_csum)
7127
0
                    && dp_packet_l4_proto_sctp(b)))) {
7128
0
            dp_packet_l4_checksum_set_partial(b);
7129
0
        } else {
7130
0
            ovs_be16 *csum_l4;
7131
0
            void *l4;
7132
7133
0
            COVERAGE_INC(netdev_linux_unknown_l4_csum);
7134
7135
0
            csum_l4 = dp_packet_at(b, csum_start + csum_offset,
7136
0
                                   sizeof *csum_l4);
7137
0
            if (!csum_l4) {
7138
0
                return EINVAL;
7139
0
            }
7140
7141
0
            l4 = dp_packet_at(b, csum_start, dp_packet_size(b) - csum_start);
7142
0
            *csum_l4 = csum(l4, dp_packet_size(b) - csum_start);
7143
7144
0
            if (dp_packet_l4_proto_tcp(b)
7145
0
                || dp_packet_l4_proto_udp(b)
7146
0
                || dp_packet_l4_proto_sctp(b)) {
7147
0
                dp_packet_l4_checksum_set_good(b);
7148
0
            }
7149
0
        }
7150
0
    }
7151
7152
0
    int ret = 0;
7153
0
    switch (vnet->gso_type) {
7154
0
    case VIRTIO_NET_HDR_GSO_TCPV4:
7155
0
    case VIRTIO_NET_HDR_GSO_TCPV6:
7156
0
        dp_packet_set_tso_segsz(b, (OVS_FORCE uint16_t) vnet->gso_size);
7157
0
        break;
7158
7159
0
    case VIRTIO_NET_HDR_GSO_UDP:
7160
        /* UFO is not supported. */
7161
0
        VLOG_WARN_RL(&rl, "Received an unsupported packet with UFO enabled.");
7162
0
        ret = ENOTSUP;
7163
0
        break;
7164
7165
0
    case VIRTIO_NET_HDR_GSO_NONE:
7166
0
        break;
7167
7168
0
    default:
7169
0
        ret = ENOTSUP;
7170
0
        VLOG_WARN_RL(&rl, "Received an unsupported packet with GSO type: 0x%x",
7171
0
                     vnet->gso_type);
7172
0
    }
7173
7174
0
    return ret;
7175
0
}
7176
7177
/* Prepends struct virtio_net_hdr to packet 'b'.
7178
 * Returns 0 if successful, otherwise a positive errno value.
7179
 * Returns EMSGSIZE if the packet 'b' cannot be sent over MTU 'mtu'. */
7180
static int
7181
netdev_linux_prepend_vnet_hdr(struct dp_packet *b, int mtu)
7182
0
{
7183
0
    struct virtio_net_hdr v;
7184
0
    struct virtio_net_hdr *vnet = &v;
7185
7186
0
    if (dp_packet_get_tso_segsz(b)) {
7187
0
        uint16_t tso_segsz = dp_packet_get_tso_segsz(b);
7188
0
        const struct tcp_header *tcp;
7189
0
        const struct ip_header *ip;
7190
0
        if (dp_packet_inner_l4(b)) {
7191
0
            tcp = dp_packet_inner_l4(b);
7192
0
            ip = dp_packet_inner_l3(b);
7193
0
        } else {
7194
0
            tcp = dp_packet_l4(b);
7195
0
            ip = dp_packet_l3(b);
7196
0
        }
7197
0
        int tcp_hdr_len = TCP_OFFSET(tcp->tcp_ctl) * 4;
7198
0
        int hdr_len = ((char *) tcp - (char *) dp_packet_eth(b))
7199
0
                      + tcp_hdr_len;
7200
0
        int max_packet_len = mtu + ETH_HEADER_LEN + VLAN_HEADER_LEN;
7201
7202
0
        if (OVS_UNLIKELY((hdr_len + tso_segsz) > max_packet_len)) {
7203
0
            VLOG_WARN_RL(&rl, "Oversized TSO packet. hdr_len: %"PRIu32", "
7204
0
                         "gso: %"PRIu16", max length: %"PRIu32".", hdr_len,
7205
0
                         tso_segsz, max_packet_len);
7206
0
            return EMSGSIZE;
7207
0
        }
7208
7209
0
        vnet->hdr_len = (OVS_FORCE __virtio16)hdr_len;
7210
0
        vnet->gso_size = (OVS_FORCE __virtio16)(tso_segsz);
7211
0
        if (IP_VER(ip->ip_ihl_ver) == 4) {
7212
0
            vnet->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
7213
0
        } else if (IP_VER(ip->ip_ihl_ver) == 6) {
7214
0
            vnet->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
7215
0
        } else {
7216
0
            VLOG_ERR_RL(&rl, "Unknown gso_type for TSO packet. "
7217
0
                        "Offloads: %"PRIu32, b->offloads);
7218
0
            return EINVAL;
7219
0
        }
7220
0
    } else {
7221
0
        vnet->hdr_len = 0;
7222
0
        vnet->gso_size = 0;
7223
0
        vnet->gso_type = VIRTIO_NET_HDR_GSO_NONE;
7224
0
    }
7225
7226
0
    if (dp_packet_l4_checksum_good(b)
7227
0
        && (!dp_packet_tunnel(b)
7228
0
            || dp_packet_inner_l4_checksum_good(b))) {
7229
        /* The packet has good L4 checksum. No need to validate again. */
7230
0
        vnet->csum_start = vnet->csum_offset = (OVS_FORCE __virtio16) 0;
7231
0
        vnet->flags = VIRTIO_NET_HDR_F_DATA_VALID;
7232
0
    } else if (dp_packet_l4_checksum_partial(b)
7233
0
               || dp_packet_inner_l4_checksum_partial(b)) {
7234
0
        const struct ip_header *ip_hdr;
7235
0
        void *l3_off;
7236
0
        void *l4_off;
7237
0
        bool is_sctp;
7238
0
        bool is_tcp;
7239
0
        bool is_udp;
7240
7241
0
        if (dp_packet_inner_l4_checksum_partial(b)) {
7242
0
            l3_off = dp_packet_inner_l3(b);
7243
0
            l4_off = dp_packet_inner_l4(b);
7244
0
            is_tcp = dp_packet_inner_l4_proto_tcp(b);
7245
0
            is_udp = dp_packet_inner_l4_proto_udp(b);
7246
0
            is_sctp = dp_packet_inner_l4_proto_sctp(b);
7247
0
        } else {
7248
0
            l3_off = dp_packet_l3(b);
7249
0
            l4_off = dp_packet_l4(b);
7250
0
            is_tcp = dp_packet_l4_proto_tcp(b);
7251
0
            is_udp = dp_packet_l4_proto_udp(b);
7252
0
            is_sctp = dp_packet_l4_proto_sctp(b);
7253
0
        }
7254
0
        ip_hdr = l3_off;
7255
7256
        /* The csum calculation is offloaded. */
7257
0
        if (is_tcp) {
7258
            /* Virtual I/O Device (VIRTIO) Version 1.1
7259
             * 5.1.6.2 Packet Transmission
7260
             * If the driver negotiated VIRTIO_NET_F_CSUM, it can skip
7261
             * checksumming the packet:
7262
             *  - flags has the VIRTIO_NET_HDR_F_NEEDS_CSUM set,
7263
             *  - csum_start is set to the offset within the packet
7264
             *    to begin checksumming, and
7265
             *  - csum_offset indicates how many bytes after the
7266
             *    csum_start the new (16 bit ones complement) checksum
7267
             *    is placed by the device.
7268
             * The TCP checksum field in the packet is set to the sum of
7269
             * the TCP pseudo header, so that replacing it by the ones
7270
             * complement checksum of the TCP header and body will give
7271
             * the correct result. */
7272
0
            struct tcp_header *tcp_hdr = l4_off;
7273
0
            ovs_be16 csum = 0;
7274
7275
0
            if (IP_VER(ip_hdr->ip_ihl_ver) == 4) {
7276
0
                csum = ~csum_finish(packet_csum_pseudoheader(ip_hdr));
7277
0
            } else if (IP_VER(ip_hdr->ip_ihl_ver) == 6) {
7278
0
                const struct ovs_16aligned_ip6_hdr *ip6_hdr = l3_off;
7279
0
                csum = ~csum_finish(packet_csum_pseudoheader6(ip6_hdr));
7280
0
            }
7281
7282
0
            tcp_hdr->tcp_csum = csum;
7283
0
            vnet->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
7284
0
            vnet->csum_start = (OVS_FORCE __virtio16) ((char *) l4_off -
7285
0
                                    (char *) dp_packet_data(b));
7286
0
            vnet->csum_offset = (OVS_FORCE __virtio16) __builtin_offsetof(
7287
0
                                    struct tcp_header, tcp_csum);
7288
0
        } else if (is_udp) {
7289
0
            struct udp_header *udp_hdr = l4_off;
7290
0
            ovs_be16 csum = 0;
7291
7292
0
            if (IP_VER(ip_hdr->ip_ihl_ver) == 4) {
7293
0
                csum = ~csum_finish(packet_csum_pseudoheader(ip_hdr));
7294
0
            } else if (IP_VER(ip_hdr->ip_ihl_ver) == 6) {
7295
0
                const struct ovs_16aligned_ip6_hdr *ip6_hdr = l3_off;
7296
0
                csum = ~csum_finish(packet_csum_pseudoheader6(ip6_hdr));
7297
0
            }
7298
7299
0
            udp_hdr->udp_csum = csum;
7300
0
            vnet->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
7301
0
            vnet->csum_start = (OVS_FORCE __virtio16) ((char *) udp_hdr -
7302
0
                                    (char *) dp_packet_data(b));;
7303
0
            vnet->csum_offset = (OVS_FORCE __virtio16) __builtin_offsetof(
7304
0
                                    struct udp_header, udp_csum);
7305
0
        } else if (is_sctp) {
7306
            /* The Linux kernel networking stack only supports csum_start
7307
             * and csum_offset when SCTP GSO is enabled.  See kernel's
7308
             * skb_csum_hwoffload_help(). Currently there is no SCTP
7309
             * segmentation offload support in OVS. */
7310
0
            vnet->csum_start = vnet->csum_offset = (OVS_FORCE __virtio16) 0;
7311
0
            vnet->flags = 0;
7312
0
        } else {
7313
            /* This should only happen when a new L4 proto
7314
             * is not covered in above checks. */
7315
0
            VLOG_WARN_RL(&rl, "Unsupported L4 checksum offload. "
7316
0
                         "Offloads: %"PRIu32, b->offloads);
7317
0
            vnet->csum_start = vnet->csum_offset = (OVS_FORCE __virtio16) 0;
7318
0
            vnet->flags = 0;
7319
0
        }
7320
0
    } else {
7321
        /* Packet L4 csum is unknown. */
7322
0
        vnet->csum_start = vnet->csum_offset = (OVS_FORCE __virtio16) 0;
7323
0
        vnet->flags = 0;
7324
0
    }
7325
7326
0
    dp_packet_push(b, vnet, sizeof *vnet);
7327
0
    return 0;
7328
0
}