Coverage Report

Created: 2025-07-01 06:50

/src/openvswitch/lib/netdev-linux.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017 Nicira, Inc.
3
 *
4
 * Licensed under the Apache License, Version 2.0 (the "License");
5
 * you may not use this file except in compliance with the License.
6
 * You may obtain a copy of the License at:
7
 *
8
 *     http://www.apache.org/licenses/LICENSE-2.0
9
 *
10
 * Unless required by applicable law or agreed to in writing, software
11
 * distributed under the License is distributed on an "AS IS" BASIS,
12
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
 * See the License for the specific language governing permissions and
14
 * limitations under the License.
15
 */
16
17
#include <config.h>
18
19
#include "netdev-linux.h"
20
#include "netdev-linux-private.h"
21
22
#include <errno.h>
23
#include <fcntl.h>
24
#include <sys/types.h>
25
#include <netinet/in.h>
26
#include <arpa/inet.h>
27
#include <inttypes.h>
28
#include <math.h>
29
#include <linux/filter.h>
30
#include <linux/gen_stats.h>
31
#include <linux/if_ether.h>
32
#include <linux/if_packet.h>
33
#include <linux/if_tun.h>
34
#include <linux/types.h>
35
#include <linux/ethtool.h>
36
#include <linux/mii.h>
37
#include <linux/rtnetlink.h>
38
#include <linux/sockios.h>
39
#include <linux/virtio_net.h>
40
#include <sys/ioctl.h>
41
#include <sys/socket.h>
42
#include <sys/uio.h>
43
#include <net/if.h>
44
#include <net/if_arp.h>
45
#include <net/route.h>
46
#include <poll.h>
47
#include <stdlib.h>
48
#include <string.h>
49
#include <unistd.h>
50
51
#include "coverage.h"
52
#include "dp-packet.h"
53
#include "dpif-netlink.h"
54
#include "dpif-netdev.h"
55
#include "openvswitch/dynamic-string.h"
56
#include "fatal-signal.h"
57
#include "hash.h"
58
#include "openvswitch/hmap.h"
59
#include "netdev-afxdp.h"
60
#include "netdev-provider.h"
61
#include "netdev-vport.h"
62
#include "netlink-notifier.h"
63
#include "netlink-socket.h"
64
#include "netlink.h"
65
#include "netnsid.h"
66
#include "openvswitch/ofpbuf.h"
67
#include "openflow/openflow.h"
68
#include "ovs-atomic.h"
69
#include "ovs-numa.h"
70
#include "packets.h"
71
#include "openvswitch/poll-loop.h"
72
#include "rtnetlink.h"
73
#include "openvswitch/shash.h"
74
#include "socket-util.h"
75
#include "sset.h"
76
#include "tc.h"
77
#include "timer.h"
78
#include "unaligned.h"
79
#include "openvswitch/vlog.h"
80
#include "userspace-tso.h"
81
#include "util.h"
82
83
VLOG_DEFINE_THIS_MODULE(netdev_linux);
84
85
COVERAGE_DEFINE(netdev_set_policing);
86
COVERAGE_DEFINE(netdev_arp_lookup);
87
COVERAGE_DEFINE(netdev_get_ifindex);
88
COVERAGE_DEFINE(netdev_get_hwaddr);
89
COVERAGE_DEFINE(netdev_set_hwaddr);
90
COVERAGE_DEFINE(netdev_get_ethtool);
91
COVERAGE_DEFINE(netdev_set_ethtool);
92
COVERAGE_DEFINE(netdev_linux_invalid_l4_csum);
93
COVERAGE_DEFINE(netdev_linux_unknown_l4_csum);
94
95

96
#ifndef IFLA_IF_NETNSID
97
0
#define IFLA_IF_NETNSID 0x45
98
#endif
99
/* These were introduced in Linux 2.6.14, so they might be missing if we have
100
 * old headers. */
101
#ifndef ADVERTISED_Pause
102
#define ADVERTISED_Pause                (1 << 13)
103
#endif
104
#ifndef ADVERTISED_Asym_Pause
105
#define ADVERTISED_Asym_Pause           (1 << 14)
106
#endif
107
108
/* These were introduced in Linux 2.6.24, so they might be missing if we
109
 * have old headers. */
110
#ifndef ETHTOOL_GFLAGS
111
#define ETHTOOL_GFLAGS       0x00000025 /* Get flags bitmap(ethtool_value) */
112
#endif
113
#ifndef ETHTOOL_SFLAGS
114
#define ETHTOOL_SFLAGS       0x00000026 /* Set flags bitmap(ethtool_value) */
115
#endif
116
117
/* This was introduced in Linux 2.6.25, so it might be missing if we have old
118
 * headers. */
119
#ifndef TC_RTAB_SIZE
120
#define TC_RTAB_SIZE 1024
121
#endif
122
123
/* Linux 2.6.21 introduced struct tpacket_auxdata.
124
 * Linux 2.6.27 added the tp_vlan_tci member.
125
 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
126
 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
127
 * TP_STATUS_VLAN_TPID_VALID.
128
 *
129
 * With all this churn it's easiest to unconditionally define a replacement
130
 * structure that has everything we want.
131
 */
132
#ifndef PACKET_AUXDATA
133
#define PACKET_AUXDATA                  8
134
#endif
135
#ifndef TP_STATUS_VLAN_VALID
136
#define TP_STATUS_VLAN_VALID            (1 << 4)
137
#endif
138
#ifndef TP_STATUS_VLAN_TPID_VALID
139
#define TP_STATUS_VLAN_TPID_VALID       (1 << 6)
140
#endif
141
#undef tpacket_auxdata
142
#define tpacket_auxdata rpl_tpacket_auxdata
143
struct tpacket_auxdata {
144
    uint32_t tp_status;
145
    uint32_t tp_len;
146
    uint32_t tp_snaplen;
147
    uint16_t tp_mac;
148
    uint16_t tp_net;
149
    uint16_t tp_vlan_tci;
150
    uint16_t tp_vlan_tpid;
151
};
152
153
/* Linux 2.6.27 introduced ethtool_cmd_speed
154
 *
155
 * To avoid revisiting problems reported with using configure to detect
156
 * compatibility (see report at
157
 * https://mail.openvswitch.org/pipermail/ovs-dev/2014-October/291521.html)
158
 * unconditionally replace ethtool_cmd_speed. */
159
0
#define ethtool_cmd_speed rpl_ethtool_cmd_speed
160
static inline uint32_t rpl_ethtool_cmd_speed(const struct ethtool_cmd *ep)
161
0
{
162
0
        return ep->speed | (ep->speed_hi << 16);
163
0
}
164
165
/* Linux 2.6.30 introduced supported and advertised flags for
166
 * 1G base KX, and 10G base KX4, KR and R. */
167
#ifndef SUPPORTED_1000baseKX_Full
168
#define SUPPORTED_1000baseKX_Full      (1 << 17)
169
#define SUPPORTED_10000baseKX4_Full    (1 << 18)
170
#define SUPPORTED_10000baseKR_Full     (1 << 19)
171
#define SUPPORTED_10000baseR_FEC       (1 << 20)
172
#define ADVERTISED_1000baseKX_Full     (1 << 17)
173
#define ADVERTISED_10000baseKX4_Full   (1 << 18)
174
#define ADVERTISED_10000baseKR_Full    (1 << 19)
175
#define ADVERTISED_10000baseR_FEC      (1 << 20)
176
#endif
177
178
/* Linux 3.5 introduced supported and advertised flags for
179
 * 40G base KR4, CR4, SR4 and LR4. */
180
#ifndef SUPPORTED_40000baseKR4_Full
181
#define SUPPORTED_40000baseKR4_Full    (1 << 23)
182
#define SUPPORTED_40000baseCR4_Full    (1 << 24)
183
#define SUPPORTED_40000baseSR4_Full    (1 << 25)
184
#define SUPPORTED_40000baseLR4_Full    (1 << 26)
185
#define ADVERTISED_40000baseKR4_Full   (1 << 23)
186
#define ADVERTISED_40000baseCR4_Full   (1 << 24)
187
#define ADVERTISED_40000baseSR4_Full   (1 << 25)
188
#define ADVERTISED_40000baseLR4_Full   (1 << 26)
189
#endif
190
191
/* Linux 2.6.35 introduced IFLA_STATS64 and rtnl_link_stats64.
192
 *
193
 * Tests for rtnl_link_stats64 don't seem to consistently work, e.g. on
194
 * 2.6.32-431.29.2.el6.x86_64 (see report at
195
 * https://mail.openvswitch.org/pipermail/ovs-dev/2014-October/291521.html).
196
 * Maybe if_link.h is not self-contained on those kernels.  It is easiest to
197
 * unconditionally define a replacement. */
198
#ifndef IFLA_STATS64
199
0
#define IFLA_STATS64 23
200
#endif
201
#define rtnl_link_stats64 rpl_rtnl_link_stats64
202
struct rtnl_link_stats64 {
203
    uint64_t rx_packets;
204
    uint64_t tx_packets;
205
    uint64_t rx_bytes;
206
    uint64_t tx_bytes;
207
    uint64_t rx_errors;
208
    uint64_t tx_errors;
209
    uint64_t rx_dropped;
210
    uint64_t tx_dropped;
211
    uint64_t multicast;
212
    uint64_t collisions;
213
214
    uint64_t rx_length_errors;
215
    uint64_t rx_over_errors;
216
    uint64_t rx_crc_errors;
217
    uint64_t rx_frame_errors;
218
    uint64_t rx_fifo_errors;
219
    uint64_t rx_missed_errors;
220
221
    uint64_t tx_aborted_errors;
222
    uint64_t tx_carrier_errors;
223
    uint64_t tx_fifo_errors;
224
    uint64_t tx_heartbeat_errors;
225
    uint64_t tx_window_errors;
226
227
    uint64_t rx_compressed;
228
    uint64_t tx_compressed;
229
};
230
231
/* Linux 3.19 introduced virtio_types.h.  It might be missing
232
 * if we are using old kernel. */
233
#ifndef HAVE_VIRTIO_TYPES
234
typedef __u16 __bitwise__ __virtio16;
235
typedef __u32 __bitwise__ __virtio32;
236
typedef __u64 __bitwise__ __virtio64;
237
#endif
238
239
enum {
240
    VALID_IFINDEX           = 1 << 0,
241
    VALID_ETHERADDR         = 1 << 1,
242
    VALID_IN                = 1 << 2,
243
    VALID_MTU               = 1 << 3,
244
    VALID_POLICING          = 1 << 4,
245
    VALID_VPORT_STAT_ERROR  = 1 << 5,
246
    VALID_DRVINFO           = 1 << 6,
247
    VALID_FEATURES          = 1 << 7,
248
    VALID_NUMA_ID           = 1 << 8,
249
};
250
251
/* Linux 4.4 introduced the ability to skip the internal stats gathering
252
 * that netlink does via an external filter mask that can be passed into
253
 * a netlink request.
254
 */
255
#ifndef RTEXT_FILTER_SKIP_STATS
256
#define RTEXT_FILTER_SKIP_STATS (1 << 3)
257
#endif
258
259
/* Use one for the packet buffer and another for the aux buffer to receive
260
 * TSO packets. */
261
0
#define IOV_STD_SIZE 1
262
0
#define IOV_TSO_SIZE 2
263
264
enum {
265
    IOV_PACKET = 0,
266
    IOV_AUXBUF = 1,
267
};
268

269
struct linux_lag_member {
270
   uint32_t block_id;
271
   struct shash_node *node;
272
};
273
274
/* Protects 'lag_shash' and the mutable members of struct linux_lag_member. */
275
static struct ovs_mutex lag_mutex = OVS_MUTEX_INITIALIZER;
276
277
/* All members whose LAG primary interfaces are OVS network devices. */
278
static struct shash lag_shash OVS_GUARDED_BY(lag_mutex)
279
    = SHASH_INITIALIZER(&lag_shash);
280
281
/* Traffic control. */
282
283
/* An instance of a traffic control class.  Always associated with a particular
284
 * network device.
285
 *
286
 * Each TC implementation subclasses this with whatever additional data it
287
 * needs. */
288
struct tc {
289
    const struct tc_ops *ops;
290
    struct hmap queues;         /* Contains "struct tc_queue"s.
291
                                 * Read by generic TC layer.
292
                                 * Written only by TC implementation. */
293
};
294
295
0
#define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
296
297
/* One traffic control queue.
298
 *
299
 * Each TC implementation subclasses this with whatever additional data it
300
 * needs. */
301
struct tc_queue {
302
    struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
303
    unsigned int queue_id;      /* OpenFlow queue ID. */
304
    long long int created;      /* Time queue was created, in msecs. */
305
};
306
307
/* A particular kind of traffic control.  Each implementation generally maps to
308
 * one particular Linux qdisc class.
309
 *
310
 * The functions below return 0 if successful or a positive errno value on
311
 * failure, except where otherwise noted.  All of them must be provided, except
312
 * where otherwise noted. */
313
struct tc_ops {
314
    /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
315
     * This is null for tc_ops_default and tc_ops_other, for which there are no
316
     * appropriate values. */
317
    const char *linux_name;
318
319
    /* Name used in OVS database, e.g. "linux-htb".  Must be nonnull. */
320
    const char *ovs_name;
321
322
    /* Number of supported OpenFlow queues, 0 for qdiscs that have no
323
     * queues.  The queues are numbered 0 through n_queues - 1. */
324
    unsigned int n_queues;
325
326
    /* Called to install this TC class on 'netdev'.  The implementation should
327
     * make the Netlink calls required to set up 'netdev' with the right qdisc
328
     * and configure it according to 'details'.  The implementation may assume
329
     * that the current qdisc is the default; that is, there is no need for it
330
     * to delete the current qdisc before installing itself.
331
     *
332
     * The contents of 'details' should be documented as valid for 'ovs_name'
333
     * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
334
     * (which is built as ovs-vswitchd.conf.db(8)).
335
     *
336
     * This function must return 0 if and only if it sets 'netdev->tc' to an
337
     * initialized 'struct tc'.
338
     *
339
     * (This function is null for tc_ops_other, which cannot be installed.  For
340
     * other TC classes it should always be nonnull.) */
341
    int (*tc_install)(struct netdev *netdev, const struct smap *details);
342
343
    /* Called when the netdev code determines (through a Netlink query) that
344
     * this TC class's qdisc is installed on 'netdev', but we didn't install
345
     * it ourselves and so don't know any of the details.
346
     *
347
     * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
348
     * 'netdev'.  The TCA_KIND attribute of 'nlmsg' is 'linux_name'.  The
349
     * implementation should parse the other attributes of 'nlmsg' as
350
     * necessary to determine its configuration.  If necessary it should also
351
     * use Netlink queries to determine the configuration of queues on
352
     * 'netdev'.
353
     *
354
     * This function must return 0 if and only if it sets 'netdev->tc' to an
355
     * initialized 'struct tc'. */
356
    int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
357
358
    /* Destroys the data structures allocated by the implementation as part of
359
     * 'tc'.  (This includes destroying 'tc->queues' by calling
360
     * tc_destroy(tc).
361
     *
362
     * The implementation should not need to perform any Netlink calls.  If
363
     * desirable, the caller is responsible for deconfiguring the kernel qdisc.
364
     * (But it may not be desirable.)
365
     *
366
     * This function may be null if 'tc' is trivial. */
367
    void (*tc_destroy)(struct tc *tc);
368
369
    /* Retrieves details of 'netdev->tc' configuration into 'details'.
370
     *
371
     * The implementation should not need to perform any Netlink calls, because
372
     * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
373
     * cached the configuration.
374
     *
375
     * The contents of 'details' should be documented as valid for 'ovs_name'
376
     * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
377
     * (which is built as ovs-vswitchd.conf.db(8)).
378
     *
379
     * This function may be null if 'tc' is not configurable.
380
     */
381
    int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
382
383
    /* Reconfigures 'netdev->tc' according to 'details', performing any
384
     * required Netlink calls to complete the reconfiguration.
385
     *
386
     * The contents of 'details' should be documented as valid for 'ovs_name'
387
     * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
388
     * (which is built as ovs-vswitchd.conf.db(8)).
389
     *
390
     * This function may be null if 'tc' is not configurable.
391
     */
392
    int (*qdisc_set)(struct netdev *, const struct smap *details);
393
394
    /* Retrieves details of 'queue' on 'netdev->tc' into 'details'.  'queue' is
395
     * one of the 'struct tc_queue's within 'netdev->tc->queues'.
396
     *
397
     * The contents of 'details' should be documented as valid for 'ovs_name'
398
     * in the "other_config" column in the "Queue" table in
399
     * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
400
     *
401
     * The implementation should not need to perform any Netlink calls, because
402
     * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
403
     * cached the queue configuration.
404
     *
405
     * This function may be null if 'tc' does not have queues ('n_queues' is
406
     * 0). */
407
    int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
408
                     struct smap *details);
409
410
    /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
411
     * 'details', perfoming any required Netlink calls to complete the
412
     * reconfiguration.  The caller ensures that 'queue_id' is less than
413
     * 'n_queues'.
414
     *
415
     * The contents of 'details' should be documented as valid for 'ovs_name'
416
     * in the "other_config" column in the "Queue" table in
417
     * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
418
     *
419
     * This function may be null if 'tc' does not have queues or its queues are
420
     * not configurable. */
421
    int (*class_set)(struct netdev *, unsigned int queue_id,
422
                     const struct smap *details);
423
424
    /* Deletes 'queue' from 'netdev->tc'.  'queue' is one of the 'struct
425
     * tc_queue's within 'netdev->tc->queues'.
426
     *
427
     * This function may be null if 'tc' does not have queues or its queues
428
     * cannot be deleted. */
429
    int (*class_delete)(struct netdev *, struct tc_queue *queue);
430
431
    /* Obtains stats for 'queue' from 'netdev->tc'.  'queue' is one of the
432
     * 'struct tc_queue's within 'netdev->tc->queues'.
433
     *
434
     * On success, initializes '*stats'.
435
     *
436
     * This function may be null if 'tc' does not have queues or if it cannot
437
     * report queue statistics. */
438
    int (*class_get_stats)(const struct netdev *netdev,
439
                           const struct tc_queue *queue,
440
                           struct netdev_queue_stats *stats);
441
442
    /* Extracts queue stats from 'nlmsg', which is a response to a
443
     * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
444
     *
445
     * This function may be null if 'tc' does not have queues or if it cannot
446
     * report queue statistics. */
447
    int (*class_dump_stats)(const struct netdev *netdev,
448
                            const struct ofpbuf *nlmsg,
449
                            netdev_dump_queue_stats_cb *cb, void *aux);
450
};
451
452
static void
453
tc_init(struct tc *tc, const struct tc_ops *ops)
454
0
{
455
0
    tc->ops = ops;
456
0
    hmap_init(&tc->queues);
457
0
}
458
459
static void
460
tc_destroy(struct tc *tc)
461
0
{
462
0
    hmap_destroy(&tc->queues);
463
0
}
464
465
static const struct tc_ops tc_ops_htb;
466
static const struct tc_ops tc_ops_hfsc;
467
static const struct tc_ops tc_ops_codel;
468
static const struct tc_ops tc_ops_fqcodel;
469
static const struct tc_ops tc_ops_sfq;
470
static const struct tc_ops tc_ops_netem;
471
static const struct tc_ops tc_ops_default;
472
static const struct tc_ops tc_ops_noop;
473
static const struct tc_ops tc_ops_other;
474
475
static const struct tc_ops *const tcs[] = {
476
    &tc_ops_htb,                /* Hierarchy token bucket (see tc-htb(8)). */
477
    &tc_ops_hfsc,               /* Hierarchical fair service curve. */
478
    &tc_ops_codel,              /* Controlled delay */
479
    &tc_ops_fqcodel,            /* Fair queue controlled delay */
480
    &tc_ops_sfq,                /* Stochastic fair queueing */
481
    &tc_ops_netem,              /* Network Emulator */
482
    &tc_ops_noop,               /* Non operating qos type. */
483
    &tc_ops_default,            /* Default qdisc (see tc-pfifo_fast(8)). */
484
    &tc_ops_other,              /* Some other qdisc. */
485
    NULL
486
};
487
488
static unsigned int tc_ticks_to_bytes(uint64_t rate, unsigned int ticks);
489
static unsigned int tc_bytes_to_ticks(uint64_t rate, unsigned int size);
490
static unsigned int tc_buffer_per_jiffy(uint64_t rate);
491
static uint32_t tc_time_to_ticks(uint32_t time);
492
493
static struct tcmsg *netdev_linux_tc_make_request(const struct netdev *,
494
                                                  int type,
495
                                                  unsigned int flags,
496
                                                  struct ofpbuf *);
497
498
static int tc_add_policer(struct netdev *, uint64_t kbits_rate,
499
                          uint32_t kbits_burst, uint32_t kpkts_rate,
500
                          uint32_t kpkts_burst);
501
502
static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
503
                          struct nlattr **options);
504
static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
505
                          struct nlattr **options,
506
                          struct netdev_queue_stats *);
507
static int tc_query_class(const struct netdev *,
508
                          unsigned int handle, unsigned int parent,
509
                          struct ofpbuf **replyp);
510
static int tc_delete_class(const struct netdev *, unsigned int handle);
511
512
static int tc_del_qdisc(struct netdev *netdev);
513
static int tc_query_qdisc(const struct netdev *netdev);
514
static void tc_policer_init(struct tc_police *tc_police, uint64_t kbits_rate,
515
                            uint64_t kbits_burst);
516
517
void
518
tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate,
519
            uint64_t rate64);
520
static int tc_calc_cell_log(unsigned int mtu);
521
static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
522
static int tc_calc_buffer(uint64_t Bps, int mtu, uint64_t burst_bytes);
523

524
525
/* This is set pretty low because we probably won't learn anything from the
526
 * additional log messages. */
527
static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
528
529
/* Polling miimon status for all ports causes performance degradation when
530
 * handling a large number of ports. If there are no devices using miimon, then
531
 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait().
532
 *
533
 * Readers do not depend on this variable synchronizing with the related
534
 * changes in the device miimon status, so we can use atomic_count. */
535
static atomic_count miimon_cnt = ATOMIC_COUNT_INIT(0);
536
537
/* Very old kernels from the 2.6 era don't support vnet headers with the tun
538
 * device. We can detect this while constructing a netdev, but need this for
539
 * packet rx/tx. */
540
static bool tap_supports_vnet_hdr = true;
541
542
static int netdev_linux_parse_vnet_hdr(struct dp_packet *b);
543
static int netdev_linux_prepend_vnet_hdr(struct dp_packet *b, int mtu);
544
static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
545
                                   int cmd, const char *cmd_name);
546
static int get_flags(const struct netdev *, unsigned int *flags);
547
static int set_flags(const char *, unsigned int flags);
548
static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
549
                        enum netdev_flags on, enum netdev_flags *old_flagsp)
550
    OVS_REQUIRES(netdev->mutex);
551
static int get_ifindex(const struct netdev *, int *ifindexp);
552
static int do_set_addr(struct netdev *netdev,
553
                       int ioctl_nr, const char *ioctl_name,
554
                       struct in_addr addr);
555
static int get_etheraddr(const char *netdev_name, struct eth_addr *ea);
556
static int set_etheraddr(const char *netdev_name, const struct eth_addr);
557
static int af_packet_sock(void);
558
static bool netdev_linux_miimon_enabled(void);
559
static void netdev_linux_miimon_run(void);
560
static void netdev_linux_miimon_wait(void);
561
static int netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup);
562
static void netdev_linux_set_ol(struct netdev *netdev);
563
564
static bool
565
is_tap_netdev(const struct netdev *netdev)
566
0
{
567
0
    return netdev_get_class(netdev) == &netdev_tap_class;
568
0
}
569

570
static int
571
netdev_linux_netnsid_update__(struct netdev_linux *netdev)
572
0
{
573
0
    struct dpif_netlink_vport reply;
574
0
    struct ofpbuf *buf;
575
0
    int error;
576
577
0
    error = dpif_netlink_vport_get(netdev_get_name(&netdev->up), &reply, &buf);
578
0
    if (error) {
579
0
        if (error == ENOENT) {
580
            /* Assume it is local if there is no API (e.g. if the openvswitch
581
             * kernel module is not loaded). */
582
0
            netnsid_set_local(&netdev->netnsid);
583
0
        } else {
584
0
            netnsid_unset(&netdev->netnsid);
585
0
        }
586
0
        return error;
587
0
    }
588
589
0
    netnsid_set(&netdev->netnsid, reply.netnsid);
590
0
    ofpbuf_delete(buf);
591
0
    return 0;
592
0
}
593
594
static int
595
netdev_linux_netnsid_update(struct netdev_linux *netdev)
596
0
{
597
0
    if (netnsid_is_unset(netdev->netnsid)) {
598
0
        if (netdev_get_class(&netdev->up) == &netdev_tap_class) {
599
0
            netnsid_set_local(&netdev->netnsid);
600
0
        } else {
601
0
            return netdev_linux_netnsid_update__(netdev);
602
0
        }
603
0
    }
604
605
0
    return 0;
606
0
}
607
608
static bool
609
netdev_linux_netnsid_is_eq(struct netdev_linux *netdev, int nsid)
610
0
{
611
0
    netdev_linux_netnsid_update(netdev);
612
0
    return netnsid_eq(netdev->netnsid, nsid);
613
0
}
614
615
static bool
616
netdev_linux_netnsid_is_remote(struct netdev_linux *netdev)
617
0
{
618
0
    netdev_linux_netnsid_update(netdev);
619
0
    return netnsid_is_remote(netdev->netnsid);
620
0
}
621
622
static int netdev_linux_update_via_netlink(struct netdev_linux *);
623
static void netdev_linux_update(struct netdev_linux *netdev, int,
624
                                const struct rtnetlink_change *)
625
    OVS_REQUIRES(netdev->mutex);
626
static void netdev_linux_changed(struct netdev_linux *netdev,
627
                                 unsigned int ifi_flags, unsigned int mask)
628
    OVS_REQUIRES(netdev->mutex);
629
630
/* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK,
631
 * RTNLGRP_IPV4_IFADDR and RTNLGRP_IPV6_IFADDR changes, or NULL
632
 * if no such socket could be created. */
633
static struct nl_sock *
634
netdev_linux_notify_sock(void)
635
0
{
636
0
    static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
637
0
    static struct nl_sock *sock;
638
0
    unsigned int mcgroups[] = {RTNLGRP_LINK, RTNLGRP_IPV4_IFADDR,
639
0
                                RTNLGRP_IPV6_IFADDR, RTNLGRP_IPV6_IFINFO};
640
641
0
    if (ovsthread_once_start(&once)) {
642
0
        int error;
643
644
0
        error = nl_sock_create(NETLINK_ROUTE, &sock);
645
0
        if (!error) {
646
0
            size_t i;
647
648
0
            nl_sock_listen_all_nsid(sock, true);
649
0
            for (i = 0; i < ARRAY_SIZE(mcgroups); i++) {
650
0
                error = nl_sock_join_mcgroup(sock, mcgroups[i]);
651
0
                if (error) {
652
0
                    nl_sock_destroy(sock);
653
0
                    sock = NULL;
654
0
                    break;
655
0
                }
656
0
            }
657
0
        }
658
0
        ovsthread_once_done(&once);
659
0
    }
660
661
0
    return sock;
662
0
}
663
664
static bool
665
netdev_linux_miimon_enabled(void)
666
0
{
667
0
    return atomic_count_get(&miimon_cnt) > 0;
668
0
}
669
670
static bool
671
netdev_linux_kind_is_lag(const char *kind)
672
0
{
673
0
    if (!strcmp(kind, "bond") || !strcmp(kind, "team")) {
674
0
        return true;
675
0
    }
676
677
0
    return false;
678
0
}
679
680
static void
681
netdev_linux_update_lag(struct rtnetlink_change *change)
682
    OVS_REQUIRES(lag_mutex)
683
0
{
684
0
    struct linux_lag_member *lag;
685
686
0
    if (change->sub && netdev_linux_kind_is_lag(change->sub)) {
687
0
        lag = shash_find_data(&lag_shash, change->ifname);
688
689
0
        if (!lag) {
690
0
            struct netdev *primary_netdev;
691
0
            char primary_name[IFNAMSIZ];
692
0
            uint32_t block_id;
693
0
            int error = 0;
694
695
0
            if (!if_indextoname(change->master_ifindex, primary_name)) {
696
0
                return;
697
0
            }
698
0
            primary_netdev = netdev_from_name(primary_name);
699
0
            if (!primary_netdev) {
700
0
                return;
701
0
            }
702
703
            /* If LAG primary member is not attached to ovs,
704
             * ingress block on LAG members should not be updated. */
705
0
            if (!primary_netdev->auto_classified &&
706
0
                is_netdev_linux_class(primary_netdev->netdev_class)) {
707
0
                block_id = netdev_get_block_id(primary_netdev);
708
0
                if (!block_id) {
709
0
                    netdev_close(primary_netdev);
710
0
                    return;
711
0
                }
712
713
0
                lag = xmalloc(sizeof *lag);
714
0
                lag->block_id = block_id;
715
0
                lag->node = shash_add(&lag_shash, change->ifname, lag);
716
717
                /* delete ingress block in case it exists */
718
0
                tc_add_del_qdisc(change->if_index, false, 0, TC_INGRESS);
719
                /* LAG primary is linux netdev so add member to same block. */
720
0
                error = tc_add_del_qdisc(change->if_index, true, block_id,
721
0
                                         TC_INGRESS);
722
0
                if (error) {
723
0
                    VLOG_WARN("failed to bind LAG member %s to "
724
0
                              "primary's block", change->ifname);
725
0
                    shash_delete(&lag_shash, lag->node);
726
0
                    free(lag);
727
0
                }
728
0
            }
729
730
0
            netdev_close(primary_netdev);
731
0
        }
732
0
    } else if (change->master_ifindex == 0) {
733
        /* Check if this was a lag member that has been removed. */
734
0
        lag = shash_find_data(&lag_shash, change->ifname);
735
736
0
        if (lag) {
737
0
            tc_add_del_qdisc(change->if_index, false, lag->block_id,
738
0
                             TC_INGRESS);
739
0
            shash_delete(&lag_shash, lag->node);
740
0
            free(lag);
741
0
        }
742
0
    }
743
0
}
744
745
void
746
netdev_linux_run(const struct netdev_class *netdev_class OVS_UNUSED)
747
0
{
748
0
    struct nl_sock *sock;
749
0
    int error;
750
751
0
    if (netdev_linux_miimon_enabled()) {
752
0
        netdev_linux_miimon_run();
753
0
    }
754
755
0
    sock = netdev_linux_notify_sock();
756
0
    if (!sock) {
757
0
        return;
758
0
    }
759
760
0
    do {
761
0
        uint64_t buf_stub[4096 / 8];
762
0
        int nsid;
763
0
        struct ofpbuf buf;
764
765
0
        ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
766
0
        error = nl_sock_recv(sock, &buf, &nsid, false);
767
0
        if (!error) {
768
0
            struct rtnetlink_change change;
769
770
0
            if (rtnetlink_parse(&buf, &change) && !change.irrelevant) {
771
0
                struct netdev *netdev_ = NULL;
772
0
                char dev_name[IFNAMSIZ];
773
774
0
                if (!change.ifname) {
775
0
                     change.ifname = if_indextoname(change.if_index, dev_name);
776
0
                }
777
778
0
                if (change.ifname) {
779
0
                    netdev_ = netdev_from_name(change.ifname);
780
0
                }
781
0
                if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
782
0
                    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
783
784
0
                    ovs_mutex_lock(&netdev->mutex);
785
0
                    netdev_linux_update(netdev, nsid, &change);
786
0
                    ovs_mutex_unlock(&netdev->mutex);
787
0
                }
788
789
0
                if (change.ifname &&
790
0
                    rtnetlink_type_is_rtnlgrp_link(change.nlmsg_type)) {
791
792
                    /* Need to try updating the LAG information. */
793
0
                    ovs_mutex_lock(&lag_mutex);
794
0
                    netdev_linux_update_lag(&change);
795
0
                    ovs_mutex_unlock(&lag_mutex);
796
0
                }
797
0
                netdev_close(netdev_);
798
0
            }
799
0
        } else if (error == ENOBUFS) {
800
0
            struct shash device_shash;
801
0
            struct shash_node *node;
802
803
0
            nl_sock_drain(sock);
804
805
0
            shash_init(&device_shash);
806
0
            netdev_get_devices(&netdev_linux_class, &device_shash);
807
0
            SHASH_FOR_EACH (node, &device_shash) {
808
0
                struct netdev *netdev_ = node->data;
809
0
                struct netdev_linux *netdev = netdev_linux_cast(netdev_);
810
0
                unsigned int flags;
811
812
0
                ovs_mutex_lock(&netdev->mutex);
813
0
                get_flags(netdev_, &flags);
814
0
                netdev_linux_changed(netdev, flags, 0);
815
0
                ovs_mutex_unlock(&netdev->mutex);
816
817
0
                netdev_close(netdev_);
818
0
            }
819
0
            shash_destroy(&device_shash);
820
0
        } else if (error != EAGAIN) {
821
0
            static struct vlog_rate_limit rll = VLOG_RATE_LIMIT_INIT(1, 5);
822
0
            VLOG_WARN_RL(&rll, "error reading or parsing netlink (%s)",
823
0
                         ovs_strerror(error));
824
0
        }
825
0
        ofpbuf_uninit(&buf);
826
0
    } while (!error);
827
0
}
828
829
static void
830
netdev_linux_wait(const struct netdev_class *netdev_class OVS_UNUSED)
831
0
{
832
0
    struct nl_sock *sock;
833
834
0
    if (netdev_linux_miimon_enabled()) {
835
0
        netdev_linux_miimon_wait();
836
0
    }
837
0
    sock = netdev_linux_notify_sock();
838
0
    if (sock) {
839
0
        nl_sock_wait(sock, POLLIN);
840
0
    }
841
0
}
842
843
static void
844
netdev_linux_changed(struct netdev_linux *dev,
845
                     unsigned int ifi_flags, unsigned int mask)
846
    OVS_REQUIRES(dev->mutex)
847
0
{
848
0
    netdev_change_seq_changed(&dev->up);
849
850
0
    if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
851
0
        dev->carrier_resets++;
852
0
    }
853
0
    dev->ifi_flags = ifi_flags;
854
855
0
    dev->cache_valid &= mask;
856
0
    if (!(mask & VALID_IN)) {
857
0
        netdev_get_addrs_list_flush();
858
0
    }
859
0
}
860
861
static void
862
netdev_linux_update__(struct netdev_linux *dev,
863
                      const struct rtnetlink_change *change)
864
    OVS_REQUIRES(dev->mutex)
865
0
{
866
0
    if (rtnetlink_type_is_rtnlgrp_link(change->nlmsg_type)) {
867
0
        if (change->nlmsg_type == RTM_NEWLINK) {
868
            /* Keep drv-info, ip addresses, and NUMA id. */
869
0
            netdev_linux_changed(dev, change->ifi_flags,
870
0
                                 VALID_DRVINFO | VALID_IN | VALID_NUMA_ID);
871
872
            /* Update netdev from rtnl-change msg. */
873
0
            if (change->mtu) {
874
0
                dev->mtu = change->mtu;
875
0
                dev->cache_valid |= VALID_MTU;
876
0
                dev->netdev_mtu_error = 0;
877
0
            }
878
879
0
            if (!eth_addr_is_zero(change->mac)) {
880
0
                dev->etheraddr = change->mac;
881
0
                dev->cache_valid |= VALID_ETHERADDR;
882
0
                dev->ether_addr_error = 0;
883
884
                /* The mac addr has been changed, report it now. */
885
0
                rtnetlink_report_link();
886
0
            }
887
888
0
            if (change->primary && netdev_linux_kind_is_lag(change->primary)) {
889
0
                dev->is_lag_primary = true;
890
0
            }
891
892
0
            dev->ifindex = change->if_index;
893
0
            dev->cache_valid |= VALID_IFINDEX;
894
0
            dev->get_ifindex_error = 0;
895
0
            dev->present = true;
896
0
        } else {
897
            /* FIXME */
898
0
            netdev_linux_changed(dev, change->ifi_flags, 0);
899
0
            dev->present = false;
900
0
            netnsid_unset(&dev->netnsid);
901
0
        }
902
0
    } else if (rtnetlink_type_is_rtnlgrp_addr(change->nlmsg_type)) {
903
        /* Invalidates in4, in6. */
904
0
        netdev_linux_changed(dev, dev->ifi_flags, ~VALID_IN);
905
0
    } else {
906
0
        OVS_NOT_REACHED();
907
0
    }
908
0
}
909
910
static void
911
netdev_linux_update(struct netdev_linux *dev, int nsid,
912
                    const struct rtnetlink_change *change)
913
    OVS_REQUIRES(dev->mutex)
914
0
{
915
0
    if (netdev_linux_netnsid_is_eq(dev, nsid)) {
916
0
        netdev_linux_update__(dev, change);
917
0
    }
918
0
}
919
920
static struct netdev *
921
netdev_linux_alloc(void)
922
0
{
923
0
    struct netdev_linux *netdev = xzalloc(sizeof *netdev);
924
0
    return &netdev->up;
925
0
}
926
927
static int
928
netdev_linux_common_construct(struct netdev *netdev_)
929
0
{
930
    /* Prevent any attempt to create (or open) a network device named "default"
931
     * or "all".  These device names are effectively reserved on Linux because
932
     * /proc/sys/net/ipv4/conf/ always contains directories by these names.  By
933
     * itself this wouldn't call for any special treatment, but in practice if
934
     * a program tries to create devices with these names, it causes the kernel
935
     * to fire a "new device" notification event even though creation failed,
936
     * and in turn that causes OVS to wake up and try to create them again,
937
     * which ends up as a 100% CPU loop. */
938
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
939
0
    const char *name = netdev_->name;
940
0
    if (!strcmp(name, "default") || !strcmp(name, "all")) {
941
0
        static struct vlog_rate_limit rll = VLOG_RATE_LIMIT_INIT(1, 1);
942
0
        VLOG_WARN_RL(&rll, "%s: Linux forbids network device with this name",
943
0
                     name);
944
0
        return EINVAL;
945
0
    }
946
947
    /* The device could be in the same network namespace or in another one. */
948
0
    netnsid_unset(&netdev->netnsid);
949
0
    ovs_mutex_init(&netdev->mutex);
950
951
0
    return 0;
952
0
}
953
954
/* Creates system and internal devices. */
955
int
956
netdev_linux_construct(struct netdev *netdev_)
957
0
{
958
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
959
0
    int error = netdev_linux_common_construct(netdev_);
960
0
    if (error) {
961
0
        return error;
962
0
    }
963
964
0
    if (userspace_tso_enabled()) {
965
        /* The AF_PACKET socket interface uses the same option to facilitate
966
         * both csum and segmentation offloading. However, these features can
967
         * be toggled off or on individually at the interface level. The netdev
968
         * flags are set based on the features indicated by ethtool. */
969
0
        netdev_linux_set_ol(netdev_);
970
0
    }
971
972
0
    error = get_flags(&netdev->up, &netdev->ifi_flags);
973
0
    if (error == ENODEV) {
974
0
        if (netdev->up.netdev_class != &netdev_internal_class) {
975
            /* The device does not exist, so don't allow it to be opened. */
976
0
            return ENODEV;
977
0
        } else {
978
            /* "Internal" netdevs have to be created as netdev objects before
979
             * they exist in the kernel, because creating them in the kernel
980
             * happens by passing a netdev object to dpif_port_add().
981
             * Therefore, ignore the error. */
982
0
        }
983
0
    }
984
985
0
    return 0;
986
0
}
987
988
/* For most types of netdevs we open the device for each call of
989
 * netdev_open().  However, this is not the case with tap devices,
990
 * since it is only possible to open the device once.  In this
991
 * situation we share a single file descriptor, and consequently
992
 * buffers, across all readers.  Therefore once data is read it will
993
 * be unavailable to other reads for tap devices. */
994
static int
995
netdev_linux_construct_tap(struct netdev *netdev_)
996
0
{
997
0
    static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
998
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
999
0
    static const char tap_dev[] = "/dev/net/tun";
1000
0
    const char *name = netdev_->name;
1001
0
    unsigned long oflags;
1002
0
    unsigned int up;
1003
0
    struct ifreq ifr;
1004
1005
0
    int error = netdev_linux_common_construct(netdev_);
1006
0
    if (error) {
1007
0
        return error;
1008
0
    }
1009
1010
    /* Open tap device. */
1011
0
    netdev->tap_fd = open(tap_dev, O_RDWR);
1012
0
    if (netdev->tap_fd < 0) {
1013
0
        error = errno;
1014
0
        VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
1015
0
        return error;
1016
0
    }
1017
1018
    /* Create tap device. */
1019
0
    get_flags(&netdev->up, &netdev->ifi_flags);
1020
1021
0
    if (ovsthread_once_start(&once)) {
1022
0
        if (ioctl(netdev->tap_fd, TUNGETFEATURES, &up) == -1) {
1023
0
            VLOG_WARN("%s: querying tap features failed: %s", name,
1024
0
                      ovs_strerror(errno));
1025
0
            tap_supports_vnet_hdr = false;
1026
0
        } else if (!(up & IFF_VNET_HDR)) {
1027
0
            VLOG_WARN("TAP interfaces do not support virtio-net headers");
1028
0
            tap_supports_vnet_hdr = false;
1029
0
        }
1030
0
        ovsthread_once_done(&once);
1031
0
    }
1032
1033
0
    memset(&ifr, 0, sizeof ifr);
1034
1035
0
    ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
1036
0
    if (tap_supports_vnet_hdr) {
1037
0
        ifr.ifr_flags |= IFF_VNET_HDR;
1038
0
    }
1039
1040
0
    ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
1041
0
    if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
1042
0
        VLOG_WARN("%s: creating tap device failed: %s", name,
1043
0
                  ovs_strerror(errno));
1044
0
        error = errno;
1045
0
        goto error_close;
1046
0
    }
1047
1048
    /* Make non-blocking. */
1049
0
    error = set_nonblocking(netdev->tap_fd);
1050
0
    if (error) {
1051
0
        goto error_close;
1052
0
    }
1053
1054
0
    if (ioctl(netdev->tap_fd, TUNSETPERSIST, 1)) {
1055
0
        VLOG_WARN("%s: creating tap device failed (persist): %s", name,
1056
0
                  ovs_strerror(errno));
1057
0
        error = errno;
1058
0
        goto error_close;
1059
0
    }
1060
1061
0
    oflags = TUN_F_CSUM;
1062
0
    if (userspace_tso_enabled()) {
1063
0
        oflags |= (TUN_F_TSO4 | TUN_F_TSO6);
1064
0
    }
1065
1066
0
    if (tap_supports_vnet_hdr
1067
0
        && ioctl(netdev->tap_fd, TUNSETOFFLOAD, oflags) == 0) {
1068
0
        netdev_->ol_flags |= (NETDEV_TX_OFFLOAD_TCP_CKSUM
1069
0
                              | NETDEV_TX_OFFLOAD_UDP_CKSUM);
1070
1071
0
        if (userspace_tso_enabled()) {
1072
0
            netdev_->ol_flags |= NETDEV_TX_OFFLOAD_TCP_TSO;
1073
0
        }
1074
0
    } else {
1075
0
       VLOG_INFO("%s: Disabling checksum and segment offloading due to "
1076
0
                 "missing kernel support", name);
1077
0
    }
1078
1079
0
    netdev->present = true;
1080
0
    return 0;
1081
1082
0
error_close:
1083
0
    close(netdev->tap_fd);
1084
0
    return error;
1085
0
}
1086
1087
static void
1088
netdev_linux_destruct(struct netdev *netdev_)
1089
0
{
1090
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1091
1092
0
    if (netdev->tc && netdev->tc->ops->tc_destroy) {
1093
0
        netdev->tc->ops->tc_destroy(netdev->tc);
1094
0
    }
1095
1096
0
    if (netdev_get_class(netdev_) == &netdev_tap_class
1097
0
        && netdev->tap_fd >= 0)
1098
0
    {
1099
0
        ioctl(netdev->tap_fd, TUNSETPERSIST, 0);
1100
0
        close(netdev->tap_fd);
1101
0
    }
1102
1103
0
    if (netdev->miimon_interval > 0) {
1104
0
        atomic_count_dec(&miimon_cnt);
1105
0
    }
1106
1107
0
    ovs_mutex_destroy(&netdev->mutex);
1108
0
}
1109
1110
static void
1111
netdev_linux_dealloc(struct netdev *netdev_)
1112
0
{
1113
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1114
0
    free(netdev);
1115
0
}
1116
1117
static struct netdev_rxq *
1118
netdev_linux_rxq_alloc(void)
1119
0
{
1120
0
    struct netdev_rxq_linux *rx = xzalloc(sizeof *rx);
1121
0
    return &rx->up;
1122
0
}
1123
1124
static int
1125
netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
1126
0
{
1127
0
    struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1128
0
    struct netdev *netdev_ = rx->up.netdev;
1129
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1130
0
    int error;
1131
1132
0
    ovs_mutex_lock(&netdev->mutex);
1133
0
    rx->is_tap = is_tap_netdev(netdev_);
1134
0
    if (rx->is_tap) {
1135
0
        rx->fd = netdev->tap_fd;
1136
0
    } else {
1137
0
        struct sockaddr_ll sll;
1138
0
        int ifindex, val;
1139
        /* Result of tcpdump -dd inbound */
1140
0
        static const struct sock_filter filt[] = {
1141
0
            { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
1142
0
            { 0x15, 0, 1, 0x00000004 }, /* jeq #4     jt 2  jf 3 */
1143
0
            { 0x6, 0, 0, 0x00000000 },  /* ret #0 */
1144
0
            { 0x6, 0, 0, 0x0000ffff }   /* ret #65535 */
1145
0
        };
1146
0
        static const struct sock_fprog fprog = {
1147
0
            ARRAY_SIZE(filt), (struct sock_filter *) filt
1148
0
        };
1149
1150
        /* Create file descriptor. */
1151
0
        rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
1152
0
        if (rx->fd < 0) {
1153
0
            error = errno;
1154
0
            VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
1155
0
            goto error;
1156
0
        }
1157
1158
0
        val = 1;
1159
0
        if (setsockopt(rx->fd, SOL_PACKET, PACKET_AUXDATA, &val, sizeof val)) {
1160
0
            error = errno;
1161
0
            VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
1162
0
                     netdev_get_name(netdev_), ovs_strerror(error));
1163
0
            goto error;
1164
0
        }
1165
1166
0
        if (userspace_tso_enabled()
1167
0
            && setsockopt(rx->fd, SOL_PACKET, PACKET_VNET_HDR, &val,
1168
0
                          sizeof val)) {
1169
0
            error = errno;
1170
0
            VLOG_ERR("%s: failed to enable vnet hdr in txq raw socket: %s",
1171
0
                     netdev_get_name(netdev_), ovs_strerror(errno));
1172
0
            goto error;
1173
0
        }
1174
1175
        /* Set non-blocking mode. */
1176
0
        error = set_nonblocking(rx->fd);
1177
0
        if (error) {
1178
0
            goto error;
1179
0
        }
1180
1181
        /* Get ethernet device index. */
1182
0
        error = get_ifindex(&netdev->up, &ifindex);
1183
0
        if (error) {
1184
0
            goto error;
1185
0
        }
1186
1187
        /* Bind to specific ethernet device. */
1188
0
        memset(&sll, 0, sizeof sll);
1189
0
        sll.sll_family = AF_PACKET;
1190
0
        sll.sll_ifindex = ifindex;
1191
0
        sll.sll_protocol = htons(ETH_P_ALL);
1192
0
        if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
1193
0
            error = errno;
1194
0
            VLOG_ERR("%s: failed to bind raw socket (%s)",
1195
0
                     netdev_get_name(netdev_), ovs_strerror(error));
1196
0
            goto error;
1197
0
        }
1198
1199
        /* Filter for only inbound packets. */
1200
0
        error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
1201
0
                           sizeof fprog);
1202
0
        if (error) {
1203
0
            error = errno;
1204
0
            VLOG_ERR("%s: failed to attach filter (%s)",
1205
0
                     netdev_get_name(netdev_), ovs_strerror(error));
1206
0
            goto error;
1207
0
        }
1208
0
    }
1209
0
    ovs_mutex_unlock(&netdev->mutex);
1210
1211
0
    return 0;
1212
1213
0
error:
1214
0
    if (rx->fd >= 0) {
1215
0
        close(rx->fd);
1216
0
    }
1217
0
    ovs_mutex_unlock(&netdev->mutex);
1218
0
    return error;
1219
0
}
1220
1221
static void
1222
netdev_linux_rxq_destruct(struct netdev_rxq *rxq_)
1223
0
{
1224
0
    struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1225
0
    int i;
1226
1227
0
    if (!rx->is_tap) {
1228
0
        close(rx->fd);
1229
0
    }
1230
1231
0
    for (i = 0; i < NETDEV_MAX_BURST; i++) {
1232
0
        dp_packet_delete(rx->aux_bufs[i]);
1233
0
    }
1234
0
}
1235
1236
static void
1237
netdev_linux_rxq_dealloc(struct netdev_rxq *rxq_)
1238
0
{
1239
0
    struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1240
1241
0
    free(rx);
1242
0
}
1243
1244
static ovs_be16
1245
auxdata_to_vlan_tpid(const struct tpacket_auxdata *aux, bool double_tagged)
1246
0
{
1247
0
    if (aux->tp_status & TP_STATUS_VLAN_TPID_VALID) {
1248
0
        return htons(aux->tp_vlan_tpid);
1249
0
    } else if (double_tagged) {
1250
0
        return htons(ETH_TYPE_VLAN_8021AD);
1251
0
    } else {
1252
0
        return htons(ETH_TYPE_VLAN_8021Q);
1253
0
    }
1254
0
}
1255
1256
static bool
1257
auxdata_has_vlan_tci(const struct tpacket_auxdata *aux)
1258
0
{
1259
0
    return aux->tp_vlan_tci || aux->tp_status & TP_STATUS_VLAN_VALID;
1260
0
}
1261
1262
/*
1263
 * Receive packets from raw socket in batch process for better performance,
1264
 * it can receive NETDEV_MAX_BURST packets at most once, the received
1265
 * packets are added into *batch. The return value is 0 or errno.
1266
 *
1267
 * It also used recvmmsg to reduce multiple syscalls overhead;
1268
 */
1269
static int
1270
netdev_linux_batch_rxq_recv_sock(struct netdev_rxq_linux *rx, int mtu,
1271
                                 struct dp_packet_batch *batch)
1272
0
{
1273
0
    int iovlen;
1274
0
    size_t std_len;
1275
0
    ssize_t retval;
1276
0
    int virtio_net_hdr_size;
1277
0
    struct iovec iovs[NETDEV_MAX_BURST][IOV_TSO_SIZE];
1278
0
    struct cmsghdr *cmsg;
1279
0
    union {
1280
0
        struct cmsghdr cmsg;
1281
0
        char buffer[CMSG_SPACE(sizeof(struct tpacket_auxdata))];
1282
0
    } cmsg_buffers[NETDEV_MAX_BURST];
1283
0
    struct mmsghdr mmsgs[NETDEV_MAX_BURST];
1284
0
    struct dp_packet *buffers[NETDEV_MAX_BURST];
1285
0
    int i;
1286
1287
0
    if (userspace_tso_enabled()) {
1288
        /* Use the buffer from the allocated packet below to receive MTU
1289
         * sized packets and an aux_buf for extra TSO data. */
1290
0
        iovlen = IOV_TSO_SIZE;
1291
0
        virtio_net_hdr_size = sizeof(struct virtio_net_hdr);
1292
0
    } else {
1293
        /* Use only the buffer from the allocated packet. */
1294
0
        iovlen = IOV_STD_SIZE;
1295
0
        virtio_net_hdr_size = 0;
1296
0
    }
1297
1298
    /* The length here needs to be accounted in the same way when the
1299
     * aux_buf is allocated so that it can be prepended to TSO buffer. */
1300
0
    std_len = virtio_net_hdr_size + VLAN_ETH_HEADER_LEN + mtu;
1301
0
    for (i = 0; i < NETDEV_MAX_BURST; i++) {
1302
0
        buffers[i] = dp_packet_new_with_headroom(std_len, DP_NETDEV_HEADROOM);
1303
0
        iovs[i][IOV_PACKET].iov_base = dp_packet_data(buffers[i]);
1304
0
        iovs[i][IOV_PACKET].iov_len = std_len;
1305
0
        if (iovlen == IOV_TSO_SIZE) {
1306
0
            iovs[i][IOV_AUXBUF].iov_base = dp_packet_data(rx->aux_bufs[i]);
1307
0
            iovs[i][IOV_AUXBUF].iov_len = dp_packet_tailroom(rx->aux_bufs[i]);
1308
0
        }
1309
1310
0
        mmsgs[i].msg_hdr.msg_name = NULL;
1311
0
        mmsgs[i].msg_hdr.msg_namelen = 0;
1312
0
        mmsgs[i].msg_hdr.msg_iov = iovs[i];
1313
0
        mmsgs[i].msg_hdr.msg_iovlen = iovlen;
1314
0
        mmsgs[i].msg_hdr.msg_control = &cmsg_buffers[i];
1315
0
        mmsgs[i].msg_hdr.msg_controllen = sizeof cmsg_buffers[i];
1316
0
        mmsgs[i].msg_hdr.msg_flags = 0;
1317
0
    }
1318
1319
0
    do {
1320
0
        retval = recvmmsg(rx->fd, mmsgs, NETDEV_MAX_BURST, MSG_TRUNC, NULL);
1321
0
    } while (retval < 0 && errno == EINTR);
1322
1323
0
    if (retval < 0) {
1324
0
        retval = errno;
1325
0
        for (i = 0; i < NETDEV_MAX_BURST; i++) {
1326
0
            dp_packet_delete(buffers[i]);
1327
0
        }
1328
1329
0
        return retval;
1330
0
    }
1331
1332
0
    for (i = 0; i < retval; i++) {
1333
0
        struct dp_packet *pkt;
1334
1335
0
        if (mmsgs[i].msg_hdr.msg_flags & MSG_TRUNC
1336
0
            || mmsgs[i].msg_len < ETH_HEADER_LEN) {
1337
0
            struct netdev *netdev_ = netdev_rxq_get_netdev(&rx->up);
1338
0
            struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1339
1340
            /* The rx->aux_bufs[i] will be re-used next time. */
1341
0
            dp_packet_delete(buffers[i]);
1342
0
            netdev->rx_dropped += 1;
1343
0
            if (mmsgs[i].msg_hdr.msg_flags & MSG_TRUNC) {
1344
                /* Data is truncated, so the packet is corrupted, and needs
1345
                 * to be dropped. This can happen if TSO/GRO is enabled in
1346
                 * the kernel, but not in userspace, i.e. there is no dp
1347
                 * buffer to store the full packet. */
1348
0
                VLOG_WARN_RL(&rl,
1349
0
                             "%s: Dropped packet: Too big. GRO/TSO enabled?",
1350
0
                             netdev_get_name(netdev_));
1351
0
            } else {
1352
0
                VLOG_WARN_RL(&rl,
1353
0
                             "%s: Dropped packet: less than ether hdr size",
1354
0
                             netdev_get_name(netdev_));
1355
0
            }
1356
1357
0
            continue;
1358
0
        }
1359
1360
0
        if (mmsgs[i].msg_len > std_len) {
1361
            /* Build a single linear TSO packet by prepending the data from
1362
             * std_len buffer to the aux_buf. */
1363
0
            pkt = rx->aux_bufs[i];
1364
0
            dp_packet_set_size(pkt, mmsgs[i].msg_len - std_len);
1365
0
            dp_packet_push(pkt, dp_packet_data(buffers[i]), std_len);
1366
            /* The headroom should be the same in buffers[i], pkt and
1367
             * DP_NETDEV_HEADROOM. */
1368
0
            dp_packet_resize(pkt, DP_NETDEV_HEADROOM, 0);
1369
0
            dp_packet_delete(buffers[i]);
1370
0
            rx->aux_bufs[i] = NULL;
1371
0
         } else {
1372
0
            dp_packet_set_size(buffers[i], mmsgs[i].msg_len);
1373
0
            pkt = buffers[i];
1374
0
         }
1375
1376
0
        if (virtio_net_hdr_size) {
1377
0
            int ret = netdev_linux_parse_vnet_hdr(pkt);
1378
0
            if (OVS_UNLIKELY(ret)) {
1379
0
                struct netdev *netdev_ = netdev_rxq_get_netdev(&rx->up);
1380
0
                struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1381
1382
                /* Unexpected error situation: the virtio header is not
1383
                 * present or corrupted or contains unsupported features.
1384
                 * Drop the packet but continue in case next ones are
1385
                 * correct. */
1386
0
                dp_packet_delete(pkt);
1387
0
                netdev->rx_dropped += 1;
1388
0
                VLOG_WARN_RL(&rl, "%s: Dropped packet: vnet header is missing "
1389
0
                             "or corrupt: %s", netdev_get_name(netdev_),
1390
0
                             ovs_strerror(ret));
1391
0
                continue;
1392
0
            }
1393
0
        }
1394
1395
0
        for (cmsg = CMSG_FIRSTHDR(&mmsgs[i].msg_hdr); cmsg;
1396
0
                 cmsg = CMSG_NXTHDR(&mmsgs[i].msg_hdr, cmsg)) {
1397
0
            const struct tpacket_auxdata *aux;
1398
1399
0
            if (cmsg->cmsg_level != SOL_PACKET
1400
0
                || cmsg->cmsg_type != PACKET_AUXDATA
1401
0
                || cmsg->cmsg_len <
1402
0
                       CMSG_LEN(sizeof(struct tpacket_auxdata))) {
1403
0
                continue;
1404
0
            }
1405
1406
0
            aux = ALIGNED_CAST(struct tpacket_auxdata *, CMSG_DATA(cmsg));
1407
0
            if (auxdata_has_vlan_tci(aux)) {
1408
0
                struct eth_header *eth;
1409
0
                bool double_tagged;
1410
1411
0
                eth = dp_packet_data(pkt);
1412
0
                double_tagged = eth->eth_type == htons(ETH_TYPE_VLAN_8021Q);
1413
1414
0
                eth_push_vlan(pkt,
1415
0
                              auxdata_to_vlan_tpid(aux, double_tagged),
1416
0
                              htons(aux->tp_vlan_tci));
1417
0
                break;
1418
0
            }
1419
0
        }
1420
0
        dp_packet_batch_add(batch, pkt);
1421
0
    }
1422
1423
    /* Delete unused buffers. */
1424
0
    for (; i < NETDEV_MAX_BURST; i++) {
1425
0
        dp_packet_delete(buffers[i]);
1426
0
    }
1427
1428
0
    return 0;
1429
0
}
1430
1431
/*
1432
 * Receive packets from tap by batch process for better performance,
1433
 * it can receive NETDEV_MAX_BURST packets at most once, the received
1434
 * packets are added into *batch. The return value is 0 or errno.
1435
 */
1436
static int
1437
netdev_linux_batch_rxq_recv_tap(struct netdev_rxq_linux *rx, int mtu,
1438
                                struct dp_packet_batch *batch)
1439
0
{
1440
0
    int virtio_net_hdr_size;
1441
0
    ssize_t retval;
1442
0
    size_t std_len;
1443
0
    int iovlen;
1444
0
    int i;
1445
1446
0
    if (userspace_tso_enabled()) {
1447
        /* Use the buffer from the allocated packet below to receive MTU
1448
         * sized packets and an aux_buf for extra TSO data. */
1449
0
        iovlen = IOV_TSO_SIZE;
1450
0
    } else {
1451
        /* Use only the buffer from the allocated packet. */
1452
0
        iovlen = IOV_STD_SIZE;
1453
0
    }
1454
0
    if (OVS_LIKELY(tap_supports_vnet_hdr)) {
1455
0
        virtio_net_hdr_size = sizeof(struct virtio_net_hdr);
1456
0
    } else {
1457
0
        virtio_net_hdr_size = 0;
1458
0
    }
1459
1460
    /* The length here needs to be accounted in the same way when the
1461
     * aux_buf is allocated so that it can be prepended to TSO buffer. */
1462
0
    std_len = virtio_net_hdr_size + VLAN_ETH_HEADER_LEN + mtu;
1463
0
    for (i = 0; i < NETDEV_MAX_BURST; i++) {
1464
0
        struct dp_packet *buffer;
1465
0
        struct dp_packet *pkt;
1466
0
        struct iovec iov[IOV_TSO_SIZE];
1467
1468
        /* Assume Ethernet port. No need to set packet_type. */
1469
0
        buffer = dp_packet_new_with_headroom(std_len, DP_NETDEV_HEADROOM);
1470
0
        iov[IOV_PACKET].iov_base = dp_packet_data(buffer);
1471
0
        iov[IOV_PACKET].iov_len = std_len;
1472
0
        if (iovlen == IOV_TSO_SIZE) {
1473
0
            iov[IOV_AUXBUF].iov_base = dp_packet_data(rx->aux_bufs[i]);
1474
0
            iov[IOV_AUXBUF].iov_len = dp_packet_tailroom(rx->aux_bufs[i]);
1475
0
        }
1476
1477
0
        do {
1478
0
            retval = readv(rx->fd, iov, iovlen);
1479
0
        } while (retval < 0 && errno == EINTR);
1480
1481
0
        if (retval < 0) {
1482
0
            dp_packet_delete(buffer);
1483
0
            break;
1484
0
        }
1485
1486
0
        if (retval > std_len) {
1487
            /* Build a single linear TSO packet by prepending the data from
1488
             * std_len buffer to the aux_buf. */
1489
0
            pkt = rx->aux_bufs[i];
1490
0
            dp_packet_set_size(pkt, retval - std_len);
1491
0
            dp_packet_push(pkt, dp_packet_data(buffer), std_len);
1492
            /* The headroom should be the same in buffers[i], pkt and
1493
             * DP_NETDEV_HEADROOM. */
1494
0
            dp_packet_resize(pkt, DP_NETDEV_HEADROOM, 0);
1495
0
            dp_packet_delete(buffer);
1496
0
            rx->aux_bufs[i] = NULL;
1497
0
        } else {
1498
0
            dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
1499
0
            pkt = buffer;
1500
0
        }
1501
1502
0
        if (OVS_LIKELY(virtio_net_hdr_size) &&
1503
0
            netdev_linux_parse_vnet_hdr(pkt)) {
1504
0
            struct netdev *netdev_ = netdev_rxq_get_netdev(&rx->up);
1505
0
            struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1506
1507
            /* Unexpected error situation: the virtio header is not present
1508
             * or corrupted. Drop the packet but continue in case next ones
1509
             * are correct. */
1510
0
            dp_packet_delete(pkt);
1511
0
            netdev->rx_dropped += 1;
1512
0
            VLOG_WARN_RL(&rl, "%s: Dropped packet: Invalid virtio net header",
1513
0
                         netdev_get_name(netdev_));
1514
0
            continue;
1515
0
        }
1516
1517
0
        dp_packet_batch_add(batch, pkt);
1518
0
    }
1519
1520
0
    if ((i == 0) && (retval < 0)) {
1521
0
        return errno;
1522
0
    }
1523
1524
0
    return 0;
1525
0
}
1526
1527
static int
1528
netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet_batch *batch,
1529
                      int *qfill)
1530
0
{
1531
0
    struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1532
0
    struct netdev *netdev = rx->up.netdev;
1533
0
    ssize_t retval;
1534
0
    int mtu;
1535
1536
0
    if (netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
1537
0
        mtu = ETH_PAYLOAD_MAX;
1538
0
    }
1539
1540
0
    if (userspace_tso_enabled()) {
1541
        /* Allocate TSO packets. The packet has enough headroom to store
1542
         * a full non-TSO packet. When a TSO packet is received, the data
1543
         * from non-TSO buffer (std_len) is prepended to the TSO packet
1544
         * (aux_buf). */
1545
0
        size_t std_len = sizeof(struct virtio_net_hdr) + VLAN_ETH_HEADER_LEN
1546
0
                         + DP_NETDEV_HEADROOM + mtu;
1547
0
        size_t data_len = LINUX_RXQ_TSO_MAX_LEN - std_len;
1548
0
        for (int i = 0; i < NETDEV_MAX_BURST; i++) {
1549
0
            if (rx->aux_bufs[i]) {
1550
0
                continue;
1551
0
            }
1552
1553
0
            rx->aux_bufs[i] = dp_packet_new_with_headroom(data_len, std_len);
1554
0
        }
1555
0
    }
1556
1557
0
    dp_packet_batch_init(batch);
1558
0
    retval = (rx->is_tap
1559
0
              ? netdev_linux_batch_rxq_recv_tap(rx, mtu, batch)
1560
0
              : netdev_linux_batch_rxq_recv_sock(rx, mtu, batch));
1561
1562
0
    if (retval) {
1563
0
        if (retval != EAGAIN && retval != EMSGSIZE) {
1564
0
            VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
1565
0
                         netdev_rxq_get_name(rxq_), ovs_strerror(errno));
1566
0
        }
1567
0
    }
1568
1569
0
    if (qfill) {
1570
0
        *qfill = -ENOTSUP;
1571
0
    }
1572
1573
0
    return retval;
1574
0
}
1575
1576
static void
1577
netdev_linux_rxq_wait(struct netdev_rxq *rxq_)
1578
0
{
1579
0
    struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1580
0
    poll_fd_wait(rx->fd, POLLIN);
1581
0
}
1582
1583
static int
1584
netdev_linux_rxq_drain(struct netdev_rxq *rxq_)
1585
0
{
1586
0
    struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
1587
0
    if (rx->is_tap) {
1588
0
        struct ifreq ifr;
1589
0
        int error;
1590
1591
0
        memset(&ifr, 0, sizeof ifr);
1592
0
        error = af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_), &ifr,
1593
0
                                    SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
1594
0
        if (error) {
1595
0
            return error;
1596
0
        }
1597
0
        drain_fd(rx->fd, ifr.ifr_qlen);
1598
0
        return 0;
1599
0
    } else {
1600
0
        return drain_rcvbuf(rx->fd);
1601
0
    }
1602
0
}
1603
1604
static int
1605
netdev_linux_sock_batch_send(struct netdev *netdev_, int sock, int ifindex,
1606
                             bool tso, int mtu, struct dp_packet_batch *batch)
1607
0
{
1608
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1609
0
    const size_t size = dp_packet_batch_size(batch);
1610
    /* We don't bother setting most fields in sockaddr_ll because the
1611
     * kernel ignores them for SOCK_RAW. */
1612
0
    struct sockaddr_ll sll = { .sll_family = AF_PACKET,
1613
0
                               .sll_ifindex = ifindex };
1614
1615
0
    struct mmsghdr *mmsg = xmalloc(sizeof(*mmsg) * size);
1616
0
    struct iovec *iov = xmalloc(sizeof(*iov) * size);
1617
0
    struct dp_packet *packet;
1618
0
    int cnt = 0;
1619
1620
0
    DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
1621
0
        if (tso) {
1622
0
            int ret = netdev_linux_prepend_vnet_hdr(packet, mtu);
1623
1624
0
            if (OVS_UNLIKELY(ret)) {
1625
0
                netdev->tx_dropped += 1;
1626
0
                VLOG_WARN_RL(&rl, "%s: Prepend vnet hdr failed, packet "
1627
0
                                  "dropped. %s", netdev_get_name(netdev_),
1628
0
                             ovs_strerror(ret));
1629
0
                continue;
1630
0
            }
1631
0
         }
1632
1633
0
        iov[cnt].iov_base = dp_packet_data(packet);
1634
0
        iov[cnt].iov_len = dp_packet_size(packet);
1635
0
        mmsg[cnt].msg_hdr = (struct msghdr) { .msg_name = &sll,
1636
0
                                              .msg_namelen = sizeof sll,
1637
0
                                              .msg_iov = &iov[cnt],
1638
0
                                              .msg_iovlen = 1 };
1639
0
        cnt++;
1640
0
    }
1641
1642
0
    int error = 0;
1643
0
    for (uint32_t ofs = 0; ofs < cnt;) {
1644
0
        ssize_t retval;
1645
0
        do {
1646
0
            retval = sendmmsg(sock, mmsg + ofs, cnt - ofs, 0);
1647
0
            error = retval < 0 ? errno : 0;
1648
0
        } while (error == EINTR);
1649
0
        if (error) {
1650
0
            break;
1651
0
        }
1652
0
        ofs += retval;
1653
0
    }
1654
1655
0
    free(mmsg);
1656
0
    free(iov);
1657
0
    return error;
1658
0
}
1659
1660
/* Use the tap fd to send 'batch' to tap device 'netdev'.  Using the tap fd is
1661
 * essential, because packets sent to a tap device with an AF_PACKET socket
1662
 * will loop back to be *received* again on the tap device.  This doesn't occur
1663
 * on other interface types because we attach a socket filter to the rx
1664
 * socket. */
1665
static int
1666
netdev_linux_tap_batch_send(struct netdev *netdev_, int mtu,
1667
                            struct dp_packet_batch *batch)
1668
0
{
1669
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1670
0
    struct dp_packet *packet;
1671
1672
    /* The Linux tap driver returns EIO if the device is not up,
1673
     * so if the device is not up, don't waste time sending it.
1674
     * However, if the device is in another network namespace
1675
     * then OVS can't retrieve the state. In that case, send the
1676
     * packets anyway. */
1677
0
    if (netdev->present && !(netdev->ifi_flags & IFF_UP)) {
1678
0
        netdev->tx_dropped += dp_packet_batch_size(batch);
1679
0
        return 0;
1680
0
    }
1681
1682
0
    DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
1683
0
        size_t size;
1684
0
        ssize_t retval;
1685
0
        int error;
1686
1687
0
        if (OVS_LIKELY(tap_supports_vnet_hdr)) {
1688
0
            error = netdev_linux_prepend_vnet_hdr(packet, mtu);
1689
0
            if (OVS_UNLIKELY(error)) {
1690
0
                netdev->tx_dropped++;
1691
0
                VLOG_WARN_RL(&rl, "%s: Prepend vnet hdr failed, packet "
1692
0
                             "dropped. %s", netdev_get_name(netdev_),
1693
0
                             ovs_strerror(error));
1694
0
                continue;
1695
0
            }
1696
0
        }
1697
1698
0
        size = dp_packet_size(packet);
1699
0
        do {
1700
0
            retval = write(netdev->tap_fd, dp_packet_data(packet), size);
1701
0
            error = retval < 0 ? errno : 0;
1702
0
        } while (error == EINTR);
1703
1704
0
        if (error) {
1705
            /* The Linux tap driver returns EIO if the device is not up.  From
1706
             * the OVS side this is not an error, so we ignore it; otherwise,
1707
             * return the erro. */
1708
0
            if (error != EIO) {
1709
0
                return error;
1710
0
            }
1711
0
        } else if (retval != size) {
1712
0
            VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE" "
1713
0
                         "bytes of %"PRIuSIZE") on %s",
1714
0
                         retval, size, netdev_get_name(netdev_));
1715
0
            return EMSGSIZE;
1716
0
        }
1717
0
    }
1718
0
    return 0;
1719
0
}
1720
1721
static int
1722
netdev_linux_get_numa_id__(struct netdev_linux *netdev)
1723
    OVS_REQUIRES(netdev->mutex)
1724
0
{
1725
0
    char *numa_node_path;
1726
0
    const char *name;
1727
0
    int node_id;
1728
0
    FILE *stream;
1729
0
1730
0
    if (netdev->cache_valid & VALID_NUMA_ID) {
1731
0
        return netdev->numa_id;
1732
0
    }
1733
0
1734
0
    netdev->numa_id = 0;
1735
0
    netdev->cache_valid |= VALID_NUMA_ID;
1736
0
1737
0
    if (ovs_numa_get_n_numas() < 2) {
1738
0
        /* No need to check on system with a single NUMA node. */
1739
0
        return 0;
1740
0
    }
1741
0
1742
0
    name = netdev_get_name(&netdev->up);
1743
0
    if (strpbrk(name, "/\\")) {
1744
0
        VLOG_ERR_RL(&rl, "\"%s\" is not a valid name for a port. "
1745
0
                    "A valid name must not include '/' or '\\'."
1746
0
                    "Using numa_id 0", name);
1747
0
        return 0;
1748
0
    }
1749
0
1750
0
    numa_node_path = xasprintf("/sys/class/net/%s/device/numa_node", name);
1751
0
1752
0
    stream = fopen(numa_node_path, "r");
1753
0
    if (!stream) {
1754
0
        /* Virtual device does not have this info. */
1755
0
        VLOG_INFO_RL(&rl, "%s: Can't open '%s': %s, using numa_id 0",
1756
0
                     name, numa_node_path, ovs_strerror(errno));
1757
0
        free(numa_node_path);
1758
0
        return 0;
1759
0
    }
1760
0
1761
0
    if (fscanf(stream, "%d", &node_id) != 1
1762
0
        || !ovs_numa_numa_id_is_valid(node_id))  {
1763
0
        VLOG_WARN_RL(&rl, "%s: Can't detect NUMA node, using numa_id 0", name);
1764
0
        node_id = 0;
1765
0
    }
1766
0
1767
0
    netdev->numa_id = node_id;
1768
0
    fclose(stream);
1769
0
    free(numa_node_path);
1770
0
    return node_id;
1771
0
}
1772
1773
static int OVS_UNUSED
1774
netdev_linux_get_numa_id(const struct netdev *netdev_)
1775
0
{
1776
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1777
0
    int numa_id;
1778
0
1779
0
    ovs_mutex_lock(&netdev->mutex);
1780
0
    numa_id = netdev_linux_get_numa_id__(netdev);
1781
0
    ovs_mutex_unlock(&netdev->mutex);
1782
0
1783
0
    return numa_id;
1784
0
}
1785
1786
/* Sends 'batch' on 'netdev'.  Returns 0 if successful, otherwise a positive
1787
 * errno value.  Returns EAGAIN without blocking if the packet cannot be queued
1788
 * immediately.  Returns EMSGSIZE if a partial packet was transmitted or if
1789
 * the packet is too big or too small to transmit on the device.
1790
 *
1791
 * The kernel maintains a packet transmission queue, so the caller is not
1792
 * expected to do additional queuing of packets. */
1793
static int
1794
netdev_linux_send(struct netdev *netdev_, int qid OVS_UNUSED,
1795
                  struct dp_packet_batch *batch,
1796
                  bool concurrent_txq OVS_UNUSED)
1797
0
{
1798
0
    bool tso = userspace_tso_enabled();
1799
0
    int mtu = ETH_PAYLOAD_MAX;
1800
0
    int error = 0;
1801
0
    int sock = 0;
1802
1803
0
    if (tso) {
1804
0
        netdev_linux_get_mtu__(netdev_linux_cast(netdev_), &mtu);
1805
0
    }
1806
1807
0
    if (!is_tap_netdev(netdev_)) {
1808
0
        if (netdev_linux_netnsid_is_remote(netdev_linux_cast(netdev_))) {
1809
0
            error = EOPNOTSUPP;
1810
0
            goto free_batch;
1811
0
        }
1812
1813
0
        sock = af_packet_sock();
1814
0
        if (sock < 0) {
1815
0
            error = -sock;
1816
0
            goto free_batch;
1817
0
        }
1818
1819
0
        int ifindex = netdev_get_ifindex(netdev_);
1820
0
        if (ifindex < 0) {
1821
0
            error = -ifindex;
1822
0
            goto free_batch;
1823
0
        }
1824
1825
0
        error = netdev_linux_sock_batch_send(netdev_, sock, ifindex, tso, mtu,
1826
0
                                             batch);
1827
0
    } else {
1828
0
        error = netdev_linux_tap_batch_send(netdev_, mtu, batch);
1829
0
    }
1830
0
    if (error) {
1831
0
        if (error == ENOBUFS) {
1832
            /* The Linux AF_PACKET implementation never blocks waiting
1833
             * for room for packets, instead returning ENOBUFS.
1834
             * Translate this into EAGAIN for the caller. */
1835
0
            error = EAGAIN;
1836
0
        } else {
1837
0
            VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
1838
0
                         netdev_get_name(netdev_), ovs_strerror(error));
1839
0
        }
1840
0
    }
1841
1842
0
free_batch:
1843
0
    dp_packet_delete_batch(batch, true);
1844
0
    return error;
1845
0
}
1846
1847
/* Registers with the poll loop to wake up from the next call to poll_block()
1848
 * when the packet transmission queue has sufficient room to transmit a packet
1849
 * with netdev_send().
1850
 *
1851
 * The kernel maintains a packet transmission queue, so the client is not
1852
 * expected to do additional queuing of packets.  Thus, this function is
1853
 * unlikely to ever be used.  It is included for completeness. */
1854
static void
1855
netdev_linux_send_wait(struct netdev *netdev, int qid OVS_UNUSED)
1856
0
{
1857
0
    if (is_tap_netdev(netdev)) {
1858
        /* TAP device always accepts packets.*/
1859
0
        poll_immediate_wake();
1860
0
    }
1861
0
}
1862
1863
/* Attempts to set 'netdev''s MAC address to 'mac'.  Returns 0 if successful,
1864
 * otherwise a positive errno value. */
1865
static int
1866
netdev_linux_set_etheraddr(struct netdev *netdev_, const struct eth_addr mac)
1867
0
{
1868
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1869
0
    enum netdev_flags old_flags = 0;
1870
0
    int error;
1871
1872
0
    ovs_mutex_lock(&netdev->mutex);
1873
0
    if (netdev_linux_netnsid_is_remote(netdev)) {
1874
0
        error = EOPNOTSUPP;
1875
0
        goto exit;
1876
0
    }
1877
1878
0
    if (netdev->cache_valid & VALID_ETHERADDR) {
1879
0
        error = netdev->ether_addr_error;
1880
0
        if (error || eth_addr_equals(netdev->etheraddr, mac)) {
1881
0
            goto exit;
1882
0
        }
1883
0
        netdev->cache_valid &= ~VALID_ETHERADDR;
1884
0
    }
1885
1886
    /* Tap devices must be brought down before setting the address. */
1887
0
    if (is_tap_netdev(netdev_)) {
1888
0
        update_flags(netdev, NETDEV_UP, 0, &old_flags);
1889
0
    }
1890
0
    error = set_etheraddr(netdev_get_name(netdev_), mac);
1891
0
    if (!error || error == ENODEV) {
1892
0
        netdev->ether_addr_error = error;
1893
0
        netdev->cache_valid |= VALID_ETHERADDR;
1894
0
        if (!error) {
1895
0
            netdev->etheraddr = mac;
1896
0
        }
1897
0
    }
1898
1899
0
    if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1900
0
        update_flags(netdev, 0, NETDEV_UP, &old_flags);
1901
0
    }
1902
1903
0
exit:
1904
0
    ovs_mutex_unlock(&netdev->mutex);
1905
0
    return error;
1906
0
}
1907
1908
/* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
1909
static int
1910
netdev_linux_get_etheraddr(const struct netdev *netdev_, struct eth_addr *mac)
1911
0
{
1912
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1913
0
    int error;
1914
1915
0
    ovs_mutex_lock(&netdev->mutex);
1916
0
    if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1917
0
        netdev_linux_update_via_netlink(netdev);
1918
0
    }
1919
1920
0
    if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1921
        /* Fall back to ioctl if netlink fails */
1922
0
        netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
1923
0
                                                 &netdev->etheraddr);
1924
0
        netdev->cache_valid |= VALID_ETHERADDR;
1925
0
    }
1926
1927
0
    error = netdev->ether_addr_error;
1928
0
    if (!error) {
1929
0
        *mac = netdev->etheraddr;
1930
0
    }
1931
0
    ovs_mutex_unlock(&netdev->mutex);
1932
1933
0
    return error;
1934
0
}
1935
1936
static int
1937
netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup)
1938
0
{
1939
0
    int error;
1940
1941
0
    if (!(netdev->cache_valid & VALID_MTU)) {
1942
0
        netdev_linux_update_via_netlink(netdev);
1943
0
    }
1944
1945
0
    if (!(netdev->cache_valid & VALID_MTU)) {
1946
        /* Fall back to ioctl if netlink fails */
1947
0
        struct ifreq ifr;
1948
1949
0
        memset(&ifr, 0, sizeof ifr);
1950
0
        netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
1951
0
            netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
1952
0
        netdev->mtu = ifr.ifr_mtu;
1953
0
        netdev->cache_valid |= VALID_MTU;
1954
0
    }
1955
1956
0
    error = netdev->netdev_mtu_error;
1957
0
    if (!error) {
1958
0
        *mtup = netdev->mtu;
1959
0
    }
1960
1961
0
    return error;
1962
0
}
1963
1964
/* Returns the maximum size of transmitted (and received) packets on 'netdev',
1965
 * in bytes, not including the hardware header; thus, this is typically 1500
1966
 * bytes for Ethernet devices. */
1967
static int
1968
netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1969
0
{
1970
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1971
0
    int error;
1972
1973
0
    ovs_mutex_lock(&netdev->mutex);
1974
0
    error = netdev_linux_get_mtu__(netdev, mtup);
1975
0
    ovs_mutex_unlock(&netdev->mutex);
1976
1977
0
    return error;
1978
0
}
1979
1980
/* Sets the maximum size of transmitted (MTU) for given device using linux
1981
 * networking ioctl interface.
1982
 */
1983
static int
1984
netdev_linux_set_mtu(struct netdev *netdev_, int mtu)
1985
0
{
1986
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1987
0
    struct ifreq ifr;
1988
0
    int error;
1989
1990
0
    ovs_mutex_lock(&netdev->mutex);
1991
0
    if (netdev_linux_netnsid_is_remote(netdev)) {
1992
0
        error = EOPNOTSUPP;
1993
0
        goto exit;
1994
0
    }
1995
1996
#ifdef HAVE_AF_XDP
1997
    if (netdev_get_class(netdev_) == &netdev_afxdp_class) {
1998
        error = netdev_afxdp_verify_mtu_size(netdev_, mtu);
1999
        if (error) {
2000
            goto exit;
2001
        }
2002
    }
2003
#endif
2004
2005
0
    if (netdev->cache_valid & VALID_MTU) {
2006
0
        error = netdev->netdev_mtu_error;
2007
0
        if (error || netdev->mtu == mtu) {
2008
0
            goto exit;
2009
0
        }
2010
0
        netdev->cache_valid &= ~VALID_MTU;
2011
0
    }
2012
2013
0
    memset(&ifr, 0, sizeof ifr);
2014
0
    ifr.ifr_mtu = mtu;
2015
2016
0
    error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
2017
0
                                SIOCSIFMTU, "SIOCSIFMTU");
2018
0
    if (!error || error == ENODEV) {
2019
0
        netdev->netdev_mtu_error = error;
2020
0
        netdev->mtu = ifr.ifr_mtu;
2021
0
        netdev->cache_valid |= VALID_MTU;
2022
0
    }
2023
0
exit:
2024
0
    ovs_mutex_unlock(&netdev->mutex);
2025
0
    return error;
2026
0
}
2027
2028
/* Returns the ifindex of 'netdev', if successful, as a positive number.
2029
 * On failure, returns a negative errno value. */
2030
static int
2031
netdev_linux_get_ifindex(const struct netdev *netdev_)
2032
0
{
2033
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2034
0
    int ifindex, error;
2035
2036
0
    ovs_mutex_lock(&netdev->mutex);
2037
0
    if (netdev_linux_netnsid_is_remote(netdev)) {
2038
0
        error = EOPNOTSUPP;
2039
0
        goto exit;
2040
0
    }
2041
0
    error = get_ifindex(netdev_, &ifindex);
2042
2043
0
exit:
2044
0
    ovs_mutex_unlock(&netdev->mutex);
2045
0
    return error ? -error : ifindex;
2046
0
}
2047
2048
static int
2049
netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
2050
0
{
2051
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2052
2053
0
    ovs_mutex_lock(&netdev->mutex);
2054
0
    if (netdev->miimon_interval > 0) {
2055
0
        *carrier = netdev->miimon;
2056
0
    } else {
2057
0
        *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
2058
0
    }
2059
0
    ovs_mutex_unlock(&netdev->mutex);
2060
2061
0
    return 0;
2062
0
}
2063
2064
static long long int
2065
netdev_linux_get_carrier_resets(const struct netdev *netdev_)
2066
0
{
2067
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2068
0
    long long int carrier_resets;
2069
2070
0
    ovs_mutex_lock(&netdev->mutex);
2071
0
    carrier_resets = netdev->carrier_resets;
2072
0
    ovs_mutex_unlock(&netdev->mutex);
2073
2074
0
    return carrier_resets;
2075
0
}
2076
2077
static int
2078
netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
2079
                       struct mii_ioctl_data *data)
2080
0
{
2081
0
    struct ifreq ifr;
2082
0
    int error;
2083
2084
0
    memset(&ifr, 0, sizeof ifr);
2085
0
    memcpy(&ifr.ifr_data, data, sizeof *data);
2086
0
    error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
2087
0
    memcpy(data, &ifr.ifr_data, sizeof *data);
2088
2089
0
    return error;
2090
0
}
2091
2092
static int
2093
netdev_linux_get_miimon(const char *name, bool *miimon)
2094
0
{
2095
0
    struct mii_ioctl_data data;
2096
0
    int error;
2097
2098
0
    *miimon = false;
2099
2100
0
    memset(&data, 0, sizeof data);
2101
0
    error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
2102
0
    if (!error) {
2103
        /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
2104
0
        data.reg_num = MII_BMSR;
2105
0
        error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
2106
0
                                       &data);
2107
2108
0
        if (!error) {
2109
0
            *miimon = !!(data.val_out & BMSR_LSTATUS);
2110
0
        }
2111
0
    }
2112
0
    if (error) {
2113
0
        struct ethtool_cmd ecmd;
2114
2115
0
        VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
2116
0
                    name);
2117
2118
0
        COVERAGE_INC(netdev_get_ethtool);
2119
0
        memset(&ecmd, 0, sizeof ecmd);
2120
0
        error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
2121
0
                                        "ETHTOOL_GLINK");
2122
0
        if (!error) {
2123
0
            struct ethtool_value eval;
2124
2125
0
            memcpy(&eval, &ecmd, sizeof eval);
2126
0
            *miimon = !!eval.data;
2127
0
        } else {
2128
0
            VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
2129
0
        }
2130
0
    }
2131
2132
0
    return error;
2133
0
}
2134
2135
static int
2136
netdev_linux_set_miimon_interval(struct netdev *netdev_,
2137
                                 long long int interval)
2138
0
{
2139
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2140
2141
0
    ovs_mutex_lock(&netdev->mutex);
2142
0
    interval = interval > 0 ? MAX(interval, 100) : 0;
2143
0
    if (netdev->miimon_interval != interval) {
2144
0
        if (interval && !netdev->miimon_interval) {
2145
0
            atomic_count_inc(&miimon_cnt);
2146
0
        } else if (!interval && netdev->miimon_interval) {
2147
0
            atomic_count_dec(&miimon_cnt);
2148
0
        }
2149
2150
0
        netdev->miimon_interval = interval;
2151
0
        timer_set_expired(&netdev->miimon_timer);
2152
0
    }
2153
0
    ovs_mutex_unlock(&netdev->mutex);
2154
2155
0
    return 0;
2156
0
}
2157
2158
static void
2159
netdev_linux_miimon_run(void)
2160
0
{
2161
0
    struct shash device_shash;
2162
0
    struct shash_node *node;
2163
2164
0
    shash_init(&device_shash);
2165
0
    netdev_get_devices(&netdev_linux_class, &device_shash);
2166
0
    SHASH_FOR_EACH (node, &device_shash) {
2167
0
        struct netdev *netdev = node->data;
2168
0
        struct netdev_linux *dev = netdev_linux_cast(netdev);
2169
0
        bool miimon;
2170
2171
0
        ovs_mutex_lock(&dev->mutex);
2172
0
        if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
2173
0
            netdev_linux_get_miimon(dev->up.name, &miimon);
2174
0
            if (miimon != dev->miimon) {
2175
0
                dev->miimon = miimon;
2176
0
                netdev_linux_changed(dev, dev->ifi_flags, 0);
2177
0
            }
2178
2179
0
            timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
2180
0
        }
2181
0
        ovs_mutex_unlock(&dev->mutex);
2182
0
        netdev_close(netdev);
2183
0
    }
2184
2185
0
    shash_destroy(&device_shash);
2186
0
}
2187
2188
static void
2189
netdev_linux_miimon_wait(void)
2190
0
{
2191
0
    struct shash device_shash;
2192
0
    struct shash_node *node;
2193
2194
0
    shash_init(&device_shash);
2195
0
    netdev_get_devices(&netdev_linux_class, &device_shash);
2196
0
    SHASH_FOR_EACH (node, &device_shash) {
2197
0
        struct netdev *netdev = node->data;
2198
0
        struct netdev_linux *dev = netdev_linux_cast(netdev);
2199
2200
0
        ovs_mutex_lock(&dev->mutex);
2201
0
        if (dev->miimon_interval > 0) {
2202
0
            timer_wait(&dev->miimon_timer);
2203
0
        }
2204
0
        ovs_mutex_unlock(&dev->mutex);
2205
0
        netdev_close(netdev);
2206
0
    }
2207
0
    shash_destroy(&device_shash);
2208
0
}
2209
2210
static void
2211
swap_uint64(uint64_t *a, uint64_t *b)
2212
0
{
2213
0
    uint64_t tmp = *a;
2214
0
    *a = *b;
2215
0
    *b = tmp;
2216
0
}
2217
2218
/* Copies 'src' into 'dst', performing format conversion in the process.
2219
 *
2220
 * 'src' is allowed to be misaligned. */
2221
static void
2222
netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
2223
                                  const struct dpif_netlink_vport *vport)
2224
0
{
2225
0
    dst->rx_packets = get_32aligned_u64(&vport->stats->rx_packets);
2226
0
    dst->tx_packets = get_32aligned_u64(&vport->stats->tx_packets);
2227
0
    dst->rx_bytes = get_32aligned_u64(&vport->stats->rx_bytes);
2228
0
    dst->tx_bytes = get_32aligned_u64(&vport->stats->tx_bytes);
2229
0
    dst->rx_errors = get_32aligned_u64(&vport->stats->rx_errors);
2230
0
    dst->tx_errors = get_32aligned_u64(&vport->stats->tx_errors);
2231
0
    dst->rx_dropped = get_32aligned_u64(&vport->stats->rx_dropped);
2232
0
    dst->tx_dropped = get_32aligned_u64(&vport->stats->tx_dropped);
2233
0
    dst->multicast = 0;
2234
0
    dst->collisions = 0;
2235
0
    dst->rx_length_errors = 0;
2236
0
    dst->rx_over_errors = 0;
2237
0
    dst->rx_crc_errors = 0;
2238
0
    dst->rx_frame_errors = 0;
2239
0
    dst->rx_fifo_errors = 0;
2240
0
    dst->rx_missed_errors = 0;
2241
0
    dst->tx_aborted_errors = 0;
2242
0
    dst->tx_carrier_errors = 0;
2243
0
    dst->tx_fifo_errors = 0;
2244
0
    dst->tx_heartbeat_errors = 0;
2245
0
    dst->tx_window_errors = 0;
2246
0
    dst->upcall_packets = vport->upcall_success;
2247
0
    dst->upcall_errors = vport->upcall_fail;
2248
0
}
2249
2250
static int
2251
get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
2252
0
{
2253
0
    struct dpif_netlink_vport reply;
2254
0
    struct ofpbuf *buf;
2255
0
    int error;
2256
2257
0
    error = dpif_netlink_vport_get(netdev_get_name(netdev), &reply, &buf);
2258
0
    if (error) {
2259
0
        return error;
2260
0
    } else if (!reply.stats) {
2261
0
        ofpbuf_delete(buf);
2262
0
        return EOPNOTSUPP;
2263
0
    }
2264
2265
0
    netdev_stats_from_ovs_vport_stats(stats, &reply);
2266
2267
0
    ofpbuf_delete(buf);
2268
2269
0
    return 0;
2270
0
}
2271
2272
static void
2273
get_stats_via_vport(const struct netdev *netdev_,
2274
                    struct netdev_stats *stats)
2275
0
{
2276
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2277
2278
0
    if (!netdev->vport_stats_error ||
2279
0
        !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
2280
0
        int error;
2281
2282
0
        error = get_stats_via_vport__(netdev_, stats);
2283
0
        if (error && error != ENOENT && error != ENODEV) {
2284
0
            VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
2285
0
                         "(%s)",
2286
0
                         netdev_get_name(netdev_), ovs_strerror(error));
2287
0
        }
2288
0
        netdev->vport_stats_error = error;
2289
0
        netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
2290
0
    }
2291
0
}
2292
2293
/* Retrieves current device stats for 'netdev-linux'. */
2294
static int
2295
netdev_linux_get_stats(const struct netdev *netdev_,
2296
                       struct netdev_stats *stats)
2297
0
{
2298
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2299
0
    struct netdev_stats dev_stats;
2300
0
    int error;
2301
2302
0
    ovs_mutex_lock(&netdev->mutex);
2303
0
    get_stats_via_vport(netdev_, stats);
2304
0
    error = get_stats_via_netlink(netdev_, &dev_stats);
2305
0
    if (error) {
2306
0
        if (!netdev->vport_stats_error) {
2307
0
            error = 0;
2308
0
        }
2309
0
    } else if (netdev->vport_stats_error) {
2310
        /* stats not available from OVS then use netdev stats. */
2311
0
        *stats = dev_stats;
2312
0
    } else {
2313
0
        stats->multicast           += dev_stats.multicast;
2314
0
        stats->collisions          += dev_stats.collisions;
2315
0
        stats->rx_length_errors    += dev_stats.rx_length_errors;
2316
0
        stats->rx_over_errors      += dev_stats.rx_over_errors;
2317
0
        stats->rx_crc_errors       += dev_stats.rx_crc_errors;
2318
0
        stats->rx_frame_errors     += dev_stats.rx_frame_errors;
2319
0
        stats->rx_fifo_errors      += dev_stats.rx_fifo_errors;
2320
0
        stats->rx_missed_errors    += dev_stats.rx_missed_errors;
2321
0
        stats->tx_aborted_errors   += dev_stats.tx_aborted_errors;
2322
0
        stats->tx_carrier_errors   += dev_stats.tx_carrier_errors;
2323
0
        stats->tx_fifo_errors      += dev_stats.tx_fifo_errors;
2324
0
        stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
2325
0
        stats->tx_window_errors    += dev_stats.tx_window_errors;
2326
0
    }
2327
0
    ovs_mutex_unlock(&netdev->mutex);
2328
2329
0
    return error;
2330
0
}
2331
2332
/* Retrieves current device stats for 'netdev-tap' netdev or
2333
 * netdev-internal. */
2334
static int
2335
netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
2336
0
{
2337
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2338
0
    struct netdev_stats dev_stats;
2339
0
    int error;
2340
2341
0
    ovs_mutex_lock(&netdev->mutex);
2342
0
    get_stats_via_vport(netdev_, stats);
2343
0
    error = get_stats_via_netlink(netdev_, &dev_stats);
2344
0
    if (error) {
2345
0
        if (!netdev->vport_stats_error) {
2346
0
            error = 0;
2347
0
        }
2348
0
    } else if (netdev->vport_stats_error) {
2349
        /* Transmit and receive stats will appear to be swapped relative to the
2350
         * other ports since we are the one sending the data, not a remote
2351
         * computer.  For consistency, we swap them back here. This does not
2352
         * apply if we are getting stats from the vport layer because it always
2353
         * tracks stats from the perspective of the switch. */
2354
2355
0
        *stats = dev_stats;
2356
0
        swap_uint64(&stats->rx_packets, &stats->tx_packets);
2357
0
        swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
2358
0
        swap_uint64(&stats->rx_errors, &stats->tx_errors);
2359
0
        swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
2360
0
        stats->rx_length_errors = 0;
2361
0
        stats->rx_over_errors = 0;
2362
0
        stats->rx_crc_errors = 0;
2363
0
        stats->rx_frame_errors = 0;
2364
0
        stats->rx_fifo_errors = 0;
2365
0
        stats->rx_missed_errors = 0;
2366
0
        stats->tx_aborted_errors = 0;
2367
0
        stats->tx_carrier_errors = 0;
2368
0
        stats->tx_fifo_errors = 0;
2369
0
        stats->tx_heartbeat_errors = 0;
2370
0
        stats->tx_window_errors = 0;
2371
0
    } else {
2372
        /* Use kernel netdev's packet and byte counts since vport counters
2373
         * do not reflect packet counts on the wire when GSO, TSO or GRO
2374
         * are enabled. */
2375
0
        stats->rx_packets = dev_stats.tx_packets;
2376
0
        stats->rx_bytes = dev_stats.tx_bytes;
2377
0
        stats->tx_packets = dev_stats.rx_packets;
2378
0
        stats->tx_bytes = dev_stats.rx_bytes;
2379
2380
0
        stats->rx_dropped          += dev_stats.tx_dropped;
2381
0
        stats->tx_dropped          += dev_stats.rx_dropped;
2382
2383
0
        stats->rx_errors           += dev_stats.tx_errors;
2384
0
        stats->tx_errors           += dev_stats.rx_errors;
2385
2386
0
        stats->multicast           += dev_stats.multicast;
2387
0
        stats->collisions          += dev_stats.collisions;
2388
0
    }
2389
0
    stats->tx_dropped += netdev->tx_dropped;
2390
0
    stats->rx_dropped += netdev->rx_dropped;
2391
0
    ovs_mutex_unlock(&netdev->mutex);
2392
2393
0
    return error;
2394
0
}
2395
2396
static int
2397
netdev_internal_get_stats(const struct netdev *netdev_,
2398
                          struct netdev_stats *stats)
2399
0
{
2400
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2401
0
    int error;
2402
2403
0
    ovs_mutex_lock(&netdev->mutex);
2404
0
    get_stats_via_vport(netdev_, stats);
2405
0
    error = netdev->vport_stats_error;
2406
0
    ovs_mutex_unlock(&netdev->mutex);
2407
2408
0
    return error;
2409
0
}
2410
2411
static int
2412
netdev_linux_read_stringset_info(struct netdev_linux *netdev, uint32_t *len)
2413
0
{
2414
0
    union {
2415
0
        struct ethtool_cmd ecmd;
2416
0
        struct ethtool_sset_info hdr;
2417
0
        struct {
2418
0
            uint64_t pad[2];
2419
0
            uint32_t sset_len[1];
2420
0
        };
2421
0
    } sset_info;
2422
0
    int error;
2423
2424
0
    sset_info.hdr.cmd = ETHTOOL_GSSET_INFO;
2425
0
    sset_info.hdr.reserved = 0;
2426
0
    sset_info.hdr.sset_mask = 1ULL << ETH_SS_FEATURES;
2427
2428
0
    error = netdev_linux_do_ethtool(netdev_get_name(&netdev->up),
2429
0
                                    (struct ethtool_cmd *) &sset_info,
2430
0
                                    ETHTOOL_GSSET_INFO, "ETHTOOL_GSSET_INFO");
2431
0
    if (error) {
2432
0
        return error;
2433
0
    }
2434
0
    if (sset_info.hdr.sset_mask & (1ULL << ETH_SS_FEATURES)) {
2435
0
        *len = sset_info.sset_len[0];
2436
0
        return 0;
2437
0
    } else {
2438
        /* ETH_SS_FEATURES is not supported. */
2439
0
        return -EOPNOTSUPP;
2440
0
    }
2441
0
}
2442
2443
2444
static int
2445
netdev_linux_read_definitions(struct netdev_linux *netdev,
2446
                              struct ethtool_gstrings **pstrings)
2447
0
{
2448
0
    struct ethtool_gstrings *strings = NULL;
2449
0
    uint32_t len = 0;
2450
0
    int error = 0;
2451
2452
0
    error = netdev_linux_read_stringset_info(netdev, &len);
2453
0
    if (error) {
2454
0
        return error;
2455
0
    } else if (!len) {
2456
0
        return -EOPNOTSUPP;
2457
0
    }
2458
2459
0
    strings = xzalloc(sizeof *strings + len * ETH_GSTRING_LEN);
2460
2461
0
    strings->cmd = ETHTOOL_GSTRINGS;
2462
0
    strings->string_set = ETH_SS_FEATURES;
2463
0
    strings->len = len;
2464
0
    error = netdev_linux_do_ethtool(netdev_get_name(&netdev->up),
2465
0
                                    (struct ethtool_cmd *) strings,
2466
0
                                    ETHTOOL_GSTRINGS, "ETHTOOL_GSTRINGS");
2467
0
    if (error) {
2468
0
        goto out;
2469
0
    }
2470
2471
0
    for (int i = 0; i < len; i++) {
2472
0
        strings->data[(i + 1) * ETH_GSTRING_LEN - 1] = 0;
2473
0
    }
2474
2475
0
    *pstrings = strings;
2476
2477
0
    return 0;
2478
0
out:
2479
0
    *pstrings = NULL;
2480
0
    free(strings);
2481
0
    return error;
2482
0
}
2483
2484
static void
2485
netdev_linux_set_ol(struct netdev *netdev_)
2486
0
{
2487
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2488
0
    struct ethtool_gfeatures *features = NULL;
2489
0
    struct ethtool_gstrings *names = NULL;
2490
0
    int error;
2491
2492
0
    COVERAGE_INC(netdev_get_ethtool);
2493
2494
0
    error = netdev_linux_read_definitions(netdev, &names);
2495
0
    if (error) {
2496
0
        return;
2497
0
    }
2498
2499
0
    features = xzalloc(sizeof *features +
2500
0
                       DIV_ROUND_UP(names->len, 32) *
2501
0
                       sizeof features->features[0]);
2502
2503
0
    features->cmd = ETHTOOL_GFEATURES;
2504
0
    features->size = DIV_ROUND_UP(names->len, 32);
2505
0
    error = netdev_linux_do_ethtool(netdev_get_name(netdev_),
2506
0
                                    (struct ethtool_cmd *) features,
2507
0
                                    ETHTOOL_GFEATURES, "ETHTOOL_GFEATURES");
2508
2509
0
    if (error) {
2510
0
        goto out;
2511
0
    }
2512
2513
0
#define FEATURE_WORD(blocks, index, field)  ((blocks)[(index) / 32U].field)
2514
0
#define FEATURE_FIELD_FLAG(index)       (1U << (index) % 32U)
2515
0
#define FEATURE_BIT_IS_SET(blocks, index, field)        \
2516
0
    (FEATURE_WORD(blocks, index, field) & FEATURE_FIELD_FLAG(index))
2517
2518
0
    netdev->up.ol_flags = 0;
2519
0
    static const struct {
2520
0
        char *string;
2521
0
        uint32_t value;
2522
0
    } t_list[] = {
2523
0
        {"tx-checksum-ipv4", NETDEV_TX_OFFLOAD_TCP_CKSUM |
2524
0
                             NETDEV_TX_OFFLOAD_UDP_CKSUM},
2525
0
        {"tx-checksum-ipv6", NETDEV_TX_OFFLOAD_TCP_CKSUM |
2526
0
                             NETDEV_TX_OFFLOAD_UDP_CKSUM},
2527
0
        {"tx-checksum-ip-generic", NETDEV_TX_OFFLOAD_TCP_CKSUM |
2528
0
                                   NETDEV_TX_OFFLOAD_UDP_CKSUM},
2529
0
        {"tx-checksum-sctp", NETDEV_TX_OFFLOAD_SCTP_CKSUM},
2530
0
        {"tx-tcp-segmentation", NETDEV_TX_OFFLOAD_TCP_TSO},
2531
0
    };
2532
2533
0
    for (int j = 0; j < ARRAY_SIZE(t_list); j++) {
2534
0
        for (int i = 0; i < names->len; i++) {
2535
0
            char *name = (char *) names->data + i * ETH_GSTRING_LEN;
2536
0
            if (strcmp(t_list[j].string, name) == 0) {
2537
0
                if (FEATURE_BIT_IS_SET(features->features, i, active)) {
2538
0
                    netdev_->ol_flags |= t_list[j].value;
2539
0
                }
2540
0
                break;
2541
0
            }
2542
0
        }
2543
0
    }
2544
2545
0
out:
2546
0
    free(names);
2547
0
    free(features);
2548
0
}
2549
2550
static void
2551
netdev_linux_read_features(struct netdev_linux *netdev)
2552
0
{
2553
0
    struct ethtool_cmd ecmd;
2554
0
    int error;
2555
2556
0
    if (netdev->cache_valid & VALID_FEATURES) {
2557
0
        return;
2558
0
    }
2559
2560
0
    COVERAGE_INC(netdev_get_ethtool);
2561
0
    memset(&ecmd, 0, sizeof ecmd);
2562
0
    error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
2563
0
                                    ETHTOOL_GSET, "ETHTOOL_GSET");
2564
0
    if (error) {
2565
0
        goto out;
2566
0
    }
2567
2568
    /* Supported features. */
2569
0
    netdev->supported = 0;
2570
0
    if (ecmd.supported & SUPPORTED_10baseT_Half) {
2571
0
        netdev->supported |= NETDEV_F_10MB_HD;
2572
0
    }
2573
0
    if (ecmd.supported & SUPPORTED_10baseT_Full) {
2574
0
        netdev->supported |= NETDEV_F_10MB_FD;
2575
0
    }
2576
0
    if (ecmd.supported & SUPPORTED_100baseT_Half)  {
2577
0
        netdev->supported |= NETDEV_F_100MB_HD;
2578
0
    }
2579
0
    if (ecmd.supported & SUPPORTED_100baseT_Full) {
2580
0
        netdev->supported |= NETDEV_F_100MB_FD;
2581
0
    }
2582
0
    if (ecmd.supported & SUPPORTED_1000baseT_Half) {
2583
0
        netdev->supported |= NETDEV_F_1GB_HD;
2584
0
    }
2585
0
    if ((ecmd.supported & SUPPORTED_1000baseT_Full) ||
2586
0
        (ecmd.supported & SUPPORTED_1000baseKX_Full)) {
2587
0
        netdev->supported |= NETDEV_F_1GB_FD;
2588
0
    }
2589
0
    if ((ecmd.supported & SUPPORTED_10000baseT_Full) ||
2590
0
        (ecmd.supported & SUPPORTED_10000baseKX4_Full) ||
2591
0
        (ecmd.supported & SUPPORTED_10000baseKR_Full) ||
2592
0
        (ecmd.supported & SUPPORTED_10000baseR_FEC)) {
2593
0
        netdev->supported |= NETDEV_F_10GB_FD;
2594
0
    }
2595
0
    if ((ecmd.supported & SUPPORTED_40000baseKR4_Full) ||
2596
0
        (ecmd.supported & SUPPORTED_40000baseCR4_Full) ||
2597
0
        (ecmd.supported & SUPPORTED_40000baseSR4_Full) ||
2598
0
        (ecmd.supported & SUPPORTED_40000baseLR4_Full)) {
2599
0
        netdev->supported |= NETDEV_F_40GB_FD;
2600
0
    }
2601
0
    if (ecmd.supported & SUPPORTED_TP) {
2602
0
        netdev->supported |= NETDEV_F_COPPER;
2603
0
    }
2604
0
    if (ecmd.supported & SUPPORTED_FIBRE) {
2605
0
        netdev->supported |= NETDEV_F_FIBER;
2606
0
    }
2607
0
    if (ecmd.supported & SUPPORTED_Autoneg) {
2608
0
        netdev->supported |= NETDEV_F_AUTONEG;
2609
0
    }
2610
0
    if (ecmd.supported & SUPPORTED_Pause) {
2611
0
        netdev->supported |= NETDEV_F_PAUSE;
2612
0
    }
2613
0
    if (ecmd.supported & SUPPORTED_Asym_Pause) {
2614
0
        netdev->supported |= NETDEV_F_PAUSE_ASYM;
2615
0
    }
2616
2617
    /* Advertised features. */
2618
0
    netdev->advertised = 0;
2619
0
    if (ecmd.advertising & ADVERTISED_10baseT_Half) {
2620
0
        netdev->advertised |= NETDEV_F_10MB_HD;
2621
0
    }
2622
0
    if (ecmd.advertising & ADVERTISED_10baseT_Full) {
2623
0
        netdev->advertised |= NETDEV_F_10MB_FD;
2624
0
    }
2625
0
    if (ecmd.advertising & ADVERTISED_100baseT_Half) {
2626
0
        netdev->advertised |= NETDEV_F_100MB_HD;
2627
0
    }
2628
0
    if (ecmd.advertising & ADVERTISED_100baseT_Full) {
2629
0
        netdev->advertised |= NETDEV_F_100MB_FD;
2630
0
    }
2631
0
    if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
2632
0
        netdev->advertised |= NETDEV_F_1GB_HD;
2633
0
    }
2634
0
    if ((ecmd.advertising & ADVERTISED_1000baseT_Full) ||
2635
0
        (ecmd.advertising & ADVERTISED_1000baseKX_Full)) {
2636
0
        netdev->advertised |= NETDEV_F_1GB_FD;
2637
0
    }
2638
0
    if ((ecmd.advertising & ADVERTISED_10000baseT_Full) ||
2639
0
        (ecmd.advertising & ADVERTISED_10000baseKX4_Full) ||
2640
0
        (ecmd.advertising & ADVERTISED_10000baseKR_Full) ||
2641
0
        (ecmd.advertising & ADVERTISED_10000baseR_FEC)) {
2642
0
        netdev->advertised |= NETDEV_F_10GB_FD;
2643
0
    }
2644
0
    if ((ecmd.advertising & ADVERTISED_40000baseKR4_Full) ||
2645
0
        (ecmd.advertising & ADVERTISED_40000baseCR4_Full) ||
2646
0
        (ecmd.advertising & ADVERTISED_40000baseSR4_Full) ||
2647
0
        (ecmd.advertising & ADVERTISED_40000baseLR4_Full)) {
2648
0
        netdev->advertised |= NETDEV_F_40GB_FD;
2649
0
    }
2650
0
    if (ecmd.advertising & ADVERTISED_TP) {
2651
0
        netdev->advertised |= NETDEV_F_COPPER;
2652
0
    }
2653
0
    if (ecmd.advertising & ADVERTISED_FIBRE) {
2654
0
        netdev->advertised |= NETDEV_F_FIBER;
2655
0
    }
2656
0
    if (ecmd.advertising & ADVERTISED_Autoneg) {
2657
0
        netdev->advertised |= NETDEV_F_AUTONEG;
2658
0
    }
2659
0
    if (ecmd.advertising & ADVERTISED_Pause) {
2660
0
        netdev->advertised |= NETDEV_F_PAUSE;
2661
0
    }
2662
0
    if (ecmd.advertising & ADVERTISED_Asym_Pause) {
2663
0
        netdev->advertised |= NETDEV_F_PAUSE_ASYM;
2664
0
    }
2665
2666
    /* Current settings. */
2667
0
    netdev->current_speed = ethtool_cmd_speed(&ecmd);
2668
0
    if (netdev->current_speed == SPEED_10) {
2669
0
        netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
2670
0
    } else if (netdev->current_speed == SPEED_100) {
2671
0
        netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
2672
0
    } else if (netdev->current_speed == SPEED_1000) {
2673
0
        netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
2674
0
    } else if (netdev->current_speed == SPEED_10000) {
2675
0
        netdev->current = NETDEV_F_10GB_FD;
2676
0
    } else if (netdev->current_speed == 40000) {
2677
0
        netdev->current = NETDEV_F_40GB_FD;
2678
0
    } else if (netdev->current_speed == 100000) {
2679
0
        netdev->current = NETDEV_F_100GB_FD;
2680
0
    } else if (netdev->current_speed == 1000000) {
2681
0
        netdev->current = NETDEV_F_1TB_FD;
2682
0
    } else {
2683
0
        netdev->current = 0;
2684
0
    }
2685
2686
0
    if (ecmd.port == PORT_TP) {
2687
0
        netdev->current |= NETDEV_F_COPPER;
2688
0
    } else if (ecmd.port == PORT_FIBRE) {
2689
0
        netdev->current |= NETDEV_F_FIBER;
2690
0
    }
2691
2692
0
    if (ecmd.autoneg) {
2693
0
        netdev->current |= NETDEV_F_AUTONEG;
2694
0
    }
2695
2696
0
out:
2697
0
    netdev->cache_valid |= VALID_FEATURES;
2698
0
    netdev->get_features_error = error;
2699
0
}
2700
2701
/* Stores the features supported by 'netdev' into of '*current', '*advertised',
2702
 * '*supported', and '*peer'.  Each value is a bitmap of NETDEV_* bits.
2703
 * Returns 0 if successful, otherwise a positive errno value. */
2704
static int
2705
netdev_linux_get_features(const struct netdev *netdev_,
2706
                          enum netdev_features *current,
2707
                          enum netdev_features *advertised,
2708
                          enum netdev_features *supported,
2709
                          enum netdev_features *peer)
2710
0
{
2711
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2712
0
    int error;
2713
2714
0
    ovs_mutex_lock(&netdev->mutex);
2715
0
    if (netdev_linux_netnsid_is_remote(netdev)) {
2716
0
        error = EOPNOTSUPP;
2717
0
        goto exit;
2718
0
    }
2719
2720
0
    netdev_linux_read_features(netdev);
2721
0
    if (!netdev->get_features_error) {
2722
0
        *current = netdev->current;
2723
0
        *advertised = netdev->advertised;
2724
0
        *supported = netdev->supported;
2725
0
        *peer = 0;              /* XXX */
2726
0
    }
2727
0
    error = netdev->get_features_error;
2728
2729
0
exit:
2730
0
    ovs_mutex_unlock(&netdev->mutex);
2731
0
    return error;
2732
0
}
2733
2734
static int
2735
netdev_linux_get_speed_locked(struct netdev_linux *netdev,
2736
                              uint32_t *current, uint32_t *max)
2737
0
{
2738
0
    if (netdev_linux_netnsid_is_remote(netdev)) {
2739
0
        *current = *max = 0;
2740
0
        return EOPNOTSUPP;
2741
0
    }
2742
2743
0
    netdev_linux_read_features(netdev);
2744
0
    if (!netdev->get_features_error) {
2745
0
        *current = netdev->current_speed == SPEED_UNKNOWN
2746
0
                   ? 0 : netdev->current_speed;
2747
0
        *max = MIN(UINT32_MAX,
2748
0
                   netdev_features_to_bps(netdev->supported, 0) / 1000000ULL);
2749
0
    } else {
2750
0
        *current = *max = 0;
2751
0
    }
2752
0
    return netdev->get_features_error;
2753
0
}
2754
2755
static int
2756
netdev_linux_get_speed(const struct netdev *netdev_, uint32_t *current,
2757
                       uint32_t *max)
2758
0
{
2759
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2760
0
    int error;
2761
2762
0
    ovs_mutex_lock(&netdev->mutex);
2763
0
    error = netdev_linux_get_speed_locked(netdev, current, max);
2764
0
    ovs_mutex_unlock(&netdev->mutex);
2765
0
    return error;
2766
0
}
2767
2768
/* Set the features advertised by 'netdev' to 'advertise'. */
2769
static int
2770
netdev_linux_set_advertisements(struct netdev *netdev_,
2771
                                enum netdev_features advertise)
2772
0
{
2773
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2774
0
    struct ethtool_cmd ecmd;
2775
0
    int error;
2776
2777
0
    ovs_mutex_lock(&netdev->mutex);
2778
2779
0
    COVERAGE_INC(netdev_get_ethtool);
2780
2781
0
    if (netdev_linux_netnsid_is_remote(netdev)) {
2782
0
        error = EOPNOTSUPP;
2783
0
        goto exit;
2784
0
    }
2785
2786
0
    memset(&ecmd, 0, sizeof ecmd);
2787
0
    error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
2788
0
                                    ETHTOOL_GSET, "ETHTOOL_GSET");
2789
0
    if (error) {
2790
0
        goto exit;
2791
0
    }
2792
2793
0
    ecmd.advertising = 0;
2794
0
    if (advertise & NETDEV_F_10MB_HD) {
2795
0
        ecmd.advertising |= ADVERTISED_10baseT_Half;
2796
0
    }
2797
0
    if (advertise & NETDEV_F_10MB_FD) {
2798
0
        ecmd.advertising |= ADVERTISED_10baseT_Full;
2799
0
    }
2800
0
    if (advertise & NETDEV_F_100MB_HD) {
2801
0
        ecmd.advertising |= ADVERTISED_100baseT_Half;
2802
0
    }
2803
0
    if (advertise & NETDEV_F_100MB_FD) {
2804
0
        ecmd.advertising |= ADVERTISED_100baseT_Full;
2805
0
    }
2806
0
    if (advertise & NETDEV_F_1GB_HD) {
2807
0
        ecmd.advertising |= ADVERTISED_1000baseT_Half;
2808
0
    }
2809
0
    if (advertise & NETDEV_F_1GB_FD) {
2810
0
        ecmd.advertising |= ADVERTISED_1000baseT_Full;
2811
0
    }
2812
0
    if (advertise & NETDEV_F_10GB_FD) {
2813
0
        ecmd.advertising |= ADVERTISED_10000baseT_Full;
2814
0
    }
2815
0
    if (advertise & NETDEV_F_COPPER) {
2816
0
        ecmd.advertising |= ADVERTISED_TP;
2817
0
    }
2818
0
    if (advertise & NETDEV_F_FIBER) {
2819
0
        ecmd.advertising |= ADVERTISED_FIBRE;
2820
0
    }
2821
0
    if (advertise & NETDEV_F_AUTONEG) {
2822
0
        ecmd.advertising |= ADVERTISED_Autoneg;
2823
0
    }
2824
0
    if (advertise & NETDEV_F_PAUSE) {
2825
0
        ecmd.advertising |= ADVERTISED_Pause;
2826
0
    }
2827
0
    if (advertise & NETDEV_F_PAUSE_ASYM) {
2828
0
        ecmd.advertising |= ADVERTISED_Asym_Pause;
2829
0
    }
2830
0
    COVERAGE_INC(netdev_set_ethtool);
2831
0
    error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
2832
0
                                    ETHTOOL_SSET, "ETHTOOL_SSET");
2833
2834
0
exit:
2835
0
    ovs_mutex_unlock(&netdev->mutex);
2836
0
    return error;
2837
0
}
2838
2839
static void
2840
nl_msg_act_police_start_nest(struct ofpbuf *request, uint32_t prio,
2841
                             size_t *offset, size_t *act_offset,
2842
                             bool single_action)
2843
0
{
2844
0
    *act_offset = nl_msg_start_nested(request, prio);
2845
0
    nl_msg_put_string(request, TCA_ACT_KIND, "police");
2846
2847
    /* If police action is added independently from filter, we need to
2848
     * add action flag according to tc-policy. */
2849
0
    if (single_action) {
2850
0
        nl_msg_put_act_tc_policy_flag(request);
2851
0
    }
2852
0
    *offset = nl_msg_start_nested(request, TCA_ACT_OPTIONS);
2853
0
}
2854
2855
static void
2856
nl_msg_act_police_end_nest(struct ofpbuf *request, size_t offset,
2857
                           size_t act_offset, uint32_t notexceed_act)
2858
0
{
2859
0
    nl_msg_put_u32(request, TCA_POLICE_RESULT, notexceed_act);
2860
0
    nl_msg_end_nested(request, offset);
2861
0
    nl_msg_end_nested(request, act_offset);
2862
0
}
2863
2864
static void
2865
nl_msg_put_act_police(struct ofpbuf *request, uint32_t index,
2866
                      uint64_t kbits_rate, uint64_t kbits_burst,
2867
                      uint64_t pkts_rate, uint64_t pkts_burst,
2868
                      uint32_t notexceed_act, bool single_action)
2869
0
{
2870
0
    uint64_t bytes_rate = kbits_rate / 8 * 1000;
2871
0
    size_t offset, act_offset;
2872
0
    struct tc_police police;
2873
0
    uint32_t prio = 0;
2874
2875
0
    if (!kbits_rate && !pkts_rate) {
2876
0
        return;
2877
0
    }
2878
2879
0
    tc_policer_init(&police, kbits_rate, kbits_burst);
2880
0
    police.index = index;
2881
2882
0
    nl_msg_act_police_start_nest(request, ++prio, &offset, &act_offset,
2883
0
                                 single_action);
2884
0
    if (police.rate.rate) {
2885
0
        tc_put_rtab(request, TCA_POLICE_RATE, &police.rate, bytes_rate);
2886
0
    }
2887
#ifdef HAVE_TCA_POLICE_PKTRATE64
2888
    if (bytes_rate > UINT32_MAX) {
2889
        nl_msg_put_u64(request, TCA_POLICE_RATE64, bytes_rate);
2890
    }
2891
#endif
2892
0
    if (pkts_rate) {
2893
0
        uint64_t pkt_burst_ticks;
2894
        /* Here tc_bytes_to_ticks is used to convert packets rather than bytes
2895
           to ticks. */
2896
0
        pkt_burst_ticks = tc_bytes_to_ticks(pkts_rate, pkts_burst);
2897
0
        nl_msg_put_u64(request, TCA_POLICE_PKTRATE64, pkts_rate);
2898
0
        nl_msg_put_u64(request, TCA_POLICE_PKTBURST64, pkt_burst_ticks);
2899
0
    }
2900
0
    nl_msg_put_unspec(request, TCA_POLICE_TBF, &police, sizeof police);
2901
0
    nl_msg_act_police_end_nest(request, offset, act_offset, notexceed_act);
2902
0
}
2903
2904
static int
2905
tc_add_matchall_policer(struct netdev *netdev, uint64_t kbits_rate,
2906
                        uint32_t kbits_burst, uint32_t kpkts_rate,
2907
                        uint32_t kpkts_burst)
2908
0
{
2909
0
    uint16_t eth_type = (OVS_FORCE uint16_t) htons(ETH_P_ALL);
2910
0
    size_t basic_offset, action_offset;
2911
0
    uint16_t prio = TC_RESERVED_PRIORITY_POLICE;
2912
0
    int ifindex, err = 0;
2913
0
    struct ofpbuf request;
2914
0
    struct ofpbuf *reply;
2915
0
    struct tcmsg *tcmsg;
2916
0
    uint32_t handle = 1;
2917
2918
0
    err = get_ifindex(netdev, &ifindex);
2919
0
    if (err) {
2920
0
        return err;
2921
0
    }
2922
2923
0
    tcmsg = tc_make_request(ifindex, RTM_NEWTFILTER, NLM_F_CREATE | NLM_F_ECHO,
2924
0
                            &request);
2925
0
    tcmsg->tcm_parent = TC_INGRESS_PARENT;
2926
0
    tcmsg->tcm_info = tc_make_handle(prio, eth_type);
2927
0
    tcmsg->tcm_handle = handle;
2928
2929
0
    nl_msg_put_string(&request, TCA_KIND, "matchall");
2930
0
    basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2931
0
    action_offset = nl_msg_start_nested(&request, TCA_MATCHALL_ACT);
2932
0
    nl_msg_put_act_police(&request, 0, kbits_rate, kbits_burst,
2933
0
                          kpkts_rate * 1000ULL, kpkts_burst * 1000ULL,
2934
0
                          TC_ACT_UNSPEC, false);
2935
0
    nl_msg_end_nested(&request, action_offset);
2936
0
    nl_msg_end_nested(&request, basic_offset);
2937
2938
0
    err = tc_transact(&request, &reply);
2939
0
    if (!err) {
2940
0
        struct ofpbuf b = ofpbuf_const_initializer(reply->data, reply->size);
2941
0
        struct nlmsghdr *nlmsg = ofpbuf_try_pull(&b, sizeof *nlmsg);
2942
0
        struct tcmsg *tc = ofpbuf_try_pull(&b, sizeof *tc);
2943
2944
0
        if (!nlmsg || !tc) {
2945
0
            VLOG_ERR_RL(&rl,
2946
0
                        "Failed to add match all policer, malformed reply");
2947
0
            ofpbuf_delete(reply);
2948
0
            return EPROTO;
2949
0
        }
2950
0
        ofpbuf_delete(reply);
2951
0
    }
2952
2953
0
    return err;
2954
0
}
2955
2956
static int
2957
tc_del_matchall_policer(struct netdev *netdev)
2958
0
{
2959
0
    int prio = TC_RESERVED_PRIORITY_POLICE;
2960
0
    uint32_t block_id = 0;
2961
0
    struct tcf_id id;
2962
0
    int ifindex;
2963
0
    int err;
2964
2965
0
    err = get_ifindex(netdev, &ifindex);
2966
0
    if (err) {
2967
0
        return err;
2968
0
    }
2969
2970
0
    id = tc_make_tcf_id(ifindex, block_id, prio, TC_INGRESS);
2971
0
    err = tc_del_filter(&id, "matchall");
2972
0
    if (err) {
2973
0
        return err;
2974
0
    }
2975
2976
0
    return 0;
2977
0
}
2978
2979
/* Attempts to set input rate limiting (policing) policy.  Returns 0 if
2980
 * successful, otherwise a positive errno value. */
2981
static int
2982
netdev_linux_set_policing(struct netdev *netdev_, uint32_t kbits_rate,
2983
                          uint32_t kbits_burst, uint32_t kpkts_rate,
2984
                          uint32_t kpkts_burst)
2985
0
{
2986
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2987
0
    const char *netdev_name = netdev_get_name(netdev_);
2988
0
    int ifindex;
2989
0
    int error;
2990
2991
0
    kbits_burst = (!kbits_rate ? 0       /* Force to 0 if no rate specified. */
2992
0
                   : !kbits_burst ? 8000 /* Default to 8000 kbits if 0. */
2993
0
                   : kbits_burst);       /* Stick with user-specified value. */
2994
2995
0
    kpkts_burst = (!kpkts_rate ? 0       /* Force to 0 if no rate specified. */
2996
0
                   : !kpkts_burst ? 16   /* Default to 16 kpkts if 0. */
2997
0
                   : kpkts_burst);       /* Stick with user-specified value. */
2998
2999
0
    ovs_mutex_lock(&netdev->mutex);
3000
0
    if (netdev_linux_netnsid_is_remote(netdev)) {
3001
0
        error = EOPNOTSUPP;
3002
0
        goto out;
3003
0
    }
3004
3005
0
    if (netdev->cache_valid & VALID_POLICING) {
3006
0
        error = netdev->netdev_policing_error;
3007
0
        if (error || (netdev->kbits_rate == kbits_rate &&
3008
0
                      netdev->kpkts_rate == kpkts_rate &&
3009
0
                      netdev->kbits_burst == kbits_burst &&
3010
0
                      netdev->kpkts_burst == kpkts_burst)) {
3011
            /* Assume that settings haven't changed since we last set them. */
3012
0
            goto out;
3013
0
        }
3014
0
        netdev->cache_valid &= ~VALID_POLICING;
3015
0
    }
3016
3017
0
    COVERAGE_INC(netdev_set_policing);
3018
3019
    /* Use matchall for policing when offloadling ovs with tc-flower. */
3020
0
    if (netdev_is_flow_api_enabled()) {
3021
0
        error = tc_del_matchall_policer(netdev_);
3022
0
        if (kbits_rate || kpkts_rate) {
3023
0
            error = tc_add_matchall_policer(netdev_, kbits_rate, kbits_burst,
3024
0
                                            kpkts_rate, kpkts_burst);
3025
0
        }
3026
0
        goto out;
3027
0
    }
3028
3029
0
    error = get_ifindex(netdev_, &ifindex);
3030
0
    if (error) {
3031
0
        goto out;
3032
0
    }
3033
3034
    /* Remove any existing ingress qdisc. */
3035
0
    error = tc_add_del_qdisc(ifindex, false, 0, TC_INGRESS);
3036
0
    if (error) {
3037
0
        VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
3038
0
                     netdev_name, ovs_strerror(error));
3039
0
        goto out;
3040
0
    }
3041
3042
0
    if (kbits_rate || kpkts_rate) {
3043
0
        const char *cls_name = "matchall";
3044
3045
0
        error = tc_add_del_qdisc(ifindex, true, 0, TC_INGRESS);
3046
0
        if (error) {
3047
0
            VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
3048
0
                         netdev_name, ovs_strerror(error));
3049
0
            goto out;
3050
0
        }
3051
3052
0
        error = tc_add_matchall_policer(netdev_, kbits_rate, kbits_burst,
3053
0
                                        kpkts_rate, kpkts_burst);
3054
0
        if (error == ENOENT) {
3055
0
            cls_name = "basic";
3056
            /* This error is returned when the matchall classifier is missing.
3057
             * Fall back to the basic classifier.  */
3058
0
            error = tc_add_policer(netdev_, kbits_rate, kbits_burst,
3059
0
                                   kpkts_rate, kpkts_burst);
3060
0
        }
3061
0
        if (error){
3062
0
            VLOG_WARN_RL(&rl, "%s: adding cls_%s policing action failed: %s",
3063
0
                         netdev_name, cls_name, ovs_strerror(error));
3064
0
            goto out;
3065
0
        }
3066
0
    }
3067
3068
0
out:
3069
0
    if (!error) {
3070
0
        netdev->kbits_rate = kbits_rate;
3071
0
        netdev->kbits_burst = kbits_burst;
3072
0
        netdev->kpkts_rate = kpkts_rate;
3073
0
        netdev->kpkts_burst = kpkts_burst;
3074
0
    }
3075
3076
0
    if (!error || error == ENODEV) {
3077
0
        netdev->netdev_policing_error = error;
3078
0
        netdev->cache_valid |= VALID_POLICING;
3079
0
    }
3080
0
    ovs_mutex_unlock(&netdev->mutex);
3081
0
    return error;
3082
0
}
3083
3084
static int
3085
netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
3086
                           struct sset *types)
3087
0
{
3088
0
    const struct tc_ops *const *opsp;
3089
0
    for (opsp = tcs; *opsp != NULL; opsp++) {
3090
0
        const struct tc_ops *ops = *opsp;
3091
0
        if (ops->tc_install && ops->ovs_name[0] != '\0') {
3092
0
            sset_add(types, ops->ovs_name);
3093
0
        }
3094
0
    }
3095
0
    return 0;
3096
0
}
3097
3098
static const struct tc_ops *
3099
tc_lookup_ovs_name(const char *name)
3100
0
{
3101
0
    const struct tc_ops *const *opsp;
3102
3103
0
    for (opsp = tcs; *opsp != NULL; opsp++) {
3104
0
        const struct tc_ops *ops = *opsp;
3105
0
        if (!strcmp(name, ops->ovs_name)) {
3106
0
            return ops;
3107
0
        }
3108
0
    }
3109
0
    return NULL;
3110
0
}
3111
3112
static const struct tc_ops *
3113
tc_lookup_linux_name(const char *name)
3114
0
{
3115
0
    const struct tc_ops *const *opsp;
3116
3117
0
    for (opsp = tcs; *opsp != NULL; opsp++) {
3118
0
        const struct tc_ops *ops = *opsp;
3119
0
        if (ops->linux_name && !strcmp(name, ops->linux_name)) {
3120
0
            return ops;
3121
0
        }
3122
0
    }
3123
0
    return NULL;
3124
0
}
3125
3126
static struct tc_queue *
3127
tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
3128
                size_t hash)
3129
0
{
3130
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3131
0
    struct tc_queue *queue;
3132
3133
0
    HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
3134
0
        if (queue->queue_id == queue_id) {
3135
0
            return queue;
3136
0
        }
3137
0
    }
3138
0
    return NULL;
3139
0
}
3140
3141
static struct tc_queue *
3142
tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
3143
0
{
3144
0
    return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
3145
0
}
3146
3147
static int
3148
netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
3149
                                  const char *type,
3150
                                  struct netdev_qos_capabilities *caps)
3151
0
{
3152
0
    const struct tc_ops *ops = tc_lookup_ovs_name(type);
3153
0
    if (!ops) {
3154
0
        return EOPNOTSUPP;
3155
0
    }
3156
0
    caps->n_queues = ops->n_queues;
3157
0
    return 0;
3158
0
}
3159
3160
static int
3161
netdev_linux_get_qos(const struct netdev *netdev_,
3162
                     const char **typep, struct smap *details)
3163
0
{
3164
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3165
0
    int error;
3166
3167
0
    ovs_mutex_lock(&netdev->mutex);
3168
0
    if (netdev_linux_netnsid_is_remote(netdev)) {
3169
0
        error = EOPNOTSUPP;
3170
0
        goto exit;
3171
0
    }
3172
3173
0
    error = tc_query_qdisc(netdev_);
3174
0
    if (!error) {
3175
0
        *typep = netdev->tc->ops->ovs_name;
3176
0
        error = (netdev->tc->ops->qdisc_get
3177
0
                 ? netdev->tc->ops->qdisc_get(netdev_, details)
3178
0
                 : 0);
3179
0
    }
3180
3181
0
exit:
3182
0
    ovs_mutex_unlock(&netdev->mutex);
3183
0
    return error;
3184
0
}
3185
3186
static int
3187
netdev_linux_set_qos(struct netdev *netdev_,
3188
                     const char *type, const struct smap *details)
3189
0
{
3190
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3191
0
    const struct tc_ops *new_ops;
3192
0
    int error;
3193
3194
0
    new_ops = tc_lookup_ovs_name(type);
3195
0
    if (!new_ops || !new_ops->tc_install) {
3196
0
        return EOPNOTSUPP;
3197
0
    }
3198
3199
0
    if (new_ops == &tc_ops_noop) {
3200
0
        return new_ops->tc_install(netdev_, details);
3201
0
    }
3202
3203
0
    ovs_mutex_lock(&netdev->mutex);
3204
0
    if (netdev_linux_netnsid_is_remote(netdev)) {
3205
0
        error = EOPNOTSUPP;
3206
0
        goto exit;
3207
0
    }
3208
3209
0
    error = tc_query_qdisc(netdev_);
3210
0
    if (error) {
3211
0
        goto exit;
3212
0
    }
3213
3214
0
    if (new_ops == netdev->tc->ops) {
3215
0
        error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
3216
0
    } else {
3217
        /* Delete existing qdisc. */
3218
0
        error = tc_del_qdisc(netdev_);
3219
0
        if (error) {
3220
0
            VLOG_WARN_RL(&rl, "%s: Failed to delete existing qdisc: %s",
3221
0
                         netdev_get_name(netdev_), ovs_strerror(error));
3222
0
            goto exit;
3223
0
        }
3224
0
        ovs_assert(netdev->tc == NULL);
3225
3226
        /* Install new qdisc. */
3227
0
        error = new_ops->tc_install(netdev_, details);
3228
0
        if (error) {
3229
0
            VLOG_WARN_RL(&rl, "%s: Failed to install new qdisc: %s",
3230
0
                         netdev_get_name(netdev_), ovs_strerror(error));
3231
0
        }
3232
0
        ovs_assert((error == 0) == (netdev->tc != NULL));
3233
0
    }
3234
3235
0
exit:
3236
0
    ovs_mutex_unlock(&netdev->mutex);
3237
0
    return error;
3238
0
}
3239
3240
static int
3241
netdev_linux_get_queue(const struct netdev *netdev_,
3242
                       unsigned int queue_id, struct smap *details)
3243
0
{
3244
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3245
0
    int error;
3246
3247
0
    ovs_mutex_lock(&netdev->mutex);
3248
0
    if (netdev_linux_netnsid_is_remote(netdev)) {
3249
0
        error = EOPNOTSUPP;
3250
0
        goto exit;
3251
0
    }
3252
3253
0
    error = tc_query_qdisc(netdev_);
3254
0
    if (!error) {
3255
0
        struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
3256
0
        error = (queue
3257
0
                ? netdev->tc->ops->class_get(netdev_, queue, details)
3258
0
                : ENOENT);
3259
0
    }
3260
3261
0
exit:
3262
0
    ovs_mutex_unlock(&netdev->mutex);
3263
0
    return error;
3264
0
}
3265
3266
static int
3267
netdev_linux_set_queue(struct netdev *netdev_,
3268
                       unsigned int queue_id, const struct smap *details)
3269
0
{
3270
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3271
0
    int error;
3272
3273
0
    ovs_mutex_lock(&netdev->mutex);
3274
0
    if (netdev_linux_netnsid_is_remote(netdev)) {
3275
0
        error = EOPNOTSUPP;
3276
0
        goto exit;
3277
0
    }
3278
3279
0
    error = tc_query_qdisc(netdev_);
3280
0
    if (!error) {
3281
0
        error = (queue_id < netdev->tc->ops->n_queues
3282
0
                 && netdev->tc->ops->class_set
3283
0
                 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
3284
0
                 : EINVAL);
3285
0
    }
3286
3287
0
exit:
3288
0
    ovs_mutex_unlock(&netdev->mutex);
3289
0
    return error;
3290
0
}
3291
3292
static int
3293
netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
3294
0
{
3295
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3296
0
    int error;
3297
3298
0
    ovs_mutex_lock(&netdev->mutex);
3299
0
    if (netdev_linux_netnsid_is_remote(netdev)) {
3300
0
        error = EOPNOTSUPP;
3301
0
        goto exit;
3302
0
    }
3303
3304
0
    error = tc_query_qdisc(netdev_);
3305
0
    if (!error) {
3306
0
        if (netdev->tc->ops->class_delete) {
3307
0
            struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
3308
0
            error = (queue
3309
0
                     ? netdev->tc->ops->class_delete(netdev_, queue)
3310
0
                     : ENOENT);
3311
0
        } else {
3312
0
            error = EINVAL;
3313
0
        }
3314
0
    }
3315
3316
0
exit:
3317
0
    ovs_mutex_unlock(&netdev->mutex);
3318
0
    return error;
3319
0
}
3320
3321
static int
3322
netdev_linux_get_queue_stats(const struct netdev *netdev_,
3323
                             unsigned int queue_id,
3324
                             struct netdev_queue_stats *stats)
3325
0
{
3326
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3327
0
    int error;
3328
3329
0
    ovs_mutex_lock(&netdev->mutex);
3330
0
    if (netdev_linux_netnsid_is_remote(netdev)) {
3331
0
        error = EOPNOTSUPP;
3332
0
        goto exit;
3333
0
    }
3334
3335
0
    error = tc_query_qdisc(netdev_);
3336
0
    if (!error) {
3337
0
        if (netdev->tc->ops->class_get_stats) {
3338
0
            const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
3339
0
            if (queue) {
3340
0
                stats->created = queue->created;
3341
0
                error = netdev->tc->ops->class_get_stats(netdev_, queue,
3342
0
                                                         stats);
3343
0
            } else {
3344
0
                error = ENOENT;
3345
0
            }
3346
0
        } else {
3347
0
            error = EOPNOTSUPP;
3348
0
        }
3349
0
    }
3350
3351
0
exit:
3352
0
    ovs_mutex_unlock(&netdev->mutex);
3353
0
    return error;
3354
0
}
3355
3356
struct queue_dump_state {
3357
    struct nl_dump dump;
3358
    struct ofpbuf buf;
3359
};
3360
3361
static bool
3362
start_queue_dump(const struct netdev *netdev, struct queue_dump_state *state)
3363
0
{
3364
0
    struct ofpbuf request;
3365
0
    struct tcmsg *tcmsg;
3366
3367
0
    tcmsg = netdev_linux_tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
3368
0
    if (!tcmsg) {
3369
0
        return false;
3370
0
    }
3371
0
    tcmsg->tcm_parent = 0;
3372
0
    nl_dump_start(&state->dump, NETLINK_ROUTE, &request);
3373
0
    ofpbuf_uninit(&request);
3374
3375
0
    ofpbuf_init(&state->buf, NL_DUMP_BUFSIZE);
3376
0
    return true;
3377
0
}
3378
3379
static int
3380
finish_queue_dump(struct queue_dump_state *state)
3381
0
{
3382
0
    ofpbuf_uninit(&state->buf);
3383
0
    return nl_dump_done(&state->dump);
3384
0
}
3385
3386
struct netdev_linux_queue_state {
3387
    unsigned int *queues;
3388
    size_t cur_queue;
3389
    size_t n_queues;
3390
};
3391
3392
static int
3393
netdev_linux_queue_dump_start(const struct netdev *netdev_, void **statep)
3394
0
{
3395
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3396
0
    int error;
3397
3398
0
    ovs_mutex_lock(&netdev->mutex);
3399
0
    if (netdev_linux_netnsid_is_remote(netdev)) {
3400
0
        error = EOPNOTSUPP;
3401
0
        goto exit;
3402
0
    }
3403
3404
0
    error = tc_query_qdisc(netdev_);
3405
0
    if (!error) {
3406
0
        if (netdev->tc->ops->class_get) {
3407
0
            struct netdev_linux_queue_state *state;
3408
0
            struct tc_queue *queue;
3409
0
            size_t i;
3410
3411
0
            *statep = state = xmalloc(sizeof *state);
3412
0
            state->n_queues = hmap_count(&netdev->tc->queues);
3413
0
            state->cur_queue = 0;
3414
0
            state->queues = xmalloc(state->n_queues * sizeof *state->queues);
3415
3416
0
            i = 0;
3417
0
            HMAP_FOR_EACH (queue, hmap_node, &netdev->tc->queues) {
3418
0
                state->queues[i++] = queue->queue_id;
3419
0
            }
3420
0
        } else {
3421
0
            error = EOPNOTSUPP;
3422
0
        }
3423
0
    }
3424
3425
0
exit:
3426
0
    ovs_mutex_unlock(&netdev->mutex);
3427
0
    return error;
3428
0
}
3429
3430
static int
3431
netdev_linux_queue_dump_next(const struct netdev *netdev_, void *state_,
3432
                             unsigned int *queue_idp, struct smap *details)
3433
0
{
3434
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3435
0
    struct netdev_linux_queue_state *state = state_;
3436
0
    int error = EOF;
3437
3438
0
    ovs_mutex_lock(&netdev->mutex);
3439
0
    if (netdev_linux_netnsid_is_remote(netdev)) {
3440
0
        error = EOPNOTSUPP;
3441
0
        goto exit;
3442
0
    }
3443
3444
0
    while (state->cur_queue < state->n_queues) {
3445
0
        unsigned int queue_id = state->queues[state->cur_queue++];
3446
0
        struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
3447
3448
0
        if (queue) {
3449
0
            *queue_idp = queue_id;
3450
0
            error = netdev->tc->ops->class_get(netdev_, queue, details);
3451
0
            break;
3452
0
        }
3453
0
    }
3454
3455
0
exit:
3456
0
    ovs_mutex_unlock(&netdev->mutex);
3457
0
    return error;
3458
0
}
3459
3460
static int
3461
netdev_linux_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
3462
                             void *state_)
3463
0
{
3464
0
    struct netdev_linux_queue_state *state = state_;
3465
3466
0
    free(state->queues);
3467
0
    free(state);
3468
0
    return 0;
3469
0
}
3470
3471
static int
3472
netdev_linux_dump_queue_stats(const struct netdev *netdev_,
3473
                              netdev_dump_queue_stats_cb *cb, void *aux)
3474
0
{
3475
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3476
0
    int error;
3477
3478
0
    ovs_mutex_lock(&netdev->mutex);
3479
0
    if (netdev_linux_netnsid_is_remote(netdev)) {
3480
0
        error = EOPNOTSUPP;
3481
0
        goto exit;
3482
0
    }
3483
3484
0
    error = tc_query_qdisc(netdev_);
3485
0
    if (!error) {
3486
0
        struct queue_dump_state state;
3487
3488
0
        if (!netdev->tc->ops->class_dump_stats) {
3489
0
            error = EOPNOTSUPP;
3490
0
        } else if (!start_queue_dump(netdev_, &state)) {
3491
0
            error = ENODEV;
3492
0
        } else {
3493
0
            struct ofpbuf msg;
3494
0
            int retval;
3495
3496
0
            while (nl_dump_next(&state.dump, &msg, &state.buf)) {
3497
0
                retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
3498
0
                                                           cb, aux);
3499
0
                if (retval) {
3500
0
                    error = retval;
3501
0
                }
3502
0
            }
3503
3504
0
            retval = finish_queue_dump(&state);
3505
0
            if (retval) {
3506
0
                error = retval;
3507
0
            }
3508
0
        }
3509
0
    }
3510
3511
0
exit:
3512
0
    ovs_mutex_unlock(&netdev->mutex);
3513
0
    return error;
3514
0
}
3515
3516
static int
3517
netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
3518
                     struct in_addr netmask)
3519
0
{
3520
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3521
0
    int error;
3522
3523
0
    ovs_mutex_lock(&netdev->mutex);
3524
0
    if (netdev_linux_netnsid_is_remote(netdev)) {
3525
0
        error = EOPNOTSUPP;
3526
0
        goto exit;
3527
0
    }
3528
3529
0
    error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
3530
0
    if (!error) {
3531
0
        if (address.s_addr != INADDR_ANY) {
3532
0
            error = do_set_addr(netdev_, SIOCSIFNETMASK,
3533
0
                                "SIOCSIFNETMASK", netmask);
3534
0
        }
3535
0
    }
3536
3537
0
exit:
3538
0
    ovs_mutex_unlock(&netdev->mutex);
3539
0
    return error;
3540
0
}
3541
3542
/* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address.
3543
 * Otherwise, sets '*in6' to 'in6addr_any' and returns the corresponding
3544
 * error. */
3545
static int
3546
netdev_linux_get_addr_list(const struct netdev *netdev_,
3547
                          struct in6_addr **addr, struct in6_addr **mask, int *n_cnt)
3548
0
{
3549
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3550
0
    int error;
3551
3552
0
    ovs_mutex_lock(&netdev->mutex);
3553
0
    if (netdev_linux_netnsid_is_remote(netdev)) {
3554
0
        error = EOPNOTSUPP;
3555
0
        goto exit;
3556
0
    }
3557
3558
0
    error = netdev_get_addrs(netdev_get_name(netdev_), addr, mask, n_cnt);
3559
3560
0
exit:
3561
0
    ovs_mutex_unlock(&netdev->mutex);
3562
0
    return error;
3563
0
}
3564
3565
static void
3566
make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
3567
0
{
3568
0
    struct sockaddr_in sin;
3569
0
    memset(&sin, 0, sizeof sin);
3570
0
    sin.sin_family = AF_INET;
3571
0
    sin.sin_addr = addr;
3572
0
    sin.sin_port = 0;
3573
3574
0
    memset(sa, 0, sizeof *sa);
3575
0
    memcpy(sa, &sin, sizeof sin);
3576
0
}
3577
3578
static int
3579
do_set_addr(struct netdev *netdev,
3580
            int ioctl_nr, const char *ioctl_name, struct in_addr addr)
3581
0
{
3582
0
    struct ifreq ifr;
3583
3584
0
    memset(&ifr, 0, sizeof ifr);
3585
0
    make_in4_sockaddr(&ifr.ifr_addr, addr);
3586
0
    return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
3587
0
                               ioctl_name);
3588
0
}
3589
3590
/* Adds 'router' as a default IP gateway. */
3591
static int
3592
netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
3593
0
{
3594
0
    struct in_addr any = { INADDR_ANY };
3595
0
    struct rtentry rt;
3596
0
    int error;
3597
3598
0
    memset(&rt, 0, sizeof rt);
3599
0
    make_in4_sockaddr(&rt.rt_dst, any);
3600
0
    make_in4_sockaddr(&rt.rt_gateway, router);
3601
0
    make_in4_sockaddr(&rt.rt_genmask, any);
3602
0
    rt.rt_flags = RTF_UP | RTF_GATEWAY;
3603
0
    error = af_inet_ioctl(SIOCADDRT, &rt);
3604
0
    if (error) {
3605
0
        VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
3606
0
    }
3607
0
    return error;
3608
0
}
3609
3610
static int
3611
netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
3612
                          char **netdev_name)
3613
0
{
3614
0
    static const char fn[] = "/proc/net/route";
3615
0
    FILE *stream;
3616
0
    char line[256];
3617
0
    int ln;
3618
3619
0
    *netdev_name = NULL;
3620
0
    stream = fopen(fn, "r");
3621
0
    if (stream == NULL) {
3622
0
        VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
3623
0
        return errno;
3624
0
    }
3625
3626
0
    ln = 0;
3627
0
    while (fgets(line, sizeof line, stream)) {
3628
0
        if (++ln >= 2) {
3629
0
            char iface[17];
3630
0
            ovs_be32 dest, gateway, mask;
3631
0
            int refcnt, metric, mtu;
3632
0
            unsigned int flags, use, window, irtt;
3633
3634
0
            if (!ovs_scan(line,
3635
0
                          "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
3636
0
                          " %d %u %u\n",
3637
0
                          iface, &dest, &gateway, &flags, &refcnt,
3638
0
                          &use, &metric, &mask, &mtu, &window, &irtt)) {
3639
0
                VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
3640
0
                        fn, ln, line);
3641
0
                continue;
3642
0
            }
3643
0
            if (!(flags & RTF_UP)) {
3644
                /* Skip routes that aren't up. */
3645
0
                continue;
3646
0
            }
3647
3648
            /* The output of 'dest', 'mask', and 'gateway' were given in
3649
             * network byte order, so we don't need need any endian
3650
             * conversions here. */
3651
0
            if ((dest & mask) == (host->s_addr & mask)) {
3652
0
                if (!gateway) {
3653
                    /* The host is directly reachable. */
3654
0
                    next_hop->s_addr = 0;
3655
0
                } else {
3656
                    /* To reach the host, we must go through a gateway. */
3657
0
                    next_hop->s_addr = gateway;
3658
0
                }
3659
0
                *netdev_name = xstrdup(iface);
3660
0
                fclose(stream);
3661
0
                return 0;
3662
0
            }
3663
0
        }
3664
0
    }
3665
3666
0
    fclose(stream);
3667
0
    return ENXIO;
3668
0
}
3669
3670
int
3671
netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
3672
0
{
3673
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3674
0
    int error = 0;
3675
3676
0
    ovs_mutex_lock(&netdev->mutex);
3677
0
    if (!(netdev->cache_valid & VALID_DRVINFO)) {
3678
0
        struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
3679
3680
0
        COVERAGE_INC(netdev_get_ethtool);
3681
0
        memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
3682
0
        error = netdev_linux_do_ethtool(netdev->up.name,
3683
0
                                        cmd,
3684
0
                                        ETHTOOL_GDRVINFO,
3685
0
                                        "ETHTOOL_GDRVINFO");
3686
0
        if (!error) {
3687
0
            netdev->cache_valid |= VALID_DRVINFO;
3688
0
        }
3689
0
    }
3690
3691
0
    if (!error) {
3692
0
        smap_add(smap, "driver_name", netdev->drvinfo.driver);
3693
0
        smap_add(smap, "driver_version", netdev->drvinfo.version);
3694
0
        smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
3695
0
    }
3696
0
    ovs_mutex_unlock(&netdev->mutex);
3697
3698
0
    return error;
3699
0
}
3700
3701
static int
3702
netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
3703
                           struct smap *smap)
3704
0
{
3705
0
    smap_add(smap, "driver_name", "openvswitch");
3706
0
    return 0;
3707
0
}
3708
3709
static uint32_t
3710
netdev_linux_get_block_id(struct netdev *netdev_)
3711
0
{
3712
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3713
0
    uint32_t block_id = 0;
3714
3715
0
    ovs_mutex_lock(&netdev->mutex);
3716
    /* Ensure the linux netdev has had its fields populated. */
3717
0
    if (!(netdev->cache_valid & VALID_IFINDEX)) {
3718
0
        netdev_linux_update_via_netlink(netdev);
3719
0
    }
3720
3721
    /* Only assigning block ids to linux netdevs that are
3722
     * LAG primary members. */
3723
0
    if (netdev->is_lag_primary) {
3724
0
        block_id = netdev->ifindex;
3725
0
    }
3726
0
    ovs_mutex_unlock(&netdev->mutex);
3727
3728
0
    return block_id;
3729
0
}
3730
3731
/* Looks up the ARP table entry for 'ip' on 'netdev'.  If one exists and can be
3732
 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
3733
 * returns 0.  Otherwise, it returns a positive errno value; in particular,
3734
 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
3735
static int
3736
netdev_linux_arp_lookup(const struct netdev *netdev,
3737
                        ovs_be32 ip, struct eth_addr *mac)
3738
0
{
3739
0
    struct arpreq r;
3740
0
    struct sockaddr_in sin;
3741
0
    int retval;
3742
3743
0
    memset(&r, 0, sizeof r);
3744
0
    memset(&sin, 0, sizeof sin);
3745
0
    sin.sin_family = AF_INET;
3746
0
    sin.sin_addr.s_addr = ip;
3747
0
    sin.sin_port = 0;
3748
0
    memcpy(&r.arp_pa, &sin, sizeof sin);
3749
0
    r.arp_ha.sa_family = ARPHRD_ETHER;
3750
0
    r.arp_flags = 0;
3751
0
    ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
3752
0
    COVERAGE_INC(netdev_arp_lookup);
3753
0
    retval = af_inet_ioctl(SIOCGARP, &r);
3754
0
    if (!retval) {
3755
0
        memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
3756
0
    } else if (retval != ENXIO) {
3757
0
        VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
3758
0
                     netdev_get_name(netdev), IP_ARGS(ip),
3759
0
                     ovs_strerror(retval));
3760
0
    }
3761
0
    return retval;
3762
0
}
3763
3764
static unsigned int
3765
nd_to_iff_flags(enum netdev_flags nd)
3766
0
{
3767
0
    unsigned int iff = 0;
3768
0
    if (nd & NETDEV_UP) {
3769
0
        iff |= IFF_UP;
3770
0
    }
3771
0
    if (nd & NETDEV_PROMISC) {
3772
0
        iff |= IFF_PROMISC;
3773
0
    }
3774
0
    if (nd & NETDEV_LOOPBACK) {
3775
0
        iff |= IFF_LOOPBACK;
3776
0
    }
3777
0
    return iff;
3778
0
}
3779
3780
static int
3781
iff_to_nd_flags(unsigned int iff)
3782
0
{
3783
0
    enum netdev_flags nd = 0;
3784
0
    if (iff & IFF_UP) {
3785
0
        nd |= NETDEV_UP;
3786
0
    }
3787
0
    if (iff & IFF_PROMISC) {
3788
0
        nd |= NETDEV_PROMISC;
3789
0
    }
3790
0
    if (iff & IFF_LOOPBACK) {
3791
0
        nd |= NETDEV_LOOPBACK;
3792
0
    }
3793
0
    return nd;
3794
0
}
3795
3796
static int
3797
update_flags(struct netdev_linux *netdev, enum netdev_flags off,
3798
             enum netdev_flags on, enum netdev_flags *old_flagsp)
3799
    OVS_REQUIRES(netdev->mutex)
3800
0
{
3801
0
    unsigned int old_flags, new_flags;
3802
0
    int error = 0;
3803
3804
0
    old_flags = netdev->ifi_flags;
3805
0
    *old_flagsp = iff_to_nd_flags(old_flags);
3806
0
    new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
3807
0
    if (new_flags != old_flags) {
3808
0
        error = set_flags(netdev_get_name(&netdev->up), new_flags);
3809
0
        get_flags(&netdev->up, &netdev->ifi_flags);
3810
0
    }
3811
3812
0
    return error;
3813
0
}
3814
3815
static int
3816
netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
3817
                          enum netdev_flags on, enum netdev_flags *old_flagsp)
3818
0
{
3819
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3820
0
    int error = 0;
3821
3822
0
    ovs_mutex_lock(&netdev->mutex);
3823
0
    if (on || off) {
3824
        /* Changing flags over netlink isn't support yet. */
3825
0
        if (netdev_linux_netnsid_is_remote(netdev)) {
3826
0
            error = EOPNOTSUPP;
3827
0
            goto exit;
3828
0
        }
3829
0
        error = update_flags(netdev, off, on, old_flagsp);
3830
0
    } else {
3831
        /* Try reading flags over netlink, or fall back to ioctl. */
3832
0
        if (!netdev_linux_update_via_netlink(netdev)) {
3833
0
            *old_flagsp = iff_to_nd_flags(netdev->ifi_flags);
3834
0
        } else {
3835
0
            error = update_flags(netdev, off, on, old_flagsp);
3836
0
        }
3837
0
    }
3838
3839
0
exit:
3840
0
    ovs_mutex_unlock(&netdev->mutex);
3841
0
    return error;
3842
0
}
3843
3844
#define NETDEV_LINUX_CLASS_COMMON                               \
3845
    .run = netdev_linux_run,                                    \
3846
    .wait = netdev_linux_wait,                                  \
3847
    .alloc = netdev_linux_alloc,                                \
3848
    .dealloc = netdev_linux_dealloc,                            \
3849
    .send_wait = netdev_linux_send_wait,                        \
3850
    .set_etheraddr = netdev_linux_set_etheraddr,                \
3851
    .get_etheraddr = netdev_linux_get_etheraddr,                \
3852
    .get_mtu = netdev_linux_get_mtu,                            \
3853
    .set_mtu = netdev_linux_set_mtu,                            \
3854
    .get_ifindex = netdev_linux_get_ifindex,                    \
3855
    .get_carrier = netdev_linux_get_carrier,                    \
3856
    .get_carrier_resets = netdev_linux_get_carrier_resets,      \
3857
    .set_miimon_interval = netdev_linux_set_miimon_interval,    \
3858
    .set_advertisements = netdev_linux_set_advertisements,      \
3859
    .set_policing = netdev_linux_set_policing,                  \
3860
    .get_qos_types = netdev_linux_get_qos_types,                \
3861
    .get_qos_capabilities = netdev_linux_get_qos_capabilities,  \
3862
    .get_qos = netdev_linux_get_qos,                            \
3863
    .set_qos = netdev_linux_set_qos,                            \
3864
    .get_queue = netdev_linux_get_queue,                        \
3865
    .set_queue = netdev_linux_set_queue,                        \
3866
    .delete_queue = netdev_linux_delete_queue,                  \
3867
    .get_queue_stats = netdev_linux_get_queue_stats,            \
3868
    .queue_dump_start = netdev_linux_queue_dump_start,          \
3869
    .queue_dump_next = netdev_linux_queue_dump_next,            \
3870
    .queue_dump_done = netdev_linux_queue_dump_done,            \
3871
    .dump_queue_stats = netdev_linux_dump_queue_stats,          \
3872
    .set_in4 = netdev_linux_set_in4,                            \
3873
    .get_addr_list = netdev_linux_get_addr_list,                \
3874
    .add_router = netdev_linux_add_router,                      \
3875
    .get_next_hop = netdev_linux_get_next_hop,                  \
3876
    .arp_lookup = netdev_linux_arp_lookup,                      \
3877
    .update_flags = netdev_linux_update_flags,                  \
3878
    .rxq_alloc = netdev_linux_rxq_alloc,                        \
3879
    .rxq_dealloc = netdev_linux_rxq_dealloc,                    \
3880
    .rxq_wait = netdev_linux_rxq_wait,                          \
3881
    .rxq_drain = netdev_linux_rxq_drain
3882
3883
const struct netdev_class netdev_linux_class = {
3884
    NETDEV_LINUX_CLASS_COMMON,
3885
    .type = "system",
3886
    .is_pmd = false,
3887
    .construct = netdev_linux_construct,
3888
    .destruct = netdev_linux_destruct,
3889
    .get_stats = netdev_linux_get_stats,
3890
    .get_features = netdev_linux_get_features,
3891
    .get_speed = netdev_linux_get_speed,
3892
    .get_status = netdev_linux_get_status,
3893
    .get_block_id = netdev_linux_get_block_id,
3894
    .send = netdev_linux_send,
3895
    .rxq_construct = netdev_linux_rxq_construct,
3896
    .rxq_destruct = netdev_linux_rxq_destruct,
3897
    .rxq_recv = netdev_linux_rxq_recv,
3898
};
3899
3900
const struct netdev_class netdev_tap_class = {
3901
    NETDEV_LINUX_CLASS_COMMON,
3902
    .type = "tap",
3903
    .is_pmd = false,
3904
    .construct = netdev_linux_construct_tap,
3905
    .destruct = netdev_linux_destruct,
3906
    .get_stats = netdev_tap_get_stats,
3907
    .get_features = netdev_linux_get_features,
3908
    .get_speed = netdev_linux_get_speed,
3909
    .get_status = netdev_linux_get_status,
3910
    .send = netdev_linux_send,
3911
    .rxq_construct = netdev_linux_rxq_construct,
3912
    .rxq_destruct = netdev_linux_rxq_destruct,
3913
    .rxq_recv = netdev_linux_rxq_recv,
3914
};
3915
3916
const struct netdev_class netdev_internal_class = {
3917
    NETDEV_LINUX_CLASS_COMMON,
3918
    .type = "internal",
3919
    .is_pmd = false,
3920
    .construct = netdev_linux_construct,
3921
    .destruct = netdev_linux_destruct,
3922
    .get_stats = netdev_internal_get_stats,
3923
    .get_status = netdev_internal_get_status,
3924
    .send = netdev_linux_send,
3925
    .rxq_construct = netdev_linux_rxq_construct,
3926
    .rxq_destruct = netdev_linux_rxq_destruct,
3927
    .rxq_recv = netdev_linux_rxq_recv,
3928
};
3929
3930
#ifdef HAVE_AF_XDP
3931
#define NETDEV_AFXDP_CLASS_COMMON                               \
3932
    .construct = netdev_afxdp_construct,                        \
3933
    .destruct = netdev_afxdp_destruct,                          \
3934
    .get_stats = netdev_afxdp_get_stats,                        \
3935
    .get_custom_stats = netdev_afxdp_get_custom_stats,          \
3936
    .get_status = netdev_afxdp_get_status,                      \
3937
    .set_config = netdev_afxdp_set_config,                      \
3938
    .get_config = netdev_afxdp_get_config,                      \
3939
    .reconfigure = netdev_afxdp_reconfigure,                    \
3940
    .get_numa_id = netdev_linux_get_numa_id,                    \
3941
    .send = netdev_afxdp_batch_send,                            \
3942
    .rxq_construct = netdev_afxdp_rxq_construct,                \
3943
    .rxq_destruct = netdev_afxdp_rxq_destruct,                  \
3944
    .rxq_recv = netdev_afxdp_rxq_recv
3945
3946
const struct netdev_class netdev_afxdp_class = {
3947
    NETDEV_LINUX_CLASS_COMMON,
3948
    NETDEV_AFXDP_CLASS_COMMON,
3949
    .type = "afxdp",
3950
    .is_pmd = true,
3951
};
3952
3953
const struct netdev_class netdev_afxdp_nonpmd_class = {
3954
    NETDEV_LINUX_CLASS_COMMON,
3955
    NETDEV_AFXDP_CLASS_COMMON,
3956
    .type = "afxdp-nonpmd",
3957
    .is_pmd = false,
3958
};
3959
#endif
3960

3961
3962
#define CODEL_N_QUEUES 0x0000
3963
3964
/* In sufficiently new kernel headers these are defined as enums in
3965
 * <linux/pkt_sched.h>.  Define them here as macros to help out with older
3966
 * kernels.  (This overrides any enum definition in the header file but that's
3967
 * harmless.) */
3968
0
#define TCA_CODEL_TARGET   1
3969
0
#define TCA_CODEL_LIMIT    2
3970
0
#define TCA_CODEL_INTERVAL 3
3971
3972
struct codel {
3973
    struct tc tc;
3974
    uint32_t target;
3975
    uint32_t limit;
3976
    uint32_t interval;
3977
};
3978
3979
static struct codel *
3980
codel_get__(const struct netdev *netdev_)
3981
0
{
3982
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3983
0
    return CONTAINER_OF(netdev->tc, struct codel, tc);
3984
0
}
3985
3986
static void
3987
codel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
3988
                uint32_t interval)
3989
0
{
3990
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3991
0
    struct codel *codel;
3992
3993
0
    codel = xmalloc(sizeof *codel);
3994
0
    tc_init(&codel->tc, &tc_ops_codel);
3995
0
    codel->target = target;
3996
0
    codel->limit = limit;
3997
0
    codel->interval = interval;
3998
3999
0
    netdev->tc = &codel->tc;
4000
0
}
4001
4002
static int
4003
codel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
4004
                    uint32_t interval)
4005
0
{
4006
0
    size_t opt_offset;
4007
0
    struct ofpbuf request;
4008
0
    struct tcmsg *tcmsg;
4009
0
    uint32_t otarget, olimit, ointerval;
4010
0
    int error;
4011
4012
0
    tc_del_qdisc(netdev);
4013
4014
0
    tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
4015
0
                                         NLM_F_EXCL | NLM_F_CREATE, &request);
4016
0
    if (!tcmsg) {
4017
0
        return ENODEV;
4018
0
    }
4019
0
    tcmsg->tcm_handle = tc_make_handle(1, 0);
4020
0
    tcmsg->tcm_parent = TC_H_ROOT;
4021
4022
0
    otarget = target ? target : 5000;
4023
0
    olimit = limit ? limit : 10240;
4024
0
    ointerval = interval ? interval : 100000;
4025
4026
0
    nl_msg_put_string(&request, TCA_KIND, "codel");
4027
0
    opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4028
0
    nl_msg_put_u32(&request, TCA_CODEL_TARGET, otarget);
4029
0
    nl_msg_put_u32(&request, TCA_CODEL_LIMIT, olimit);
4030
0
    nl_msg_put_u32(&request, TCA_CODEL_INTERVAL, ointerval);
4031
0
    nl_msg_end_nested(&request, opt_offset);
4032
4033
0
    error = tc_transact(&request, NULL);
4034
0
    if (error) {
4035
0
        VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
4036
0
        "target %u, limit %u, interval %u error %d(%s)",
4037
0
        netdev_get_name(netdev),
4038
0
        otarget, olimit, ointerval,
4039
0
        error, ovs_strerror(error));
4040
0
    }
4041
0
    return error;
4042
0
}
4043
4044
static void
4045
codel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
4046
                            const struct smap *details, struct codel *codel)
4047
0
{
4048
0
    codel->target = smap_get_ullong(details, "target", 0);
4049
0
    codel->limit = smap_get_ullong(details, "limit", 0);
4050
0
    codel->interval = smap_get_ullong(details, "interval", 0);
4051
4052
0
    if (!codel->target) {
4053
0
        codel->target = 5000;
4054
0
    }
4055
0
    if (!codel->limit) {
4056
0
        codel->limit = 10240;
4057
0
    }
4058
0
    if (!codel->interval) {
4059
0
        codel->interval = 100000;
4060
0
    }
4061
0
}
4062
4063
static int
4064
codel_tc_install(struct netdev *netdev, const struct smap *details)
4065
0
{
4066
0
    int error;
4067
0
    struct codel codel;
4068
4069
0
    codel_parse_qdisc_details__(netdev, details, &codel);
4070
0
    error = codel_setup_qdisc__(netdev, codel.target, codel.limit,
4071
0
                                codel.interval);
4072
0
    if (!error) {
4073
0
        codel_install__(netdev, codel.target, codel.limit, codel.interval);
4074
0
    }
4075
0
    return error;
4076
0
}
4077
4078
static int
4079
codel_parse_tca_options__(struct nlattr *nl_options, struct codel *codel)
4080
0
{
4081
0
    static const struct nl_policy tca_codel_policy[] = {
4082
0
        [TCA_CODEL_TARGET] = { .type = NL_A_U32 },
4083
0
        [TCA_CODEL_LIMIT] = { .type = NL_A_U32 },
4084
0
        [TCA_CODEL_INTERVAL] = { .type = NL_A_U32 }
4085
0
    };
4086
4087
0
    struct nlattr *attrs[ARRAY_SIZE(tca_codel_policy)];
4088
4089
0
    if (!nl_parse_nested(nl_options, tca_codel_policy,
4090
0
                         attrs, ARRAY_SIZE(tca_codel_policy))) {
4091
0
        VLOG_WARN_RL(&rl, "failed to parse CoDel class options");
4092
0
        return EPROTO;
4093
0
    }
4094
4095
0
    codel->target = nl_attr_get_u32(attrs[TCA_CODEL_TARGET]);
4096
0
    codel->limit = nl_attr_get_u32(attrs[TCA_CODEL_LIMIT]);
4097
0
    codel->interval = nl_attr_get_u32(attrs[TCA_CODEL_INTERVAL]);
4098
0
    return 0;
4099
0
}
4100
4101
static int
4102
codel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
4103
0
{
4104
0
    struct nlattr *nlattr;
4105
0
    const char * kind;
4106
0
    int error;
4107
0
    struct codel codel;
4108
4109
0
    error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
4110
0
    if (error != 0) {
4111
0
        return error;
4112
0
    }
4113
4114
0
    error = codel_parse_tca_options__(nlattr, &codel);
4115
0
    if (error != 0) {
4116
0
        return error;
4117
0
    }
4118
4119
0
    codel_install__(netdev, codel.target, codel.limit, codel.interval);
4120
0
    return 0;
4121
0
}
4122
4123
4124
static void
4125
codel_tc_destroy(struct tc *tc)
4126
0
{
4127
0
    struct codel *codel = CONTAINER_OF(tc, struct codel, tc);
4128
0
    tc_destroy(tc);
4129
0
    free(codel);
4130
0
}
4131
4132
static int
4133
codel_qdisc_get(const struct netdev *netdev, struct smap *details)
4134
0
{
4135
0
    const struct codel *codel = codel_get__(netdev);
4136
0
    smap_add_format(details, "target", "%u", codel->target);
4137
0
    smap_add_format(details, "limit", "%u", codel->limit);
4138
0
    smap_add_format(details, "interval", "%u", codel->interval);
4139
0
    return 0;
4140
0
}
4141
4142
static int
4143
codel_qdisc_set(struct netdev *netdev, const struct smap *details)
4144
0
{
4145
0
    struct codel codel;
4146
4147
0
    codel_parse_qdisc_details__(netdev, details, &codel);
4148
0
    codel_install__(netdev, codel.target, codel.limit, codel.interval);
4149
0
    codel_get__(netdev)->target = codel.target;
4150
0
    codel_get__(netdev)->limit = codel.limit;
4151
0
    codel_get__(netdev)->interval = codel.interval;
4152
0
    return 0;
4153
0
}
4154
4155
static const struct tc_ops tc_ops_codel = {
4156
    .linux_name = "codel",
4157
    .ovs_name = "linux-codel",
4158
    .n_queues = CODEL_N_QUEUES,
4159
    .tc_install = codel_tc_install,
4160
    .tc_load = codel_tc_load,
4161
    .tc_destroy = codel_tc_destroy,
4162
    .qdisc_get = codel_qdisc_get,
4163
    .qdisc_set = codel_qdisc_set,
4164
};
4165

4166
/* FQ-CoDel traffic control class. */
4167
4168
#define FQCODEL_N_QUEUES 0x0000
4169
4170
/* In sufficiently new kernel headers these are defined as enums in
4171
 * <linux/pkt_sched.h>.  Define them here as macros to help out with older
4172
 * kernels.  (This overrides any enum definition in the header file but that's
4173
 * harmless.) */
4174
0
#define TCA_FQ_CODEL_TARGET     1
4175
0
#define TCA_FQ_CODEL_LIMIT      2
4176
0
#define TCA_FQ_CODEL_INTERVAL   3
4177
#define TCA_FQ_CODEL_ECN        4
4178
0
#define TCA_FQ_CODEL_FLOWS      5
4179
0
#define TCA_FQ_CODEL_QUANTUM    6
4180
4181
struct fqcodel {
4182
    struct tc tc;
4183
    uint32_t target;
4184
    uint32_t limit;
4185
    uint32_t interval;
4186
    uint32_t flows;
4187
    uint32_t quantum;
4188
};
4189
4190
static struct fqcodel *
4191
fqcodel_get__(const struct netdev *netdev_)
4192
0
{
4193
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4194
0
    return CONTAINER_OF(netdev->tc, struct fqcodel, tc);
4195
0
}
4196
4197
static void
4198
fqcodel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
4199
                  uint32_t interval, uint32_t flows, uint32_t quantum)
4200
0
{
4201
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4202
0
    struct fqcodel *fqcodel;
4203
4204
0
    fqcodel = xmalloc(sizeof *fqcodel);
4205
0
    tc_init(&fqcodel->tc, &tc_ops_fqcodel);
4206
0
    fqcodel->target = target;
4207
0
    fqcodel->limit = limit;
4208
0
    fqcodel->interval = interval;
4209
0
    fqcodel->flows = flows;
4210
0
    fqcodel->quantum = quantum;
4211
4212
0
    netdev->tc = &fqcodel->tc;
4213
0
}
4214
4215
static int
4216
fqcodel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
4217
                      uint32_t interval, uint32_t flows, uint32_t quantum)
4218
0
{
4219
0
    size_t opt_offset;
4220
0
    struct ofpbuf request;
4221
0
    struct tcmsg *tcmsg;
4222
0
    uint32_t otarget, olimit, ointerval, oflows,  oquantum;
4223
0
    int error;
4224
4225
0
    tc_del_qdisc(netdev);
4226
4227
0
    tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
4228
0
                                         NLM_F_EXCL | NLM_F_CREATE, &request);
4229
0
    if (!tcmsg) {
4230
0
        return ENODEV;
4231
0
    }
4232
0
    tcmsg->tcm_handle = tc_make_handle(1, 0);
4233
0
    tcmsg->tcm_parent = TC_H_ROOT;
4234
4235
0
    otarget = target ? target : 5000;
4236
0
    olimit = limit ? limit : 10240;
4237
0
    ointerval = interval ? interval : 100000;
4238
0
    oflows = flows ? flows : 1024;
4239
0
    oquantum = quantum ? quantum : 1514; /* fq_codel default quantum is 1514
4240
                                            not mtu */
4241
4242
0
    nl_msg_put_string(&request, TCA_KIND, "fq_codel");
4243
0
    opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4244
0
    nl_msg_put_u32(&request, TCA_FQ_CODEL_TARGET, otarget);
4245
0
    nl_msg_put_u32(&request, TCA_FQ_CODEL_LIMIT, olimit);
4246
0
    nl_msg_put_u32(&request, TCA_FQ_CODEL_INTERVAL, ointerval);
4247
0
    nl_msg_put_u32(&request, TCA_FQ_CODEL_FLOWS, oflows);
4248
0
    nl_msg_put_u32(&request, TCA_FQ_CODEL_QUANTUM, oquantum);
4249
0
    nl_msg_end_nested(&request, opt_offset);
4250
4251
0
    error = tc_transact(&request, NULL);
4252
0
    if (error) {
4253
0
        VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
4254
0
        "target %u, limit %u, interval %u, flows %u, quantum %u error %d(%s)",
4255
0
        netdev_get_name(netdev),
4256
0
        otarget, olimit, ointerval, oflows, oquantum,
4257
0
        error, ovs_strerror(error));
4258
0
    }
4259
0
    return error;
4260
0
}
4261
4262
static void
4263
fqcodel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
4264
                          const struct smap *details, struct fqcodel *fqcodel)
4265
0
{
4266
0
    fqcodel->target = smap_get_ullong(details, "target", 0);
4267
0
    fqcodel->limit = smap_get_ullong(details, "limit", 0);
4268
0
    fqcodel->interval = smap_get_ullong(details, "interval", 0);
4269
0
    fqcodel->flows = smap_get_ullong(details, "flows", 0);
4270
0
    fqcodel->quantum = smap_get_ullong(details, "quantum", 0);
4271
4272
0
    if (!fqcodel->target) {
4273
0
        fqcodel->target = 5000;
4274
0
    }
4275
0
    if (!fqcodel->limit) {
4276
0
        fqcodel->limit = 10240;
4277
0
    }
4278
0
    if (!fqcodel->interval) {
4279
0
        fqcodel->interval = 1000000;
4280
0
    }
4281
0
    if (!fqcodel->flows) {
4282
0
        fqcodel->flows = 1024;
4283
0
    }
4284
0
    if (!fqcodel->quantum) {
4285
0
        fqcodel->quantum = 1514;
4286
0
    }
4287
0
}
4288
4289
static int
4290
fqcodel_tc_install(struct netdev *netdev, const struct smap *details)
4291
0
{
4292
0
    int error;
4293
0
    struct fqcodel fqcodel;
4294
4295
0
    fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
4296
0
    error = fqcodel_setup_qdisc__(netdev, fqcodel.target, fqcodel.limit,
4297
0
                                  fqcodel.interval, fqcodel.flows,
4298
0
                                  fqcodel.quantum);
4299
0
    if (!error) {
4300
0
        fqcodel_install__(netdev, fqcodel.target, fqcodel.limit,
4301
0
                          fqcodel.interval, fqcodel.flows, fqcodel.quantum);
4302
0
    }
4303
0
    return error;
4304
0
}
4305
4306
static int
4307
fqcodel_parse_tca_options__(struct nlattr *nl_options, struct fqcodel *fqcodel)
4308
0
{
4309
0
    static const struct nl_policy tca_fqcodel_policy[] = {
4310
0
        [TCA_FQ_CODEL_TARGET] = { .type = NL_A_U32 },
4311
0
        [TCA_FQ_CODEL_LIMIT] = { .type = NL_A_U32 },
4312
0
        [TCA_FQ_CODEL_INTERVAL] = { .type = NL_A_U32 },
4313
0
        [TCA_FQ_CODEL_FLOWS] = { .type = NL_A_U32 },
4314
0
        [TCA_FQ_CODEL_QUANTUM] = { .type = NL_A_U32 }
4315
0
    };
4316
4317
0
    struct nlattr *attrs[ARRAY_SIZE(tca_fqcodel_policy)];
4318
4319
0
    if (!nl_parse_nested(nl_options, tca_fqcodel_policy,
4320
0
                         attrs, ARRAY_SIZE(tca_fqcodel_policy))) {
4321
0
        VLOG_WARN_RL(&rl, "failed to parse FQ_CoDel class options");
4322
0
        return EPROTO;
4323
0
    }
4324
4325
0
    fqcodel->target = nl_attr_get_u32(attrs[TCA_FQ_CODEL_TARGET]);
4326
0
    fqcodel->limit = nl_attr_get_u32(attrs[TCA_FQ_CODEL_LIMIT]);
4327
0
    fqcodel->interval =nl_attr_get_u32(attrs[TCA_FQ_CODEL_INTERVAL]);
4328
0
    fqcodel->flows = nl_attr_get_u32(attrs[TCA_FQ_CODEL_FLOWS]);
4329
0
    fqcodel->quantum = nl_attr_get_u32(attrs[TCA_FQ_CODEL_QUANTUM]);
4330
0
    return 0;
4331
0
}
4332
4333
static int
4334
fqcodel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
4335
0
{
4336
0
    struct nlattr *nlattr;
4337
0
    const char * kind;
4338
0
    int error;
4339
0
    struct fqcodel fqcodel;
4340
4341
0
    error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
4342
0
    if (error != 0) {
4343
0
        return error;
4344
0
    }
4345
4346
0
    error = fqcodel_parse_tca_options__(nlattr, &fqcodel);
4347
0
    if (error != 0) {
4348
0
        return error;
4349
0
    }
4350
4351
0
    fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
4352
0
                      fqcodel.flows, fqcodel.quantum);
4353
0
    return 0;
4354
0
}
4355
4356
static void
4357
fqcodel_tc_destroy(struct tc *tc)
4358
0
{
4359
0
    struct fqcodel *fqcodel = CONTAINER_OF(tc, struct fqcodel, tc);
4360
0
    tc_destroy(tc);
4361
0
    free(fqcodel);
4362
0
}
4363
4364
static int
4365
fqcodel_qdisc_get(const struct netdev *netdev, struct smap *details)
4366
0
{
4367
0
    const struct fqcodel *fqcodel = fqcodel_get__(netdev);
4368
0
    smap_add_format(details, "target", "%u", fqcodel->target);
4369
0
    smap_add_format(details, "limit", "%u", fqcodel->limit);
4370
0
    smap_add_format(details, "interval", "%u", fqcodel->interval);
4371
0
    smap_add_format(details, "flows", "%u", fqcodel->flows);
4372
0
    smap_add_format(details, "quantum", "%u", fqcodel->quantum);
4373
0
    return 0;
4374
0
}
4375
4376
static int
4377
fqcodel_qdisc_set(struct netdev *netdev, const struct smap *details)
4378
0
{
4379
0
    struct fqcodel fqcodel;
4380
4381
0
    fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
4382
0
    fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
4383
0
                      fqcodel.flows, fqcodel.quantum);
4384
0
    fqcodel_get__(netdev)->target = fqcodel.target;
4385
0
    fqcodel_get__(netdev)->limit = fqcodel.limit;
4386
0
    fqcodel_get__(netdev)->interval = fqcodel.interval;
4387
0
    fqcodel_get__(netdev)->flows = fqcodel.flows;
4388
0
    fqcodel_get__(netdev)->quantum = fqcodel.quantum;
4389
0
    return 0;
4390
0
}
4391
4392
static const struct tc_ops tc_ops_fqcodel = {
4393
    .linux_name = "fq_codel",
4394
    .ovs_name = "linux-fq_codel",
4395
    .n_queues = FQCODEL_N_QUEUES,
4396
    .tc_install = fqcodel_tc_install,
4397
    .tc_load = fqcodel_tc_load,
4398
    .tc_destroy = fqcodel_tc_destroy,
4399
    .qdisc_get = fqcodel_qdisc_get,
4400
    .qdisc_set = fqcodel_qdisc_set,
4401
};
4402

4403
/* SFQ traffic control class. */
4404
4405
#define SFQ_N_QUEUES 0x0000
4406
4407
struct sfq {
4408
    struct tc tc;
4409
    uint32_t quantum;
4410
    uint32_t perturb;
4411
};
4412
4413
static struct sfq *
4414
sfq_get__(const struct netdev *netdev_)
4415
0
{
4416
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4417
0
    return CONTAINER_OF(netdev->tc, struct sfq, tc);
4418
0
}
4419
4420
static void
4421
sfq_install__(struct netdev *netdev_, uint32_t quantum, uint32_t perturb)
4422
0
{
4423
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4424
0
    struct sfq *sfq;
4425
4426
0
    sfq = xmalloc(sizeof *sfq);
4427
0
    tc_init(&sfq->tc, &tc_ops_sfq);
4428
0
    sfq->perturb = perturb;
4429
0
    sfq->quantum = quantum;
4430
4431
0
    netdev->tc = &sfq->tc;
4432
0
}
4433
4434
static int
4435
sfq_setup_qdisc__(struct netdev *netdev, uint32_t quantum, uint32_t perturb)
4436
0
{
4437
0
    struct tc_sfq_qopt opt;
4438
0
    struct ofpbuf request;
4439
0
    struct tcmsg *tcmsg;
4440
0
    int mtu;
4441
0
    int mtu_error, error;
4442
0
    mtu_error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
4443
4444
0
    tc_del_qdisc(netdev);
4445
4446
0
    tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
4447
0
                                         NLM_F_EXCL | NLM_F_CREATE, &request);
4448
0
    if (!tcmsg) {
4449
0
        return ENODEV;
4450
0
    }
4451
0
    tcmsg->tcm_handle = tc_make_handle(1, 0);
4452
0
    tcmsg->tcm_parent = TC_H_ROOT;
4453
4454
0
    memset(&opt, 0, sizeof opt);
4455
0
    if (!quantum) {
4456
0
        if (!mtu_error) {
4457
0
            opt.quantum = mtu; /* if we cannot find mtu, use default */
4458
0
        }
4459
0
    } else {
4460
0
        opt.quantum = quantum;
4461
0
    }
4462
4463
0
    if (!perturb) {
4464
0
        opt.perturb_period = 10;
4465
0
    } else {
4466
0
        opt.perturb_period = perturb;
4467
0
    }
4468
4469
0
    nl_msg_put_string(&request, TCA_KIND, "sfq");
4470
0
    nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
4471
4472
0
    error = tc_transact(&request, NULL);
4473
0
    if (error) {
4474
0
        VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
4475
0
                     "quantum %u, perturb %u error %d(%s)",
4476
0
                     netdev_get_name(netdev),
4477
0
                     opt.quantum, opt.perturb_period,
4478
0
                     error, ovs_strerror(error));
4479
0
    }
4480
0
    return error;
4481
0
}
4482
4483
static void
4484
sfq_parse_qdisc_details__(struct netdev *netdev,
4485
                          const struct smap *details, struct sfq *sfq)
4486
0
{
4487
0
    sfq->perturb = smap_get_ullong(details, "perturb", 0);
4488
0
    sfq->quantum = smap_get_ullong(details, "quantum", 0);
4489
4490
0
    if (!sfq->perturb) {
4491
0
        sfq->perturb = 10;
4492
0
    }
4493
4494
0
    if (!sfq->quantum) {
4495
0
        int mtu;
4496
0
        if (!netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
4497
0
            sfq->quantum = mtu;
4498
0
        } else {
4499
0
            VLOG_WARN_RL(&rl, "when using SFQ, you must specify quantum on a "
4500
0
                         "device without mtu");
4501
0
        }
4502
0
    }
4503
0
}
4504
4505
static int
4506
sfq_tc_install(struct netdev *netdev, const struct smap *details)
4507
0
{
4508
0
    int error;
4509
0
    struct sfq sfq;
4510
4511
0
    sfq_parse_qdisc_details__(netdev, details, &sfq);
4512
0
    error = sfq_setup_qdisc__(netdev, sfq.quantum, sfq.perturb);
4513
0
    if (!error) {
4514
0
        sfq_install__(netdev, sfq.quantum, sfq.perturb);
4515
0
    }
4516
0
    return error;
4517
0
}
4518
4519
static int
4520
sfq_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
4521
0
{
4522
0
    const struct tc_sfq_qopt *sfq;
4523
0
    struct nlattr *nlattr;
4524
0
    const char * kind;
4525
0
    int error;
4526
4527
0
    error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
4528
0
    if (error == 0) {
4529
0
        sfq = nl_attr_get(nlattr);
4530
0
        sfq_install__(netdev, sfq->quantum, sfq->perturb_period);
4531
0
        return 0;
4532
0
    }
4533
4534
0
    return error;
4535
0
}
4536
4537
static void
4538
sfq_tc_destroy(struct tc *tc)
4539
0
{
4540
0
    struct sfq *sfq = CONTAINER_OF(tc, struct sfq, tc);
4541
0
    tc_destroy(tc);
4542
0
    free(sfq);
4543
0
}
4544
4545
static int
4546
sfq_qdisc_get(const struct netdev *netdev, struct smap *details)
4547
0
{
4548
0
    const struct sfq *sfq = sfq_get__(netdev);
4549
0
    smap_add_format(details, "quantum", "%u", sfq->quantum);
4550
0
    smap_add_format(details, "perturb", "%u", sfq->perturb);
4551
0
    return 0;
4552
0
}
4553
4554
static int
4555
sfq_qdisc_set(struct netdev *netdev, const struct smap *details)
4556
0
{
4557
0
    struct sfq sfq;
4558
4559
0
    sfq_parse_qdisc_details__(netdev, details, &sfq);
4560
0
    sfq_install__(netdev, sfq.quantum, sfq.perturb);
4561
0
    sfq_get__(netdev)->quantum = sfq.quantum;
4562
0
    sfq_get__(netdev)->perturb = sfq.perturb;
4563
0
    return 0;
4564
0
}
4565
4566
static const struct tc_ops tc_ops_sfq = {
4567
    .linux_name = "sfq",
4568
    .ovs_name = "linux-sfq",
4569
    .n_queues = SFQ_N_QUEUES,
4570
    .tc_install = sfq_tc_install,
4571
    .tc_load = sfq_tc_load,
4572
    .tc_destroy = sfq_tc_destroy,
4573
    .qdisc_get = sfq_qdisc_get,
4574
    .qdisc_set = sfq_qdisc_set,
4575
};
4576

4577
/* netem traffic control class. */
4578
4579
struct netem {
4580
    struct tc tc;
4581
    uint32_t latency;
4582
    uint32_t limit;
4583
    uint32_t loss;
4584
    uint32_t jitter;
4585
};
4586
4587
static struct netem *
4588
netem_get__(const struct netdev *netdev_)
4589
0
{
4590
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4591
0
    return CONTAINER_OF(netdev->tc, struct netem, tc);
4592
0
}
4593
4594
static void
4595
netem_install__(struct netdev *netdev_, uint32_t latency,
4596
                uint32_t limit, uint32_t loss, uint32_t jitter)
4597
0
{
4598
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4599
0
    struct netem *netem;
4600
4601
0
    netem = xmalloc(sizeof *netem);
4602
0
    tc_init(&netem->tc, &tc_ops_netem);
4603
0
    netem->latency = latency;
4604
0
    netem->limit = limit;
4605
0
    netem->loss = loss;
4606
0
    netem->jitter = jitter;
4607
4608
0
    netdev->tc = &netem->tc;
4609
0
}
4610
4611
static int
4612
netem_setup_qdisc__(struct netdev *netdev, uint32_t latency,
4613
                    uint32_t limit, uint32_t loss, uint32_t jitter)
4614
0
{
4615
0
    struct tc_netem_qopt opt;
4616
0
    struct ofpbuf request;
4617
0
    struct tcmsg *tcmsg;
4618
0
    int error;
4619
4620
0
    tc_del_qdisc(netdev);
4621
4622
0
    tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
4623
0
                                         NLM_F_EXCL | NLM_F_CREATE, &request);
4624
0
    if (!tcmsg) {
4625
0
        return ENODEV;
4626
0
    }
4627
0
    tcmsg->tcm_handle = tc_make_handle(1, 0);
4628
0
    tcmsg->tcm_parent = TC_H_ROOT;
4629
4630
0
    memset(&opt, 0, sizeof opt);
4631
4632
0
    if (!limit) {
4633
0
        opt.limit = 1000;
4634
0
    } else {
4635
0
        opt.limit = limit;
4636
0
    }
4637
4638
0
    if (loss) {
4639
0
        if (loss > 100) {
4640
0
            VLOG_WARN_RL(&rl,
4641
0
                         "loss should be a percentage value between 0 to 100, "
4642
0
                         "loss was %u", loss);
4643
0
            return EINVAL;
4644
0
        }
4645
0
        opt.loss = floor(UINT32_MAX * (loss / 100.0));
4646
0
    }
4647
4648
0
    opt.latency = tc_time_to_ticks(latency);
4649
0
    opt.jitter = tc_time_to_ticks(jitter);
4650
4651
0
    nl_msg_put_string(&request, TCA_KIND, "netem");
4652
0
    nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
4653
4654
0
    error = tc_transact(&request, NULL);
4655
0
    if (error) {
4656
0
        VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
4657
0
                          "latency %u, limit %u, loss %u, jitter %u "
4658
0
                          "error %d(%s)",
4659
0
                     netdev_get_name(netdev),
4660
0
                     opt.latency, opt.limit, opt.loss, opt.jitter,
4661
0
                     error, ovs_strerror(error));
4662
0
    }
4663
0
    return error;
4664
0
}
4665
4666
static void
4667
netem_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
4668
                          const struct smap *details, struct netem *netem)
4669
0
{
4670
0
    netem->latency = smap_get_ullong(details, "latency", 0);
4671
0
    netem->limit = smap_get_ullong(details, "limit", 0);
4672
0
    netem->loss = smap_get_ullong(details, "loss", 0);
4673
0
    netem->jitter = smap_get_ullong(details, "jitter", 0);
4674
4675
0
    if (!netem->limit) {
4676
0
        netem->limit = 1000;
4677
0
    }
4678
0
}
4679
4680
static int
4681
netem_tc_install(struct netdev *netdev, const struct smap *details)
4682
0
{
4683
0
    int error;
4684
0
    struct netem netem;
4685
4686
0
    netem_parse_qdisc_details__(netdev, details, &netem);
4687
0
    error = netem_setup_qdisc__(netdev, netem.latency,
4688
0
                                netem.limit, netem.loss, netem.jitter);
4689
0
    if (!error) {
4690
0
        netem_install__(netdev, netem.latency,
4691
0
                        netem.limit, netem.loss, netem.jitter);
4692
0
    }
4693
0
    return error;
4694
0
}
4695
4696
static int
4697
netem_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
4698
0
{
4699
0
    const struct tc_netem_qopt *netem;
4700
0
    struct nlattr *nlattr;
4701
0
    const char *kind;
4702
0
    int error;
4703
4704
0
    error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
4705
0
    if (error == 0) {
4706
0
        netem = nl_attr_get(nlattr);
4707
0
        netem_install__(netdev, netem->latency,
4708
0
                        netem->limit, netem->loss, netem->jitter);
4709
0
        return 0;
4710
0
    }
4711
4712
0
    return error;
4713
0
}
4714
4715
static void
4716
netem_tc_destroy(struct tc *tc)
4717
0
{
4718
0
    struct netem *netem = CONTAINER_OF(tc, struct netem, tc);
4719
0
    tc_destroy(tc);
4720
0
    free(netem);
4721
0
}
4722
4723
static int
4724
netem_qdisc_get(const struct netdev *netdev, struct smap *details)
4725
0
{
4726
0
    const struct netem *netem = netem_get__(netdev);
4727
0
    smap_add_format(details, "latency", "%u", netem->latency);
4728
0
    smap_add_format(details, "limit", "%u", netem->limit);
4729
0
    smap_add_format(details, "loss", "%u", netem->loss);
4730
0
    smap_add_format(details, "jitter", "%u", netem->jitter);
4731
0
    return 0;
4732
0
}
4733
4734
static int
4735
netem_qdisc_set(struct netdev *netdev, const struct smap *details)
4736
0
{
4737
0
    struct netem netem;
4738
4739
0
    netem_parse_qdisc_details__(netdev, details, &netem);
4740
0
    netem_install__(netdev, netem.latency,
4741
0
                    netem.limit, netem.loss, netem.jitter);
4742
0
    netem_get__(netdev)->latency = netem.latency;
4743
0
    netem_get__(netdev)->limit = netem.limit;
4744
0
    netem_get__(netdev)->loss = netem.loss;
4745
0
    netem_get__(netdev)->jitter = netem.jitter;
4746
0
    return 0;
4747
0
}
4748
4749
static const struct tc_ops tc_ops_netem = {
4750
    .linux_name = "netem",
4751
    .ovs_name = "linux-netem",
4752
    .n_queues = 0,
4753
    .tc_install = netem_tc_install,
4754
    .tc_load = netem_tc_load,
4755
    .tc_destroy = netem_tc_destroy,
4756
    .qdisc_get = netem_qdisc_get,
4757
    .qdisc_set = netem_qdisc_set,
4758
};
4759

4760
/* HTB traffic control class. */
4761
4762
0
#define HTB_N_QUEUES 0xf000
4763
0
#define HTB_RATE2QUANTUM 10
4764
4765
struct htb {
4766
    struct tc tc;
4767
    uint64_t max_rate;          /* In bytes/s. */
4768
};
4769
4770
struct htb_class {
4771
    struct tc_queue tc_queue;
4772
    uint64_t min_rate;          /* In bytes/s. */
4773
    uint64_t max_rate;          /* In bytes/s. */
4774
    unsigned int burst;         /* In bytes. */
4775
    unsigned int priority;      /* Lower values are higher priorities. */
4776
};
4777
4778
static struct htb *
4779
htb_get__(const struct netdev *netdev_)
4780
0
{
4781
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4782
0
    return CONTAINER_OF(netdev->tc, struct htb, tc);
4783
0
}
4784
4785
static void
4786
htb_install__(struct netdev *netdev_, uint64_t max_rate)
4787
0
{
4788
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4789
0
    struct htb *htb;
4790
4791
0
    htb = xmalloc(sizeof *htb);
4792
0
    tc_init(&htb->tc, &tc_ops_htb);
4793
0
    htb->max_rate = max_rate;
4794
4795
0
    netdev->tc = &htb->tc;
4796
0
}
4797
4798
/* Create an HTB qdisc.
4799
 *
4800
 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
4801
static int
4802
htb_setup_qdisc__(struct netdev *netdev)
4803
0
{
4804
0
    size_t opt_offset;
4805
0
    struct tc_htb_glob opt;
4806
0
    struct ofpbuf request;
4807
0
    struct tcmsg *tcmsg;
4808
4809
0
    tc_del_qdisc(netdev);
4810
4811
0
    tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
4812
0
                                         NLM_F_EXCL | NLM_F_CREATE, &request);
4813
0
    if (!tcmsg) {
4814
0
        return ENODEV;
4815
0
    }
4816
0
    tcmsg->tcm_handle = tc_make_handle(1, 0);
4817
0
    tcmsg->tcm_parent = TC_H_ROOT;
4818
4819
0
    nl_msg_put_string(&request, TCA_KIND, "htb");
4820
4821
0
    memset(&opt, 0, sizeof opt);
4822
0
    opt.rate2quantum = HTB_RATE2QUANTUM;
4823
0
    opt.version = 3;
4824
0
    opt.defcls = 1;
4825
4826
0
    opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4827
0
    nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
4828
0
    nl_msg_end_nested(&request, opt_offset);
4829
4830
0
    return tc_transact(&request, NULL);
4831
0
}
4832
4833
/* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
4834
 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
4835
static int
4836
htb_setup_class__(struct netdev *netdev, unsigned int handle,
4837
                  unsigned int parent, struct htb_class *class)
4838
0
{
4839
0
    size_t opt_offset;
4840
0
    struct tc_htb_opt opt;
4841
0
    struct ofpbuf request;
4842
0
    struct tcmsg *tcmsg;
4843
0
    int error;
4844
0
    int mtu;
4845
4846
0
    error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
4847
0
    if (error) {
4848
0
        VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
4849
0
                     netdev_get_name(netdev));
4850
0
        return error;
4851
0
    }
4852
4853
0
    memset(&opt, 0, sizeof opt);
4854
0
    tc_fill_rate(&opt.rate, class->min_rate, mtu);
4855
0
    tc_fill_rate(&opt.ceil, class->max_rate, mtu);
4856
    /* Makes sure the quantum is at least MTU.  Setting quantum will
4857
     * make htb ignore the r2q for this class. */
4858
0
    if ((class->min_rate / HTB_RATE2QUANTUM) < mtu) {
4859
0
        opt.quantum = mtu;
4860
0
    }
4861
0
    opt.buffer = tc_calc_buffer(class->min_rate, mtu, class->burst);
4862
0
    opt.cbuffer = tc_calc_buffer(class->max_rate, mtu, class->burst);
4863
0
    opt.prio = class->priority;
4864
4865
0
    tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE,
4866
0
                                         &request);
4867
0
    if (!tcmsg) {
4868
0
        return ENODEV;
4869
0
    }
4870
0
    tcmsg->tcm_handle = handle;
4871
0
    tcmsg->tcm_parent = parent;
4872
4873
0
    nl_msg_put_string(&request, TCA_KIND, "htb");
4874
0
    opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4875
4876
0
#ifdef HAVE_TCA_HTB_RATE64
4877
0
    if (class->min_rate > UINT32_MAX) {
4878
0
        nl_msg_put_u64(&request, TCA_HTB_RATE64, class->min_rate);
4879
0
    }
4880
0
    if (class->max_rate > UINT32_MAX) {
4881
0
        nl_msg_put_u64(&request, TCA_HTB_CEIL64, class->max_rate);
4882
0
    }
4883
0
#endif
4884
0
    nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
4885
4886
0
    tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate, class->min_rate);
4887
0
    tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil, class->max_rate);
4888
0
    nl_msg_end_nested(&request, opt_offset);
4889
4890
0
    error = tc_transact(&request, NULL);
4891
0
    if (error) {
4892
0
        VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
4893
0
                     "min_rate=%"PRIu64" max_rate=%"PRIu64" burst=%u prio=%u "
4894
0
                     "(%s)",
4895
0
                     netdev_get_name(netdev),
4896
0
                     tc_get_major(handle), tc_get_minor(handle),
4897
0
                     tc_get_major(parent), tc_get_minor(parent),
4898
0
                     class->min_rate, class->max_rate,
4899
0
                     class->burst, class->priority, ovs_strerror(error));
4900
0
    }
4901
0
    return error;
4902
0
}
4903
4904
/* Parses Netlink attributes in 'options' for HTB parameters and stores a
4905
 * description of them into 'details'.  The description complies with the
4906
 * specification given in the vswitch database documentation for linux-htb
4907
 * queue details. */
4908
static int
4909
htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
4910
0
{
4911
0
    static const struct nl_policy tca_htb_policy[] = {
4912
0
        [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
4913
0
                            .min_len = sizeof(struct tc_htb_opt) },
4914
0
#ifdef HAVE_TCA_HTB_RATE64
4915
0
        [TCA_HTB_RATE64] = { .type = NL_A_U64, .optional = true },
4916
0
        [TCA_HTB_CEIL64] = { .type = NL_A_U64, .optional = true },
4917
0
#endif
4918
0
    };
4919
4920
0
    struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
4921
0
    const struct tc_htb_opt *htb;
4922
4923
0
    if (!nl_parse_nested(nl_options, tca_htb_policy,
4924
0
                         attrs, ARRAY_SIZE(tca_htb_policy))) {
4925
0
        VLOG_WARN_RL(&rl, "failed to parse HTB class options");
4926
0
        return EPROTO;
4927
0
    }
4928
4929
0
    htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
4930
0
    class->min_rate = htb->rate.rate;
4931
0
    class->max_rate = htb->ceil.rate;
4932
0
#ifdef HAVE_TCA_HTB_RATE64
4933
0
    if (attrs[TCA_HTB_RATE64]) {
4934
0
        class->min_rate = nl_attr_get_u64(attrs[TCA_HTB_RATE64]);
4935
0
    }
4936
0
    if (attrs[TCA_HTB_CEIL64]) {
4937
0
        class->max_rate = nl_attr_get_u64(attrs[TCA_HTB_CEIL64]);
4938
0
    }
4939
0
#endif
4940
0
    class->burst = tc_ticks_to_bytes(class->min_rate, htb->buffer);
4941
0
    class->priority = htb->prio;
4942
0
    return 0;
4943
0
}
4944
4945
static int
4946
htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
4947
                  struct htb_class *options,
4948
                  struct netdev_queue_stats *stats)
4949
0
{
4950
0
    struct nlattr *nl_options;
4951
0
    unsigned int handle;
4952
0
    int error;
4953
4954
0
    error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
4955
0
    if (!error && queue_id) {
4956
0
        unsigned int major = tc_get_major(handle);
4957
0
        unsigned int minor = tc_get_minor(handle);
4958
0
        if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
4959
0
            *queue_id = minor - 1;
4960
0
        } else {
4961
0
            error = EPROTO;
4962
0
        }
4963
0
    }
4964
0
    if (!error && options) {
4965
0
        error = htb_parse_tca_options__(nl_options, options);
4966
0
    }
4967
0
    return error;
4968
0
}
4969
4970
static void
4971
htb_parse_qdisc_details__(struct netdev *netdev, const struct smap *details,
4972
                          struct htb_class *hc)
4973
0
{
4974
0
    hc->max_rate = smap_get_ullong(details, "max-rate", 0) / 8;
4975
0
    if (!hc->max_rate) {
4976
0
        uint32_t current_speed;
4977
0
        uint32_t max_speed OVS_UNUSED;
4978
4979
0
        netdev_linux_get_speed_locked(netdev_linux_cast(netdev),
4980
0
                                      &current_speed, &max_speed);
4981
0
        hc->max_rate = current_speed ? current_speed / 8 * 1000000ULL
4982
0
                                     : NETDEV_DEFAULT_BPS / 8;
4983
0
    }
4984
0
    hc->min_rate = hc->max_rate;
4985
0
    hc->burst = 0;
4986
0
    hc->priority = 0;
4987
0
}
4988
4989
static int
4990
htb_parse_class_details__(struct netdev *netdev,
4991
                          const struct smap *details, struct htb_class *hc)
4992
0
{
4993
0
    const struct htb *htb = htb_get__(netdev);
4994
0
    int mtu, error;
4995
0
    unsigned long long int max_rate_bit;
4996
4997
0
    error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
4998
0
    if (error) {
4999
0
        VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
5000
0
                     netdev_get_name(netdev));
5001
0
        return error;
5002
0
    }
5003
5004
    /* HTB requires at least an mtu sized min-rate to send any traffic even
5005
     * on uncongested links. */
5006
0
    hc->min_rate = smap_get_ullong(details, "min-rate", 0) / 8;
5007
0
    hc->min_rate = MAX(hc->min_rate, mtu);
5008
0
    hc->min_rate = MIN(hc->min_rate, htb->max_rate);
5009
5010
    /* max-rate */
5011
0
    max_rate_bit = smap_get_ullong(details, "max-rate", 0);
5012
0
    hc->max_rate = max_rate_bit ? max_rate_bit / 8 : htb->max_rate;
5013
0
    hc->max_rate = MAX(hc->max_rate, hc->min_rate);
5014
0
    hc->max_rate = MIN(hc->max_rate, htb->max_rate);
5015
5016
    /* burst
5017
     *
5018
     * According to hints in the documentation that I've read, it is important
5019
     * that 'burst' be at least as big as the largest frame that might be
5020
     * transmitted.  Also, making 'burst' a bit bigger than necessary is OK,
5021
     * but having it a bit too small is a problem.  Since netdev_get_mtu()
5022
     * doesn't include the Ethernet header, we need to add at least 14 (18?) to
5023
     * the MTU.  We actually add 64, instead of 14, as a guard against
5024
     * additional headers get tacked on somewhere that we're not aware of. */
5025
0
    hc->burst = smap_get_ullong(details, "burst", 0) / 8;
5026
0
    hc->burst = MAX(hc->burst, mtu + 64);
5027
5028
    /* priority */
5029
0
    hc->priority = smap_get_ullong(details, "priority", 0);
5030
5031
0
    return 0;
5032
0
}
5033
5034
static int
5035
htb_query_class__(const struct netdev *netdev, unsigned int handle,
5036
                  unsigned int parent, struct htb_class *options,
5037
                  struct netdev_queue_stats *stats)
5038
0
{
5039
0
    struct ofpbuf *reply;
5040
0
    int error;
5041
5042
0
    error = tc_query_class(netdev, handle, parent, &reply);
5043
0
    if (!error) {
5044
0
        error = htb_parse_tcmsg__(reply, NULL, options, stats);
5045
0
        ofpbuf_delete(reply);
5046
0
    }
5047
0
    return error;
5048
0
}
5049
5050
static int
5051
htb_tc_install(struct netdev *netdev, const struct smap *details)
5052
0
{
5053
0
    int error;
5054
5055
0
    error = htb_setup_qdisc__(netdev);
5056
0
    if (!error) {
5057
0
        struct htb_class hc;
5058
5059
0
        htb_parse_qdisc_details__(netdev, details, &hc);
5060
0
        error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
5061
0
                                  tc_make_handle(1, 0), &hc);
5062
0
        if (!error) {
5063
0
            htb_install__(netdev, hc.max_rate);
5064
0
        }
5065
0
    }
5066
0
    return error;
5067
0
}
5068
5069
static struct htb_class *
5070
htb_class_cast__(const struct tc_queue *queue)
5071
0
{
5072
0
    return CONTAINER_OF(queue, struct htb_class, tc_queue);
5073
0
}
5074
5075
static void
5076
htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
5077
                   const struct htb_class *hc)
5078
0
{
5079
0
    struct htb *htb = htb_get__(netdev);
5080
0
    size_t hash = hash_int(queue_id, 0);
5081
0
    struct tc_queue *queue;
5082
0
    struct htb_class *hcp;
5083
5084
0
    queue = tc_find_queue__(netdev, queue_id, hash);
5085
0
    if (queue) {
5086
0
        hcp = htb_class_cast__(queue);
5087
0
    } else {
5088
0
        hcp = xmalloc(sizeof *hcp);
5089
0
        queue = &hcp->tc_queue;
5090
0
        queue->queue_id = queue_id;
5091
0
        queue->created = time_msec();
5092
0
        hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
5093
0
    }
5094
5095
0
    hcp->min_rate = hc->min_rate;
5096
0
    hcp->max_rate = hc->max_rate;
5097
0
    hcp->burst = hc->burst;
5098
0
    hcp->priority = hc->priority;
5099
0
}
5100
5101
static int
5102
htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
5103
0
{
5104
0
    struct ofpbuf msg;
5105
0
    struct queue_dump_state state;
5106
0
    struct htb_class hc;
5107
5108
    /* Get qdisc options. */
5109
0
    hc.max_rate = 0;
5110
0
    htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
5111
0
    htb_install__(netdev, hc.max_rate);
5112
5113
    /* Get queues. */
5114
0
    if (!start_queue_dump(netdev, &state)) {
5115
0
        return ENODEV;
5116
0
    }
5117
0
    while (nl_dump_next(&state.dump, &msg, &state.buf)) {
5118
0
        unsigned int queue_id;
5119
5120
0
        if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
5121
0
            htb_update_queue__(netdev, queue_id, &hc);
5122
0
        }
5123
0
    }
5124
0
    finish_queue_dump(&state);
5125
5126
0
    return 0;
5127
0
}
5128
5129
static void
5130
htb_tc_destroy(struct tc *tc)
5131
0
{
5132
0
    struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
5133
0
    struct htb_class *hc;
5134
5135
0
    HMAP_FOR_EACH_POP (hc, tc_queue.hmap_node, &htb->tc.queues) {
5136
0
        free(hc);
5137
0
    }
5138
0
    tc_destroy(tc);
5139
0
    free(htb);
5140
0
}
5141
5142
static int
5143
htb_qdisc_get(const struct netdev *netdev, struct smap *details)
5144
0
{
5145
0
    const struct htb *htb = htb_get__(netdev);
5146
0
    smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
5147
0
    return 0;
5148
0
}
5149
5150
static int
5151
htb_qdisc_set(struct netdev *netdev, const struct smap *details)
5152
0
{
5153
0
    struct htb_class hc;
5154
0
    int error;
5155
5156
0
    htb_parse_qdisc_details__(netdev, details, &hc);
5157
0
    error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
5158
0
                              tc_make_handle(1, 0), &hc);
5159
0
    if (!error) {
5160
0
        htb_get__(netdev)->max_rate = hc.max_rate;
5161
0
    }
5162
0
    return error;
5163
0
}
5164
5165
static int
5166
htb_class_get(const struct netdev *netdev OVS_UNUSED,
5167
              const struct tc_queue *queue, struct smap *details)
5168
0
{
5169
0
    const struct htb_class *hc = htb_class_cast__(queue);
5170
5171
0
    smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
5172
0
    if (hc->min_rate != hc->max_rate) {
5173
0
        smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
5174
0
    }
5175
0
    smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
5176
0
    if (hc->priority) {
5177
0
        smap_add_format(details, "priority", "%u", hc->priority);
5178
0
    }
5179
0
    return 0;
5180
0
}
5181
5182
static int
5183
htb_class_set(struct netdev *netdev, unsigned int queue_id,
5184
              const struct smap *details)
5185
0
{
5186
0
    struct htb_class hc;
5187
0
    int error;
5188
5189
0
    error = htb_parse_class_details__(netdev, details, &hc);
5190
0
    if (error) {
5191
0
        return error;
5192
0
    }
5193
5194
0
    error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
5195
0
                              tc_make_handle(1, 0xfffe), &hc);
5196
0
    if (error) {
5197
0
        return error;
5198
0
    }
5199
5200
0
    htb_update_queue__(netdev, queue_id, &hc);
5201
0
    return 0;
5202
0
}
5203
5204
static int
5205
htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
5206
0
{
5207
0
    struct htb_class *hc = htb_class_cast__(queue);
5208
0
    struct htb *htb = htb_get__(netdev);
5209
0
    int error;
5210
5211
0
    error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
5212
0
    if (!error) {
5213
0
        hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
5214
0
        free(hc);
5215
0
    }
5216
0
    return error;
5217
0
}
5218
5219
static int
5220
htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
5221
                    struct netdev_queue_stats *stats)
5222
0
{
5223
0
    return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
5224
0
                             tc_make_handle(1, 0xfffe), NULL, stats);
5225
0
}
5226
5227
static int
5228
htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
5229
                     const struct ofpbuf *nlmsg,
5230
                     netdev_dump_queue_stats_cb *cb, void *aux)
5231
0
{
5232
0
    struct netdev_queue_stats stats;
5233
0
    unsigned int handle, major, minor;
5234
0
    int error;
5235
5236
0
    error = tc_parse_class(nlmsg, &handle, NULL, &stats);
5237
0
    if (error) {
5238
0
        return error;
5239
0
    }
5240
5241
0
    major = tc_get_major(handle);
5242
0
    minor = tc_get_minor(handle);
5243
0
    if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
5244
0
        (*cb)(minor - 1, &stats, aux);
5245
0
    }
5246
0
    return 0;
5247
0
}
5248
5249
static const struct tc_ops tc_ops_htb = {
5250
    .linux_name = "htb",
5251
    .ovs_name = "linux-htb",
5252
    .n_queues = HTB_N_QUEUES,
5253
    .tc_install = htb_tc_install,
5254
    .tc_load = htb_tc_load,
5255
    .tc_destroy = htb_tc_destroy,
5256
    .qdisc_get = htb_qdisc_get,
5257
    .qdisc_set = htb_qdisc_set,
5258
    .class_get = htb_class_get,
5259
    .class_set = htb_class_set,
5260
    .class_delete = htb_class_delete,
5261
    .class_get_stats = htb_class_get_stats,
5262
    .class_dump_stats = htb_class_dump_stats
5263
};
5264

5265
/* "linux-hfsc" traffic control class. */
5266
5267
0
#define HFSC_N_QUEUES 0xf000
5268
5269
struct hfsc {
5270
    struct tc tc;
5271
    uint32_t max_rate;
5272
};
5273
5274
struct hfsc_class {
5275
    struct tc_queue tc_queue;
5276
    uint32_t min_rate;
5277
    uint32_t max_rate;
5278
};
5279
5280
static struct hfsc *
5281
hfsc_get__(const struct netdev *netdev_)
5282
0
{
5283
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5284
0
    return CONTAINER_OF(netdev->tc, struct hfsc, tc);
5285
0
}
5286
5287
static struct hfsc_class *
5288
hfsc_class_cast__(const struct tc_queue *queue)
5289
0
{
5290
0
    return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
5291
0
}
5292
5293
static void
5294
hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
5295
0
{
5296
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5297
0
    struct hfsc *hfsc;
5298
5299
0
    hfsc = xmalloc(sizeof *hfsc);
5300
0
    tc_init(&hfsc->tc, &tc_ops_hfsc);
5301
0
    hfsc->max_rate = max_rate;
5302
0
    netdev->tc = &hfsc->tc;
5303
0
}
5304
5305
static void
5306
hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
5307
                    const struct hfsc_class *hc)
5308
0
{
5309
0
    size_t hash;
5310
0
    struct hfsc *hfsc;
5311
0
    struct hfsc_class *hcp;
5312
0
    struct tc_queue *queue;
5313
5314
0
    hfsc = hfsc_get__(netdev);
5315
0
    hash = hash_int(queue_id, 0);
5316
5317
0
    queue = tc_find_queue__(netdev, queue_id, hash);
5318
0
    if (queue) {
5319
0
        hcp = hfsc_class_cast__(queue);
5320
0
    } else {
5321
0
        hcp             = xmalloc(sizeof *hcp);
5322
0
        queue           = &hcp->tc_queue;
5323
0
        queue->queue_id = queue_id;
5324
0
        queue->created  = time_msec();
5325
0
        hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
5326
0
    }
5327
5328
0
    hcp->min_rate = hc->min_rate;
5329
0
    hcp->max_rate = hc->max_rate;
5330
0
}
5331
5332
static int
5333
hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
5334
0
{
5335
0
    const struct tc_service_curve *rsc, *fsc, *usc;
5336
0
    static const struct nl_policy tca_hfsc_policy[] = {
5337
0
        [TCA_HFSC_RSC] = {
5338
0
            .type      = NL_A_UNSPEC,
5339
0
            .optional  = false,
5340
0
            .min_len   = sizeof(struct tc_service_curve),
5341
0
        },
5342
0
        [TCA_HFSC_FSC] = {
5343
0
            .type      = NL_A_UNSPEC,
5344
0
            .optional  = false,
5345
0
            .min_len   = sizeof(struct tc_service_curve),
5346
0
        },
5347
0
        [TCA_HFSC_USC] = {
5348
0
            .type      = NL_A_UNSPEC,
5349
0
            .optional  = false,
5350
0
            .min_len   = sizeof(struct tc_service_curve),
5351
0
        },
5352
0
    };
5353
0
    struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
5354
5355
0
    if (!nl_parse_nested(nl_options, tca_hfsc_policy,
5356
0
                         attrs, ARRAY_SIZE(tca_hfsc_policy))) {
5357
0
        VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
5358
0
        return EPROTO;
5359
0
    }
5360
5361
0
    rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
5362
0
    fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
5363
0
    usc = nl_attr_get(attrs[TCA_HFSC_USC]);
5364
5365
0
    if (rsc->m1 != 0 || rsc->d != 0 ||
5366
0
        fsc->m1 != 0 || fsc->d != 0 ||
5367
0
        usc->m1 != 0 || usc->d != 0) {
5368
0
        VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
5369
0
                     "Non-linear service curves are not supported.");
5370
0
        return EPROTO;
5371
0
    }
5372
5373
0
    if (rsc->m2 != fsc->m2) {
5374
0
        VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
5375
0
                     "Real-time service curves are not supported ");
5376
0
        return EPROTO;
5377
0
    }
5378
5379
0
    if (rsc->m2 > usc->m2) {
5380
0
        VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
5381
0
                     "Min-rate service curve is greater than "
5382
0
                     "the max-rate service curve.");
5383
0
        return EPROTO;
5384
0
    }
5385
5386
0
    class->min_rate = fsc->m2;
5387
0
    class->max_rate = usc->m2;
5388
0
    return 0;
5389
0
}
5390
5391
static int
5392
hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
5393
                   struct hfsc_class *options,
5394
                   struct netdev_queue_stats *stats)
5395
0
{
5396
0
    int error;
5397
0
    unsigned int handle;
5398
0
    struct nlattr *nl_options;
5399
5400
0
    error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
5401
0
    if (error) {
5402
0
        return error;
5403
0
    }
5404
5405
0
    if (queue_id) {
5406
0
        unsigned int major, minor;
5407
5408
0
        major = tc_get_major(handle);
5409
0
        minor = tc_get_minor(handle);
5410
0
        if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
5411
0
            *queue_id = minor - 1;
5412
0
        } else {
5413
0
            return EPROTO;
5414
0
        }
5415
0
    }
5416
5417
0
    if (options) {
5418
0
        error = hfsc_parse_tca_options__(nl_options, options);
5419
0
    }
5420
5421
0
    return error;
5422
0
}
5423
5424
static int
5425
hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
5426
                   unsigned int parent, struct hfsc_class *options,
5427
                   struct netdev_queue_stats *stats)
5428
0
{
5429
0
    int error;
5430
0
    struct ofpbuf *reply;
5431
5432
0
    error = tc_query_class(netdev, handle, parent, &reply);
5433
0
    if (error) {
5434
0
        return error;
5435
0
    }
5436
5437
0
    error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
5438
0
    ofpbuf_delete(reply);
5439
0
    return error;
5440
0
}
5441
5442
static void
5443
hfsc_parse_qdisc_details__(struct netdev *netdev, const struct smap *details,
5444
                           struct hfsc_class *class)
5445
0
{
5446
0
    uint32_t max_rate = smap_get_ullong(details, "max-rate", 0) / 8;
5447
0
    if (!max_rate) {
5448
0
        uint32_t current_speed;
5449
0
        uint32_t max_speed OVS_UNUSED;
5450
5451
0
        netdev_linux_get_speed_locked(netdev_linux_cast(netdev),
5452
0
                                      &current_speed, &max_speed);
5453
0
        max_rate = current_speed ? current_speed / 8 * 1000000ULL
5454
0
                                 : NETDEV_DEFAULT_BPS / 8;
5455
0
    }
5456
5457
0
    class->min_rate = max_rate;
5458
0
    class->max_rate = max_rate;
5459
0
}
5460
5461
static int
5462
hfsc_parse_class_details__(struct netdev *netdev,
5463
                           const struct smap *details,
5464
                           struct hfsc_class * class)
5465
0
{
5466
0
    const struct hfsc *hfsc;
5467
0
    uint32_t min_rate, max_rate;
5468
5469
0
    hfsc       = hfsc_get__(netdev);
5470
5471
0
    min_rate = smap_get_ullong(details, "min-rate", 0) / 8;
5472
0
    min_rate = MAX(min_rate, 1);
5473
0
    min_rate = MIN(min_rate, hfsc->max_rate);
5474
5475
0
    max_rate = smap_get_ullong(details, "max-rate", hfsc->max_rate * 8) / 8;
5476
0
    max_rate = MAX(max_rate, min_rate);
5477
0
    max_rate = MIN(max_rate, hfsc->max_rate);
5478
5479
0
    class->min_rate = min_rate;
5480
0
    class->max_rate = max_rate;
5481
5482
0
    return 0;
5483
0
}
5484
5485
/* Create an HFSC qdisc.
5486
 *
5487
 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
5488
static int
5489
hfsc_setup_qdisc__(struct netdev * netdev)
5490
0
{
5491
0
    struct tcmsg *tcmsg;
5492
0
    struct ofpbuf request;
5493
0
    struct tc_hfsc_qopt opt;
5494
5495
0
    tc_del_qdisc(netdev);
5496
5497
0
    tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
5498
0
                                         NLM_F_EXCL | NLM_F_CREATE, &request);
5499
5500
0
    if (!tcmsg) {
5501
0
        return ENODEV;
5502
0
    }
5503
5504
0
    tcmsg->tcm_handle = tc_make_handle(1, 0);
5505
0
    tcmsg->tcm_parent = TC_H_ROOT;
5506
5507
0
    memset(&opt, 0, sizeof opt);
5508
0
    opt.defcls = 1;
5509
5510
0
    nl_msg_put_string(&request, TCA_KIND, "hfsc");
5511
0
    nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
5512
5513
0
    return tc_transact(&request, NULL);
5514
0
}
5515
5516
/* Create an HFSC class.
5517
 *
5518
 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
5519
 * sc rate <min_rate> ul rate <max_rate>" */
5520
static int
5521
hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
5522
                   unsigned int parent, struct hfsc_class *class)
5523
0
{
5524
0
    int error;
5525
0
    size_t opt_offset;
5526
0
    struct tcmsg *tcmsg;
5527
0
    struct ofpbuf request;
5528
0
    struct tc_service_curve min, max;
5529
5530
0
    tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE,
5531
0
                                         &request);
5532
5533
0
    if (!tcmsg) {
5534
0
        return ENODEV;
5535
0
    }
5536
5537
0
    tcmsg->tcm_handle = handle;
5538
0
    tcmsg->tcm_parent = parent;
5539
5540
0
    min.m1 = 0;
5541
0
    min.d  = 0;
5542
0
    min.m2 = class->min_rate;
5543
5544
0
    max.m1 = 0;
5545
0
    max.d  = 0;
5546
0
    max.m2 = class->max_rate;
5547
5548
0
    nl_msg_put_string(&request, TCA_KIND, "hfsc");
5549
0
    opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
5550
0
    nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
5551
0
    nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
5552
0
    nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
5553
0
    nl_msg_end_nested(&request, opt_offset);
5554
5555
0
    error = tc_transact(&request, NULL);
5556
0
    if (error) {
5557
0
        VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
5558
0
                     "min-rate %ubps, max-rate %ubps (%s)",
5559
0
                     netdev_get_name(netdev),
5560
0
                     tc_get_major(handle), tc_get_minor(handle),
5561
0
                     tc_get_major(parent), tc_get_minor(parent),
5562
0
                     class->min_rate, class->max_rate, ovs_strerror(error));
5563
0
    }
5564
5565
0
    return error;
5566
0
}
5567
5568
static int
5569
hfsc_tc_install(struct netdev *netdev, const struct smap *details)
5570
0
{
5571
0
    int error;
5572
0
    struct hfsc_class class;
5573
5574
0
    error = hfsc_setup_qdisc__(netdev);
5575
5576
0
    if (error) {
5577
0
        return error;
5578
0
    }
5579
5580
0
    hfsc_parse_qdisc_details__(netdev, details, &class);
5581
0
    error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
5582
0
                               tc_make_handle(1, 0), &class);
5583
5584
0
    if (error) {
5585
0
        return error;
5586
0
    }
5587
5588
0
    hfsc_install__(netdev, class.max_rate);
5589
0
    return 0;
5590
0
}
5591
5592
static int
5593
hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
5594
0
{
5595
0
    struct ofpbuf msg;
5596
0
    struct queue_dump_state state;
5597
0
    struct hfsc_class hc;
5598
5599
0
    hc.max_rate = 0;
5600
0
    hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
5601
0
    hfsc_install__(netdev, hc.max_rate);
5602
5603
0
    if (!start_queue_dump(netdev, &state)) {
5604
0
        return ENODEV;
5605
0
    }
5606
5607
0
    while (nl_dump_next(&state.dump, &msg, &state.buf)) {
5608
0
        unsigned int queue_id;
5609
5610
0
        if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
5611
0
            hfsc_update_queue__(netdev, queue_id, &hc);
5612
0
        }
5613
0
    }
5614
5615
0
    finish_queue_dump(&state);
5616
0
    return 0;
5617
0
}
5618
5619
static void
5620
hfsc_tc_destroy(struct tc *tc)
5621
0
{
5622
0
    struct hfsc *hfsc;
5623
0
    struct hfsc_class *hc;
5624
5625
0
    hfsc = CONTAINER_OF(tc, struct hfsc, tc);
5626
5627
0
    HMAP_FOR_EACH_SAFE (hc, tc_queue.hmap_node, &hfsc->tc.queues) {
5628
0
        hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
5629
0
        free(hc);
5630
0
    }
5631
5632
0
    tc_destroy(tc);
5633
0
    free(hfsc);
5634
0
}
5635
5636
static int
5637
hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
5638
0
{
5639
0
    const struct hfsc *hfsc;
5640
0
    hfsc = hfsc_get__(netdev);
5641
0
    smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
5642
0
    return 0;
5643
0
}
5644
5645
static int
5646
hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
5647
0
{
5648
0
    int error;
5649
0
    struct hfsc_class class;
5650
5651
0
    hfsc_parse_qdisc_details__(netdev, details, &class);
5652
0
    error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
5653
0
                               tc_make_handle(1, 0), &class);
5654
5655
0
    if (!error) {
5656
0
        hfsc_get__(netdev)->max_rate = class.max_rate;
5657
0
    }
5658
5659
0
    return error;
5660
0
}
5661
5662
static int
5663
hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
5664
              const struct tc_queue *queue, struct smap *details)
5665
0
{
5666
0
    const struct hfsc_class *hc;
5667
5668
0
    hc = hfsc_class_cast__(queue);
5669
0
    smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
5670
0
    if (hc->min_rate != hc->max_rate) {
5671
0
        smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
5672
0
    }
5673
0
    return 0;
5674
0
}
5675
5676
static int
5677
hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
5678
               const struct smap *details)
5679
0
{
5680
0
    int error;
5681
0
    struct hfsc_class class;
5682
5683
0
    error = hfsc_parse_class_details__(netdev, details, &class);
5684
0
    if (error) {
5685
0
        return error;
5686
0
    }
5687
5688
0
    error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
5689
0
                               tc_make_handle(1, 0xfffe), &class);
5690
0
    if (error) {
5691
0
        return error;
5692
0
    }
5693
5694
0
    hfsc_update_queue__(netdev, queue_id, &class);
5695
0
    return 0;
5696
0
}
5697
5698
static int
5699
hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
5700
0
{
5701
0
    int error;
5702
0
    struct hfsc *hfsc;
5703
0
    struct hfsc_class *hc;
5704
5705
0
    hc   = hfsc_class_cast__(queue);
5706
0
    hfsc = hfsc_get__(netdev);
5707
5708
0
    error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
5709
0
    if (!error) {
5710
0
        hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
5711
0
        free(hc);
5712
0
    }
5713
0
    return error;
5714
0
}
5715
5716
static int
5717
hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
5718
                     struct netdev_queue_stats *stats)
5719
0
{
5720
0
    return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
5721
0
                             tc_make_handle(1, 0xfffe), NULL, stats);
5722
0
}
5723
5724
static int
5725
hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
5726
                      const struct ofpbuf *nlmsg,
5727
                      netdev_dump_queue_stats_cb *cb, void *aux)
5728
0
{
5729
0
    struct netdev_queue_stats stats;
5730
0
    unsigned int handle, major, minor;
5731
0
    int error;
5732
5733
0
    error = tc_parse_class(nlmsg, &handle, NULL, &stats);
5734
0
    if (error) {
5735
0
        return error;
5736
0
    }
5737
5738
0
    major = tc_get_major(handle);
5739
0
    minor = tc_get_minor(handle);
5740
0
    if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
5741
0
        (*cb)(minor - 1, &stats, aux);
5742
0
    }
5743
0
    return 0;
5744
0
}
5745
5746
static const struct tc_ops tc_ops_hfsc = {
5747
    .linux_name = "hfsc",
5748
    .ovs_name = "linux-hfsc",
5749
    .n_queues = HFSC_N_QUEUES,              /* n_queues */
5750
    .tc_install = hfsc_tc_install,
5751
    .tc_load = hfsc_tc_load,
5752
    .tc_destroy = hfsc_tc_destroy,
5753
    .qdisc_get = hfsc_qdisc_get,
5754
    .qdisc_set = hfsc_qdisc_set,
5755
    .class_get = hfsc_class_get,
5756
    .class_set = hfsc_class_set,
5757
    .class_delete = hfsc_class_delete,
5758
    .class_get_stats = hfsc_class_get_stats,
5759
    .class_dump_stats = hfsc_class_dump_stats,
5760
};
5761

5762
/* "linux-noop" traffic control class. */
5763
5764
static void
5765
noop_install__(struct netdev *netdev_)
5766
0
{
5767
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5768
0
    static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
5769
5770
0
    netdev->tc = CONST_CAST(struct tc *, &tc);
5771
0
}
5772
5773
static int
5774
noop_tc_install(struct netdev *netdev,
5775
                   const struct smap *details OVS_UNUSED)
5776
0
{
5777
0
    noop_install__(netdev);
5778
0
    return 0;
5779
0
}
5780
5781
static int
5782
noop_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
5783
0
{
5784
0
    noop_install__(netdev);
5785
0
    return 0;
5786
0
}
5787
5788
static const struct tc_ops tc_ops_noop = {
5789
    .ovs_name = "linux-noop",               /* ovs_name */
5790
    .tc_install = noop_tc_install,
5791
    .tc_load = noop_tc_load,
5792
};
5793

5794
/* "linux-default" traffic control class.
5795
 *
5796
 * This class represents the default, unnamed Linux qdisc.  It corresponds to
5797
 * the "" (empty string) QoS type in the OVS database. */
5798
5799
static void
5800
default_install__(struct netdev *netdev_)
5801
0
{
5802
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5803
0
    static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
5804
5805
    /* Nothing but a tc class implementation is allowed to write to a tc.  This
5806
     * class never does that, so we can legitimately use a const tc object. */
5807
0
    netdev->tc = CONST_CAST(struct tc *, &tc);
5808
0
}
5809
5810
static int
5811
default_tc_install(struct netdev *netdev,
5812
                   const struct smap *details OVS_UNUSED)
5813
0
{
5814
0
    default_install__(netdev);
5815
0
    return 0;
5816
0
}
5817
5818
static int
5819
default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
5820
0
{
5821
0
    default_install__(netdev);
5822
0
    return 0;
5823
0
}
5824
5825
static const struct tc_ops tc_ops_default = {
5826
    .ovs_name = "",                         /* ovs_name */
5827
    .tc_install = default_tc_install,
5828
    .tc_load = default_tc_load,
5829
};
5830

5831
/* "linux-other" traffic control class.
5832
 *
5833
 * */
5834
5835
static int
5836
other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
5837
0
{
5838
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5839
0
    static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
5840
5841
    /* Nothing but a tc class implementation is allowed to write to a tc.  This
5842
     * class never does that, so we can legitimately use a const tc object. */
5843
0
    netdev->tc = CONST_CAST(struct tc *, &tc);
5844
0
    return 0;
5845
0
}
5846
5847
static const struct tc_ops tc_ops_other = {
5848
    .ovs_name = "linux-other",
5849
    .tc_load = other_tc_load,
5850
};
5851

5852
/* Traffic control. */
5853
5854
/* Number of kernel "tc" ticks per second. */
5855
static double ticks_per_s;
5856
5857
/* Number of kernel "jiffies" per second.  This is used for the purpose of
5858
 * computing buffer sizes.  Generally kernel qdiscs need to be able to buffer
5859
 * one jiffy's worth of data.
5860
 *
5861
 * There are two possibilities here:
5862
 *
5863
 *    - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
5864
 *      approximate range of 100 to 1024.  That means that we really need to
5865
 *      make sure that the qdisc can buffer that much data.
5866
 *
5867
 *    - 'buffer_hz' is an absurdly large number.  That means that the kernel
5868
 *      has finely granular timers and there's no need to fudge additional room
5869
 *      for buffers.  (There's no extra effort needed to implement that: the
5870
 *      large 'buffer_hz' is used as a divisor, so practically any number will
5871
 *      come out as 0 in the division.  Small integer results in the case of
5872
 *      really high dividends won't have any real effect anyhow.)
5873
 */
5874
static unsigned int buffer_hz;
5875
5876
static struct tcmsg *
5877
netdev_linux_tc_make_request(const struct netdev *netdev, int type,
5878
                             unsigned int flags, struct ofpbuf *request)
5879
0
{
5880
0
    int ifindex;
5881
0
    int error;
5882
5883
0
    error = get_ifindex(netdev, &ifindex);
5884
0
    if (error) {
5885
0
        return NULL;
5886
0
    }
5887
5888
0
    return tc_make_request(ifindex, type, flags, request);
5889
0
}
5890
5891
static void
5892
tc_policer_init(struct tc_police *tc_police, uint64_t kbits_rate,
5893
                uint64_t kbits_burst)
5894
0
{
5895
0
    int mtu = 65535;
5896
5897
0
    memset(tc_police, 0, sizeof *tc_police);
5898
5899
0
    tc_police->action = TC_POLICE_SHOT;
5900
0
    tc_police->mtu = mtu;
5901
0
    tc_fill_rate(&tc_police->rate, kbits_rate * 1000 / 8, mtu);
5902
5903
    /* The following appears wrong in one way: In networking a kilobit is
5904
     * usually 1000 bits but this uses 1024 bits.
5905
     *
5906
     * However if you "fix" those problems then "tc filter show ..." shows
5907
     * "125000b", meaning 125,000 bits, when OVS configures it for 1000 kbit ==
5908
     * 1,000,000 bits, whereas this actually ends up doing the right thing from
5909
     * tc's point of view.  Whatever. */
5910
0
    tc_police->burst = tc_bytes_to_ticks(
5911
0
        tc_police->rate.rate, kbits_burst * 1024 / 8);
5912
0
}
5913
5914
/* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
5915
 * of 'kbits_burst', with a rate of 'kpkts_rate' and a burst size of
5916
 * 'kpkts_burst'.
5917
 *
5918
 * This function is equivalent to running:
5919
 *     /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
5920
 *              basic police rate <kbits_rate>kbit burst <kbits_burst>k
5921
 *              mtu 65535 drop
5922
 *
5923
 * The configuration and stats may be seen with the following command:
5924
 *     /sbin/tc -s filter show dev <devname> parent ffff:
5925
 *
5926
 * Returns 0 if successful, otherwise a positive errno value.
5927
 */
5928
static int
5929
tc_add_policer(struct netdev *netdev, uint64_t kbits_rate,
5930
               uint32_t kbits_burst, uint32_t kpkts_rate, uint32_t kpkts_burst)
5931
0
{
5932
0
    size_t basic_offset, police_offset;
5933
0
    struct ofpbuf request;
5934
0
    struct tcmsg *tcmsg;
5935
0
    int error;
5936
5937
0
    tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWTFILTER,
5938
0
                                         NLM_F_EXCL | NLM_F_CREATE, &request);
5939
0
    if (!tcmsg) {
5940
0
        return ENODEV;
5941
0
    }
5942
0
    tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
5943
0
    tcmsg->tcm_info = tc_make_handle(49,
5944
0
                                     (OVS_FORCE uint16_t) htons(ETH_P_ALL));
5945
0
    nl_msg_put_string(&request, TCA_KIND, "basic");
5946
5947
0
    basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
5948
0
    police_offset = nl_msg_start_nested(&request, TCA_BASIC_ACT);
5949
0
    nl_msg_put_act_police(&request, 0, kbits_rate, kbits_burst,
5950
0
                          kpkts_rate * 1000ULL, kpkts_burst * 1000ULL,
5951
0
                          TC_ACT_UNSPEC, false);
5952
0
    nl_msg_end_nested(&request, police_offset);
5953
0
    nl_msg_end_nested(&request, basic_offset);
5954
5955
0
    error = tc_transact(&request, NULL);
5956
0
    if (error) {
5957
0
        return error;
5958
0
    }
5959
5960
0
    return 0;
5961
0
}
5962
5963
int
5964
tc_add_policer_action(uint32_t index, uint64_t kbits_rate,
5965
                      uint32_t kbits_burst, uint32_t pkts_rate,
5966
                      uint32_t pkts_burst, bool update)
5967
0
{
5968
0
    struct ofpbuf request;
5969
0
    struct tcamsg *tcamsg;
5970
0
    size_t offset;
5971
0
    int flags;
5972
0
    int error;
5973
5974
0
    flags = (update ? NLM_F_REPLACE : NLM_F_EXCL) | NLM_F_CREATE;
5975
0
    tcamsg = tc_make_action_request(RTM_NEWACTION, flags, &request);
5976
0
    if (!tcamsg) {
5977
0
        return ENODEV;
5978
0
    }
5979
5980
0
    offset = nl_msg_start_nested(&request, TCA_ACT_TAB);
5981
0
    nl_msg_put_act_police(&request, index, kbits_rate, kbits_burst, pkts_rate,
5982
0
                          pkts_burst, TC_ACT_PIPE, true);
5983
0
    nl_msg_end_nested(&request, offset);
5984
5985
0
    error = tc_transact(&request, NULL);
5986
0
    if (error) {
5987
0
        VLOG_ERR_RL(&rl, "Failed to %s police action, err=%d",
5988
0
                    update ? "update" : "add", error);
5989
0
    }
5990
5991
0
    return error;
5992
0
}
5993
5994
static int
5995
tc_update_policer_action_stats(struct ofpbuf *msg,
5996
                               struct ofputil_meter_stats *stats)
5997
0
{
5998
0
    struct ofpbuf b = ofpbuf_const_initializer(msg->data, msg->size);
5999
0
    struct nlmsghdr *nlmsg = ofpbuf_try_pull(&b, sizeof *nlmsg);
6000
0
    struct tcamsg *tca = ofpbuf_try_pull(&b, sizeof *tca);
6001
0
    struct ovs_flow_stats stats_dropped;
6002
0
    struct ovs_flow_stats stats_hw;
6003
0
    struct ovs_flow_stats stats_sw;
6004
0
    const struct nlattr *act;
6005
0
    struct nlattr *prio;
6006
0
    int error = 0;
6007
6008
0
    if (!stats) {
6009
0
        goto exit;
6010
0
    }
6011
6012
0
    if (!nlmsg || !tca) {
6013
0
        VLOG_ERR_RL(&rl, "Failed to get action stats, size error");
6014
0
        error = EPROTO;
6015
0
        goto exit;
6016
0
    }
6017
6018
0
    act = nl_attr_find(&b, 0, TCA_ACT_TAB);
6019
0
    if (!act) {
6020
0
        VLOG_ERR_RL(&rl, "Failed to get action stats, can't find attribute");
6021
0
        error = EPROTO;
6022
0
        goto exit;
6023
0
    }
6024
6025
0
    prio = (struct nlattr *) act + 1;
6026
0
    memset(&stats_sw, 0, sizeof stats_sw);
6027
0
    memset(&stats_hw, 0, sizeof stats_hw);
6028
0
    memset(&stats_dropped, 0, sizeof stats_dropped);
6029
0
    error = tc_parse_action_stats(prio, &stats_sw, &stats_hw, &stats_dropped);
6030
0
    if (!error) {
6031
0
        stats->packet_in_count +=
6032
0
            get_32aligned_u64(&stats_sw.n_packets);
6033
0
        stats->byte_in_count += get_32aligned_u64(&stats_sw.n_bytes);
6034
0
        stats->packet_in_count +=
6035
0
            get_32aligned_u64(&stats_hw.n_packets);
6036
0
        stats->byte_in_count += get_32aligned_u64(&stats_hw.n_bytes);
6037
0
        if (stats->n_bands >= 1) {
6038
0
            stats->bands[0].packet_count +=
6039
0
                get_32aligned_u64(&stats_dropped.n_packets);
6040
0
        }
6041
0
    }
6042
6043
0
exit:
6044
0
    ofpbuf_delete(msg);
6045
0
    return error;
6046
0
}
6047
6048
int
6049
tc_get_policer_action(uint32_t index, struct ofputil_meter_stats *stats)
6050
0
{
6051
0
    struct ofpbuf *replyp = NULL;
6052
0
    struct ofpbuf request;
6053
0
    struct tcamsg *tcamsg;
6054
0
    size_t root_offset;
6055
0
    size_t prio_offset;
6056
0
    int error;
6057
6058
0
    tcamsg = tc_make_action_request(RTM_GETACTION, 0, &request);
6059
0
    if (!tcamsg) {
6060
0
        return ENODEV;
6061
0
    }
6062
6063
0
    root_offset = nl_msg_start_nested(&request, TCA_ACT_TAB);
6064
0
    prio_offset = nl_msg_start_nested(&request, 1);
6065
0
    nl_msg_put_string(&request, TCA_ACT_KIND, "police");
6066
0
    nl_msg_put_u32(&request, TCA_ACT_INDEX, index);
6067
0
    nl_msg_end_nested(&request, prio_offset);
6068
0
    nl_msg_end_nested(&request, root_offset);
6069
6070
0
    error = tc_transact(&request, &replyp);
6071
0
    if (error) {
6072
0
        VLOG_ERR_RL(&rl, "Failed to dump police action (index: %u), err=%d",
6073
0
                    index, error);
6074
0
        return error;
6075
0
    }
6076
6077
0
    return tc_update_policer_action_stats(replyp, stats);
6078
0
}
6079
6080
int
6081
tc_del_policer_action(uint32_t index, struct ofputil_meter_stats *stats)
6082
0
{
6083
0
    struct ofpbuf *replyp = NULL;
6084
0
    struct ofpbuf request;
6085
0
    struct tcamsg *tcamsg;
6086
0
    size_t root_offset;
6087
0
    size_t prio_offset;
6088
0
    int error;
6089
6090
0
    tcamsg = tc_make_action_request(RTM_DELACTION, NLM_F_ACK, &request);
6091
0
    if (!tcamsg) {
6092
0
        return ENODEV;
6093
0
    }
6094
6095
0
    root_offset = nl_msg_start_nested(&request, TCA_ACT_TAB);
6096
0
    prio_offset = nl_msg_start_nested(&request, 1);
6097
0
    nl_msg_put_string(&request, TCA_ACT_KIND, "police");
6098
0
    nl_msg_put_u32(&request, TCA_ACT_INDEX, index);
6099
0
    nl_msg_end_nested(&request, prio_offset);
6100
0
    nl_msg_end_nested(&request, root_offset);
6101
6102
0
    error = tc_transact(&request, &replyp);
6103
0
    if (error) {
6104
0
        VLOG_ERR_RL(&rl, "Failed to delete police action (index: %u), err=%d",
6105
0
                    index, error);
6106
0
        return error;
6107
0
    }
6108
6109
0
    return tc_update_policer_action_stats(replyp, stats);
6110
0
}
6111
6112
static void
6113
read_psched(void)
6114
0
{
6115
    /* The values in psched are not individually very meaningful, but they are
6116
     * important.  The tables below show some values seen in the wild.
6117
     *
6118
     * Some notes:
6119
     *
6120
     *   - "c" has always been a constant 1000000 since at least Linux 2.4.14.
6121
     *     (Before that, there are hints that it was 1000000000.)
6122
     *
6123
     *   - "d" can be unrealistically large, see the comment on 'buffer_hz'
6124
     *     above.
6125
     *
6126
     *                        /proc/net/psched
6127
     *     -----------------------------------
6128
     * [1] 000c8000 000f4240 000f4240 00000064
6129
     * [2] 000003e8 00000400 000f4240 3b9aca00
6130
     * [3] 000003e8 00000400 000f4240 3b9aca00
6131
     * [4] 000003e8 00000400 000f4240 00000064
6132
     * [5] 000003e8 00000040 000f4240 3b9aca00
6133
     * [6] 000003e8 00000040 000f4240 000000f9
6134
     *
6135
     *           a         b          c             d ticks_per_s     buffer_hz
6136
     *     ------- --------- ---------- ------------- ----------- -------------
6137
     * [1] 819,200 1,000,000  1,000,000           100     819,200           100
6138
     * [2]   1,000     1,024  1,000,000 1,000,000,000     976,562 1,000,000,000
6139
     * [3]   1,000     1,024  1,000,000 1,000,000,000     976,562 1,000,000,000
6140
     * [4]   1,000     1,024  1,000,000           100     976,562           100
6141
     * [5]   1,000        64  1,000,000 1,000,000,000  15,625,000 1,000,000,000
6142
     * [6]   1,000        64  1,000,000           249  15,625,000           249
6143
     *
6144
     * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
6145
     * [2] 2.6.26-1-686-bigmem from Debian lenny
6146
     * [3] 2.6.26-2-sparc64 from Debian lenny
6147
     * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
6148
     * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
6149
     * [6] 2.6.34 from kernel.org on KVM
6150
     */
6151
0
    static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
6152
0
    static const char fn[] = "/proc/net/psched";
6153
0
    unsigned int a, b, c, d;
6154
0
    FILE *stream;
6155
6156
0
    if (!ovsthread_once_start(&once)) {
6157
0
        return;
6158
0
    }
6159
6160
0
    ticks_per_s = 1.0;
6161
0
    buffer_hz = 100;
6162
6163
0
    stream = fopen(fn, "r");
6164
0
    if (!stream) {
6165
0
        VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
6166
0
        goto exit;
6167
0
    }
6168
6169
0
    if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
6170
0
        VLOG_WARN("%s: read failed", fn);
6171
0
        fclose(stream);
6172
0
        goto exit;
6173
0
    }
6174
0
    VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
6175
0
    fclose(stream);
6176
6177
0
    if (!a || !b || !c) {
6178
0
        VLOG_WARN("%s: invalid scheduler parameters", fn);
6179
0
        goto exit;
6180
0
    }
6181
6182
0
    ticks_per_s = (double) a * c / b;
6183
0
    if (c == 1000000) {
6184
0
        buffer_hz = d;
6185
0
    } else {
6186
0
        VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
6187
0
                  fn, a, b, c, d);
6188
0
    }
6189
0
    VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
6190
6191
0
exit:
6192
0
    ovsthread_once_done(&once);
6193
0
}
6194
6195
/* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
6196
 * rate of 'rate' bytes per second. */
6197
static unsigned int
6198
tc_ticks_to_bytes(uint64_t rate, unsigned int ticks)
6199
0
{
6200
0
    read_psched();
6201
0
    return (rate * ticks) / ticks_per_s;
6202
0
}
6203
6204
/* Returns the number of ticks that it would take to transmit 'size' bytes at a
6205
 * rate of 'rate' bytes per second. */
6206
static unsigned int
6207
tc_bytes_to_ticks(uint64_t rate, unsigned int size)
6208
0
{
6209
0
    read_psched();
6210
0
    return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
6211
0
}
6212
6213
/* Returns the number of bytes that need to be reserved for qdisc buffering at
6214
 * a transmission rate of 'rate' bytes per second. */
6215
static unsigned int
6216
tc_buffer_per_jiffy(uint64_t rate)
6217
0
{
6218
0
    read_psched();
6219
0
    return rate / buffer_hz;
6220
0
}
6221
6222
static uint32_t
6223
0
tc_time_to_ticks(uint32_t time) {
6224
0
    read_psched();
6225
0
    return time * (ticks_per_s / 1000000);
6226
0
}
6227
6228
/* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
6229
 * e.g. "htb", into '*kind' (if it is nonnull).  If 'options' is nonnull,
6230
 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
6231
 * stores NULL into it if it is absent.
6232
 *
6233
 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
6234
 * 'msg'.
6235
 *
6236
 * Returns 0 if successful, otherwise a positive errno value. */
6237
static int
6238
tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
6239
               struct nlattr **options)
6240
0
{
6241
0
    static const struct nl_policy tca_policy[] = {
6242
0
        [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
6243
0
        [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
6244
0
    };
6245
0
    struct nlattr *ta[ARRAY_SIZE(tca_policy)];
6246
6247
0
    if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
6248
0
                         tca_policy, ta, ARRAY_SIZE(ta))) {
6249
0
        VLOG_WARN_RL(&rl, "failed to parse qdisc message");
6250
0
        goto error;
6251
0
    }
6252
6253
0
    if (kind) {
6254
0
        *kind = nl_attr_get_string(ta[TCA_KIND]);
6255
0
    }
6256
6257
0
    if (options) {
6258
0
        *options = ta[TCA_OPTIONS];
6259
0
    }
6260
6261
0
    return 0;
6262
6263
0
error:
6264
0
    if (kind) {
6265
0
        *kind = NULL;
6266
0
    }
6267
0
    if (options) {
6268
0
        *options = NULL;
6269
0
    }
6270
0
    return EPROTO;
6271
0
}
6272
6273
/* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
6274
 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
6275
 * into '*options', and its queue statistics into '*stats'.  Any of the output
6276
 * arguments may be null.
6277
 *
6278
 * Returns 0 if successful, otherwise a positive errno value. */
6279
static int
6280
tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
6281
               struct nlattr **options, struct netdev_queue_stats *stats)
6282
0
{
6283
0
    struct ofpbuf b = ofpbuf_const_initializer(msg->data, msg->size);
6284
0
    struct nlmsghdr *nlmsg = ofpbuf_try_pull(&b, sizeof *nlmsg);
6285
0
    struct tcmsg *tc = ofpbuf_try_pull(&b, sizeof *tc);
6286
0
    static const struct nl_policy tca_policy[] = {
6287
0
        [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
6288
0
        [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
6289
0
    };
6290
0
    struct nlattr *ta[ARRAY_SIZE(tca_policy)];
6291
6292
0
    if (!nlmsg || !tc) {
6293
0
        VLOG_ERR_RL(&rl, "failed to parse class message, malformed reply");
6294
0
        goto error;
6295
0
    }
6296
6297
0
    if (!nl_policy_parse(&b, 0, tca_policy, ta, ARRAY_SIZE(ta))) {
6298
0
        VLOG_WARN_RL(&rl, "failed to parse class message");
6299
0
        goto error;
6300
0
    }
6301
6302
0
    if (handlep) {
6303
0
        *handlep = tc->tcm_handle;
6304
0
    }
6305
6306
0
    if (options) {
6307
0
        *options = ta[TCA_OPTIONS];
6308
0
    }
6309
6310
0
    if (stats) {
6311
0
        const struct gnet_stats_queue *gsq;
6312
0
        struct gnet_stats_basic gsb;
6313
6314
0
        static const struct nl_policy stats_policy[] = {
6315
0
            [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
6316
0
                                  .min_len = sizeof gsb },
6317
0
            [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
6318
0
                                  .min_len = sizeof *gsq },
6319
0
        };
6320
0
        struct nlattr *sa[ARRAY_SIZE(stats_policy)];
6321
6322
0
        if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
6323
0
                             sa, ARRAY_SIZE(sa))) {
6324
0
            VLOG_WARN_RL(&rl, "failed to parse class stats");
6325
0
            goto error;
6326
0
        }
6327
6328
        /* Alignment issues screw up the length of struct gnet_stats_basic on
6329
         * some arch/bitsize combinations.  Newer versions of Linux have a
6330
         * struct gnet_stats_basic_packed, but we can't depend on that.  The
6331
         * easiest thing to do is just to make a copy. */
6332
0
        memset(&gsb, 0, sizeof gsb);
6333
0
        memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
6334
0
               MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
6335
0
        stats->tx_bytes = gsb.bytes;
6336
0
        stats->tx_packets = gsb.packets;
6337
6338
0
        gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
6339
0
        stats->tx_errors = gsq->drops;
6340
0
    }
6341
6342
0
    return 0;
6343
6344
0
error:
6345
0
    if (options) {
6346
0
        *options = NULL;
6347
0
    }
6348
0
    if (stats) {
6349
0
        memset(stats, 0, sizeof *stats);
6350
0
    }
6351
0
    return EPROTO;
6352
0
}
6353
6354
/* Queries the kernel for class with identifier 'handle' and parent 'parent'
6355
 * on 'netdev'. */
6356
static int
6357
tc_query_class(const struct netdev *netdev,
6358
               unsigned int handle, unsigned int parent,
6359
               struct ofpbuf **replyp)
6360
0
{
6361
0
    struct ofpbuf request;
6362
0
    struct tcmsg *tcmsg;
6363
0
    int error;
6364
6365
0
    tcmsg = netdev_linux_tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO,
6366
0
                                         &request);
6367
0
    if (!tcmsg) {
6368
0
        return ENODEV;
6369
0
    }
6370
0
    tcmsg->tcm_handle = handle;
6371
0
    tcmsg->tcm_parent = parent;
6372
6373
0
    error = tc_transact(&request, replyp);
6374
0
    if (error) {
6375
0
        VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
6376
0
                     netdev_get_name(netdev),
6377
0
                     tc_get_major(handle), tc_get_minor(handle),
6378
0
                     tc_get_major(parent), tc_get_minor(parent),
6379
0
                     ovs_strerror(error));
6380
0
    }
6381
0
    return error;
6382
0
}
6383
6384
/* Equivalent to "tc class del dev <name> handle <handle>". */
6385
static int
6386
tc_delete_class(const struct netdev *netdev, unsigned int handle)
6387
0
{
6388
0
    struct ofpbuf request;
6389
0
    struct tcmsg *tcmsg;
6390
0
    int error;
6391
6392
0
    tcmsg = netdev_linux_tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
6393
0
    if (!tcmsg) {
6394
0
        return ENODEV;
6395
0
    }
6396
0
    tcmsg->tcm_handle = handle;
6397
0
    tcmsg->tcm_parent = 0;
6398
6399
0
    error = tc_transact(&request, NULL);
6400
0
    if (error) {
6401
0
        VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
6402
0
                     netdev_get_name(netdev),
6403
0
                     tc_get_major(handle), tc_get_minor(handle),
6404
0
                     ovs_strerror(error));
6405
0
    }
6406
0
    return error;
6407
0
}
6408
6409
/* Equivalent to "tc qdisc del dev <name> root". */
6410
static int
6411
tc_del_qdisc(struct netdev *netdev_)
6412
0
{
6413
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
6414
0
    struct ofpbuf request;
6415
0
    struct tcmsg *tcmsg;
6416
0
    int error;
6417
6418
0
    tcmsg = netdev_linux_tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
6419
0
    if (!tcmsg) {
6420
0
        return ENODEV;
6421
0
    }
6422
0
    tcmsg->tcm_parent = TC_H_ROOT;
6423
6424
0
    error = tc_transact(&request, NULL);
6425
0
    if (error == EINVAL || error == ENOENT) {
6426
        /* EINVAL or ENOENT probably means that the default qdisc was in use,
6427
         * in which case we've accomplished our purpose. */
6428
0
        error = 0;
6429
0
    }
6430
0
    if (!error && netdev->tc) {
6431
0
        if (netdev->tc->ops->tc_destroy) {
6432
0
            netdev->tc->ops->tc_destroy(netdev->tc);
6433
0
        }
6434
0
        netdev->tc = NULL;
6435
0
    }
6436
0
    return error;
6437
0
}
6438
6439
static bool
6440
getqdisc_is_safe(void)
6441
0
{
6442
0
    static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
6443
0
    static bool safe = false;
6444
6445
0
    if (ovsthread_once_start(&once)) {
6446
0
        if (ovs_kernel_is_version_or_newer(2, 35)) {
6447
0
            safe = true;
6448
0
        } else {
6449
0
            VLOG_INFO("disabling unsafe RTM_GETQDISC in Linux kernel");
6450
0
        }
6451
0
        ovsthread_once_done(&once);
6452
0
    }
6453
0
    return safe;
6454
0
}
6455
6456
/* If 'netdev''s qdisc type and parameters are not yet known, queries the
6457
 * kernel to determine what they are.  Returns 0 if successful, otherwise a
6458
 * positive errno value. */
6459
static int
6460
tc_query_qdisc(const struct netdev *netdev_)
6461
0
{
6462
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
6463
0
    struct ofpbuf request, *qdisc;
6464
0
    const struct tc_ops *ops;
6465
0
    struct tcmsg *tcmsg;
6466
0
    int load_error;
6467
0
    int error;
6468
6469
0
    if (netdev->tc) {
6470
0
        return 0;
6471
0
    }
6472
6473
    /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
6474
     * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
6475
     * 2.6.35 without that fix backported to it.
6476
     *
6477
     * To avoid the OOPS, we must not make a request that would attempt to dump
6478
     * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
6479
     * few others.  There are a few ways that I can see to do this, but most of
6480
     * them seem to be racy (and if you lose the race the kernel OOPSes).  The
6481
     * technique chosen here is to assume that any non-default qdisc that we
6482
     * create will have a class with handle 1:0.  The built-in qdiscs only have
6483
     * a class with handle 0:0.
6484
     *
6485
     * On Linux 2.6.35+ we use the straightforward method because it allows us
6486
     * to handle non-builtin qdiscs without handle 1:0 (e.g. codel).  However,
6487
     * in such a case we get no response at all from the kernel (!) if a
6488
     * builtin qdisc is in use (which is later caught by "!error &&
6489
     * !qdisc->size"). */
6490
0
    tcmsg = netdev_linux_tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO,
6491
0
                                         &request);
6492
0
    if (!tcmsg) {
6493
0
        return ENODEV;
6494
0
    }
6495
0
    tcmsg->tcm_handle = tc_make_handle(getqdisc_is_safe() ? 0 : 1, 0);
6496
0
    tcmsg->tcm_parent = getqdisc_is_safe() ? TC_H_ROOT : 0;
6497
6498
    /* Figure out what tc class to instantiate. */
6499
0
    error = tc_transact(&request, &qdisc);
6500
0
    if (!error && qdisc->size) {
6501
0
        const char *kind;
6502
6503
0
        error = tc_parse_qdisc(qdisc, &kind, NULL);
6504
0
        if (error) {
6505
0
            ops = &tc_ops_other;
6506
0
        } else {
6507
0
            ops = tc_lookup_linux_name(kind);
6508
0
            if (!ops) {
6509
0
                static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
6510
0
                VLOG_DBG_RL(&rl2, "unknown qdisc \"%s\"", kind);
6511
6512
0
                ops = &tc_ops_other;
6513
0
            }
6514
0
        }
6515
0
    } else if ((!error && !qdisc->size) || error == ENOENT) {
6516
        /* Either it's a built-in qdisc, or (on Linux pre-2.6.35) it's a qdisc
6517
         * set up by some other entity that doesn't have a handle 1:0.  We will
6518
         * assume that it's the system default qdisc. */
6519
0
        ops = &tc_ops_default;
6520
0
        error = 0;
6521
0
    } else {
6522
        /* Who knows?  Maybe the device got deleted. */
6523
0
        VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
6524
0
                     netdev_get_name(netdev_), ovs_strerror(error));
6525
0
        ops = &tc_ops_other;
6526
0
    }
6527
6528
    /* Instantiate it. */
6529
0
    load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
6530
0
    ovs_assert((load_error == 0) == (netdev->tc != NULL));
6531
0
    ofpbuf_delete(qdisc);
6532
6533
0
    return error ? error : load_error;
6534
0
}
6535
6536
/* Linux traffic control uses tables with 256 entries ("rtab" tables) to
6537
   approximate the time to transmit packets of various lengths.  For an MTU of
6538
   256 or less, each entry is exact; for an MTU of 257 through 512, each entry
6539
   represents two possible packet lengths; for a MTU of 513 through 1024, four
6540
   possible lengths; and so on.
6541
6542
   Returns, for the specified 'mtu', the number of bits that packet lengths
6543
   need to be shifted right to fit within such a 256-entry table. */
6544
static int
6545
tc_calc_cell_log(unsigned int mtu)
6546
0
{
6547
0
    int cell_log;
6548
6549
0
    if (!mtu) {
6550
0
        mtu = ETH_PAYLOAD_MAX;
6551
0
    }
6552
0
    mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
6553
6554
0
    for (cell_log = 0; mtu >= 256; cell_log++) {
6555
0
        mtu >>= 1;
6556
0
    }
6557
6558
0
    return cell_log;
6559
0
}
6560
6561
/* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
6562
 * of 'mtu'. */
6563
static void
6564
tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
6565
0
{
6566
0
    memset(rate, 0, sizeof *rate);
6567
0
    rate->cell_log = tc_calc_cell_log(mtu);
6568
    /* rate->overhead = 0; */           /* New in 2.6.24, not yet in some */
6569
    /* rate->cell_align = 0; */         /* distro headers. */
6570
0
    rate->mpu = ETH_TOTAL_MIN;
6571
0
    rate->rate = MIN(UINT32_MAX, Bps);
6572
0
}
6573
6574
/* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
6575
 * attribute of the specified "type".
6576
 *
6577
 * A 64-bit rate can be provided via 'rate64' in bps.
6578
 * If zero, the rate in 'rate' will be used.
6579
 *
6580
 * See tc_calc_cell_log() above for a description of "rtab"s. */
6581
void
6582
tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate,
6583
            uint64_t rate64)
6584
0
{
6585
0
    uint32_t *rtab;
6586
0
    unsigned int i;
6587
6588
0
    rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
6589
0
    for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
6590
0
        unsigned packet_size = (i + 1) << rate->cell_log;
6591
0
        if (packet_size < rate->mpu) {
6592
0
            packet_size = rate->mpu;
6593
0
        }
6594
0
        rtab[i] = tc_bytes_to_ticks(rate64 ? rate64 : rate->rate, packet_size);
6595
0
    }
6596
0
}
6597
6598
/* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
6599
 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
6600
 * burst size of 'burst_bytes'.  (If no value was requested, a 'burst_bytes' of
6601
 * 0 is fine.) */
6602
static int
6603
tc_calc_buffer(uint64_t Bps, int mtu, uint64_t burst_bytes)
6604
0
{
6605
0
    unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
6606
0
    return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
6607
0
}
6608

6609
/* Linux-only functions declared in netdev-linux.h  */
6610
6611
/* Modifies the 'flag' bit in ethtool's flags field for 'netdev'.  If
6612
 * 'enable' is true, the bit is set.  Otherwise, it is cleared. */
6613
int
6614
netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
6615
                              const char *flag_name, bool enable)
6616
0
{
6617
0
    const char *netdev_name = netdev_get_name(netdev);
6618
0
    struct ethtool_value evalue;
6619
0
    uint32_t new_flags;
6620
0
    int error;
6621
6622
0
    COVERAGE_INC(netdev_get_ethtool);
6623
0
    memset(&evalue, 0, sizeof evalue);
6624
0
    error = netdev_linux_do_ethtool(netdev_name,
6625
0
                                    (struct ethtool_cmd *)&evalue,
6626
0
                                    ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
6627
0
    if (error) {
6628
0
        return error;
6629
0
    }
6630
6631
0
    COVERAGE_INC(netdev_set_ethtool);
6632
0
    new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
6633
0
    if (new_flags == evalue.data) {
6634
0
        return 0;
6635
0
    }
6636
0
    evalue.data = new_flags;
6637
0
    error = netdev_linux_do_ethtool(netdev_name,
6638
0
                                    (struct ethtool_cmd *)&evalue,
6639
0
                                    ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
6640
0
    if (error) {
6641
0
        return error;
6642
0
    }
6643
6644
0
    COVERAGE_INC(netdev_get_ethtool);
6645
0
    memset(&evalue, 0, sizeof evalue);
6646
0
    error = netdev_linux_do_ethtool(netdev_name,
6647
0
                                    (struct ethtool_cmd *)&evalue,
6648
0
                                    ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
6649
0
    if (error) {
6650
0
        return error;
6651
0
    }
6652
6653
0
    if (new_flags != evalue.data) {
6654
0
        VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
6655
0
                     "device %s failed", enable ? "enable" : "disable",
6656
0
                     flag_name, netdev_name);
6657
0
        return EOPNOTSUPP;
6658
0
    }
6659
6660
0
    return 0;
6661
0
}
6662

6663
/* Utility functions. */
6664
6665
/* Copies 'src' into 'dst', performing format conversion in the process. */
6666
static void
6667
netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
6668
                                  const struct rtnl_link_stats *src)
6669
0
{
6670
0
    dst->rx_packets = src->rx_packets;
6671
0
    dst->tx_packets = src->tx_packets;
6672
0
    dst->rx_bytes = src->rx_bytes;
6673
0
    dst->tx_bytes = src->tx_bytes;
6674
0
    dst->rx_errors = src->rx_errors;
6675
0
    dst->tx_errors = src->tx_errors;
6676
0
    dst->rx_dropped = src->rx_dropped;
6677
0
    dst->tx_dropped = src->tx_dropped;
6678
0
    dst->multicast = src->multicast;
6679
0
    dst->collisions = src->collisions;
6680
0
    dst->rx_length_errors = src->rx_length_errors;
6681
0
    dst->rx_over_errors = src->rx_over_errors;
6682
0
    dst->rx_crc_errors = src->rx_crc_errors;
6683
0
    dst->rx_frame_errors = src->rx_frame_errors;
6684
0
    dst->rx_fifo_errors = src->rx_fifo_errors;
6685
0
    dst->rx_missed_errors = src->rx_missed_errors;
6686
0
    dst->tx_aborted_errors = src->tx_aborted_errors;
6687
0
    dst->tx_carrier_errors = src->tx_carrier_errors;
6688
0
    dst->tx_fifo_errors = src->tx_fifo_errors;
6689
0
    dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
6690
0
    dst->tx_window_errors = src->tx_window_errors;
6691
0
}
6692
6693
/* Copies 'src' into 'dst', performing format conversion in the process. */
6694
static void
6695
netdev_stats_from_rtnl_link_stats64(struct netdev_stats *dst,
6696
                                    const struct rtnl_link_stats64 *src)
6697
0
{
6698
0
    dst->rx_packets = src->rx_packets;
6699
0
    dst->tx_packets = src->tx_packets;
6700
0
    dst->rx_bytes = src->rx_bytes;
6701
0
    dst->tx_bytes = src->tx_bytes;
6702
0
    dst->rx_errors = src->rx_errors;
6703
0
    dst->tx_errors = src->tx_errors;
6704
0
    dst->rx_dropped = src->rx_dropped;
6705
0
    dst->tx_dropped = src->tx_dropped;
6706
0
    dst->multicast = src->multicast;
6707
0
    dst->collisions = src->collisions;
6708
0
    dst->rx_length_errors = src->rx_length_errors;
6709
0
    dst->rx_over_errors = src->rx_over_errors;
6710
0
    dst->rx_crc_errors = src->rx_crc_errors;
6711
0
    dst->rx_frame_errors = src->rx_frame_errors;
6712
0
    dst->rx_fifo_errors = src->rx_fifo_errors;
6713
0
    dst->rx_missed_errors = src->rx_missed_errors;
6714
0
    dst->tx_aborted_errors = src->tx_aborted_errors;
6715
0
    dst->tx_carrier_errors = src->tx_carrier_errors;
6716
0
    dst->tx_fifo_errors = src->tx_fifo_errors;
6717
0
    dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
6718
0
    dst->tx_window_errors = src->tx_window_errors;
6719
0
}
6720
6721
int
6722
get_stats_via_netlink(const struct netdev *netdev_, struct netdev_stats *stats)
6723
0
{
6724
0
    struct ofpbuf request;
6725
0
    struct ofpbuf *reply;
6726
0
    int error;
6727
6728
    /* Filtering all counters by default */
6729
0
    memset(stats, 0xFF, sizeof(struct netdev_stats));
6730
6731
0
    ofpbuf_init(&request, 0);
6732
0
    nl_msg_put_nlmsghdr(&request,
6733
0
                        sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
6734
0
                        RTM_GETLINK, NLM_F_REQUEST);
6735
0
    ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
6736
0
    nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(netdev_));
6737
0
    error = nl_transact(NETLINK_ROUTE, &request, &reply);
6738
0
    ofpbuf_uninit(&request);
6739
0
    if (error) {
6740
0
        return error;
6741
0
    }
6742
6743
0
    if (ofpbuf_try_pull(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg))) {
6744
0
        const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS64);
6745
0
        if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats64)) {
6746
0
            const struct rtnl_link_stats64 *lstats = nl_attr_get(a);
6747
0
            struct rtnl_link_stats64 aligned_lstats;
6748
6749
0
            if (!IS_PTR_ALIGNED(lstats)) {
6750
0
                memcpy(&aligned_lstats, (void *) lstats,
6751
0
                       sizeof aligned_lstats);
6752
0
                lstats = &aligned_lstats;
6753
0
            }
6754
0
            netdev_stats_from_rtnl_link_stats64(stats, lstats);
6755
0
            error = 0;
6756
0
        } else {
6757
0
            a = nl_attr_find(reply, 0, IFLA_STATS);
6758
0
            if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats)) {
6759
0
                netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(a));
6760
0
                error = 0;
6761
0
            } else {
6762
0
                VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
6763
0
                error = EPROTO;
6764
0
            }
6765
0
        }
6766
0
    } else {
6767
0
        VLOG_WARN_RL(&rl, "short RTM_GETLINK reply");
6768
0
        error = EPROTO;
6769
0
    }
6770
6771
6772
0
    ofpbuf_delete(reply);
6773
0
    return error;
6774
0
}
6775
6776
static int
6777
get_flags(const struct netdev *dev, unsigned int *flags)
6778
0
{
6779
0
    struct ifreq ifr;
6780
0
    int error;
6781
6782
0
    memset(&ifr, 0, sizeof ifr);
6783
0
    *flags = 0;
6784
0
    error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
6785
0
    if (!error) {
6786
0
        *flags = ifr.ifr_flags;
6787
0
    }
6788
0
    return error;
6789
0
}
6790
6791
static int
6792
set_flags(const char *name, unsigned int flags)
6793
0
{
6794
0
    struct ifreq ifr;
6795
6796
0
    memset(&ifr, 0, sizeof ifr);
6797
0
    ifr.ifr_flags = flags;
6798
0
    return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
6799
0
}
6800
6801
int
6802
linux_get_ifindex(const char *netdev_name)
6803
0
{
6804
0
    struct ifreq ifr;
6805
0
    int error;
6806
6807
0
    memset(&ifr, 0, sizeof ifr);
6808
0
    ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
6809
0
    COVERAGE_INC(netdev_get_ifindex);
6810
6811
0
    error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
6812
0
    if (error) {
6813
        /* ENODEV probably means that a vif disappeared asynchronously and
6814
         * hasn't been removed from the database yet, so reduce the log level
6815
         * to INFO for that case. */
6816
0
        VLOG_RL(&rl, error == ENODEV ? VLL_INFO : VLL_ERR,
6817
0
                "ioctl(SIOCGIFINDEX) on %s device failed: %s",
6818
0
                netdev_name, ovs_strerror(error));
6819
0
        return -error;
6820
0
    }
6821
0
    return ifr.ifr_ifindex;
6822
0
}
6823
6824
static int
6825
get_ifindex(const struct netdev *netdev_, int *ifindexp)
6826
0
{
6827
0
    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
6828
6829
0
    if (!(netdev->cache_valid & VALID_IFINDEX)) {
6830
0
        netdev_linux_update_via_netlink(netdev);
6831
0
    }
6832
6833
0
    if (!(netdev->cache_valid & VALID_IFINDEX)) {
6834
        /* Fall back to ioctl if netlink fails */
6835
0
        int ifindex = linux_get_ifindex(netdev_get_name(netdev_));
6836
6837
0
        if (ifindex < 0) {
6838
0
            netdev->get_ifindex_error = -ifindex;
6839
0
            netdev->ifindex = 0;
6840
0
        } else {
6841
0
            netdev->get_ifindex_error = 0;
6842
0
            netdev->ifindex = ifindex;
6843
0
        }
6844
0
        netdev->cache_valid |= VALID_IFINDEX;
6845
0
    }
6846
6847
0
    *ifindexp = netdev->ifindex;
6848
0
    return netdev->get_ifindex_error;
6849
0
}
6850
6851
static int
6852
netdev_linux_update_via_netlink(struct netdev_linux *netdev)
6853
0
{
6854
0
    struct ofpbuf request;
6855
0
    struct ofpbuf *reply;
6856
0
    struct rtnetlink_change chg;
6857
0
    struct rtnetlink_change *change = &chg;
6858
0
    int error;
6859
6860
0
    ofpbuf_init(&request, 0);
6861
0
    nl_msg_put_nlmsghdr(&request,
6862
0
                        sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ) +
6863
0
                        NL_A_U32_SIZE, RTM_GETLINK, NLM_F_REQUEST);
6864
0
    ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
6865
6866
    /* The correct identifiers for a Linux device are netnsid and ifindex,
6867
     * but ifindex changes as the port is moved to another network namespace
6868
     * and the interface name statically stored in ovsdb. */
6869
0
    nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(&netdev->up));
6870
0
    if (netdev_linux_netnsid_is_remote(netdev)) {
6871
0
        nl_msg_put_u32(&request, IFLA_IF_NETNSID, netdev->netnsid);
6872
0
    }
6873
6874
0
    nl_msg_put_u32(&request, IFLA_EXT_MASK, RTEXT_FILTER_SKIP_STATS);
6875
6876
0
    error = nl_transact(NETLINK_ROUTE, &request, &reply);
6877
0
    ofpbuf_uninit(&request);
6878
0
    if (error) {
6879
0
        ofpbuf_delete(reply);
6880
0
        return error;
6881
0
    }
6882
6883
0
    if (rtnetlink_parse(reply, change)
6884
0
        && !change->irrelevant
6885
0
        && change->nlmsg_type == RTM_NEWLINK) {
6886
0
        bool changed = false;
6887
0
        error = 0;
6888
6889
        /* Update netdev from rtnl msg and increment its seq if needed. */
6890
0
        if ((change->ifi_flags ^ netdev->ifi_flags) & IFF_RUNNING) {
6891
0
            netdev->carrier_resets++;
6892
0
            changed = true;
6893
0
        }
6894
0
        if (change->ifi_flags != netdev->ifi_flags) {
6895
0
            netdev->ifi_flags = change->ifi_flags;
6896
0
            changed = true;
6897
0
        }
6898
0
        if (change->mtu && change->mtu != netdev->mtu) {
6899
0
            netdev->mtu = change->mtu;
6900
0
            netdev->cache_valid |= VALID_MTU;
6901
0
            netdev->netdev_mtu_error = 0;
6902
0
            changed = true;
6903
0
        }
6904
0
        if (!eth_addr_is_zero(change->mac)
6905
0
            && !eth_addr_equals(change->mac, netdev->etheraddr)) {
6906
0
            netdev->etheraddr = change->mac;
6907
0
            netdev->cache_valid |= VALID_ETHERADDR;
6908
0
            netdev->ether_addr_error = 0;
6909
0
            changed = true;
6910
0
        }
6911
0
        if (change->if_index != netdev->ifindex) {
6912
0
            netdev->ifindex = change->if_index;
6913
0
            netdev->cache_valid |= VALID_IFINDEX;
6914
0
            netdev->get_ifindex_error = 0;
6915
0
            changed = true;
6916
0
        }
6917
0
        if (change->primary && netdev_linux_kind_is_lag(change->primary)) {
6918
0
            netdev->is_lag_primary = true;
6919
0
        }
6920
0
        if (changed) {
6921
0
            netdev_change_seq_changed(&netdev->up);
6922
0
        }
6923
0
    } else {
6924
0
        error = EINVAL;
6925
0
    }
6926
6927
0
    ofpbuf_delete(reply);
6928
0
    return error;
6929
0
}
6930
6931
static int
6932
get_etheraddr(const char *netdev_name, struct eth_addr *ea)
6933
0
{
6934
0
    struct ifreq ifr;
6935
0
    int hwaddr_family;
6936
0
    int error;
6937
6938
0
    memset(&ifr, 0, sizeof ifr);
6939
0
    ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
6940
0
    COVERAGE_INC(netdev_get_hwaddr);
6941
0
    error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
6942
0
    if (error) {
6943
        /* ENODEV probably means that a vif disappeared asynchronously and
6944
         * hasn't been removed from the database yet, so reduce the log level
6945
         * to INFO for that case. */
6946
0
        VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
6947
0
             "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
6948
0
             netdev_name, ovs_strerror(error));
6949
0
        return error;
6950
0
    }
6951
0
    hwaddr_family = ifr.ifr_hwaddr.sa_family;
6952
0
    if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER &&
6953
0
        hwaddr_family != ARPHRD_NONE) {
6954
0
        VLOG_INFO("%s device has unknown hardware address family %d",
6955
0
                  netdev_name, hwaddr_family);
6956
0
        return EINVAL;
6957
0
    }
6958
0
    memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
6959
0
    return 0;
6960
0
}
6961
6962
static int
6963
set_etheraddr(const char *netdev_name, const struct eth_addr mac)
6964
0
{
6965
0
    struct ifreq ifr;
6966
0
    int error;
6967
6968
0
    memset(&ifr, 0, sizeof ifr);
6969
0
    ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
6970
0
    ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
6971
0
    memcpy(ifr.ifr_hwaddr.sa_data, &mac, ETH_ADDR_LEN);
6972
0
    COVERAGE_INC(netdev_set_hwaddr);
6973
0
    error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
6974
0
    if (error) {
6975
0
        VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
6976
0
                 netdev_name, ovs_strerror(error));
6977
0
    }
6978
0
    return error;
6979
0
}
6980
6981
static int
6982
netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
6983
                        int cmd, const char *cmd_name)
6984
0
{
6985
0
    struct ifreq ifr;
6986
0
    int error;
6987
6988
0
    memset(&ifr, 0, sizeof ifr);
6989
0
    ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
6990
0
    ifr.ifr_data = (caddr_t) ecmd;
6991
6992
0
    ecmd->cmd = cmd;
6993
0
    error = af_inet_ioctl(SIOCETHTOOL, &ifr);
6994
0
    if (error) {
6995
0
        if (error != EOPNOTSUPP) {
6996
0
            VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
6997
0
                         "failed: %s", cmd_name, name, ovs_strerror(error));
6998
0
        } else {
6999
            /* The device doesn't support this operation.  That's pretty
7000
             * common, so there's no point in logging anything. */
7001
0
        }
7002
0
    }
7003
0
    return error;
7004
0
}
7005
7006
/* Returns an AF_PACKET raw socket or a negative errno value. */
7007
static int
7008
af_packet_sock(void)
7009
0
{
7010
0
    static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
7011
0
    static int sock;
7012
7013
0
    if (ovsthread_once_start(&once)) {
7014
0
        sock = socket(AF_PACKET, SOCK_RAW, 0);
7015
0
        if (sock >= 0) {
7016
0
            int error = set_nonblocking(sock);
7017
0
            if (error) {
7018
0
                close(sock);
7019
0
                sock = -error;
7020
0
            } else if (userspace_tso_enabled()) {
7021
0
                int val = 1;
7022
0
                error = setsockopt(sock, SOL_PACKET, PACKET_VNET_HDR, &val,
7023
0
                                   sizeof val);
7024
0
                if (error) {
7025
0
                    error = errno;
7026
0
                    VLOG_ERR("failed to enable vnet hdr in raw socket: %s",
7027
0
                             ovs_strerror(errno));
7028
0
                    close(sock);
7029
0
                    sock = -error;
7030
0
                }
7031
0
            }
7032
0
        } else {
7033
0
            sock = -errno;
7034
0
            VLOG_ERR("failed to create packet socket: %s",
7035
0
                     ovs_strerror(errno));
7036
0
        }
7037
0
        ovsthread_once_done(&once);
7038
0
    }
7039
7040
0
    return sock;
7041
0
}
7042
7043
/* Initializes packet 'b' with features enabled in the prepended
7044
 * struct virtio_net_hdr.  Returns 0 if successful, otherwise a
7045
 * positive errno value. */
7046
static int
7047
netdev_linux_parse_vnet_hdr(struct dp_packet *b)
7048
0
{
7049
0
    struct virtio_net_hdr *vnet = dp_packet_pull(b, sizeof *vnet);
7050
7051
0
    if (OVS_UNLIKELY(!vnet)) {
7052
0
        return EINVAL;
7053
0
    }
7054
7055
0
    if (vnet->flags == 0 && vnet->gso_type == VIRTIO_NET_HDR_GSO_NONE) {
7056
0
        return 0;
7057
0
    }
7058
7059
0
    if (vnet->flags == VIRTIO_NET_HDR_F_NEEDS_CSUM) {
7060
0
        uint16_t csum_offset = (OVS_FORCE uint16_t) vnet->csum_offset;
7061
0
        uint16_t csum_start = (OVS_FORCE uint16_t) vnet->csum_start;
7062
7063
0
        if (csum_start >= dp_packet_size(b)
7064
0
            || csum_start + csum_offset >= dp_packet_size(b)) {
7065
0
            COVERAGE_INC(netdev_linux_invalid_l4_csum);
7066
0
            return EINVAL;
7067
0
        }
7068
7069
0
        parse_tcp_flags(b, NULL, NULL, NULL);
7070
7071
0
        if (csum_start == b->l4_ofs
7072
0
            && ((csum_offset == offsetof(struct tcp_header, tcp_csum)
7073
0
                 && dp_packet_l4_proto_tcp(b))
7074
0
                || (csum_offset == offsetof(struct udp_header, udp_csum)
7075
0
                    && dp_packet_l4_proto_udp(b))
7076
0
                || (csum_offset == offsetof(struct sctp_header, sctp_csum)
7077
0
                    && dp_packet_l4_proto_sctp(b)))) {
7078
0
            dp_packet_l4_checksum_set_partial(b);
7079
0
        } else {
7080
0
            ovs_be16 *csum_l4;
7081
0
            void *l4;
7082
7083
0
            COVERAGE_INC(netdev_linux_unknown_l4_csum);
7084
7085
0
            csum_l4 = dp_packet_at(b, csum_start + csum_offset,
7086
0
                                   sizeof *csum_l4);
7087
0
            if (!csum_l4) {
7088
0
                return EINVAL;
7089
0
            }
7090
7091
0
            l4 = dp_packet_at(b, csum_start, dp_packet_size(b) - csum_start);
7092
0
            *csum_l4 = csum(l4, dp_packet_size(b) - csum_start);
7093
7094
0
            if (dp_packet_l4_proto_tcp(b)
7095
0
                || dp_packet_l4_proto_udp(b)
7096
0
                || dp_packet_l4_proto_sctp(b)) {
7097
0
                dp_packet_l4_checksum_set_good(b);
7098
0
            }
7099
0
        }
7100
0
    }
7101
7102
0
    int ret = 0;
7103
0
    switch (vnet->gso_type) {
7104
0
    case VIRTIO_NET_HDR_GSO_TCPV4:
7105
0
    case VIRTIO_NET_HDR_GSO_TCPV6:
7106
0
        dp_packet_set_tso_segsz(b, (OVS_FORCE uint16_t) vnet->gso_size);
7107
0
        break;
7108
7109
0
    case VIRTIO_NET_HDR_GSO_UDP:
7110
        /* UFO is not supported. */
7111
0
        VLOG_WARN_RL(&rl, "Received an unsupported packet with UFO enabled.");
7112
0
        ret = ENOTSUP;
7113
0
        break;
7114
7115
0
    case VIRTIO_NET_HDR_GSO_NONE:
7116
0
        break;
7117
7118
0
    default:
7119
0
        ret = ENOTSUP;
7120
0
        VLOG_WARN_RL(&rl, "Received an unsupported packet with GSO type: 0x%x",
7121
0
                     vnet->gso_type);
7122
0
    }
7123
7124
0
    return ret;
7125
0
}
7126
7127
/* Prepends struct virtio_net_hdr to packet 'b'.
7128
 * Returns 0 if successful, otherwise a positive errno value.
7129
 * Returns EMSGSIZE if the packet 'b' cannot be sent over MTU 'mtu'. */
7130
static int
7131
netdev_linux_prepend_vnet_hdr(struct dp_packet *b, int mtu)
7132
0
{
7133
0
    struct virtio_net_hdr v;
7134
0
    struct virtio_net_hdr *vnet = &v;
7135
7136
0
    if (dp_packet_get_tso_segsz(b)) {
7137
0
        uint16_t tso_segsz = dp_packet_get_tso_segsz(b);
7138
0
        const struct tcp_header *tcp;
7139
0
        const struct ip_header *ip;
7140
0
        if (dp_packet_inner_l4(b)) {
7141
0
            tcp = dp_packet_inner_l4(b);
7142
0
            ip = dp_packet_inner_l3(b);
7143
0
        } else {
7144
0
            tcp = dp_packet_l4(b);
7145
0
            ip = dp_packet_l3(b);
7146
0
        }
7147
0
        int tcp_hdr_len = TCP_OFFSET(tcp->tcp_ctl) * 4;
7148
0
        int hdr_len = ((char *) tcp - (char *) dp_packet_eth(b))
7149
0
                      + tcp_hdr_len;
7150
0
        int max_packet_len = mtu + ETH_HEADER_LEN + VLAN_HEADER_LEN;
7151
7152
0
        if (OVS_UNLIKELY((hdr_len + tso_segsz) > max_packet_len)) {
7153
0
            VLOG_WARN_RL(&rl, "Oversized TSO packet. hdr_len: %"PRIu32", "
7154
0
                         "gso: %"PRIu16", max length: %"PRIu32".", hdr_len,
7155
0
                         tso_segsz, max_packet_len);
7156
0
            return EMSGSIZE;
7157
0
        }
7158
7159
0
        vnet->hdr_len = (OVS_FORCE __virtio16)hdr_len;
7160
0
        vnet->gso_size = (OVS_FORCE __virtio16)(tso_segsz);
7161
0
        if (IP_VER(ip->ip_ihl_ver) == 4) {
7162
0
            vnet->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
7163
0
        } else if (IP_VER(ip->ip_ihl_ver) == 6) {
7164
0
            vnet->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
7165
0
        } else {
7166
0
            VLOG_ERR_RL(&rl, "Unknown gso_type for TSO packet. "
7167
0
                        "Offloads: %"PRIu32, b->offloads);
7168
0
            return EINVAL;
7169
0
        }
7170
0
    } else {
7171
0
        vnet->hdr_len = 0;
7172
0
        vnet->gso_size = 0;
7173
0
        vnet->gso_type = VIRTIO_NET_HDR_GSO_NONE;
7174
0
    }
7175
7176
0
    if (dp_packet_l4_checksum_good(b)
7177
0
        && (!dp_packet_tunnel(b)
7178
0
            || dp_packet_inner_l4_checksum_good(b))) {
7179
        /* The packet has good L4 checksum. No need to validate again. */
7180
0
        vnet->csum_start = vnet->csum_offset = (OVS_FORCE __virtio16) 0;
7181
0
        vnet->flags = VIRTIO_NET_HDR_F_DATA_VALID;
7182
0
    } else if (dp_packet_l4_checksum_partial(b)
7183
0
               || dp_packet_inner_l4_checksum_partial(b)) {
7184
0
        const struct ip_header *ip_hdr;
7185
0
        void *l3_off;
7186
0
        void *l4_off;
7187
0
        bool is_sctp;
7188
0
        bool is_tcp;
7189
0
        bool is_udp;
7190
7191
0
        if (dp_packet_inner_l4_checksum_partial(b)) {
7192
0
            l3_off = dp_packet_inner_l3(b);
7193
0
            l4_off = dp_packet_inner_l4(b);
7194
0
            is_tcp = dp_packet_inner_l4_proto_tcp(b);
7195
0
            is_udp = dp_packet_inner_l4_proto_udp(b);
7196
0
            is_sctp = dp_packet_inner_l4_proto_sctp(b);
7197
0
        } else {
7198
0
            l3_off = dp_packet_l3(b);
7199
0
            l4_off = dp_packet_l4(b);
7200
0
            is_tcp = dp_packet_l4_proto_tcp(b);
7201
0
            is_udp = dp_packet_l4_proto_udp(b);
7202
0
            is_sctp = dp_packet_l4_proto_sctp(b);
7203
0
        }
7204
0
        ip_hdr = l3_off;
7205
7206
        /* The csum calculation is offloaded. */
7207
0
        if (is_tcp) {
7208
            /* Virtual I/O Device (VIRTIO) Version 1.1
7209
             * 5.1.6.2 Packet Transmission
7210
             * If the driver negotiated VIRTIO_NET_F_CSUM, it can skip
7211
             * checksumming the packet:
7212
             *  - flags has the VIRTIO_NET_HDR_F_NEEDS_CSUM set,
7213
             *  - csum_start is set to the offset within the packet
7214
             *    to begin checksumming, and
7215
             *  - csum_offset indicates how many bytes after the
7216
             *    csum_start the new (16 bit ones complement) checksum
7217
             *    is placed by the device.
7218
             * The TCP checksum field in the packet is set to the sum of
7219
             * the TCP pseudo header, so that replacing it by the ones
7220
             * complement checksum of the TCP header and body will give
7221
             * the correct result. */
7222
0
            struct tcp_header *tcp_hdr = l4_off;
7223
0
            ovs_be16 csum = 0;
7224
7225
0
            if (IP_VER(ip_hdr->ip_ihl_ver) == 4) {
7226
0
                csum = ~csum_finish(packet_csum_pseudoheader(ip_hdr));
7227
0
            } else if (IP_VER(ip_hdr->ip_ihl_ver) == 6) {
7228
0
                const struct ovs_16aligned_ip6_hdr *ip6_hdr = l3_off;
7229
0
                csum = ~csum_finish(packet_csum_pseudoheader6(ip6_hdr));
7230
0
            }
7231
7232
0
            tcp_hdr->tcp_csum = csum;
7233
0
            vnet->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
7234
0
            vnet->csum_start = (OVS_FORCE __virtio16) ((char *) l4_off -
7235
0
                                    (char *) dp_packet_data(b));
7236
0
            vnet->csum_offset = (OVS_FORCE __virtio16) __builtin_offsetof(
7237
0
                                    struct tcp_header, tcp_csum);
7238
0
        } else if (is_udp) {
7239
0
            struct udp_header *udp_hdr = l4_off;
7240
0
            ovs_be16 csum = 0;
7241
7242
0
            if (IP_VER(ip_hdr->ip_ihl_ver) == 4) {
7243
0
                csum = ~csum_finish(packet_csum_pseudoheader(ip_hdr));
7244
0
            } else if (IP_VER(ip_hdr->ip_ihl_ver) == 6) {
7245
0
                const struct ovs_16aligned_ip6_hdr *ip6_hdr = l3_off;
7246
0
                csum = ~csum_finish(packet_csum_pseudoheader6(ip6_hdr));
7247
0
            }
7248
7249
0
            udp_hdr->udp_csum = csum;
7250
0
            vnet->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
7251
0
            vnet->csum_start = (OVS_FORCE __virtio16) ((char *) udp_hdr -
7252
0
                                    (char *) dp_packet_data(b));;
7253
0
            vnet->csum_offset = (OVS_FORCE __virtio16) __builtin_offsetof(
7254
0
                                    struct udp_header, udp_csum);
7255
0
        } else if (is_sctp) {
7256
            /* The Linux kernel networking stack only supports csum_start
7257
             * and csum_offset when SCTP GSO is enabled.  See kernel's
7258
             * skb_csum_hwoffload_help(). Currently there is no SCTP
7259
             * segmentation offload support in OVS. */
7260
0
            vnet->csum_start = vnet->csum_offset = (OVS_FORCE __virtio16) 0;
7261
0
            vnet->flags = 0;
7262
0
        } else {
7263
            /* This should only happen when a new L4 proto
7264
             * is not covered in above checks. */
7265
0
            VLOG_WARN_RL(&rl, "Unsupported L4 checksum offload. "
7266
0
                         "Offloads: %"PRIu32, b->offloads);
7267
0
            vnet->csum_start = vnet->csum_offset = (OVS_FORCE __virtio16) 0;
7268
0
            vnet->flags = 0;
7269
0
        }
7270
0
    } else {
7271
        /* Packet L4 csum is unknown. */
7272
0
        vnet->csum_start = vnet->csum_offset = (OVS_FORCE __virtio16) 0;
7273
0
        vnet->flags = 0;
7274
0
    }
7275
7276
0
    dp_packet_push(b, vnet, sizeof *vnet);
7277
0
    return 0;
7278
0
}