Coverage Report

Created: 2025-08-29 06:59

/src/openvswitch/lib/conntrack.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) 2015-2019 Nicira, Inc.
3
 *
4
 * Licensed under the Apache License, Version 2.0 (the "License");
5
 * you may not use this file except in compliance with the License.
6
 * You may obtain a copy of the License at:
7
 *
8
 *     http://www.apache.org/licenses/LICENSE-2.0
9
 *
10
 * Unless required by applicable law or agreed to in writing, software
11
 * distributed under the License is distributed on an "AS IS" BASIS,
12
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
 * See the License for the specific language governing permissions and
14
 * limitations under the License.
15
 */
16
17
#include <config.h>
18
#include <ctype.h>
19
#include <errno.h>
20
#include <sys/types.h>
21
#include <netinet/in.h>
22
#include <netinet/icmp6.h>
23
#include <string.h>
24
25
#include "conntrack.h"
26
#include "conntrack-private.h"
27
#include "conntrack-tp.h"
28
#include "coverage.h"
29
#include "crc32c.h"
30
#include "csum.h"
31
#include "ct-dpif.h"
32
#include "dp-packet.h"
33
#include "flow.h"
34
#include "netdev.h"
35
#include "odp-netlink.h"
36
#include "odp-util.h"
37
#include "openvswitch/hmap.h"
38
#include "openvswitch/types.h"
39
#include "openvswitch/vlog.h"
40
#include "ovs-rcu.h"
41
#include "ovs-thread.h"
42
#include "openvswitch/poll-loop.h"
43
#include "random.h"
44
#include "rculist.h"
45
#include "timeval.h"
46
#include "unaligned.h"
47
48
VLOG_DEFINE_THIS_MODULE(conntrack);
49
50
COVERAGE_DEFINE(conntrack_full);
51
COVERAGE_DEFINE(conntrack_l3csum_checked);
52
COVERAGE_DEFINE(conntrack_l3csum_err);
53
COVERAGE_DEFINE(conntrack_l4csum_checked);
54
COVERAGE_DEFINE(conntrack_l4csum_err);
55
COVERAGE_DEFINE(conntrack_lookup_natted_miss);
56
COVERAGE_DEFINE(conntrack_zone_full);
57
58
struct conn_lookup_ctx {
59
    struct conn_key key;
60
    struct conn *conn;
61
    uint32_t hash;
62
    bool reply;
63
    bool icmp_related;
64
};
65
66
enum ftp_ctl_pkt {
67
    /* Control packets with address and/or port specifiers. */
68
    CT_FTP_CTL_INTEREST,
69
    /* Control packets without address and/or port specifiers. */
70
    CT_FTP_CTL_OTHER,
71
    CT_FTP_CTL_INVALID,
72
};
73
74
enum ct_alg_mode {
75
    CT_FTP_MODE_ACTIVE,
76
    CT_FTP_MODE_PASSIVE,
77
    CT_TFTP_MODE,
78
};
79
80
enum ct_alg_ctl_type {
81
    CT_ALG_CTL_NONE,
82
    CT_ALG_CTL_FTP,
83
    CT_ALG_CTL_TFTP,
84
    /* SIP is not enabled through Openflow and presently only used as
85
     * an example of an alg that allows a wildcard src ip. */
86
    CT_ALG_CTL_SIP,
87
};
88
89
struct zone_limit {
90
    struct cmap_node node;
91
    struct conntrack_zone_limit czl;
92
};
93
94
static bool conn_key_extract(struct conntrack *, struct dp_packet *,
95
                             ovs_be16 dl_type, struct conn_lookup_ctx *,
96
                             uint16_t zone);
97
static uint32_t conn_key_hash(const struct conn_key *, uint32_t basis);
98
static void conn_key_reverse(struct conn_key *);
99
static bool valid_new(struct dp_packet *pkt, struct conn_key *);
100
static struct conn *new_conn(struct conntrack *ct, struct dp_packet *pkt,
101
                             struct conn_key *, long long now,
102
                             uint32_t tp_id);
103
static void delete_conn__(struct conn *);
104
static void delete_conn(struct conn *);
105
static enum ct_update_res conn_update(struct conntrack *ct, struct conn *conn,
106
                                      struct dp_packet *pkt,
107
                                      struct conn_lookup_ctx *ctx,
108
                                      long long now);
109
static long long int conn_expiration(const struct conn *);
110
static bool conn_expired(const struct conn *, long long now);
111
static void conn_expire_push_front(struct conntrack *ct, struct conn *conn);
112
static void set_mark(struct dp_packet *, struct conn *,
113
                     uint32_t val, uint32_t mask);
114
static void set_label(struct dp_packet *, struct conn *,
115
                      const struct ovs_key_ct_labels *val,
116
                      const struct ovs_key_ct_labels *mask);
117
static void *clean_thread_main(void *f_);
118
119
static bool
120
nat_get_unique_tuple(struct conntrack *ct, struct conn *conn,
121
                     const struct nat_action_info_t *nat_info);
122
123
static uint8_t
124
reverse_icmp_type(uint8_t type);
125
static uint8_t
126
reverse_icmp6_type(uint8_t type);
127
static inline bool
128
extract_l3_ipv4(struct dp_packet *pkt, struct conn_key *key, const void *data,
129
                size_t size, const char **new_data);
130
static inline bool
131
extract_l3_ipv6(struct conn_key *key, const void *data, size_t size,
132
                const char **new_data);
133
static struct alg_exp_node *
134
expectation_lookup(struct hmap *alg_expectations, const struct conn_key *key,
135
                   uint32_t basis, bool src_ip_wc);
136
137
static int
138
repl_ftp_v4_addr(struct dp_packet *pkt, ovs_be32 v4_addr_rep,
139
                 char *ftp_data_v4_start,
140
                 size_t addr_offset_from_ftp_data_start, size_t addr_size);
141
142
static enum ftp_ctl_pkt
143
process_ftp_ctl_v4(struct conntrack *ct,
144
                   struct dp_packet *pkt,
145
                   const struct conn *conn_for_expectation,
146
                   ovs_be32 *v4_addr_rep,
147
                   char **ftp_data_v4_start,
148
                   size_t *addr_offset_from_ftp_data_start,
149
                   size_t *addr_size);
150
151
static enum ftp_ctl_pkt
152
detect_ftp_ctl_type(const struct conn_lookup_ctx *ctx,
153
                    struct dp_packet *pkt);
154
155
static void
156
expectation_clean(struct conntrack *ct, const struct conn_key *parent_key);
157
158
static struct ct_l4_proto *l4_protos[UINT8_MAX + 1];
159
160
static void
161
handle_ftp_ctl(struct conntrack *ct, const struct conn_lookup_ctx *ctx,
162
               struct dp_packet *pkt, struct conn *ec, long long now,
163
               enum ftp_ctl_pkt ftp_ctl, bool nat);
164
165
static void
166
handle_tftp_ctl(struct conntrack *ct,
167
                const struct conn_lookup_ctx *ctx OVS_UNUSED,
168
                struct dp_packet *pkt, struct conn *conn_for_expectation,
169
                long long now OVS_UNUSED, enum ftp_ctl_pkt ftp_ctl OVS_UNUSED,
170
                bool nat OVS_UNUSED);
171
172
typedef void (*alg_helper)(struct conntrack *ct,
173
                           const struct conn_lookup_ctx *ctx,
174
                           struct dp_packet *pkt,
175
                           struct conn *conn_for_expectation,
176
                           long long now, enum ftp_ctl_pkt ftp_ctl,
177
                           bool nat);
178
179
static alg_helper alg_helpers[] = {
180
    [CT_ALG_CTL_NONE] = NULL,
181
    [CT_ALG_CTL_FTP] = handle_ftp_ctl,
182
    [CT_ALG_CTL_TFTP] = handle_tftp_ctl,
183
};
184
185
/* The maximum TCP or UDP port number. */
186
0
#define CT_MAX_L4_PORT 65535
187
/* String buffer used for parsing FTP string messages.
188
 * This is sized about twice what is needed to leave some
189
 * margin of error. */
190
#define LARGEST_FTP_MSG_OF_INTEREST 128
191
/* FTP port string used in active mode. */
192
0
#define FTP_PORT_CMD "PORT"
193
/* FTP pasv string used in passive mode. */
194
0
#define FTP_PASV_REPLY_CODE "227"
195
/* FTP epsv string used in passive mode. */
196
0
#define FTP_EPSV_REPLY_CODE "229"
197
/* Maximum decimal digits for port in FTP command.
198
 * The port is represented as two 3 digit numbers with the
199
 * high part a multiple of 256. */
200
0
#define MAX_FTP_PORT_DGTS 3
201
202
/* FTP extension EPRT string used for active mode. */
203
0
#define FTP_EPRT_CMD "EPRT"
204
/* FTP extension EPSV string used for passive mode. */
205
0
#define FTP_EPSV_REPLY "EXTENDED PASSIVE"
206
/* Maximum decimal digits for port in FTP extended command. */
207
0
#define MAX_EXT_FTP_PORT_DGTS 5
208
/* FTP extended command code for IPv4. */
209
0
#define FTP_AF_V4 '1'
210
/* FTP extended command code for IPv6. */
211
0
#define FTP_AF_V6 '2'
212
/* Used to indicate a wildcard L4 source port number for ALGs.
213
 * This is used for port numbers that we cannot predict in
214
 * expectations. */
215
0
#define ALG_WC_SRC_PORT 0
216
217
/* If the total number of connections goes above this value, no new connections
218
 * are accepted. */
219
#define DEFAULT_N_CONN_LIMIT 3000000
220
221
/* Does a member by member comparison of two conn_keys; this
222
 * function must be kept in sync with struct conn_key; returns 0
223
 * if the keys are equal or 1 if the keys are not equal. */
224
static int
225
conn_key_cmp(const struct conn_key *key1, const struct conn_key *key2)
226
0
{
227
0
    if (!memcmp(&key1->src.addr, &key2->src.addr, sizeof key1->src.addr) &&
228
0
        !memcmp(&key1->dst.addr, &key2->dst.addr, sizeof key1->dst.addr) &&
229
0
        (key1->src.icmp_id == key2->src.icmp_id) &&
230
0
        (key1->src.icmp_type == key2->src.icmp_type) &&
231
0
        (key1->src.icmp_code == key2->src.icmp_code) &&
232
0
        (key1->dst.icmp_id == key2->dst.icmp_id) &&
233
0
        (key1->dst.icmp_type == key2->dst.icmp_type) &&
234
0
        (key1->dst.icmp_code == key2->dst.icmp_code) &&
235
0
        (key1->dl_type == key2->dl_type) &&
236
0
        (key1->zone == key2->zone) &&
237
0
        (key1->nw_proto == key2->nw_proto)) {
238
239
0
        return 0;
240
0
    }
241
0
    return 1;
242
0
}
243
244
/* Initializes the connection tracker 'ct'.  The caller is responsible for
245
 * calling 'conntrack_destroy()', when the instance is not needed anymore */
246
struct conntrack *
247
conntrack_init(void)
248
0
{
249
0
    static struct ovsthread_once setup_l4_once = OVSTHREAD_ONCE_INITIALIZER;
250
0
    struct conntrack *ct = xzalloc(sizeof *ct);
251
252
    /* This value can be used during init (e.g. timeout_policy_init()),
253
     * set it first to ensure it is available.
254
     */
255
0
    ct->hash_basis = random_uint32();
256
257
0
    ovs_rwlock_init(&ct->resources_lock);
258
0
    ovs_rwlock_wrlock(&ct->resources_lock);
259
0
    hmap_init(&ct->alg_expectations);
260
0
    hindex_init(&ct->alg_expectation_refs);
261
0
    ovs_rwlock_unlock(&ct->resources_lock);
262
263
0
    ovs_mutex_init_adaptive(&ct->ct_lock);
264
0
    ovs_mutex_lock(&ct->ct_lock);
265
0
    for (unsigned i = 0; i < ARRAY_SIZE(ct->conns); i++) {
266
0
        cmap_init(&ct->conns[i]);
267
0
    }
268
0
    for (unsigned i = 0; i < ARRAY_SIZE(ct->exp_lists); i++) {
269
0
        rculist_init(&ct->exp_lists[i]);
270
0
    }
271
0
    cmap_init(&ct->zone_limits);
272
0
    ct->zone_limit_seq = 0;
273
0
    timeout_policy_init(ct);
274
0
    ovs_mutex_unlock(&ct->ct_lock);
275
276
0
    atomic_count_init(&ct->n_conn, 0);
277
0
    atomic_init(&ct->n_conn_limit, DEFAULT_N_CONN_LIMIT);
278
0
    atomic_init(&ct->tcp_seq_chk, true);
279
0
    atomic_init(&ct->sweep_ms, 20000);
280
0
    atomic_init(&ct->default_zone_limit, 0);
281
0
    latch_init(&ct->clean_thread_exit);
282
0
    ct->clean_thread = ovs_thread_create("ct_clean", clean_thread_main, ct);
283
0
    ct->ipf = ipf_init();
284
285
    /* Initialize the l4 protocols. */
286
0
    if (ovsthread_once_start(&setup_l4_once)) {
287
0
        for (int i = 0; i < ARRAY_SIZE(l4_protos); i++) {
288
0
            l4_protos[i] = &ct_proto_other;
289
0
        }
290
        /* IPPROTO_UDP uses ct_proto_other, so no need to initialize it. */
291
0
        l4_protos[IPPROTO_TCP] = &ct_proto_tcp;
292
0
        l4_protos[IPPROTO_ICMP] = &ct_proto_icmp4;
293
0
        l4_protos[IPPROTO_ICMPV6] = &ct_proto_icmp6;
294
295
0
        ovsthread_once_done(&setup_l4_once);
296
0
    }
297
0
    return ct;
298
0
}
299
300
static uint32_t
301
zone_key_hash(int32_t zone, uint32_t basis)
302
0
{
303
0
    size_t hash = hash_int((OVS_FORCE uint32_t) zone, basis);
304
0
    return hash;
305
0
}
306
307
static int64_t
308
zone_limit_get_limit__(struct conntrack_zone_limit *czl)
309
0
{
310
0
    int64_t limit;
311
0
    atomic_read_relaxed(&czl->limit, &limit);
312
313
0
    return limit;
314
0
}
315
316
static int64_t
317
zone_limit_get_limit(struct conntrack *ct, struct conntrack_zone_limit *czl)
318
0
{
319
0
    int64_t limit = zone_limit_get_limit__(czl);
320
321
0
    if (limit == ZONE_LIMIT_CONN_DEFAULT) {
322
0
        atomic_read_relaxed(&ct->default_zone_limit, &limit);
323
0
        limit = limit ? limit : -1;
324
0
    }
325
326
0
    return limit;
327
0
}
328
329
static struct zone_limit *
330
zone_limit_lookup_protected(struct conntrack *ct, int32_t zone)
331
    OVS_REQUIRES(ct->ct_lock)
332
0
{
333
0
    uint32_t hash = zone_key_hash(zone, ct->hash_basis);
334
0
    struct zone_limit *zl;
335
0
    CMAP_FOR_EACH_WITH_HASH_PROTECTED (zl, node, hash, &ct->zone_limits) {
336
0
        if (zl->czl.zone == zone) {
337
0
            return zl;
338
0
        }
339
0
    }
340
0
    return NULL;
341
0
}
342
343
static struct zone_limit *
344
zone_limit_lookup(struct conntrack *ct, int32_t zone)
345
0
{
346
0
    uint32_t hash = zone_key_hash(zone, ct->hash_basis);
347
0
    struct zone_limit *zl;
348
0
    CMAP_FOR_EACH_WITH_HASH (zl, node, hash, &ct->zone_limits) {
349
0
        if (zl->czl.zone == zone) {
350
0
            return zl;
351
0
        }
352
0
    }
353
0
    return NULL;
354
0
}
355
356
static struct zone_limit *
357
zone_limit_create__(struct conntrack *ct, int32_t zone, int64_t limit)
358
    OVS_REQUIRES(ct->ct_lock)
359
0
{
360
0
    struct zone_limit *zl = NULL;
361
362
0
    if (zone > DEFAULT_ZONE && zone <= MAX_ZONE) {
363
0
        zl = xmalloc(sizeof *zl);
364
0
        atomic_init(&zl->czl.limit, limit);
365
0
        atomic_count_init(&zl->czl.count, 0);
366
0
        zl->czl.zone = zone;
367
0
        zl->czl.zone_limit_seq = ct->zone_limit_seq++;
368
0
        uint32_t hash = zone_key_hash(zone, ct->hash_basis);
369
0
        cmap_insert(&ct->zone_limits, &zl->node, hash);
370
0
    }
371
372
0
    return zl;
373
0
}
374
375
static struct zone_limit *
376
zone_limit_create(struct conntrack *ct, int32_t zone, int64_t limit)
377
    OVS_REQUIRES(ct->ct_lock)
378
0
{
379
0
    struct zone_limit *zl = zone_limit_lookup_protected(ct, zone);
380
381
0
    if (zl) {
382
0
        return zl;
383
0
    }
384
385
0
    return zone_limit_create__(ct, zone, limit);
386
0
}
387
388
/* Lazily creates a new entry in the zone_limits cmap if default limit
389
 * is set and there's no entry for the zone. */
390
static struct zone_limit *
391
zone_limit_lookup_or_default(struct conntrack *ct, int32_t zone)
392
    OVS_REQUIRES(ct->ct_lock)
393
0
{
394
0
    struct zone_limit *zl = zone_limit_lookup_protected(ct, zone);
395
396
0
    if (!zl) {
397
0
        uint32_t limit;
398
0
        atomic_read_relaxed(&ct->default_zone_limit, &limit);
399
400
0
        if (limit) {
401
0
            zl = zone_limit_create__(ct, zone, ZONE_LIMIT_CONN_DEFAULT);
402
0
        }
403
0
    }
404
405
0
    return zl;
406
0
}
407
408
struct conntrack_zone_info
409
zone_limit_get(struct conntrack *ct, int32_t zone)
410
0
{
411
0
    struct conntrack_zone_info czl = {
412
0
        .zone = DEFAULT_ZONE,
413
0
        .limit = 0,
414
0
        .count = 0,
415
0
    };
416
0
    struct zone_limit *zl = zone_limit_lookup(ct, zone);
417
0
    if (zl) {
418
0
        int64_t czl_limit = zone_limit_get_limit__(&zl->czl);
419
0
        if (czl_limit > ZONE_LIMIT_CONN_DEFAULT) {
420
0
            czl.zone = zl->czl.zone;
421
0
            czl.limit = czl_limit;
422
0
        } else {
423
0
            atomic_read_relaxed(&ct->default_zone_limit, &czl.limit);
424
0
        }
425
426
0
        czl.count = atomic_count_get(&zl->czl.count);
427
0
    } else {
428
0
        atomic_read_relaxed(&ct->default_zone_limit, &czl.limit);
429
0
    }
430
431
0
    return czl;
432
0
}
433
434
static void
435
zone_limit_clean__(struct conntrack *ct, struct zone_limit *zl)
436
    OVS_REQUIRES(ct->ct_lock)
437
0
{
438
0
    uint32_t hash = zone_key_hash(zl->czl.zone, ct->hash_basis);
439
0
    cmap_remove(&ct->zone_limits, &zl->node, hash);
440
0
    ovsrcu_postpone(free, zl);
441
0
}
442
443
static void
444
zone_limit_clean(struct conntrack *ct, struct zone_limit *zl)
445
    OVS_REQUIRES(ct->ct_lock)
446
0
{
447
0
    uint32_t limit;
448
449
0
    atomic_read_relaxed(&ct->default_zone_limit, &limit);
450
    /* Do not remove the entry if the default limit is enabled, but
451
     * simply move the limit to default. */
452
0
    if (limit) {
453
0
        atomic_store_relaxed(&zl->czl.limit, ZONE_LIMIT_CONN_DEFAULT);
454
0
    } else {
455
0
        zone_limit_clean__(ct, zl);
456
0
    }
457
0
}
458
459
static void
460
zone_limit_clean_default(struct conntrack *ct)
461
    OVS_REQUIRES(ct->ct_lock)
462
0
{
463
0
    struct zone_limit *zl;
464
0
    int64_t czl_limit;
465
466
0
    atomic_store_relaxed(&ct->default_zone_limit, 0);
467
468
0
    CMAP_FOR_EACH (zl, node, &ct->zone_limits) {
469
0
        atomic_read_relaxed(&zl->czl.limit, &czl_limit);
470
0
        if (zone_limit_get_limit__(&zl->czl) == ZONE_LIMIT_CONN_DEFAULT) {
471
0
            zone_limit_clean__(ct, zl);
472
0
        }
473
0
    }
474
0
}
475
476
static bool
477
zone_limit_delete__(struct conntrack *ct, int32_t zone)
478
    OVS_REQUIRES(ct->ct_lock)
479
0
{
480
0
    struct zone_limit *zl = NULL;
481
482
0
    if (zone == DEFAULT_ZONE) {
483
0
        zone_limit_clean_default(ct);
484
0
    } else {
485
0
        zl = zone_limit_lookup_protected(ct, zone);
486
0
        if (zl) {
487
0
            zone_limit_clean(ct, zl);
488
0
        }
489
0
    }
490
491
0
    return zl != NULL;
492
0
}
493
494
int
495
zone_limit_delete(struct conntrack *ct, int32_t zone)
496
0
{
497
0
    bool deleted;
498
499
0
    ovs_mutex_lock(&ct->ct_lock);
500
0
    deleted = zone_limit_delete__(ct, zone);
501
0
    ovs_mutex_unlock(&ct->ct_lock);
502
503
0
    if (zone != DEFAULT_ZONE) {
504
0
        VLOG_INFO(deleted
505
0
                  ? "Deleted zone limit for zone %d"
506
0
                  : "Attempted delete of non-existent zone limit: zone %d",
507
0
                  zone);
508
0
    }
509
510
0
    return 0;
511
0
}
512
513
static void
514
zone_limit_update_default(struct conntrack *ct, int32_t zone, uint32_t limit)
515
0
{
516
    /* limit zero means delete default. */
517
0
    if (limit == 0) {
518
0
        ovs_mutex_lock(&ct->ct_lock);
519
0
        zone_limit_delete__(ct, zone);
520
0
        ovs_mutex_unlock(&ct->ct_lock);
521
0
    } else {
522
0
        atomic_store_relaxed(&ct->default_zone_limit, limit);
523
0
    }
524
0
}
525
526
int
527
zone_limit_update(struct conntrack *ct, int32_t zone, uint32_t limit)
528
0
{
529
0
    struct zone_limit *zl;
530
0
    int err = 0;
531
532
0
    if (zone == DEFAULT_ZONE) {
533
0
        zone_limit_update_default(ct, zone, limit);
534
0
        VLOG_INFO("Set default zone limit to %u", limit);
535
0
        return err;
536
0
    }
537
538
0
    zl = zone_limit_lookup(ct, zone);
539
0
    if (zl) {
540
0
        atomic_store_relaxed(&zl->czl.limit, limit);
541
0
        VLOG_INFO("Changed zone limit of %u for zone %d", limit, zone);
542
0
    } else {
543
0
        ovs_mutex_lock(&ct->ct_lock);
544
0
        err = zone_limit_create(ct, zone, limit) == NULL;
545
0
        ovs_mutex_unlock(&ct->ct_lock);
546
0
        if (!err) {
547
0
            VLOG_INFO("Created zone limit of %u for zone %d", limit, zone);
548
0
        } else {
549
0
            VLOG_WARN("Request to create zone limit for invalid zone %d",
550
0
                      zone);
551
0
        }
552
0
    }
553
554
0
    return err;
555
0
}
556
557
static void
558
conn_clean__(struct conntrack *ct, struct conn *conn)
559
    OVS_REQUIRES(ct->ct_lock)
560
0
{
561
0
    uint32_t hash;
562
563
0
    if (conn->alg) {
564
0
        expectation_clean(ct, &conn->key_node[CT_DIR_FWD].key);
565
0
    }
566
567
0
    hash = conn_key_hash(&conn->key_node[CT_DIR_FWD].key, ct->hash_basis);
568
0
    cmap_remove(&ct->conns[conn->key_node[CT_DIR_FWD].key.zone],
569
0
                &conn->key_node[CT_DIR_FWD].cm_node, hash);
570
571
0
    if (conn->nat_action) {
572
0
        hash = conn_key_hash(&conn->key_node[CT_DIR_REV].key,
573
0
                             ct->hash_basis);
574
0
        cmap_remove(&ct->conns[conn->key_node[CT_DIR_REV].key.zone],
575
0
                    &conn->key_node[CT_DIR_REV].cm_node, hash);
576
0
    }
577
578
0
    rculist_remove(&conn->node);
579
0
}
580
581
/* Also removes the associated nat 'conn' from the lookup
582
   datastructures. */
583
static void
584
conn_clean(struct conntrack *ct, struct conn *conn)
585
    OVS_EXCLUDED(conn->lock, ct->ct_lock)
586
0
{
587
0
    if (atomic_flag_test_and_set(&conn->reclaimed)) {
588
0
        return;
589
0
    }
590
591
0
    ovs_mutex_lock(&ct->ct_lock);
592
0
    conn_clean__(ct, conn);
593
0
    ovs_mutex_unlock(&ct->ct_lock);
594
595
0
    struct zone_limit *zl = zone_limit_lookup(ct, conn->admit_zone);
596
0
    if (zl && zl->czl.zone_limit_seq == conn->zone_limit_seq) {
597
0
        atomic_count_dec(&zl->czl.count);
598
0
    }
599
600
0
    ovsrcu_postpone(delete_conn, conn);
601
0
    atomic_count_dec(&ct->n_conn);
602
0
}
603
604
static void
605
conn_force_expire(struct conn *conn)
606
0
{
607
0
    atomic_store_relaxed(&conn->expiration, 0);
608
0
}
609
610
/* Destroys the connection tracker 'ct' and frees all the allocated memory.
611
 * The caller of this function must already have shut down packet input
612
 * and PMD threads (which would have been quiesced).  */
613
void
614
conntrack_destroy(struct conntrack *ct)
615
0
{
616
0
    struct conn *conn;
617
618
0
    latch_set(&ct->clean_thread_exit);
619
0
    pthread_join(ct->clean_thread, NULL);
620
0
    latch_destroy(&ct->clean_thread_exit);
621
622
0
    for (unsigned i = 0; i < N_EXP_LISTS; i++) {
623
0
        RCULIST_FOR_EACH (conn, node, &ct->exp_lists[i]) {
624
0
            conn_clean(ct, conn);
625
0
        }
626
0
    }
627
628
0
    struct zone_limit *zl;
629
0
    CMAP_FOR_EACH (zl, node, &ct->zone_limits) {
630
0
        uint32_t hash = zone_key_hash(zl->czl.zone, ct->hash_basis);
631
632
0
        cmap_remove(&ct->zone_limits, &zl->node, hash);
633
0
        ovsrcu_postpone(free, zl);
634
0
    }
635
636
0
    struct timeout_policy *tp;
637
0
    CMAP_FOR_EACH (tp, node, &ct->timeout_policies) {
638
0
        uint32_t hash = hash_int(tp->policy.id, ct->hash_basis);
639
640
0
        cmap_remove(&ct->timeout_policies, &tp->node, hash);
641
0
        ovsrcu_postpone(free, tp);
642
0
    }
643
644
0
    ovs_mutex_lock(&ct->ct_lock);
645
646
0
    for (unsigned i = 0; i < ARRAY_SIZE(ct->conns); i++) {
647
0
        cmap_destroy(&ct->conns[i]);
648
0
    }
649
0
    cmap_destroy(&ct->zone_limits);
650
0
    cmap_destroy(&ct->timeout_policies);
651
652
0
    ovs_mutex_unlock(&ct->ct_lock);
653
0
    ovs_mutex_destroy(&ct->ct_lock);
654
655
0
    ovs_rwlock_wrlock(&ct->resources_lock);
656
0
    struct alg_exp_node *alg_exp_node;
657
0
    HMAP_FOR_EACH_POP (alg_exp_node, node, &ct->alg_expectations) {
658
0
        free(alg_exp_node);
659
0
    }
660
0
    hmap_destroy(&ct->alg_expectations);
661
0
    hindex_destroy(&ct->alg_expectation_refs);
662
0
    ovs_rwlock_unlock(&ct->resources_lock);
663
0
    ovs_rwlock_destroy(&ct->resources_lock);
664
665
0
    ipf_destroy(ct->ipf);
666
0
    free(ct);
667
0
}
668

669
670
static bool
671
conn_key_lookup(struct conntrack *ct, const struct conn_key *key,
672
                uint32_t hash, long long now, struct conn **conn_out,
673
                bool *reply)
674
0
{
675
0
    struct conn_key_node *keyn;
676
0
    struct conn *conn = NULL;
677
0
    bool found = false;
678
679
0
    CMAP_FOR_EACH_WITH_HASH (keyn, cm_node, hash, &ct->conns[key->zone]) {
680
0
        if (keyn->dir == CT_DIR_FWD) {
681
0
            conn = CONTAINER_OF(keyn, struct conn, key_node[CT_DIR_FWD]);
682
0
        } else {
683
0
            conn = CONTAINER_OF(keyn, struct conn, key_node[CT_DIR_REV]);
684
0
        }
685
686
0
        if (conn_expired(conn, now)) {
687
0
            continue;
688
0
        }
689
690
0
        for (int i = CT_DIR_FWD; i < CT_DIRS; i++) {
691
0
            if (!conn_key_cmp(&conn->key_node[i].key, key)) {
692
0
                found = true;
693
0
                if (reply) {
694
0
                    *reply = (i == CT_DIR_REV);
695
0
                }
696
0
                goto out_found;
697
0
            }
698
0
        }
699
0
    }
700
701
0
out_found:
702
0
    if (found && conn_out) {
703
0
        *conn_out = conn;
704
0
    } else if (conn_out) {
705
0
        *conn_out = NULL;
706
0
    }
707
708
0
    return found;
709
0
}
710
711
static bool
712
conn_lookup(struct conntrack *ct, const struct conn_key *key,
713
            long long now, struct conn **conn_out, bool *reply)
714
0
{
715
0
    uint32_t hash = conn_key_hash(key, ct->hash_basis);
716
0
    return conn_key_lookup(ct, key, hash, now, conn_out, reply);
717
0
}
718
719
static void
720
write_ct_md(struct dp_packet *pkt, uint16_t zone, const struct conn *conn,
721
            const struct conn_key *key, const struct alg_exp_node *alg_exp)
722
0
{
723
0
    pkt->md.ct_state |= CS_TRACKED;
724
0
    pkt->md.ct_zone = zone;
725
726
0
    if (conn) {
727
0
        ovs_mutex_lock(&conn->lock);
728
0
        pkt->md.ct_mark = conn->mark;
729
0
        pkt->md.ct_label = conn->label;
730
0
        ovs_mutex_unlock(&conn->lock);
731
0
    } else {
732
0
        pkt->md.ct_mark = 0;
733
0
        pkt->md.ct_label = OVS_U128_ZERO;
734
0
    }
735
736
    /* Use the original direction tuple if we have it. */
737
0
    if (conn) {
738
0
        if (conn->alg_related) {
739
0
            key = &conn->parent_key;
740
0
        } else {
741
0
            key = &conn->key_node[CT_DIR_FWD].key;
742
0
        }
743
0
    } else if (alg_exp) {
744
0
        pkt->md.ct_mark = alg_exp->parent_mark;
745
0
        pkt->md.ct_label = alg_exp->parent_label;
746
0
        key = &alg_exp->parent_key;
747
0
    }
748
749
0
    pkt->md.ct_orig_tuple_ipv6 = false;
750
751
0
    if (key) {
752
0
        if (key->dl_type == htons(ETH_TYPE_IP)) {
753
0
            pkt->md.ct_orig_tuple.ipv4 = (struct ovs_key_ct_tuple_ipv4) {
754
0
                key->src.addr.ipv4,
755
0
                key->dst.addr.ipv4,
756
0
                key->nw_proto != IPPROTO_ICMP
757
0
                ? key->src.port : htons(key->src.icmp_type),
758
0
                key->nw_proto != IPPROTO_ICMP
759
0
                ? key->dst.port : htons(key->src.icmp_code),
760
0
                key->nw_proto,
761
0
            };
762
0
        } else {
763
0
            pkt->md.ct_orig_tuple_ipv6 = true;
764
0
            pkt->md.ct_orig_tuple.ipv6 = (struct ovs_key_ct_tuple_ipv6) {
765
0
                key->src.addr.ipv6,
766
0
                key->dst.addr.ipv6,
767
0
                key->nw_proto != IPPROTO_ICMPV6
768
0
                ? key->src.port : htons(key->src.icmp_type),
769
0
                key->nw_proto != IPPROTO_ICMPV6
770
0
                ? key->dst.port : htons(key->src.icmp_code),
771
0
                key->nw_proto,
772
0
            };
773
0
        }
774
0
    } else {
775
0
        memset(&pkt->md.ct_orig_tuple, 0, sizeof pkt->md.ct_orig_tuple);
776
0
    }
777
0
}
778
779
static uint8_t
780
get_ip_proto(const struct dp_packet *pkt)
781
0
{
782
0
    uint8_t ip_proto;
783
0
    struct eth_header *l2 = dp_packet_eth(pkt);
784
0
    if (l2->eth_type == htons(ETH_TYPE_IPV6)) {
785
0
        struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
786
0
        ip_proto = nh6->ip6_ctlun.ip6_un1.ip6_un1_nxt;
787
0
    } else {
788
0
        struct ip_header *l3_hdr = dp_packet_l3(pkt);
789
0
        ip_proto = l3_hdr->ip_proto;
790
0
    }
791
792
0
    return ip_proto;
793
0
}
794
795
static bool
796
is_ftp_ctl(const enum ct_alg_ctl_type ct_alg_ctl)
797
0
{
798
0
    return ct_alg_ctl == CT_ALG_CTL_FTP;
799
0
}
800
801
static enum ct_alg_ctl_type
802
get_alg_ctl_type(const struct dp_packet *pkt, const char *helper)
803
0
{
804
    /* CT_IPPORT_FTP/TFTP is used because IPPORT_FTP/TFTP in not defined
805
     * in OSX, at least in in.h. Since these values will never change, remove
806
     * the external dependency. */
807
0
    enum { CT_IPPORT_FTP = 21 };
808
0
    enum { CT_IPPORT_TFTP = 69 };
809
0
    uint8_t ip_proto = get_ip_proto(pkt);
810
0
    struct udp_header *uh = dp_packet_l4(pkt);
811
0
    struct tcp_header *th = dp_packet_l4(pkt);
812
0
    ovs_be16 ftp_port = htons(CT_IPPORT_FTP);
813
0
    ovs_be16 tftp_port = htons(CT_IPPORT_TFTP);
814
815
0
    if (helper) {
816
0
        if ((ip_proto == IPPROTO_TCP) &&
817
0
             !strncmp(helper, "ftp", strlen("ftp"))) {
818
0
            return CT_ALG_CTL_FTP;
819
0
        }
820
0
        if ((ip_proto == IPPROTO_UDP) &&
821
0
             !strncmp(helper, "tftp", strlen("tftp"))) {
822
0
            return CT_ALG_CTL_TFTP;
823
0
        }
824
0
    }
825
826
0
    if (ip_proto == IPPROTO_UDP && uh->udp_dst == tftp_port) {
827
0
        return CT_ALG_CTL_TFTP;
828
0
    } else if (ip_proto == IPPROTO_TCP &&
829
0
               (th->tcp_src == ftp_port || th->tcp_dst == ftp_port)) {
830
0
        return CT_ALG_CTL_FTP;
831
0
    }
832
0
    return CT_ALG_CTL_NONE;
833
0
}
834
835
static bool
836
alg_src_ip_wc(enum ct_alg_ctl_type alg_ctl_type)
837
0
{
838
0
    if (alg_ctl_type == CT_ALG_CTL_SIP) {
839
0
        return true;
840
0
    }
841
0
    return false;
842
0
}
843
844
static void
845
handle_alg_ctl(struct conntrack *ct, const struct conn_lookup_ctx *ctx,
846
               struct dp_packet *pkt, enum ct_alg_ctl_type ct_alg_ctl,
847
               struct conn *conn, long long now, bool nat)
848
0
{
849
    /* ALG control packet handling with expectation creation. */
850
0
    if (OVS_UNLIKELY(alg_helpers[ct_alg_ctl] && conn && conn->alg)) {
851
0
        ovs_mutex_lock(&conn->lock);
852
0
        alg_helpers[ct_alg_ctl](ct, ctx, pkt, conn, now, CT_FTP_CTL_INTEREST,
853
0
                                nat);
854
0
        ovs_mutex_unlock(&conn->lock);
855
0
    }
856
0
}
857
858
static void
859
pat_packet(struct dp_packet *pkt, const struct conn_key *key)
860
0
{
861
0
    if (key->nw_proto == IPPROTO_TCP) {
862
0
        packet_set_tcp_port(pkt, key->dst.port, key->src.port);
863
0
    } else if (key->nw_proto == IPPROTO_UDP) {
864
0
        packet_set_udp_port(pkt, key->dst.port, key->src.port);
865
0
    } else if (key->nw_proto == IPPROTO_SCTP) {
866
0
        packet_set_sctp_port(pkt, key->dst.port, key->src.port);
867
0
    }
868
0
}
869
870
static uint16_t
871
nat_action_reverse(uint16_t nat_action)
872
0
{
873
0
    if (nat_action & NAT_ACTION_SRC) {
874
0
        nat_action ^= NAT_ACTION_SRC;
875
0
        nat_action |= NAT_ACTION_DST;
876
0
    } else if (nat_action & NAT_ACTION_DST) {
877
0
        nat_action ^= NAT_ACTION_DST;
878
0
        nat_action |= NAT_ACTION_SRC;
879
0
    }
880
0
    return nat_action;
881
0
}
882
883
static void
884
nat_packet_ipv4(struct dp_packet *pkt, const struct conn_key *key,
885
                uint16_t nat_action)
886
0
{
887
0
    struct ip_header *nh = dp_packet_l3(pkt);
888
889
0
    if (nat_action & NAT_ACTION_SRC) {
890
0
        packet_set_ipv4_addr(pkt, &nh->ip_src, key->dst.addr.ipv4);
891
0
    } else if (nat_action & NAT_ACTION_DST) {
892
0
        packet_set_ipv4_addr(pkt, &nh->ip_dst, key->src.addr.ipv4);
893
0
    }
894
0
}
895
896
static void
897
nat_packet_ipv6(struct dp_packet *pkt, const struct conn_key *key,
898
                uint16_t nat_action)
899
0
{
900
0
    struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
901
902
0
    if (nat_action & NAT_ACTION_SRC) {
903
0
        packet_set_ipv6_addr(pkt, key->nw_proto, nh6->ip6_src.be32,
904
0
                             &key->dst.addr.ipv6, true);
905
0
    } else if (nat_action & NAT_ACTION_DST) {
906
0
        packet_set_ipv6_addr(pkt, key->nw_proto, nh6->ip6_dst.be32,
907
0
                             &key->src.addr.ipv6, true);
908
0
    }
909
0
}
910
911
static void
912
nat_inner_packet(struct dp_packet *pkt, struct conn_key *key,
913
                 uint16_t nat_action)
914
0
{
915
0
    char *tail = dp_packet_tail(pkt);
916
0
    uint16_t pad = dp_packet_l2_pad_size(pkt);
917
0
    struct conn_key inner_key;
918
0
    const char *inner_l4 = NULL;
919
0
    uint16_t orig_l3_ofs = pkt->l3_ofs;
920
0
    uint16_t orig_l4_ofs = pkt->l4_ofs;
921
0
    uint32_t orig_offloads = pkt->offloads;
922
923
0
    void *l3 = dp_packet_l3(pkt);
924
0
    void *l4 = dp_packet_l4(pkt);
925
0
    void *inner_l3;
926
    /* These calls are already verified to succeed during the code path from
927
     * 'conn_key_extract()' which calls
928
     * 'extract_l4_icmp()'/'extract_l4_icmp6()'. */
929
0
    if (key->dl_type == htons(ETH_TYPE_IP)) {
930
0
        inner_l3 = (char *) l4 + sizeof(struct icmp_header);
931
0
        extract_l3_ipv4(NULL, &inner_key, inner_l3,
932
0
                        tail - ((char *) inner_l3) - pad, &inner_l4);
933
0
    } else {
934
0
        inner_l3 = (char *) l4 + sizeof(struct icmp6_data_header);
935
0
        extract_l3_ipv6(&inner_key, inner_l3, tail - ((char *) inner_l3) - pad,
936
0
                        &inner_l4);
937
0
    }
938
0
    pkt->l3_ofs += (char *) inner_l3 - (char *) l3;
939
0
    pkt->l4_ofs += inner_l4 - (char *) l4;
940
    /* Drop any offloads to force below helpers to calculate checksums
941
     * if needed. */
942
0
    dp_packet_ip_checksum_set_unknown(pkt);
943
0
    dp_packet_l4_checksum_set_unknown(pkt);
944
945
    /* Reverse the key for inner packet. */
946
0
    struct conn_key rev_key = *key;
947
0
    conn_key_reverse(&rev_key);
948
949
0
    pat_packet(pkt, &rev_key);
950
951
0
    if (key->dl_type == htons(ETH_TYPE_IP)) {
952
0
        nat_packet_ipv4(pkt, &rev_key, nat_action);
953
954
0
        struct icmp_header *icmp = (struct icmp_header *) l4;
955
0
        icmp->icmp_csum = 0;
956
0
        icmp->icmp_csum = csum(icmp, tail - (char *) icmp - pad);
957
0
    } else {
958
0
        nat_packet_ipv6(pkt, &rev_key, nat_action);
959
960
0
        struct icmp6_data_header *icmp6 = (struct icmp6_data_header *) l4;
961
0
        icmp6->icmp6_base.icmp6_cksum = 0;
962
0
        icmp6->icmp6_base.icmp6_cksum =
963
0
            packet_csum_upperlayer6(l3, icmp6, IPPROTO_ICMPV6,
964
0
                                    tail - (char *) icmp6 - pad);
965
0
    }
966
967
0
    pkt->l3_ofs = orig_l3_ofs;
968
0
    pkt->l4_ofs = orig_l4_ofs;
969
0
    pkt->offloads = orig_offloads;
970
0
}
971
972
static void
973
nat_packet(struct dp_packet *pkt, struct conn *conn, bool reply, bool related)
974
0
{
975
0
    enum key_dir dir = reply ? CT_DIR_FWD : CT_DIR_REV;
976
0
    struct conn_key *key = &conn->key_node[dir].key;
977
0
    uint16_t nat_action = reply ? nat_action_reverse(conn->nat_action)
978
0
                                : conn->nat_action;
979
980
    /* Update ct_state. */
981
0
    if (nat_action & NAT_ACTION_SRC) {
982
0
        pkt->md.ct_state |= CS_SRC_NAT;
983
0
    } else if (nat_action & NAT_ACTION_DST) {
984
0
        pkt->md.ct_state |= CS_DST_NAT;
985
0
    }
986
987
    /* Reverse the key for outer header. */
988
0
    if (key->dl_type == htons(ETH_TYPE_IP)) {
989
0
        nat_packet_ipv4(pkt, key, nat_action);
990
0
    } else {
991
0
        nat_packet_ipv6(pkt, key, nat_action);
992
0
    }
993
994
0
    if (nat_action & NAT_ACTION_SRC || nat_action & NAT_ACTION_DST) {
995
0
        if (OVS_UNLIKELY(related)) {
996
0
            nat_action = nat_action_reverse(nat_action);
997
0
            nat_inner_packet(pkt, key, nat_action);
998
0
        } else {
999
0
            pat_packet(pkt, key);
1000
0
        }
1001
0
    }
1002
0
}
1003
1004
static void
1005
conn_seq_skew_set(struct conntrack *ct, const struct conn *conn_in,
1006
                  long long now, int seq_skew, bool seq_skew_dir)
1007
0
{
1008
0
    struct conn *conn;
1009
1010
0
    conn_lookup(ct, &conn_in->key_node[CT_DIR_FWD].key, now, &conn, NULL);
1011
0
    if (conn && seq_skew) {
1012
0
        conn->seq_skew = seq_skew;
1013
0
        conn->seq_skew_dir = seq_skew_dir;
1014
0
    }
1015
0
}
1016
1017
static bool
1018
ct_verify_helper(const char *helper, enum ct_alg_ctl_type ct_alg_ctl)
1019
0
{
1020
0
    if (ct_alg_ctl == CT_ALG_CTL_NONE) {
1021
0
        return true;
1022
0
    } else if (helper) {
1023
0
        if ((ct_alg_ctl == CT_ALG_CTL_FTP) &&
1024
0
             !strncmp(helper, "ftp", strlen("ftp"))) {
1025
0
            return true;
1026
0
        } else if ((ct_alg_ctl == CT_ALG_CTL_TFTP) &&
1027
0
                   !strncmp(helper, "tftp", strlen("tftp"))) {
1028
0
            return true;
1029
0
        } else {
1030
0
            return false;
1031
0
        }
1032
0
    } else {
1033
0
        return false;
1034
0
    }
1035
0
}
1036
1037
static struct conn *
1038
conn_not_found(struct conntrack *ct, struct dp_packet *pkt,
1039
               struct conn_lookup_ctx *ctx, bool commit, long long now,
1040
               const struct nat_action_info_t *nat_action_info,
1041
               const char *helper, const struct alg_exp_node *alg_exp,
1042
               enum ct_alg_ctl_type ct_alg_ctl, uint32_t tp_id)
1043
    OVS_REQUIRES(ct->ct_lock)
1044
0
{
1045
0
    struct conn *nc = NULL;
1046
1047
0
    if (!valid_new(pkt, &ctx->key)) {
1048
0
        pkt->md.ct_state = CS_INVALID;
1049
0
        return nc;
1050
0
    }
1051
1052
0
    pkt->md.ct_state = CS_NEW;
1053
1054
0
    if (alg_exp) {
1055
0
        pkt->md.ct_state |= CS_RELATED;
1056
0
    }
1057
1058
0
    if (commit) {
1059
0
        int64_t czl_limit;
1060
0
        struct conn_key_node *fwd_key_node, *rev_key_node;
1061
0
        struct zone_limit *zl = zone_limit_lookup_or_default(ct,
1062
0
                                                             ctx->key.zone);
1063
0
        if (zl) {
1064
0
            czl_limit = zone_limit_get_limit(ct, &zl->czl);
1065
0
            if (czl_limit >= 0 &&
1066
0
                atomic_count_get(&zl->czl.count) >= czl_limit) {
1067
0
                COVERAGE_INC(conntrack_zone_full);
1068
0
                return nc;
1069
0
            }
1070
0
        }
1071
1072
0
        unsigned int n_conn_limit;
1073
0
        atomic_read_relaxed(&ct->n_conn_limit, &n_conn_limit);
1074
0
        if (atomic_count_get(&ct->n_conn) >= n_conn_limit) {
1075
0
            COVERAGE_INC(conntrack_full);
1076
0
            return nc;
1077
0
        }
1078
1079
0
        nc = new_conn(ct, pkt, &ctx->key, now, tp_id);
1080
0
        fwd_key_node = &nc->key_node[CT_DIR_FWD];
1081
0
        rev_key_node = &nc->key_node[CT_DIR_REV];
1082
0
        memcpy(&fwd_key_node->key, &ctx->key, sizeof fwd_key_node->key);
1083
0
        memcpy(&rev_key_node->key, &fwd_key_node->key,
1084
0
               sizeof rev_key_node->key);
1085
0
        conn_key_reverse(&rev_key_node->key);
1086
1087
0
        if (ct_verify_helper(helper, ct_alg_ctl)) {
1088
0
            nc->alg = nullable_xstrdup(helper);
1089
0
        }
1090
1091
0
        if (alg_exp) {
1092
0
            nc->alg_related = true;
1093
0
            nc->mark = alg_exp->parent_mark;
1094
0
            nc->label = alg_exp->parent_label;
1095
0
            nc->parent_key = alg_exp->parent_key;
1096
0
        }
1097
1098
0
        ovs_mutex_init_adaptive(&nc->lock);
1099
0
        atomic_flag_clear(&nc->reclaimed);
1100
0
        fwd_key_node->dir = CT_DIR_FWD;
1101
0
        rev_key_node->dir = CT_DIR_REV;
1102
1103
0
        if (zl) {
1104
0
            nc->admit_zone = zl->czl.zone;
1105
0
            nc->zone_limit_seq = zl->czl.zone_limit_seq;
1106
0
        } else {
1107
0
            nc->admit_zone = INVALID_ZONE;
1108
0
        }
1109
1110
0
        if (nat_action_info) {
1111
0
            nc->nat_action = nat_action_info->nat_action;
1112
1113
0
            if (alg_exp) {
1114
0
                if (alg_exp->nat_rpl_dst) {
1115
0
                    rev_key_node->key.dst.addr = alg_exp->alg_nat_repl_addr;
1116
0
                    nc->nat_action = NAT_ACTION_SRC;
1117
0
                } else {
1118
0
                    rev_key_node->key.src.addr = alg_exp->alg_nat_repl_addr;
1119
0
                    nc->nat_action = NAT_ACTION_DST;
1120
0
                }
1121
0
            } else {
1122
0
                bool nat_res = nat_get_unique_tuple(ct, nc, nat_action_info);
1123
0
                if (!nat_res) {
1124
0
                    goto nat_res_exhaustion;
1125
0
                }
1126
0
            }
1127
1128
0
            nat_packet(pkt, nc, false, ctx->icmp_related);
1129
0
            uint32_t rev_hash = conn_key_hash(&rev_key_node->key,
1130
0
                                              ct->hash_basis);
1131
0
            cmap_insert(&ct->conns[ctx->key.zone],
1132
0
                        &rev_key_node->cm_node, rev_hash);
1133
0
        }
1134
1135
0
        cmap_insert(&ct->conns[ctx->key.zone],
1136
0
                    &fwd_key_node->cm_node, ctx->hash);
1137
0
        conn_expire_push_front(ct, nc);
1138
0
        atomic_count_inc(&ct->n_conn);
1139
1140
0
        if (zl) {
1141
0
            atomic_count_inc(&zl->czl.count);
1142
0
        }
1143
1144
0
        ctx->conn = nc; /* For completeness. */
1145
0
    }
1146
1147
0
    return nc;
1148
1149
    /* This would be a user error or a DOS attack.  A user error is prevented
1150
     * by allocating enough combinations of NAT addresses when combined with
1151
     * ephemeral ports.  A DOS attack should be protected against with
1152
     * firewall rules or a separate firewall.  Also using zone partitioning
1153
     * can limit DoS impact. */
1154
0
nat_res_exhaustion:
1155
0
    delete_conn__(nc);
1156
0
    static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
1157
0
    VLOG_WARN_RL(&rl, "Unable to NAT due to tuple space exhaustion - "
1158
0
                 "if DoS attack, use firewalling and/or zone partitioning.");
1159
0
    return NULL;
1160
0
}
1161
1162
static bool
1163
conn_update_state(struct conntrack *ct, struct dp_packet *pkt,
1164
                  struct conn_lookup_ctx *ctx, struct conn *conn,
1165
                  long long now)
1166
0
{
1167
0
    bool create_new_conn = false;
1168
1169
0
    if (ctx->icmp_related) {
1170
0
        pkt->md.ct_state |= CS_RELATED;
1171
0
        if (ctx->reply) {
1172
0
            pkt->md.ct_state |= CS_REPLY_DIR;
1173
0
        }
1174
0
    } else {
1175
0
        if (conn->alg_related) {
1176
0
            pkt->md.ct_state |= CS_RELATED;
1177
0
        }
1178
1179
0
        enum ct_update_res res = conn_update(ct, conn, pkt, ctx, now);
1180
1181
0
        switch (res) {
1182
0
        case CT_UPDATE_VALID:
1183
0
            pkt->md.ct_state |= CS_ESTABLISHED;
1184
0
            pkt->md.ct_state &= ~CS_NEW;
1185
0
            if (ctx->reply) {
1186
0
                pkt->md.ct_state |= CS_REPLY_DIR;
1187
0
            }
1188
0
            break;
1189
0
        case CT_UPDATE_INVALID:
1190
0
            pkt->md.ct_state = CS_INVALID;
1191
0
            break;
1192
0
        case CT_UPDATE_NEW:
1193
0
            if (conn_lookup(ct, &conn->key_node[CT_DIR_FWD].key,
1194
0
                            now, NULL, NULL)) {
1195
0
                conn_force_expire(conn);
1196
0
            }
1197
0
            create_new_conn = true;
1198
0
            break;
1199
0
        case CT_UPDATE_VALID_NEW:
1200
0
            pkt->md.ct_state |= CS_NEW;
1201
0
            break;
1202
0
        default:
1203
0
            OVS_NOT_REACHED();
1204
0
        }
1205
0
    }
1206
0
    return create_new_conn;
1207
0
}
1208
1209
static void
1210
handle_nat(struct dp_packet *pkt, struct conn *conn,
1211
           uint16_t zone, bool reply, bool related)
1212
0
{
1213
0
    if (conn->nat_action &&
1214
0
        (!(pkt->md.ct_state & (CS_SRC_NAT | CS_DST_NAT)) ||
1215
0
          (pkt->md.ct_state & (CS_SRC_NAT | CS_DST_NAT) &&
1216
0
           zone != pkt->md.ct_zone))) {
1217
1218
0
        if (pkt->md.ct_state & (CS_SRC_NAT | CS_DST_NAT)) {
1219
0
            pkt->md.ct_state &= ~(CS_SRC_NAT | CS_DST_NAT);
1220
0
        }
1221
1222
0
        nat_packet(pkt, conn, reply, related);
1223
0
    }
1224
0
}
1225
1226
static bool
1227
check_orig_tuple(struct conntrack *ct, struct dp_packet *pkt,
1228
                 struct conn_lookup_ctx *ctx_in, long long now,
1229
                 struct conn **conn,
1230
                 const struct nat_action_info_t *nat_action_info)
1231
0
{
1232
0
    if (!(pkt->md.ct_state & (CS_SRC_NAT | CS_DST_NAT)) ||
1233
0
        (ctx_in->key.dl_type == htons(ETH_TYPE_IP) &&
1234
0
         !pkt->md.ct_orig_tuple.ipv4.ipv4_proto) ||
1235
0
        (ctx_in->key.dl_type == htons(ETH_TYPE_IPV6) &&
1236
0
         !pkt->md.ct_orig_tuple.ipv6.ipv6_proto) ||
1237
0
        nat_action_info) {
1238
0
        return false;
1239
0
    }
1240
1241
0
    struct conn_key key;
1242
0
    memset(&key, 0 , sizeof key);
1243
1244
0
    if (ctx_in->key.dl_type == htons(ETH_TYPE_IP)) {
1245
0
        key.src.addr.ipv4 = pkt->md.ct_orig_tuple.ipv4.ipv4_src;
1246
0
        key.dst.addr.ipv4 = pkt->md.ct_orig_tuple.ipv4.ipv4_dst;
1247
1248
0
        if (ctx_in->key.nw_proto == IPPROTO_ICMP) {
1249
0
            key.src.icmp_id = ctx_in->key.src.icmp_id;
1250
0
            key.dst.icmp_id = ctx_in->key.dst.icmp_id;
1251
0
            uint16_t src_port = ntohs(pkt->md.ct_orig_tuple.ipv4.src_port);
1252
0
            key.src.icmp_type = (uint8_t) src_port;
1253
0
            key.dst.icmp_type = reverse_icmp_type(key.src.icmp_type);
1254
0
        } else {
1255
0
            key.src.port = pkt->md.ct_orig_tuple.ipv4.src_port;
1256
0
            key.dst.port = pkt->md.ct_orig_tuple.ipv4.dst_port;
1257
0
        }
1258
0
        key.nw_proto = pkt->md.ct_orig_tuple.ipv4.ipv4_proto;
1259
0
    } else {
1260
0
        key.src.addr.ipv6 = pkt->md.ct_orig_tuple.ipv6.ipv6_src;
1261
0
        key.dst.addr.ipv6 = pkt->md.ct_orig_tuple.ipv6.ipv6_dst;
1262
1263
0
        if (ctx_in->key.nw_proto == IPPROTO_ICMPV6) {
1264
0
            key.src.icmp_id = ctx_in->key.src.icmp_id;
1265
0
            key.dst.icmp_id = ctx_in->key.dst.icmp_id;
1266
0
            uint16_t src_port = ntohs(pkt->md.ct_orig_tuple.ipv6.src_port);
1267
0
            key.src.icmp_type = (uint8_t) src_port;
1268
0
            key.dst.icmp_type = reverse_icmp6_type(key.src.icmp_type);
1269
0
        } else {
1270
0
            key.src.port = pkt->md.ct_orig_tuple.ipv6.src_port;
1271
0
            key.dst.port = pkt->md.ct_orig_tuple.ipv6.dst_port;
1272
0
        }
1273
0
        key.nw_proto = pkt->md.ct_orig_tuple.ipv6.ipv6_proto;
1274
0
    }
1275
1276
0
    key.dl_type = ctx_in->key.dl_type;
1277
0
    key.zone = pkt->md.ct_zone;
1278
0
    conn_lookup(ct, &key, now, conn, NULL);
1279
0
    return *conn ? true : false;
1280
0
}
1281
1282
static bool
1283
conn_update_state_alg(struct conntrack *ct, struct dp_packet *pkt,
1284
                      struct conn_lookup_ctx *ctx, struct conn *conn,
1285
                      const struct nat_action_info_t *nat_action_info,
1286
                      enum ct_alg_ctl_type ct_alg_ctl, long long now,
1287
                      bool *create_new_conn)
1288
0
{
1289
0
    if (is_ftp_ctl(ct_alg_ctl)) {
1290
        /* Keep sequence tracking in sync with the source of the
1291
         * sequence skew. */
1292
0
        ovs_mutex_lock(&conn->lock);
1293
0
        if (ctx->reply != conn->seq_skew_dir) {
1294
0
            handle_ftp_ctl(ct, ctx, pkt, conn, now, CT_FTP_CTL_OTHER,
1295
0
                           !!nat_action_info);
1296
            /* conn_update_state locks for unrelated fields, so unlock. */
1297
0
            ovs_mutex_unlock(&conn->lock);
1298
0
            *create_new_conn = conn_update_state(ct, pkt, ctx, conn, now);
1299
0
        } else {
1300
            /* conn_update_state locks for unrelated fields, so unlock. */
1301
0
            ovs_mutex_unlock(&conn->lock);
1302
0
            *create_new_conn = conn_update_state(ct, pkt, ctx, conn, now);
1303
0
            ovs_mutex_lock(&conn->lock);
1304
0
            if (*create_new_conn == false) {
1305
0
                handle_ftp_ctl(ct, ctx, pkt, conn, now, CT_FTP_CTL_OTHER,
1306
0
                               !!nat_action_info);
1307
0
            }
1308
0
            ovs_mutex_unlock(&conn->lock);
1309
0
        }
1310
0
        return true;
1311
0
    }
1312
0
    return false;
1313
0
}
1314
1315
static void
1316
set_cached_conn(const struct nat_action_info_t *nat_action_info,
1317
                const struct conn_lookup_ctx *ctx, struct conn *conn,
1318
                struct dp_packet *pkt)
1319
0
{
1320
0
    if (OVS_LIKELY(!nat_action_info)) {
1321
0
        pkt->md.conn = conn;
1322
0
        pkt->md.reply = ctx->reply;
1323
0
        pkt->md.icmp_related = ctx->icmp_related;
1324
0
    } else {
1325
0
        pkt->md.conn = NULL;
1326
0
    }
1327
0
}
1328
1329
static void
1330
process_one_fast(uint16_t zone, const uint32_t *setmark,
1331
                 const struct ovs_key_ct_labels *setlabel,
1332
                 const struct nat_action_info_t *nat_action_info,
1333
                 struct conn *conn, struct dp_packet *pkt)
1334
0
{
1335
0
    if (nat_action_info) {
1336
0
        handle_nat(pkt, conn, zone, pkt->md.reply, pkt->md.icmp_related);
1337
0
        pkt->md.conn = NULL;
1338
0
    }
1339
1340
0
    pkt->md.ct_zone = zone;
1341
0
    ovs_mutex_lock(&conn->lock);
1342
0
    pkt->md.ct_mark = conn->mark;
1343
0
    pkt->md.ct_label = conn->label;
1344
0
    ovs_mutex_unlock(&conn->lock);
1345
1346
0
    if (setmark) {
1347
0
        set_mark(pkt, conn, setmark[0], setmark[1]);
1348
0
    }
1349
1350
0
    if (setlabel) {
1351
0
        set_label(pkt, conn, &setlabel[0], &setlabel[1]);
1352
0
    }
1353
0
}
1354
1355
static void
1356
initial_conn_lookup(struct conntrack *ct, struct conn_lookup_ctx *ctx,
1357
                    long long now, bool natted)
1358
0
{
1359
0
    if (natted) {
1360
        /* If the packet has been already natted (e.g. a previous
1361
         * action took place), retrieve it performing a lookup of its
1362
         * reverse key. */
1363
0
        conn_key_reverse(&ctx->key);
1364
0
    }
1365
1366
0
    conn_key_lookup(ct, &ctx->key, ctx->hash, now, &ctx->conn, &ctx->reply);
1367
1368
0
    if (natted) {
1369
0
        if (OVS_LIKELY(ctx->conn)) {
1370
0
            enum key_dir dir;
1371
0
            ctx->reply = !ctx->reply;
1372
0
            dir = ctx->reply ? CT_DIR_REV : CT_DIR_FWD;
1373
0
            ctx->key = ctx->conn->key_node[dir].key;
1374
0
            ctx->hash = conn_key_hash(&ctx->key, ct->hash_basis);
1375
0
        } else {
1376
            /* A lookup failure does not necessarily imply that an
1377
             * error occurred, it may simply indicate that a conn got
1378
             * removed during the recirculation. */
1379
0
            COVERAGE_INC(conntrack_lookup_natted_miss);
1380
0
            conn_key_reverse(&ctx->key);
1381
0
        }
1382
0
    }
1383
0
}
1384
1385
static void
1386
process_one(struct conntrack *ct, struct dp_packet *pkt,
1387
            struct conn_lookup_ctx *ctx, uint16_t zone,
1388
            bool force, bool commit, long long now, const uint32_t *setmark,
1389
            const struct ovs_key_ct_labels *setlabel,
1390
            const struct nat_action_info_t *nat_action_info,
1391
            const char *helper, uint32_t tp_id)
1392
0
{
1393
    /* Reset ct_state whenever entering a new zone. */
1394
0
    if (pkt->md.ct_state && pkt->md.ct_zone != zone) {
1395
0
        pkt->md.ct_state = 0;
1396
0
    }
1397
1398
0
    bool create_new_conn = false;
1399
0
    initial_conn_lookup(ct, ctx, now, !!(pkt->md.ct_state &
1400
0
                                         (CS_SRC_NAT | CS_DST_NAT)));
1401
0
    struct conn *conn = ctx->conn;
1402
1403
    /* Delete found entry if in wrong direction. 'force' implies commit. */
1404
0
    if (OVS_UNLIKELY(force && ctx->reply && conn)) {
1405
0
        if (conn_lookup(ct, &conn->key_node[CT_DIR_FWD].key,
1406
0
                        now, NULL, NULL)) {
1407
0
            conn_force_expire(conn);
1408
0
        }
1409
0
        conn = NULL;
1410
0
    }
1411
1412
0
    if (conn && helper == NULL) {
1413
0
        helper = conn->alg;
1414
0
    }
1415
1416
0
    enum ct_alg_ctl_type ct_alg_ctl = get_alg_ctl_type(pkt, helper);
1417
1418
0
    if (OVS_LIKELY(conn)) {
1419
0
        if (OVS_LIKELY(!conn_update_state_alg(ct, pkt, ctx, conn,
1420
0
                                              nat_action_info,
1421
0
                                              ct_alg_ctl, now,
1422
0
                                              &create_new_conn))) {
1423
0
            create_new_conn = conn_update_state(ct, pkt, ctx, conn, now);
1424
0
        }
1425
0
        if (nat_action_info && !create_new_conn) {
1426
0
            handle_nat(pkt, conn, zone, ctx->reply, ctx->icmp_related);
1427
0
        }
1428
1429
0
    } else if (check_orig_tuple(ct, pkt, ctx, now, &conn, nat_action_info)) {
1430
0
        create_new_conn = conn_update_state(ct, pkt, ctx, conn, now);
1431
0
    } else {
1432
0
        if (ctx->icmp_related) {
1433
            /* An icmp related conn should always be found; no new
1434
               connection is created based on an icmp related packet. */
1435
0
            pkt->md.ct_state = CS_INVALID;
1436
0
        } else {
1437
0
            create_new_conn = true;
1438
0
        }
1439
0
    }
1440
1441
0
    const struct alg_exp_node *alg_exp = NULL;
1442
0
    struct alg_exp_node alg_exp_entry;
1443
1444
0
    if (OVS_UNLIKELY(create_new_conn)) {
1445
1446
0
        ovs_rwlock_rdlock(&ct->resources_lock);
1447
0
        alg_exp = expectation_lookup(&ct->alg_expectations, &ctx->key,
1448
0
                                     ct->hash_basis,
1449
0
                                     alg_src_ip_wc(ct_alg_ctl));
1450
0
        if (alg_exp) {
1451
0
            memcpy(&alg_exp_entry, alg_exp, sizeof alg_exp_entry);
1452
0
            alg_exp = &alg_exp_entry;
1453
0
        }
1454
0
        ovs_rwlock_unlock(&ct->resources_lock);
1455
1456
0
        ovs_mutex_lock(&ct->ct_lock);
1457
0
        if (!conn_lookup(ct, &ctx->key, now, NULL, NULL)) {
1458
0
            conn = conn_not_found(ct, pkt, ctx, commit, now, nat_action_info,
1459
0
                                  helper, alg_exp, ct_alg_ctl, tp_id);
1460
0
        }
1461
0
        ovs_mutex_unlock(&ct->ct_lock);
1462
0
    }
1463
1464
0
    write_ct_md(pkt, zone, conn, &ctx->key, alg_exp);
1465
1466
0
    if (conn && setmark) {
1467
0
        set_mark(pkt, conn, setmark[0], setmark[1]);
1468
0
    }
1469
1470
0
    if (conn && setlabel) {
1471
0
        set_label(pkt, conn, &setlabel[0], &setlabel[1]);
1472
0
    }
1473
1474
0
    handle_alg_ctl(ct, ctx, pkt, ct_alg_ctl, conn, now, !!nat_action_info);
1475
1476
0
    set_cached_conn(nat_action_info, ctx, conn, pkt);
1477
0
}
1478
1479
/* Sends the packets in '*pkt_batch' through the connection tracker 'ct'.  All
1480
 * the packets must have the same 'dl_type' (IPv4 or IPv6) and should have
1481
 * the l3 and and l4 offset properly set.  Performs fragment reassembly with
1482
 * the help of ipf_preprocess_conntrack().
1483
 *
1484
 * If 'commit' is true, the packets are allowed to create new entries in the
1485
 * connection tables.  'setmark', if not NULL, should point to a two
1486
 * elements array containing a value and a mask to set the connection mark.
1487
 * 'setlabel' behaves similarly for the connection label.*/
1488
int
1489
conntrack_execute(struct conntrack *ct, struct dp_packet_batch *pkt_batch,
1490
                  ovs_be16 dl_type, bool force, bool commit, uint16_t zone,
1491
                  const uint32_t *setmark,
1492
                  const struct ovs_key_ct_labels *setlabel,
1493
                  const char *helper,
1494
                  const struct nat_action_info_t *nat_action_info,
1495
                  long long now, uint32_t tp_id)
1496
0
{
1497
0
    odp_port_t in_port = ODPP_LOCAL;
1498
0
    struct conn_lookup_ctx ctx;
1499
0
    struct dp_packet *packet;
1500
1501
0
    DP_PACKET_BATCH_FOR_EACH (i, packet, pkt_batch) {
1502
        /* The ipf preprocess function may consume all packets from this batch,
1503
         * save an in_port. */
1504
0
        in_port = packet->md.in_port.odp_port;
1505
0
        break;
1506
0
    }
1507
1508
0
    ipf_preprocess_conntrack(ct->ipf, pkt_batch, now, dl_type, zone,
1509
0
                             ct->hash_basis);
1510
1511
1512
0
    DP_PACKET_BATCH_FOR_EACH (i, packet, pkt_batch) {
1513
0
        struct conn *conn = packet->md.conn;
1514
1515
0
        if (helper == NULL && conn != NULL) {
1516
0
            helper = conn->alg;
1517
0
        }
1518
1519
0
        if (OVS_UNLIKELY(packet->md.ct_state == CS_INVALID)) {
1520
0
            write_ct_md(packet, zone, NULL, NULL, NULL);
1521
0
        } else if (conn &&
1522
0
                   conn->key_node[CT_DIR_FWD].key.zone == zone && !force &&
1523
0
                   !get_alg_ctl_type(packet, helper)) {
1524
0
            process_one_fast(zone, setmark, setlabel, nat_action_info,
1525
0
                             conn, packet);
1526
0
        } else if (OVS_UNLIKELY(!conn_key_extract(ct, packet, dl_type, &ctx,
1527
0
                                zone))) {
1528
0
            packet->md.ct_state = CS_INVALID;
1529
0
            write_ct_md(packet, zone, NULL, NULL, NULL);
1530
0
        } else {
1531
0
            process_one(ct, packet, &ctx, zone, force, commit, now, setmark,
1532
0
                        setlabel, nat_action_info, helper, tp_id);
1533
0
        }
1534
0
    }
1535
1536
0
    ipf_postprocess_conntrack(ct->ipf, pkt_batch, now, dl_type, zone, in_port);
1537
1538
0
    return 0;
1539
0
}
1540
1541
void
1542
conntrack_clear(struct dp_packet *packet)
1543
0
{
1544
    /* According to pkt_metadata_init(), ct_state == 0 is enough to make all of
1545
     * the conntrack fields invalid. */
1546
0
    packet->md.ct_state = 0;
1547
0
    pkt_metadata_init_conn(&packet->md);
1548
0
}
1549
1550
static void
1551
set_mark(struct dp_packet *pkt, struct conn *conn, uint32_t val, uint32_t mask)
1552
0
{
1553
0
    ovs_mutex_lock(&conn->lock);
1554
0
    if (conn->alg_related) {
1555
0
        pkt->md.ct_mark = conn->mark;
1556
0
    } else {
1557
0
        pkt->md.ct_mark = val | (pkt->md.ct_mark & ~(mask));
1558
0
        conn->mark = pkt->md.ct_mark;
1559
0
    }
1560
0
    ovs_mutex_unlock(&conn->lock);
1561
0
}
1562
1563
static void
1564
set_label(struct dp_packet *pkt, struct conn *conn,
1565
          const struct ovs_key_ct_labels *val,
1566
          const struct ovs_key_ct_labels *mask)
1567
0
{
1568
0
    ovs_mutex_lock(&conn->lock);
1569
0
    if (conn->alg_related) {
1570
0
        pkt->md.ct_label = conn->label;
1571
0
    } else {
1572
0
        ovs_u128 v, m;
1573
1574
0
        memcpy(&v, val, sizeof v);
1575
0
        memcpy(&m, mask, sizeof m);
1576
1577
0
        pkt->md.ct_label.u64.lo = v.u64.lo
1578
0
                              | (pkt->md.ct_label.u64.lo & ~(m.u64.lo));
1579
0
        pkt->md.ct_label.u64.hi = v.u64.hi
1580
0
                              | (pkt->md.ct_label.u64.hi & ~(m.u64.hi));
1581
0
        conn->label = pkt->md.ct_label;
1582
0
    }
1583
0
    ovs_mutex_unlock(&conn->lock);
1584
0
}
1585
1586

1587
int
1588
conntrack_set_sweep_interval(struct conntrack *ct, uint32_t ms)
1589
0
{
1590
0
    atomic_store_relaxed(&ct->sweep_ms, ms);
1591
0
    return 0;
1592
0
}
1593
1594
uint32_t
1595
conntrack_get_sweep_interval(struct conntrack *ct)
1596
0
{
1597
0
    uint32_t ms;
1598
0
    atomic_read_relaxed(&ct->sweep_ms, &ms);
1599
0
    return ms;
1600
0
}
1601
1602
static size_t
1603
ct_sweep(struct conntrack *ct, struct rculist *list, long long now,
1604
         size_t *cleaned_count)
1605
    OVS_NO_THREAD_SAFETY_ANALYSIS
1606
0
{
1607
0
    struct conn *conn;
1608
0
    size_t cleaned = 0;
1609
0
    size_t count = 0;
1610
1611
0
    RCULIST_FOR_EACH (conn, node, list) {
1612
0
        if (conn_expired(conn, now)) {
1613
0
            conn_clean(ct, conn);
1614
0
            cleaned++;
1615
0
        }
1616
1617
0
        count++;
1618
0
    }
1619
1620
0
    if (cleaned_count) {
1621
0
        *cleaned_count = cleaned;
1622
0
    }
1623
1624
0
    return count;
1625
0
}
1626
1627
/* Cleans up old connection entries from 'ct'.  Returns the time
1628
 * when the next wake will happen. The return value might be zero,
1629
 * meaning that an internal limit has been reached. */
1630
static long long
1631
conntrack_clean(struct conntrack *ct, long long now)
1632
0
{
1633
0
    long long next_wakeup = now + conntrack_get_sweep_interval(ct);
1634
0
    unsigned int n_conn_limit, i;
1635
0
    size_t clean_end, count = 0;
1636
0
    size_t total_cleaned = 0;
1637
1638
0
    atomic_read_relaxed(&ct->n_conn_limit, &n_conn_limit);
1639
0
    clean_end = n_conn_limit / 64;
1640
1641
0
    for (i = ct->next_sweep; i < N_EXP_LISTS; i++) {
1642
0
        size_t cleaned;
1643
1644
0
        if (count > clean_end) {
1645
0
            next_wakeup = 0;
1646
0
            break;
1647
0
        }
1648
1649
0
        count += ct_sweep(ct, &ct->exp_lists[i], now, &cleaned);
1650
0
        total_cleaned += cleaned;
1651
0
    }
1652
1653
0
    ct->next_sweep = (i < N_EXP_LISTS) ? i : 0;
1654
1655
0
    VLOG_DBG("conntrack cleaned %"PRIuSIZE" entries out of %"PRIuSIZE
1656
0
             " entries in %lld msec", total_cleaned, count,
1657
0
             time_msec() - now);
1658
1659
0
    return next_wakeup;
1660
0
}
1661
1662
/* Cleanup:
1663
 *
1664
 * We must call conntrack_clean() periodically.  conntrack_clean() return
1665
 * value gives an hint on when the next cleanup must be done. */
1666
#define CT_CLEAN_MIN_INTERVAL_MS 200
1667
1668
static void *
1669
clean_thread_main(void *f_)
1670
    OVS_NO_THREAD_SAFETY_ANALYSIS
1671
0
{
1672
0
    struct conntrack *ct = f_;
1673
1674
0
    while (!latch_is_set(&ct->clean_thread_exit)) {
1675
0
        long long next_wake;
1676
0
        long long now = time_msec();
1677
0
        next_wake = conntrack_clean(ct, now);
1678
1679
0
        if (next_wake < now) {
1680
0
            poll_timer_wait_until(now + CT_CLEAN_MIN_INTERVAL_MS);
1681
0
        } else {
1682
0
            poll_timer_wait_until(next_wake);
1683
0
        }
1684
0
        latch_wait(&ct->clean_thread_exit);
1685
0
        poll_block();
1686
0
    }
1687
1688
0
    return NULL;
1689
0
}
1690

1691
/* 'Data' is a pointer to the beginning of the L3 header and 'new_data' is
1692
 * used to store a pointer to the first byte after the L3 header.  'Size' is
1693
 * the size of the packet beyond the data pointer. */
1694
static inline bool
1695
extract_l3_ipv4(struct dp_packet *pkt, struct conn_key *key, const void *data,
1696
                size_t size, const char **new_data)
1697
0
{
1698
0
    if (OVS_UNLIKELY(size < IP_HEADER_LEN)) {
1699
0
        return false;
1700
0
    }
1701
1702
0
    const struct ip_header *ip = data;
1703
0
    size_t ip_len = IP_IHL(ip->ip_ihl_ver) * 4;
1704
1705
0
    if (OVS_UNLIKELY(ip_len < IP_HEADER_LEN)) {
1706
0
        return false;
1707
0
    }
1708
1709
0
    if (OVS_UNLIKELY(size < ip_len)) {
1710
0
        return false;
1711
0
    }
1712
1713
0
    if (IP_IS_LATER_FRAG(ip->ip_frag_off)) {
1714
0
        return false;
1715
0
    }
1716
1717
0
    if (pkt && dp_packet_ip_checksum_unknown(pkt)) {
1718
0
        COVERAGE_INC(conntrack_l3csum_checked);
1719
0
        if (csum(data, ip_len)) {
1720
0
            COVERAGE_INC(conntrack_l3csum_err);
1721
0
            dp_packet_ip_checksum_set_bad(pkt);
1722
0
            return false;
1723
0
        }
1724
0
        dp_packet_ip_checksum_set_good(pkt);
1725
0
    }
1726
1727
0
    if (new_data) {
1728
0
        *new_data = (char *) data + ip_len;
1729
0
    }
1730
1731
0
    key->src.addr.ipv4 = get_16aligned_be32(&ip->ip_src);
1732
0
    key->dst.addr.ipv4 = get_16aligned_be32(&ip->ip_dst);
1733
0
    key->nw_proto = ip->ip_proto;
1734
1735
0
    return true;
1736
0
}
1737
1738
/* 'Data' is a pointer to the beginning of the L3 header and 'new_data' is
1739
 * used to store a pointer to the first byte after the L3 header.  'Size' is
1740
 * the size of the packet beyond the data pointer. */
1741
static inline bool
1742
extract_l3_ipv6(struct conn_key *key, const void *data, size_t size,
1743
                const char **new_data)
1744
0
{
1745
0
    const struct ovs_16aligned_ip6_hdr *ip6 = data;
1746
1747
0
    if (OVS_UNLIKELY(size < sizeof *ip6)) {
1748
0
        return false;
1749
0
    }
1750
1751
0
    data = ip6 + 1;
1752
0
    size -=  sizeof *ip6;
1753
0
    uint8_t nw_proto = ip6->ip6_nxt;
1754
0
    uint8_t nw_frag = 0;
1755
1756
0
    if (!parse_ipv6_ext_hdrs(&data, &size, &nw_proto, &nw_frag,
1757
0
                             NULL, NULL)) {
1758
0
        return false;
1759
0
    }
1760
1761
0
    if (nw_frag) {
1762
0
        return false;
1763
0
    }
1764
1765
0
    if (new_data) {
1766
0
        *new_data = data;
1767
0
    }
1768
1769
0
    memcpy(&key->src.addr.ipv6, &ip6->ip6_src, sizeof key->src.addr);
1770
0
    memcpy(&key->dst.addr.ipv6, &ip6->ip6_dst, sizeof key->dst.addr);
1771
0
    key->nw_proto = nw_proto;
1772
1773
0
    return true;
1774
0
}
1775
1776
static inline bool
1777
checksum_valid(const struct conn_key *key, const void *data, size_t size,
1778
               const void *l3)
1779
0
{
1780
0
    bool valid;
1781
1782
0
    if (key->dl_type == htons(ETH_TYPE_IP)) {
1783
0
        uint32_t csum = packet_csum_pseudoheader(l3);
1784
0
        valid = (csum_finish(csum_continue(csum, data, size)) == 0);
1785
0
    } else if (key->dl_type == htons(ETH_TYPE_IPV6)) {
1786
0
        valid = (packet_csum_upperlayer6(l3, data, key->nw_proto, size) == 0);
1787
0
    } else {
1788
0
        valid = false;
1789
0
    }
1790
1791
0
    COVERAGE_INC(conntrack_l4csum_checked);
1792
0
    if (!valid) {
1793
0
        COVERAGE_INC(conntrack_l4csum_err);
1794
0
    }
1795
1796
0
    return valid;
1797
0
}
1798
1799
static inline bool
1800
sctp_checksum_valid(const void *data, size_t size)
1801
0
{
1802
0
    struct sctp_header *sctp = (struct sctp_header *) data;
1803
0
    ovs_be32 rcvd_csum, csum;
1804
1805
0
    rcvd_csum = get_16aligned_be32(&sctp->sctp_csum);
1806
0
    put_16aligned_be32(&sctp->sctp_csum, 0);
1807
0
    csum = crc32c(data, size);
1808
0
    put_16aligned_be32(&sctp->sctp_csum, rcvd_csum);
1809
1810
0
    COVERAGE_INC(conntrack_l4csum_checked);
1811
0
    if (rcvd_csum != csum) {
1812
0
        COVERAGE_INC(conntrack_l4csum_err);
1813
0
        return false;
1814
0
    }
1815
1816
0
    return true;
1817
0
}
1818
1819
static inline bool
1820
check_l4_tcp(struct dp_packet *pkt, const struct conn_key *key,
1821
             const void *data, size_t size, const void *l3)
1822
0
{
1823
0
    const struct tcp_header *tcp = data;
1824
0
    if (size < sizeof *tcp) {
1825
0
        return false;
1826
0
    }
1827
1828
0
    size_t tcp_len = TCP_OFFSET(tcp->tcp_ctl) * 4;
1829
0
    if (OVS_UNLIKELY(tcp_len < TCP_HEADER_LEN || tcp_len > size)) {
1830
0
        return false;
1831
0
    }
1832
1833
0
    if (pkt && dp_packet_l4_checksum_unknown(pkt)) {
1834
0
        if (!checksum_valid(key, data, size, l3)) {
1835
0
            dp_packet_l4_checksum_set_bad(pkt);
1836
0
            return false;
1837
0
        }
1838
0
        dp_packet_l4_checksum_set_good(pkt);
1839
0
        dp_packet_l4_proto_set_tcp(pkt);
1840
0
    }
1841
0
    return true;
1842
0
}
1843
1844
static inline bool
1845
check_l4_udp(struct dp_packet *pkt, const struct conn_key *key,
1846
             const void *data, size_t size, const void *l3)
1847
0
{
1848
0
    const struct udp_header *udp = data;
1849
0
    if (size < sizeof *udp) {
1850
0
        return false;
1851
0
    }
1852
1853
0
    size_t udp_len = ntohs(udp->udp_len);
1854
0
    if (OVS_UNLIKELY(udp_len < UDP_HEADER_LEN || udp_len > size)) {
1855
0
        return false;
1856
0
    }
1857
1858
    /* Validation must be skipped if checksum is 0 on IPv4 packets */
1859
0
    if (!(udp->udp_csum == 0 && key->dl_type == htons(ETH_TYPE_IP))
1860
0
        && (pkt && dp_packet_l4_checksum_unknown(pkt))) {
1861
0
        if (!checksum_valid(key, data, size, l3)) {
1862
0
            dp_packet_l4_checksum_set_bad(pkt);
1863
0
            return false;
1864
0
        }
1865
0
        dp_packet_l4_checksum_set_good(pkt);
1866
0
        dp_packet_l4_proto_set_udp(pkt);
1867
0
    }
1868
0
    return true;
1869
0
}
1870
1871
static inline bool
1872
sctp_check_len(const struct sctp_header *sh, size_t size)
1873
0
{
1874
0
    const struct sctp_chunk_header *sch;
1875
0
    size_t next;
1876
1877
0
    if (size < SCTP_HEADER_LEN) {
1878
0
        return false;
1879
0
    }
1880
1881
    /* rfc4960: Chunks (including Type, Length, and Value fields) are padded
1882
     * out by the sender with all zero bytes to be a multiple of 4 bytes long.
1883
     */
1884
0
    for (next = sizeof(struct sctp_header),
1885
0
         sch = SCTP_NEXT_CHUNK(sh, next);
1886
0
         next < size;
1887
0
         next += ROUND_UP(ntohs(sch->length), 4),
1888
0
         sch = SCTP_NEXT_CHUNK(sh, next)) {
1889
        /* rfc4960: This value represents the size of the chunk in bytes,
1890
         * including the Chunk Type, Chunk Flags, Chunk Length, and Chunk Value
1891
         * fields.
1892
         * Therefore, if the Chunk Value field is zero-length, the Length
1893
         * field will be set to 4. */
1894
0
        if (ntohs(sch->length) < sizeof *sch) {
1895
0
            return false;
1896
0
        }
1897
0
    }
1898
1899
0
    return (next == size);
1900
0
}
1901
1902
static inline bool
1903
check_l4_sctp(struct dp_packet *pkt, const void *data, size_t size)
1904
0
{
1905
0
    if (OVS_UNLIKELY(!sctp_check_len(data, size))) {
1906
0
        return false;
1907
0
    }
1908
1909
0
    if (pkt && dp_packet_l4_checksum_unknown(pkt)) {
1910
0
        if (!sctp_checksum_valid(data, size)) {
1911
0
            dp_packet_l4_checksum_set_bad(pkt);
1912
0
            return false;
1913
0
        }
1914
0
        dp_packet_l4_checksum_set_good(pkt);
1915
0
        dp_packet_l4_proto_set_sctp(pkt);
1916
0
    }
1917
0
    return true;
1918
0
}
1919
1920
static inline bool
1921
check_l4_icmp(struct dp_packet *pkt, const void *data, size_t size)
1922
0
{
1923
0
    if (pkt) {
1924
0
        COVERAGE_INC(conntrack_l4csum_checked);
1925
0
        if (csum(data, size)) {
1926
0
            COVERAGE_INC(conntrack_l4csum_err);
1927
0
            return false;
1928
0
        }
1929
0
    }
1930
1931
0
    return true;
1932
0
}
1933
1934
static inline bool
1935
check_l4_icmp6(struct dp_packet *pkt, const struct conn_key *key,
1936
               const void *data, size_t size, const void *l3)
1937
0
{
1938
0
    return pkt ? checksum_valid(key, data, size, l3) : true;
1939
0
}
1940
1941
static inline bool
1942
extract_l4_tcp(struct conn_key *key, const void *data, size_t size,
1943
               size_t *chk_len)
1944
0
{
1945
0
    if (OVS_UNLIKELY(size < (chk_len ? *chk_len : TCP_HEADER_LEN))) {
1946
0
        return false;
1947
0
    }
1948
1949
0
    const struct tcp_header *tcp = data;
1950
0
    key->src.port = tcp->tcp_src;
1951
0
    key->dst.port = tcp->tcp_dst;
1952
1953
    /* Port 0 is invalid */
1954
0
    return key->src.port && key->dst.port;
1955
0
}
1956
1957
static inline bool
1958
extract_l4_udp(struct conn_key *key, const void *data, size_t size,
1959
               size_t *chk_len)
1960
0
{
1961
0
    if (OVS_UNLIKELY(size < (chk_len ? *chk_len : UDP_HEADER_LEN))) {
1962
0
        return false;
1963
0
    }
1964
1965
0
    const struct udp_header *udp = data;
1966
0
    key->src.port = udp->udp_src;
1967
0
    key->dst.port = udp->udp_dst;
1968
1969
    /* Port 0 is invalid */
1970
0
    return key->src.port && key->dst.port;
1971
0
}
1972
1973
static inline bool
1974
extract_l4_sctp(struct conn_key *key, const void *data, size_t size,
1975
                size_t *chk_len)
1976
0
{
1977
0
    if (OVS_UNLIKELY(size < (chk_len ? *chk_len : SCTP_HEADER_LEN))) {
1978
0
        return false;
1979
0
    }
1980
1981
0
    const struct sctp_header *sctp = data;
1982
0
    key->src.port = sctp->sctp_src;
1983
0
    key->dst.port = sctp->sctp_dst;
1984
1985
0
    return key->src.port && key->dst.port;
1986
0
}
1987
1988
static inline bool extract_l4(struct dp_packet *pkt, struct conn_key *key,
1989
                              const void *data, size_t size, bool *related,
1990
                              const void *l3, size_t *chk_len);
1991
1992
static uint8_t
1993
reverse_icmp_type(uint8_t type)
1994
0
{
1995
0
    switch (type) {
1996
0
    case ICMP4_ECHO_REQUEST:
1997
0
        return ICMP4_ECHO_REPLY;
1998
0
    case ICMP4_ECHO_REPLY:
1999
0
        return ICMP4_ECHO_REQUEST;
2000
2001
0
    case ICMP4_TIMESTAMP:
2002
0
        return ICMP4_TIMESTAMPREPLY;
2003
0
    case ICMP4_TIMESTAMPREPLY:
2004
0
        return ICMP4_TIMESTAMP;
2005
2006
0
    case ICMP4_INFOREQUEST:
2007
0
        return ICMP4_INFOREPLY;
2008
0
    case ICMP4_INFOREPLY:
2009
0
        return ICMP4_INFOREQUEST;
2010
0
    default:
2011
0
        OVS_NOT_REACHED();
2012
0
    }
2013
0
}
2014
2015
/* If 'related' is not NULL and the function is processing an ICMP
2016
 * error packet, extract the l3 and l4 fields from the nested header
2017
 * instead and set *related to true.  If 'related' is NULL we're
2018
 * already processing a nested header and no such recursion is
2019
 * possible */
2020
static inline int
2021
extract_l4_icmp(struct conn_key *key, const void *data, size_t size,
2022
                bool *related, size_t *chk_len)
2023
0
{
2024
0
    if (OVS_UNLIKELY(size < (chk_len ? *chk_len : ICMP_HEADER_LEN))) {
2025
0
        return false;
2026
0
    }
2027
2028
0
    const struct icmp_header *icmp = data;
2029
2030
0
    switch (icmp->icmp_type) {
2031
0
    case ICMP4_ECHO_REQUEST:
2032
0
    case ICMP4_ECHO_REPLY:
2033
0
    case ICMP4_TIMESTAMP:
2034
0
    case ICMP4_TIMESTAMPREPLY:
2035
0
    case ICMP4_INFOREQUEST:
2036
0
    case ICMP4_INFOREPLY:
2037
0
        if (icmp->icmp_code != 0) {
2038
0
            return false;
2039
0
        }
2040
        /* Separate ICMP connection: identified using id */
2041
0
        key->src.icmp_id = key->dst.icmp_id = icmp->icmp_fields.echo.id;
2042
0
        key->src.icmp_type = icmp->icmp_type;
2043
0
        key->dst.icmp_type = reverse_icmp_type(icmp->icmp_type);
2044
0
        break;
2045
0
    case ICMP4_DST_UNREACH:
2046
0
    case ICMP4_TIME_EXCEEDED:
2047
0
    case ICMP4_PARAM_PROB:
2048
0
    case ICMP4_SOURCEQUENCH:
2049
0
    case ICMP4_REDIRECT: {
2050
        /* ICMP packet part of another connection. We should
2051
         * extract the key from embedded packet header */
2052
0
        struct conn_key inner_key;
2053
0
        const char *l3 = (const char *) (icmp + 1);
2054
0
        const char *tail = (const char *) data + size;
2055
0
        const char *l4;
2056
2057
0
        if (!related) {
2058
0
            return false;
2059
0
        }
2060
2061
0
        memset(&inner_key, 0, sizeof inner_key);
2062
0
        inner_key.dl_type = htons(ETH_TYPE_IP);
2063
0
        bool ok = extract_l3_ipv4(NULL, &inner_key, l3, tail - l3, &l4);
2064
0
        if (!ok) {
2065
0
            return false;
2066
0
        }
2067
2068
0
        if (inner_key.src.addr.ipv4 != key->dst.addr.ipv4) {
2069
0
            return false;
2070
0
        }
2071
2072
0
        key->src = inner_key.src;
2073
0
        key->dst = inner_key.dst;
2074
0
        key->nw_proto = inner_key.nw_proto;
2075
0
        size_t check_len = ICMP_ERROR_DATA_L4_LEN;
2076
2077
0
        ok = extract_l4(NULL, key, l4, tail - l4, NULL, l3, &check_len);
2078
0
        if (ok) {
2079
0
            conn_key_reverse(key);
2080
0
            *related = true;
2081
0
        }
2082
0
        return ok;
2083
0
    }
2084
0
    default:
2085
0
        return false;
2086
0
    }
2087
2088
0
    return true;
2089
0
}
2090
2091
static uint8_t
2092
reverse_icmp6_type(uint8_t type)
2093
0
{
2094
0
    switch (type) {
2095
0
    case ICMP6_ECHO_REQUEST:
2096
0
        return ICMP6_ECHO_REPLY;
2097
0
    case ICMP6_ECHO_REPLY:
2098
0
        return ICMP6_ECHO_REQUEST;
2099
0
    default:
2100
0
        OVS_NOT_REACHED();
2101
0
    }
2102
0
}
2103
2104
/* If 'related' is not NULL and the function is processing an ICMP
2105
 * error packet, extract the l3 and l4 fields from the nested header
2106
 * instead and set *related to true.  If 'related' is NULL we're
2107
 * already processing a nested header and no such recursion is
2108
 * possible */
2109
static inline bool
2110
extract_l4_icmp6(struct conn_key *key, const void *data, size_t size,
2111
                 bool *related)
2112
0
{
2113
0
    const struct icmp6_header *icmp6 = data;
2114
2115
    /* All the messages that we support need at least 4 bytes after
2116
     * the header */
2117
0
    if (size < sizeof *icmp6 + 4) {
2118
0
        return false;
2119
0
    }
2120
2121
0
    switch (icmp6->icmp6_type) {
2122
0
    case ICMP6_ECHO_REQUEST:
2123
0
    case ICMP6_ECHO_REPLY:
2124
0
        if (icmp6->icmp6_code != 0) {
2125
0
            return false;
2126
0
        }
2127
        /* Separate ICMP connection: identified using id */
2128
0
        key->src.icmp_id = key->dst.icmp_id = *(ovs_be16 *) (icmp6 + 1);
2129
0
        key->src.icmp_type = icmp6->icmp6_type;
2130
0
        key->dst.icmp_type = reverse_icmp6_type(icmp6->icmp6_type);
2131
0
        break;
2132
0
    case ICMP6_DST_UNREACH:
2133
0
    case ICMP6_PACKET_TOO_BIG:
2134
0
    case ICMP6_TIME_EXCEEDED:
2135
0
    case ICMP6_PARAM_PROB: {
2136
        /* ICMP packet part of another connection. We should
2137
         * extract the key from embedded packet header */
2138
0
        struct conn_key inner_key;
2139
0
        const char *l3 = (const char *) icmp6 + 8;
2140
0
        const char *tail = (const char *) data + size;
2141
0
        const char *l4 = NULL;
2142
2143
0
        if (!related) {
2144
0
            return false;
2145
0
        }
2146
2147
0
        memset(&inner_key, 0, sizeof inner_key);
2148
0
        inner_key.dl_type = htons(ETH_TYPE_IPV6);
2149
0
        bool ok = extract_l3_ipv6(&inner_key, l3, tail - l3, &l4);
2150
0
        if (!ok) {
2151
0
            return false;
2152
0
        }
2153
2154
        /* pf doesn't do this, but it seems a good idea */
2155
0
        if (!ipv6_addr_equals(&inner_key.src.addr.ipv6,
2156
0
                              &key->dst.addr.ipv6)) {
2157
0
            return false;
2158
0
        }
2159
2160
0
        key->src = inner_key.src;
2161
0
        key->dst = inner_key.dst;
2162
0
        key->nw_proto = inner_key.nw_proto;
2163
2164
0
        ok = extract_l4(NULL, key, l4, tail - l4, NULL, l3, NULL);
2165
0
        if (ok) {
2166
0
            conn_key_reverse(key);
2167
0
            *related = true;
2168
0
        }
2169
0
        return ok;
2170
0
    }
2171
0
    default:
2172
0
        return false;
2173
0
    }
2174
2175
0
    return true;
2176
0
}
2177
2178
/* Extract l4 fields into 'key', which must already contain valid l3
2179
 * members.
2180
 *
2181
 * If 'related' is not NULL and an ICMP error packet is being
2182
 * processed, the function will extract the key from the packet nested
2183
 * in the ICMP payload and set '*related' to true.
2184
 *
2185
 * 'size' here is the layer 4 size, which can be a nested size if parsing
2186
 * an ICMP or ICMP6 header.
2187
 *
2188
 * If 'related' is NULL, it means that we're already parsing a header nested
2189
 * in an ICMP error.  In this case, we skip the checksum and some length
2190
 * validations. */
2191
static inline bool
2192
extract_l4(struct dp_packet *pkt, struct conn_key *key, const void *data,
2193
           size_t size, bool *related, const void *l3, size_t *chk_len)
2194
0
{
2195
0
    if (key->nw_proto == IPPROTO_TCP) {
2196
0
        return (!related || check_l4_tcp(pkt, key, data, size, l3))
2197
0
               && extract_l4_tcp(key, data, size, chk_len);
2198
0
    } else if (key->nw_proto == IPPROTO_UDP) {
2199
0
        return (!related || check_l4_udp(pkt, key, data, size, l3))
2200
0
               && extract_l4_udp(key, data, size, chk_len);
2201
0
    } else if (key->nw_proto == IPPROTO_SCTP) {
2202
0
        return (!related || check_l4_sctp(pkt, data, size))
2203
0
               && extract_l4_sctp(key, data, size, chk_len);
2204
0
    } else if (key->dl_type == htons(ETH_TYPE_IP)
2205
0
               && key->nw_proto == IPPROTO_ICMP) {
2206
0
        return (!related || check_l4_icmp(pkt, data, size))
2207
0
               && extract_l4_icmp(key, data, size, related, chk_len);
2208
0
    } else if (key->dl_type == htons(ETH_TYPE_IPV6)
2209
0
               && key->nw_proto == IPPROTO_ICMPV6) {
2210
0
        return (!related || check_l4_icmp6(pkt, key, data, size, l3))
2211
0
               && extract_l4_icmp6(key, data, size, related);
2212
0
    }
2213
2214
    /* For all other protocols we do not have L4 keys, so keep them zero. */
2215
0
    return true;
2216
0
}
2217
2218
static bool
2219
conn_key_extract(struct conntrack *ct, struct dp_packet *pkt, ovs_be16 dl_type,
2220
                 struct conn_lookup_ctx *ctx, uint16_t zone)
2221
0
{
2222
0
    const struct eth_header *l2 = dp_packet_eth(pkt);
2223
0
    const struct ip_header *l3 = dp_packet_l3(pkt);
2224
0
    const char *l4 = dp_packet_l4(pkt);
2225
2226
0
    memset(ctx, 0, sizeof *ctx);
2227
2228
0
    if (!l2 || !l3 || !l4) {
2229
0
        return false;
2230
0
    }
2231
2232
0
    ctx->key.zone = zone;
2233
2234
    /* XXX In this function we parse the packet (again, it has already
2235
     * gone through miniflow_extract()) for two reasons:
2236
     *
2237
     * 1) To extract the l3 addresses and l4 ports.
2238
     *    We already have the l3 and l4 headers' pointers.  Extracting
2239
     *    the l3 addresses and the l4 ports is really cheap, since they
2240
     *    can be found at fixed locations.
2241
     * 2) To extract the l4 type.
2242
     *    Extracting the l4 types, for IPv6 can be quite expensive, because
2243
     *    it's not at a fixed location.
2244
     *
2245
     * Here's a way to avoid (2) with the help of the datapath.
2246
     * The datapath doesn't keep the packet's extracted flow[1], so
2247
     * using that is not an option.  We could use the packet's matching
2248
     * megaflow, but we have to make sure that the l4 type (nw_proto)
2249
     * is unwildcarded.  This means either:
2250
     *
2251
     * a) dpif-netdev unwildcards the l4 type when a new flow is installed
2252
     *    if the actions contains ct().
2253
     *
2254
     * b) ofproto-dpif-xlate unwildcards the l4 type when translating a ct()
2255
     *    action.  This is already done in different actions, but it's
2256
     *    unnecessary for the kernel.
2257
     *
2258
     * ---
2259
     * [1] The reasons for this are that keeping the flow increases
2260
     *     (slightly) the cache footprint and increases computation
2261
     *     time as we move the packet around. Most importantly, the flow
2262
     *     should be updated by the actions and this can be slow, as
2263
     *     we use a sparse representation (miniflow).
2264
     *
2265
     */
2266
0
    bool ok;
2267
0
    ctx->key.dl_type = dl_type;
2268
2269
0
    if (ctx->key.dl_type == htons(ETH_TYPE_IP)) {
2270
0
        if (dp_packet_ip_checksum_bad(pkt)) {
2271
0
            ok = false;
2272
0
            COVERAGE_INC(conntrack_l3csum_err);
2273
0
        } else {
2274
            /* Validate the checksum only when hwol is not supported and the
2275
             * packet's checksum status is not known. */
2276
0
            ok = extract_l3_ipv4(pkt, &ctx->key, l3, dp_packet_l3_size(pkt),
2277
0
                                 NULL);
2278
0
        }
2279
0
    } else if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {
2280
0
        ok = extract_l3_ipv6(&ctx->key, l3, dp_packet_l3_size(pkt), NULL);
2281
0
    } else {
2282
0
        ok = false;
2283
0
    }
2284
2285
0
    if (ok) {
2286
0
        if (!dp_packet_l4_checksum_bad(pkt)) {
2287
            /* Validate the checksum only when hwol is not supported. */
2288
0
            if (extract_l4(pkt, &ctx->key, l4, dp_packet_l4_size(pkt),
2289
0
                           &ctx->icmp_related, l3, NULL)) {
2290
0
                ctx->hash = conn_key_hash(&ctx->key, ct->hash_basis);
2291
0
                return true;
2292
0
            }
2293
0
        } else {
2294
0
            COVERAGE_INC(conntrack_l4csum_err);
2295
0
        }
2296
0
    }
2297
2298
0
    return false;
2299
0
}
2300
2301
static uint32_t
2302
ct_addr_hash_add(uint32_t hash, const union ct_addr *addr)
2303
0
{
2304
0
    BUILD_ASSERT_DECL(sizeof *addr % 4 == 0);
2305
0
    return hash_add_bytes32(hash, (const uint32_t *) addr, sizeof *addr);
2306
0
}
2307
2308
static uint32_t
2309
ct_endpoint_hash_add(uint32_t hash, const struct ct_endpoint *ep)
2310
0
{
2311
0
    BUILD_ASSERT_DECL(sizeof *ep % 4 == 0);
2312
0
    return hash_add_bytes32(hash, (const uint32_t *) ep, sizeof *ep);
2313
0
}
2314

2315
/* Symmetric */
2316
static uint32_t
2317
conn_key_hash(const struct conn_key *key, uint32_t basis)
2318
0
{
2319
0
    uint32_t hsrc, hdst, hash;
2320
0
    hsrc = hdst = basis;
2321
0
    hsrc = ct_endpoint_hash_add(hsrc, &key->src);
2322
0
    hdst = ct_endpoint_hash_add(hdst, &key->dst);
2323
2324
    /* Even if source and destination are swapped the hash will be the same. */
2325
0
    hash = hsrc ^ hdst;
2326
2327
    /* Hash the rest of the key(L3 and L4 types and zone). */
2328
0
    return hash_words((uint32_t *) (&key->dst + 1),
2329
0
                      (uint32_t *) (key + 1) - (uint32_t *) (&key->dst + 1),
2330
0
                      hash);
2331
0
}
2332
2333
static void
2334
conn_key_reverse(struct conn_key *key)
2335
0
{
2336
0
    struct ct_endpoint tmp = key->src;
2337
0
    key->src = key->dst;
2338
0
    key->dst = tmp;
2339
0
}
2340
2341
static uint32_t
2342
nat_ipv6_addrs_delta(const struct in6_addr *ipv6_min,
2343
                     const struct in6_addr *ipv6_max)
2344
0
{
2345
0
    const uint8_t *ipv6_min_hi = &ipv6_min->s6_addr[0];
2346
0
    const uint8_t *ipv6_min_lo = &ipv6_min->s6_addr[0] +  sizeof(uint64_t);
2347
0
    const uint8_t *ipv6_max_hi = &ipv6_max->s6_addr[0];
2348
0
    const uint8_t *ipv6_max_lo = &ipv6_max->s6_addr[0] + sizeof(uint64_t);
2349
2350
0
    ovs_be64 addr6_64_min_hi;
2351
0
    ovs_be64 addr6_64_min_lo;
2352
0
    memcpy(&addr6_64_min_hi, ipv6_min_hi, sizeof addr6_64_min_hi);
2353
0
    memcpy(&addr6_64_min_lo, ipv6_min_lo, sizeof addr6_64_min_lo);
2354
2355
0
    ovs_be64 addr6_64_max_hi;
2356
0
    ovs_be64 addr6_64_max_lo;
2357
0
    memcpy(&addr6_64_max_hi, ipv6_max_hi, sizeof addr6_64_max_hi);
2358
0
    memcpy(&addr6_64_max_lo, ipv6_max_lo, sizeof addr6_64_max_lo);
2359
2360
0
    uint64_t diff;
2361
2362
0
    if (addr6_64_min_hi == addr6_64_max_hi &&
2363
0
        ntohll(addr6_64_min_lo) <= ntohll(addr6_64_max_lo)) {
2364
0
        diff = ntohll(addr6_64_max_lo) - ntohll(addr6_64_min_lo);
2365
0
    } else if (ntohll(addr6_64_min_hi) + 1 == ntohll(addr6_64_max_hi) &&
2366
0
               ntohll(addr6_64_min_lo) > ntohll(addr6_64_max_lo)) {
2367
0
        diff = UINT64_MAX - (ntohll(addr6_64_min_lo) -
2368
0
                             ntohll(addr6_64_max_lo) - 1);
2369
0
    } else {
2370
        /* Limit address delta supported to 32 bits or 4 billion approximately.
2371
         * Possibly, this should be visible to the user through a datapath
2372
         * support check, however the practical impact is probably nil. */
2373
0
        diff = 0xfffffffe;
2374
0
    }
2375
2376
0
    if (diff > 0xfffffffe) {
2377
0
        diff = 0xfffffffe;
2378
0
    }
2379
0
    return diff;
2380
0
}
2381
2382
/* This function must be used in tandem with nat_ipv6_addrs_delta(), which
2383
 * restricts the input parameters. */
2384
static void
2385
nat_ipv6_addr_increment(struct in6_addr *ipv6, uint32_t increment)
2386
0
{
2387
0
    uint8_t *ipv6_hi = &ipv6->s6_addr[0];
2388
0
    uint8_t *ipv6_lo = &ipv6->s6_addr[0] + sizeof(ovs_be64);
2389
0
    ovs_be64 addr6_64_hi;
2390
0
    ovs_be64 addr6_64_lo;
2391
0
    memcpy(&addr6_64_hi, ipv6_hi, sizeof addr6_64_hi);
2392
0
    memcpy(&addr6_64_lo, ipv6_lo, sizeof addr6_64_lo);
2393
2394
0
    if (UINT64_MAX - increment >= ntohll(addr6_64_lo)) {
2395
0
        addr6_64_lo = htonll(increment + ntohll(addr6_64_lo));
2396
0
    } else if (addr6_64_hi != OVS_BE64_MAX) {
2397
0
        addr6_64_hi = htonll(1 + ntohll(addr6_64_hi));
2398
0
        addr6_64_lo = htonll(increment - (UINT64_MAX -
2399
0
                                          ntohll(addr6_64_lo) + 1));
2400
0
    } else {
2401
0
        OVS_NOT_REACHED();
2402
0
    }
2403
2404
0
    memcpy(ipv6_hi, &addr6_64_hi, sizeof addr6_64_hi);
2405
0
    memcpy(ipv6_lo, &addr6_64_lo, sizeof addr6_64_lo);
2406
0
}
2407
2408
static uint32_t
2409
nat_range_hash(const struct conn_key *key, uint32_t basis,
2410
               const struct nat_action_info_t *nat_info)
2411
0
{
2412
0
    uint32_t hash = basis;
2413
2414
0
    if (!basis) {
2415
0
        hash = ct_addr_hash_add(hash, &key->src.addr);
2416
0
    } else {
2417
0
        hash = ct_endpoint_hash_add(hash, &key->src);
2418
0
        hash = ct_endpoint_hash_add(hash, &key->dst);
2419
0
    }
2420
2421
0
    hash = ct_addr_hash_add(hash, &nat_info->min_addr);
2422
0
    hash = ct_addr_hash_add(hash, &nat_info->max_addr);
2423
0
    hash = hash_add(hash,
2424
0
                    ((uint32_t) nat_info->max_port << 16)
2425
0
                    | nat_info->min_port);
2426
0
    hash = hash_add(hash, (OVS_FORCE uint32_t) key->dl_type);
2427
0
    hash = hash_add(hash, key->nw_proto);
2428
0
    hash = hash_add(hash, key->zone);
2429
    /* The purpose of the second parameter is to distinguish hashes of data of
2430
     * different length; our data always has the same length so there is no
2431
     * value in counting. */
2432
0
    return hash_finish(hash, 0);
2433
0
}
2434
2435
/* Ports are stored in host byte order for convenience. */
2436
static void
2437
set_sport_range(const struct nat_action_info_t *ni, const struct conn_key *k,
2438
                uint32_t off, uint16_t *curr, uint16_t *min,
2439
                uint16_t *max)
2440
0
{
2441
0
    if (((ni->nat_action & NAT_ACTION_SNAT_ALL) == NAT_ACTION_SRC) ||
2442
0
        ((ni->nat_action & NAT_ACTION_DST))) {
2443
0
        *curr = ntohs(k->src.port);
2444
0
        if (*curr < 512) {
2445
0
            *min = 1;
2446
0
            *max = 511;
2447
0
        } else if (*curr < 1024) {
2448
0
            *min = 600;
2449
0
            *max = 1023;
2450
0
        } else {
2451
0
            *min = MIN_NAT_EPHEMERAL_PORT;
2452
0
            *max = MAX_NAT_EPHEMERAL_PORT;
2453
0
        }
2454
0
    } else {
2455
0
        *min = ni->min_port;
2456
0
        *max = ni->max_port;
2457
0
        *curr =  *min + (off % ((*max - *min) + 1));
2458
0
    }
2459
0
}
2460
2461
static void
2462
set_dport_range(const struct nat_action_info_t *ni, const struct conn_key *k,
2463
                uint32_t off, uint16_t *curr, uint16_t *min,
2464
                uint16_t *max)
2465
0
{
2466
0
    if (ni->nat_action & NAT_ACTION_DST_PORT) {
2467
0
        *min = ni->min_port;
2468
0
        *max = ni->max_port;
2469
0
        *curr = *min + (off % ((*max - *min) + 1));
2470
0
    } else {
2471
0
        *curr = ntohs(k->dst.port);
2472
0
        *min = *max = *curr;
2473
0
    }
2474
0
}
2475
2476
/* Gets an in range address based on the hash.
2477
 * Addresses are kept in network order. */
2478
static void
2479
get_addr_in_range(union ct_addr *min, union ct_addr *max,
2480
                  union ct_addr *curr, uint32_t hash, bool ipv4)
2481
0
{
2482
0
    uint32_t offt, range;
2483
2484
0
    if (ipv4) {
2485
0
        range = (ntohl(max->ipv4) - ntohl(min->ipv4)) + 1;
2486
0
        offt = hash % range;
2487
0
        curr->ipv4 = htonl(ntohl(min->ipv4) + offt);
2488
0
    } else {
2489
0
        range = nat_ipv6_addrs_delta(&min->ipv6, &max->ipv6) + 1;
2490
        /* Range must be within 32 bits for full hash coverage. A 64 or
2491
         * 128 bit hash is unnecessary and hence not used here. Most code
2492
         * is kept common with V4; nat_ipv6_addrs_delta() will do the
2493
         * enforcement via max_ct_addr. */
2494
0
        offt = hash % range;
2495
0
        curr->ipv6 = min->ipv6;
2496
0
        nat_ipv6_addr_increment(&curr->ipv6, offt);
2497
0
    }
2498
0
}
2499
2500
static void
2501
find_addr(const struct conn_key *key, union ct_addr *min,
2502
          union ct_addr *max, union ct_addr *curr,
2503
          uint32_t hash, bool ipv4,
2504
          const struct nat_action_info_t *nat_info)
2505
0
{
2506
0
    union ct_addr zero_ip;
2507
2508
0
    memset(&zero_ip, 0, sizeof zero_ip);
2509
2510
    /* All-zero case. */
2511
0
    if (!memcmp(min, &zero_ip, sizeof *min)) {
2512
0
        if (nat_info->nat_action & NAT_ACTION_SRC) {
2513
0
            *curr = key->src.addr;
2514
0
        } else if (nat_info->nat_action & NAT_ACTION_DST) {
2515
0
            *curr = key->dst.addr;
2516
0
        }
2517
0
    } else {
2518
0
        get_addr_in_range(min, max, curr, hash, ipv4);
2519
0
    }
2520
0
}
2521
2522
static void
2523
store_addr_to_key(union ct_addr *addr, struct conn_key *key,
2524
                  uint16_t action)
2525
0
{
2526
0
    if (action & NAT_ACTION_SRC) {
2527
0
        key->dst.addr = *addr;
2528
0
    } else {
2529
0
        key->src.addr = *addr;
2530
0
    }
2531
0
}
2532
2533
static bool
2534
nat_get_unique_l4(struct conntrack *ct, struct conn_key *rev_key,
2535
                  ovs_be16 *port, uint16_t curr, uint16_t min,
2536
                  uint16_t max)
2537
0
{
2538
0
    static const unsigned int max_attempts = 128;
2539
0
    uint16_t range = max - min + 1;
2540
0
    unsigned int attempts;
2541
0
    uint16_t orig = curr;
2542
0
    unsigned int i = 0;
2543
2544
0
    attempts = range;
2545
0
    if (attempts > max_attempts) {
2546
0
        attempts = max_attempts;
2547
0
    }
2548
2549
0
another_round:
2550
0
    i = 0;
2551
0
    FOR_EACH_PORT_IN_RANGE (curr, min, max) {
2552
0
        if (i++ >= attempts) {
2553
0
            break;
2554
0
        }
2555
2556
0
        *port = htons(curr);
2557
0
        if (!conn_lookup(ct, rev_key, time_msec(), NULL, NULL)) {
2558
0
            return true;
2559
0
        }
2560
0
    }
2561
2562
0
    if (attempts < range && attempts >= 16) {
2563
0
        attempts /= 2;
2564
0
        curr = min + (random_uint32() % range);
2565
0
        goto another_round;
2566
0
    }
2567
2568
0
    *port = htons(orig);
2569
2570
0
    return false;
2571
0
}
2572
2573
/* This function tries to get a unique tuple.
2574
 * Every iteration checks that the reverse tuple doesn't
2575
 * collide with any existing one.
2576
 *
2577
 * In case of SNAT:
2578
 *    - Pick a src IP address in the range.
2579
 *        - Try to find a source port in range (if any).
2580
 *        - If no port range exists, use the whole
2581
 *          ephemeral range (after testing the port
2582
 *          used by the sender), otherwise use the
2583
 *          specified range.
2584
 *
2585
 * In case of DNAT:
2586
 *    - Pick a dst IP address in the range.
2587
 *        - For each dport in range (if any) tries to find
2588
 *          an unique tuple.
2589
 *        - Eventually, if the previous attempt fails,
2590
 *          tries to find a source port in the ephemeral
2591
 *          range (after testing the port used by the sender).
2592
 *
2593
 * If none can be found, return exhaustion to the caller. */
2594
static bool
2595
nat_get_unique_tuple(struct conntrack *ct, struct conn *conn,
2596
                     const struct nat_action_info_t *nat_info)
2597
0
{
2598
0
    struct conn_key *fwd_key = &conn->key_node[CT_DIR_FWD].key;
2599
0
    struct conn_key *rev_key = &conn->key_node[CT_DIR_REV].key;
2600
0
    bool pat_proto = fwd_key->nw_proto == IPPROTO_TCP ||
2601
0
                     fwd_key->nw_proto == IPPROTO_UDP ||
2602
0
                     fwd_key->nw_proto == IPPROTO_SCTP;
2603
0
    uint16_t min_dport, max_dport, curr_dport;
2604
0
    uint16_t min_sport, max_sport, curr_sport;
2605
0
    union ct_addr min_addr, max_addr, addr;
2606
0
    uint32_t hash, port_off, basis;
2607
2608
0
    memset(&min_addr, 0, sizeof min_addr);
2609
0
    memset(&max_addr, 0, sizeof max_addr);
2610
0
    memset(&addr, 0, sizeof addr);
2611
2612
0
    basis = (nat_info->nat_flags & NAT_PERSISTENT) ? 0 : ct->hash_basis;
2613
0
    hash = nat_range_hash(fwd_key, basis, nat_info);
2614
2615
0
    if (nat_info->nat_flags & NAT_RANGE_RANDOM) {
2616
0
        port_off = random_uint32();
2617
0
    } else if (basis) {
2618
0
        port_off = hash;
2619
0
    } else {
2620
0
        port_off = nat_range_hash(fwd_key, ct->hash_basis, nat_info);
2621
0
    }
2622
2623
0
    min_addr = nat_info->min_addr;
2624
0
    max_addr = nat_info->max_addr;
2625
2626
0
    find_addr(fwd_key, &min_addr, &max_addr, &addr, hash,
2627
0
              (fwd_key->dl_type == htons(ETH_TYPE_IP)), nat_info);
2628
2629
0
    set_sport_range(nat_info, fwd_key, port_off, &curr_sport,
2630
0
                    &min_sport, &max_sport);
2631
0
    set_dport_range(nat_info, fwd_key, port_off, &curr_dport,
2632
0
                    &min_dport, &max_dport);
2633
2634
0
    if (pat_proto) {
2635
0
        rev_key->src.port = htons(curr_dport);
2636
0
        rev_key->dst.port = htons(curr_sport);
2637
0
    }
2638
2639
0
    store_addr_to_key(&addr, rev_key, nat_info->nat_action);
2640
2641
0
    if (!pat_proto) {
2642
0
        return !conn_lookup(ct, rev_key, time_msec(), NULL, NULL);
2643
0
    }
2644
2645
0
    bool found = false;
2646
0
    if (nat_info->nat_action & NAT_ACTION_DST_PORT) {
2647
0
        found = nat_get_unique_l4(ct, rev_key, &rev_key->src.port,
2648
0
                                  curr_dport, min_dport, max_dport);
2649
0
    }
2650
2651
0
    if (!found) {
2652
0
        found = nat_get_unique_l4(ct, rev_key, &rev_key->dst.port,
2653
0
                                  curr_sport, min_sport, max_sport);
2654
0
    }
2655
2656
0
    if (found) {
2657
0
        return true;
2658
0
    }
2659
2660
0
    return false;
2661
0
}
2662
2663
static enum ct_update_res
2664
conn_update(struct conntrack *ct, struct conn *conn, struct dp_packet *pkt,
2665
            struct conn_lookup_ctx *ctx, long long now)
2666
0
{
2667
0
    ovs_mutex_lock(&conn->lock);
2668
0
    uint8_t nw_proto = conn->key_node[CT_DIR_FWD].key.nw_proto;
2669
0
    enum ct_update_res update_res =
2670
0
        l4_protos[nw_proto]->conn_update(ct, conn, pkt, ctx->reply, now);
2671
0
    ovs_mutex_unlock(&conn->lock);
2672
0
    return update_res;
2673
0
}
2674
2675
static void
2676
conn_expire_push_front(struct conntrack *ct, struct conn *conn)
2677
    OVS_REQUIRES(ct->ct_lock)
2678
0
{
2679
0
    unsigned int curr = ct->next_list;
2680
2681
0
    ct->next_list = (ct->next_list + 1) % N_EXP_LISTS;
2682
0
    rculist_push_front(&ct->exp_lists[curr], &conn->node);
2683
0
}
2684
2685
static long long int
2686
conn_expiration(const struct conn *conn)
2687
0
{
2688
0
    long long int expiration;
2689
2690
0
    atomic_read_relaxed(&CONST_CAST(struct conn *, conn)->expiration,
2691
0
                        &expiration);
2692
0
    return expiration;
2693
0
}
2694
2695
static bool
2696
conn_expired(const struct conn *conn, long long now)
2697
0
{
2698
0
    return now >= conn_expiration(conn);
2699
0
}
2700
2701
static bool
2702
valid_new(struct dp_packet *pkt, struct conn_key *key)
2703
0
{
2704
0
    return l4_protos[key->nw_proto]->valid_new(pkt);
2705
0
}
2706
2707
static struct conn *
2708
new_conn(struct conntrack *ct, struct dp_packet *pkt, struct conn_key *key,
2709
         long long now, uint32_t tp_id)
2710
0
{
2711
0
    return l4_protos[key->nw_proto]->new_conn(ct, pkt, now, tp_id);
2712
0
}
2713
2714
static void
2715
delete_conn__(struct conn *conn)
2716
0
{
2717
0
    free(conn->alg);
2718
0
    free(conn);
2719
0
}
2720
2721
static void
2722
delete_conn(struct conn *conn)
2723
0
{
2724
0
    ovs_mutex_destroy(&conn->lock);
2725
0
    delete_conn__(conn);
2726
0
}
2727
2728

2729
/* Convert a conntrack address 'a' into an IP address 'b' based on 'dl_type'.
2730
 *
2731
 * Note that 'dl_type' should be either "ETH_TYPE_IP" or "ETH_TYPE_IPv6"
2732
 * in network-byte order. */
2733
static void
2734
ct_endpoint_to_ct_dpif_inet_addr(const union ct_addr *a,
2735
                                 union ct_dpif_inet_addr *b,
2736
                                 ovs_be16 dl_type)
2737
0
{
2738
0
    if (dl_type == htons(ETH_TYPE_IP)) {
2739
0
        b->ip = a->ipv4;
2740
0
    } else if (dl_type == htons(ETH_TYPE_IPV6)){
2741
0
        b->in6 = a->ipv6;
2742
0
    }
2743
0
}
2744
2745
/* Convert an IP address 'a' into a conntrack address 'b' based on 'dl_type'.
2746
 *
2747
 * Note that 'dl_type' should be either "ETH_TYPE_IP" or "ETH_TYPE_IPv6"
2748
 * in network-byte order. */
2749
static void
2750
ct_dpif_inet_addr_to_ct_endpoint(const union ct_dpif_inet_addr *a,
2751
                                 union ct_addr *b, ovs_be16 dl_type)
2752
0
{
2753
0
    if (dl_type == htons(ETH_TYPE_IP)) {
2754
0
        b->ipv4 = a->ip;
2755
0
    } else if (dl_type == htons(ETH_TYPE_IPV6)){
2756
0
        b->ipv6 = a->in6;
2757
0
    }
2758
0
}
2759
2760
static void
2761
conn_key_to_tuple(const struct conn_key *key, struct ct_dpif_tuple *tuple)
2762
0
{
2763
0
    if (key->dl_type == htons(ETH_TYPE_IP)) {
2764
0
        tuple->l3_type = AF_INET;
2765
0
    } else if (key->dl_type == htons(ETH_TYPE_IPV6)) {
2766
0
        tuple->l3_type = AF_INET6;
2767
0
    }
2768
0
    tuple->ip_proto = key->nw_proto;
2769
0
    ct_endpoint_to_ct_dpif_inet_addr(&key->src.addr, &tuple->src,
2770
0
                                     key->dl_type);
2771
0
    ct_endpoint_to_ct_dpif_inet_addr(&key->dst.addr, &tuple->dst,
2772
0
                                     key->dl_type);
2773
2774
0
    if (key->nw_proto == IPPROTO_ICMP || key->nw_proto == IPPROTO_ICMPV6) {
2775
0
        tuple->icmp_id = key->src.icmp_id;
2776
0
        tuple->icmp_type = key->src.icmp_type;
2777
0
        tuple->icmp_code = key->src.icmp_code;
2778
0
    } else {
2779
0
        tuple->src_port = key->src.port;
2780
0
        tuple->dst_port = key->dst.port;
2781
0
    }
2782
0
}
2783
2784
static void
2785
tuple_to_conn_key(const struct ct_dpif_tuple *tuple, uint16_t zone,
2786
                  struct conn_key *key)
2787
0
{
2788
0
    if (tuple->l3_type == AF_INET) {
2789
0
        key->dl_type = htons(ETH_TYPE_IP);
2790
0
    } else if (tuple->l3_type == AF_INET6) {
2791
0
        key->dl_type = htons(ETH_TYPE_IPV6);
2792
0
    }
2793
0
    key->nw_proto = tuple->ip_proto;
2794
0
    ct_dpif_inet_addr_to_ct_endpoint(&tuple->src, &key->src.addr,
2795
0
                                     key->dl_type);
2796
0
    ct_dpif_inet_addr_to_ct_endpoint(&tuple->dst, &key->dst.addr,
2797
0
                                     key->dl_type);
2798
2799
0
    if (tuple->ip_proto == IPPROTO_ICMP || tuple->ip_proto == IPPROTO_ICMPV6) {
2800
0
        key->src.icmp_id = tuple->icmp_id;
2801
0
        key->src.icmp_type = tuple->icmp_type;
2802
0
        key->src.icmp_code = tuple->icmp_code;
2803
0
        key->dst.icmp_id = tuple->icmp_id;
2804
0
        key->dst.icmp_type = (tuple->ip_proto == IPPROTO_ICMP)
2805
0
                             ? reverse_icmp_type(tuple->icmp_type)
2806
0
                             : reverse_icmp6_type(tuple->icmp_type);
2807
0
        key->dst.icmp_code = tuple->icmp_code;
2808
0
    } else {
2809
0
        key->src.port = tuple->src_port;
2810
0
        key->dst.port = tuple->dst_port;
2811
0
    }
2812
0
    key->zone = zone;
2813
0
}
2814
2815
static void
2816
conn_to_ct_dpif_entry(const struct conn *conn, struct ct_dpif_entry *entry,
2817
                      long long now)
2818
0
{
2819
0
    const struct conn_key *rev_key = &conn->key_node[CT_DIR_REV].key;
2820
0
    const struct conn_key *key = &conn->key_node[CT_DIR_FWD].key;
2821
2822
0
    memset(entry, 0, sizeof *entry);
2823
0
    conn_key_to_tuple(key, &entry->tuple_orig);
2824
0
    conn_key_to_tuple(rev_key, &entry->tuple_reply);
2825
2826
0
    if (conn->alg_related) {
2827
0
        conn_key_to_tuple(&conn->parent_key, &entry->tuple_parent);
2828
0
    }
2829
2830
0
    entry->zone = key->zone;
2831
2832
0
    ovs_mutex_lock(&conn->lock);
2833
0
    entry->mark = conn->mark;
2834
0
    memcpy(&entry->labels, &conn->label, sizeof entry->labels);
2835
2836
0
    long long expiration = conn_expiration(conn) - now;
2837
2838
0
    struct ct_l4_proto *class = l4_protos[key->nw_proto];
2839
0
    if (class->conn_get_protoinfo) {
2840
0
        class->conn_get_protoinfo(conn, &entry->protoinfo);
2841
0
    }
2842
0
    ovs_mutex_unlock(&conn->lock);
2843
2844
0
    entry->timeout = (expiration > 0) ? expiration / 1000 : 0;
2845
2846
0
    if (conn->alg) {
2847
        /* Caller is responsible for freeing. */
2848
0
        entry->helper.name = xstrdup(conn->alg);
2849
0
    }
2850
0
}
2851
2852
struct ipf *
2853
conntrack_ipf_ctx(struct conntrack *ct)
2854
0
{
2855
0
    return ct->ipf;
2856
0
}
2857
2858
int
2859
conntrack_dump_start(struct conntrack *ct, struct conntrack_dump *dump,
2860
                     const uint16_t *pzone, int *ptot_bkts)
2861
0
{
2862
0
    memset(dump, 0, sizeof(*dump));
2863
2864
0
    if (pzone) {
2865
0
        dump->zone = *pzone;
2866
0
        dump->filter_zone = true;
2867
0
        dump->current_zone = dump->zone;
2868
0
    }
2869
2870
0
    dump->ct = ct;
2871
0
    *ptot_bkts = 1; /* Need to clean up the callers. */
2872
0
    dump->cursor = cmap_cursor_start(&dump->ct->conns[dump->current_zone]);
2873
0
    return 0;
2874
0
}
2875
2876
int
2877
conntrack_dump_next(struct conntrack_dump *dump, struct ct_dpif_entry *entry)
2878
0
{
2879
0
    long long now = time_msec();
2880
2881
0
    struct conn_key_node *keyn;
2882
0
    struct conn *conn;
2883
2884
0
    while (true) {
2885
0
        CMAP_CURSOR_FOR_EACH_CONTINUE (keyn, cm_node, &dump->cursor) {
2886
0
            if (keyn->dir != CT_DIR_FWD) {
2887
0
                continue;
2888
0
            }
2889
2890
0
            conn = CONTAINER_OF(keyn, struct conn, key_node[CT_DIR_FWD]);
2891
0
            if (conn_expired(conn, now)) {
2892
0
                continue;
2893
0
            }
2894
2895
0
            conn_to_ct_dpif_entry(conn, entry, now);
2896
0
            return 0;
2897
0
        }
2898
2899
0
        if (dump->filter_zone || dump->current_zone == UINT16_MAX) {
2900
0
            break;
2901
0
        }
2902
0
        dump->current_zone++;
2903
0
        dump->cursor = cmap_cursor_start(&dump->ct->conns[dump->current_zone]);
2904
0
    }
2905
2906
0
    return EOF;
2907
0
}
2908
2909
int
2910
conntrack_dump_done(struct conntrack_dump *dump OVS_UNUSED)
2911
0
{
2912
0
    return 0;
2913
0
}
2914
2915
static void
2916
exp_node_to_ct_dpif_exp(const struct alg_exp_node *exp,
2917
                        struct ct_dpif_exp *entry)
2918
0
{
2919
0
    memset(entry, 0, sizeof *entry);
2920
2921
0
    conn_key_to_tuple(&exp->key, &entry->tuple_orig);
2922
0
    conn_key_to_tuple(&exp->parent_key, &entry->tuple_parent);
2923
0
    entry->zone = exp->key.zone;
2924
0
    entry->mark = exp->parent_mark;
2925
0
    memcpy(&entry->labels, &exp->parent_label, sizeof entry->labels);
2926
0
    entry->protoinfo.proto = exp->key.nw_proto;
2927
0
}
2928
2929
int
2930
conntrack_exp_dump_start(struct conntrack *ct, struct conntrack_dump *dump,
2931
                         const uint16_t *pzone)
2932
0
{
2933
0
    memset(dump, 0, sizeof(*dump));
2934
2935
0
    if (pzone) {
2936
0
        dump->zone = *pzone;
2937
0
        dump->filter_zone = true;
2938
0
    }
2939
2940
0
    dump->ct = ct;
2941
2942
0
    return 0;
2943
0
}
2944
2945
int
2946
conntrack_exp_dump_next(struct conntrack_dump *dump, struct ct_dpif_exp *entry)
2947
0
{
2948
0
    struct conntrack *ct = dump->ct;
2949
0
    struct alg_exp_node *enode;
2950
0
    int ret = EOF;
2951
2952
0
    ovs_rwlock_rdlock(&ct->resources_lock);
2953
2954
0
    for (;;) {
2955
0
        struct hmap_node *node = hmap_at_position(&ct->alg_expectations,
2956
0
                                                  &dump->hmap_pos);
2957
0
        if (!node) {
2958
0
            break;
2959
0
        }
2960
2961
0
        enode = CONTAINER_OF(node, struct alg_exp_node, node);
2962
2963
0
        if (!dump->filter_zone || enode->key.zone == dump->zone) {
2964
0
            ret = 0;
2965
0
            exp_node_to_ct_dpif_exp(enode, entry);
2966
0
            break;
2967
0
        }
2968
0
    }
2969
2970
0
    ovs_rwlock_unlock(&ct->resources_lock);
2971
2972
0
    return ret;
2973
0
}
2974
2975
int
2976
conntrack_exp_dump_done(struct conntrack_dump *dump OVS_UNUSED)
2977
0
{
2978
0
    return 0;
2979
0
}
2980
2981
static int
2982
conntrack_flush_zone(struct conntrack *ct, const uint16_t zone)
2983
0
{
2984
0
    struct conn_key_node *keyn;
2985
0
    struct conn *conn;
2986
2987
0
    CMAP_FOR_EACH (keyn, cm_node, &ct->conns[zone]) {
2988
0
        if (keyn->dir != CT_DIR_FWD) {
2989
0
            continue;
2990
0
        }
2991
0
        conn = CONTAINER_OF(keyn, struct conn, key_node[CT_DIR_FWD]);
2992
0
        conn_clean(ct, conn);
2993
0
    }
2994
2995
0
    return 0;
2996
0
}
2997
2998
int
2999
conntrack_flush(struct conntrack *ct, const uint16_t *zone)
3000
0
{
3001
0
    if (zone) {
3002
0
        return conntrack_flush_zone(ct, *zone);
3003
0
    }
3004
3005
0
    for (unsigned i = 0; i < ARRAY_SIZE(ct->conns); i++) {
3006
0
        conntrack_flush_zone(ct, i);
3007
0
    }
3008
3009
0
    return 0;
3010
0
}
3011
3012
int
3013
conntrack_flush_tuple(struct conntrack *ct, const struct ct_dpif_tuple *tuple,
3014
                      uint16_t zone)
3015
0
{
3016
0
    struct conn_key key;
3017
0
    struct conn *conn;
3018
0
    int error = 0;
3019
3020
0
    memset(&key, 0, sizeof(key));
3021
0
    tuple_to_conn_key(tuple, zone, &key);
3022
0
    conn_lookup(ct, &key, time_msec(), &conn, NULL);
3023
3024
0
    if (conn) {
3025
0
        conn_clean(ct, conn);
3026
0
    } else {
3027
0
        VLOG_WARN("Tuple not found");
3028
0
        error = ENOENT;
3029
0
    }
3030
3031
0
    return error;
3032
0
}
3033
3034
int
3035
conntrack_set_maxconns(struct conntrack *ct, uint32_t maxconns)
3036
0
{
3037
0
    atomic_store_relaxed(&ct->n_conn_limit, maxconns);
3038
0
    return 0;
3039
0
}
3040
3041
int
3042
conntrack_get_maxconns(struct conntrack *ct, uint32_t *maxconns)
3043
0
{
3044
0
    atomic_read_relaxed(&ct->n_conn_limit, maxconns);
3045
0
    return 0;
3046
0
}
3047
3048
int
3049
conntrack_get_nconns(struct conntrack *ct, uint32_t *nconns)
3050
0
{
3051
0
    *nconns = atomic_count_get(&ct->n_conn);
3052
0
    return 0;
3053
0
}
3054
3055
int
3056
conntrack_set_tcp_seq_chk(struct conntrack *ct, bool enabled)
3057
0
{
3058
0
    atomic_store_relaxed(&ct->tcp_seq_chk, enabled);
3059
0
    return 0;
3060
0
}
3061
3062
bool
3063
conntrack_get_tcp_seq_chk(struct conntrack *ct)
3064
0
{
3065
0
    bool enabled;
3066
0
    atomic_read_relaxed(&ct->tcp_seq_chk, &enabled);
3067
0
    return enabled;
3068
0
}
3069
3070
/* This function must be called with the ct->resources read lock taken. */
3071
static struct alg_exp_node *
3072
expectation_lookup(struct hmap *alg_expectations, const struct conn_key *key,
3073
                   uint32_t basis, bool src_ip_wc)
3074
0
{
3075
0
    struct conn_key check_key;
3076
0
    memcpy(&check_key, key, sizeof check_key);
3077
0
    check_key.src.port = ALG_WC_SRC_PORT;
3078
3079
0
    if (src_ip_wc) {
3080
0
        memset(&check_key.src.addr, 0, sizeof check_key.src.addr);
3081
0
    }
3082
3083
0
    struct alg_exp_node *alg_exp_node;
3084
3085
0
    HMAP_FOR_EACH_WITH_HASH (alg_exp_node, node,
3086
0
                             conn_key_hash(&check_key, basis),
3087
0
                             alg_expectations) {
3088
0
        if (!conn_key_cmp(&alg_exp_node->key, &check_key)) {
3089
0
            return alg_exp_node;
3090
0
        }
3091
0
    }
3092
0
    return NULL;
3093
0
}
3094
3095
/* This function must be called with the ct->resources write lock taken. */
3096
static void
3097
expectation_remove(struct hmap *alg_expectations,
3098
                   const struct conn_key *key, uint32_t basis)
3099
0
{
3100
0
    struct alg_exp_node *alg_exp_node;
3101
3102
0
    HMAP_FOR_EACH_WITH_HASH (alg_exp_node, node, conn_key_hash(key, basis),
3103
0
                             alg_expectations) {
3104
0
        if (!conn_key_cmp(&alg_exp_node->key, key)) {
3105
0
            hmap_remove(alg_expectations, &alg_exp_node->node);
3106
0
            break;
3107
0
        }
3108
0
    }
3109
0
}
3110
3111
/* This function must be called with the ct->resources read lock taken. */
3112
static struct alg_exp_node *
3113
expectation_ref_lookup_unique(const struct hindex *alg_expectation_refs,
3114
                              const struct conn_key *parent_key,
3115
                              const struct conn_key *alg_exp_key,
3116
                              uint32_t basis)
3117
0
{
3118
0
    struct alg_exp_node *alg_exp_node;
3119
3120
0
    HINDEX_FOR_EACH_WITH_HASH (alg_exp_node, node_ref,
3121
0
                               conn_key_hash(parent_key, basis),
3122
0
                               alg_expectation_refs) {
3123
0
        if (!conn_key_cmp(&alg_exp_node->parent_key, parent_key) &&
3124
0
            !conn_key_cmp(&alg_exp_node->key, alg_exp_key)) {
3125
0
            return alg_exp_node;
3126
0
        }
3127
0
    }
3128
0
    return NULL;
3129
0
}
3130
3131
/* This function must be called with the ct->resources write lock taken. */
3132
static void
3133
expectation_ref_create(struct hindex *alg_expectation_refs,
3134
                       struct alg_exp_node *alg_exp_node,
3135
                       uint32_t basis)
3136
0
{
3137
0
    if (!expectation_ref_lookup_unique(alg_expectation_refs,
3138
0
                                       &alg_exp_node->parent_key,
3139
0
                                       &alg_exp_node->key, basis)) {
3140
0
        hindex_insert(alg_expectation_refs, &alg_exp_node->node_ref,
3141
0
                      conn_key_hash(&alg_exp_node->parent_key, basis));
3142
0
    }
3143
0
}
3144
3145
static void
3146
expectation_clean(struct conntrack *ct, const struct conn_key *parent_key)
3147
0
{
3148
0
    ovs_rwlock_wrlock(&ct->resources_lock);
3149
3150
0
    struct alg_exp_node *node;
3151
0
    HINDEX_FOR_EACH_WITH_HASH_SAFE (node, node_ref,
3152
0
                                    conn_key_hash(parent_key, ct->hash_basis),
3153
0
                                    &ct->alg_expectation_refs) {
3154
0
        if (!conn_key_cmp(&node->parent_key, parent_key)) {
3155
0
            expectation_remove(&ct->alg_expectations, &node->key,
3156
0
                               ct->hash_basis);
3157
0
            hindex_remove(&ct->alg_expectation_refs, &node->node_ref);
3158
0
            free(node);
3159
0
        }
3160
0
    }
3161
3162
0
    ovs_rwlock_unlock(&ct->resources_lock);
3163
0
}
3164
3165
static void
3166
expectation_create(struct conntrack *ct, ovs_be16 dst_port,
3167
                   const struct conn *parent_conn, bool reply, bool src_ip_wc,
3168
                   bool skip_nat)
3169
0
{
3170
0
    const struct conn_key *pconn_key, *pconn_rev_key;
3171
0
    union ct_addr src_addr;
3172
0
    union ct_addr dst_addr;
3173
0
    union ct_addr alg_nat_repl_addr;
3174
0
    struct alg_exp_node *alg_exp_node = xzalloc(sizeof *alg_exp_node);
3175
3176
0
    pconn_key = &parent_conn->key_node[CT_DIR_FWD].key;
3177
0
    pconn_rev_key = &parent_conn->key_node[CT_DIR_REV].key;
3178
3179
0
    if (reply) {
3180
0
        src_addr = pconn_key->src.addr;
3181
0
        dst_addr = pconn_key->dst.addr;
3182
0
        alg_exp_node->nat_rpl_dst = true;
3183
0
        if (skip_nat) {
3184
0
            alg_nat_repl_addr = dst_addr;
3185
0
        } else if (parent_conn->nat_action & NAT_ACTION_DST) {
3186
0
            alg_nat_repl_addr = pconn_rev_key->src.addr;
3187
0
            alg_exp_node->nat_rpl_dst = false;
3188
0
        } else {
3189
0
            alg_nat_repl_addr = pconn_rev_key->dst.addr;
3190
0
        }
3191
0
    } else {
3192
0
        src_addr = pconn_rev_key->src.addr;
3193
0
        dst_addr = pconn_rev_key->dst.addr;
3194
0
        alg_exp_node->nat_rpl_dst = false;
3195
0
        if (skip_nat) {
3196
0
            alg_nat_repl_addr = src_addr;
3197
0
        } else if (parent_conn->nat_action & NAT_ACTION_DST) {
3198
0
            alg_nat_repl_addr = pconn_key->dst.addr;
3199
0
            alg_exp_node->nat_rpl_dst = true;
3200
0
        } else {
3201
0
            alg_nat_repl_addr = pconn_key->src.addr;
3202
0
        }
3203
0
    }
3204
0
    if (src_ip_wc) {
3205
0
        memset(&src_addr, 0, sizeof src_addr);
3206
0
    }
3207
3208
0
    alg_exp_node->key.dl_type = pconn_key->dl_type;
3209
0
    alg_exp_node->key.nw_proto = pconn_key->nw_proto;
3210
0
    alg_exp_node->key.zone = pconn_key->zone;
3211
0
    alg_exp_node->key.src.addr = src_addr;
3212
0
    alg_exp_node->key.dst.addr = dst_addr;
3213
0
    alg_exp_node->key.src.port = ALG_WC_SRC_PORT;
3214
0
    alg_exp_node->key.dst.port = dst_port;
3215
0
    alg_exp_node->parent_mark = parent_conn->mark;
3216
0
    alg_exp_node->parent_label = parent_conn->label;
3217
0
    memcpy(&alg_exp_node->parent_key, pconn_key,
3218
0
           sizeof alg_exp_node->parent_key);
3219
    /* Take the write lock here because it is almost 100%
3220
     * likely that the lookup will fail and
3221
     * expectation_create() will be called below. */
3222
0
    ovs_rwlock_wrlock(&ct->resources_lock);
3223
0
    struct alg_exp_node *alg_exp = expectation_lookup(
3224
0
        &ct->alg_expectations, &alg_exp_node->key, ct->hash_basis, src_ip_wc);
3225
0
    if (alg_exp) {
3226
0
        free(alg_exp_node);
3227
0
        ovs_rwlock_unlock(&ct->resources_lock);
3228
0
        return;
3229
0
    }
3230
3231
0
    alg_exp_node->alg_nat_repl_addr = alg_nat_repl_addr;
3232
0
    hmap_insert(&ct->alg_expectations, &alg_exp_node->node,
3233
0
                conn_key_hash(&alg_exp_node->key, ct->hash_basis));
3234
0
    expectation_ref_create(&ct->alg_expectation_refs, alg_exp_node,
3235
0
                           ct->hash_basis);
3236
0
    ovs_rwlock_unlock(&ct->resources_lock);
3237
0
}
3238
3239
static void
3240
replace_substring(char *substr, uint8_t substr_size,
3241
                  uint8_t total_size, char *rep_str,
3242
                  uint8_t rep_str_size)
3243
0
{
3244
0
    memmove(substr + rep_str_size, substr + substr_size,
3245
0
            total_size - substr_size);
3246
0
    memcpy(substr, rep_str, rep_str_size);
3247
0
}
3248
3249
static void
3250
repl_bytes(char *str, char c1, char c2, int max)
3251
0
{
3252
0
    while (*str) {
3253
0
        if (*str == c1) {
3254
0
            *str = c2;
3255
3256
0
            if (--max == 0) {
3257
0
                break;
3258
0
            }
3259
0
        }
3260
0
        str++;
3261
0
    }
3262
0
}
3263
3264
static void
3265
modify_packet(struct dp_packet *pkt, char *pkt_str, size_t size,
3266
              char *repl_str, size_t repl_size,
3267
              uint32_t orig_used_size)
3268
0
{
3269
0
    replace_substring(pkt_str, size,
3270
0
                      (const char *) dp_packet_tail(pkt) - pkt_str,
3271
0
                      repl_str, repl_size);
3272
0
    dp_packet_set_size(pkt, orig_used_size + (int) repl_size - (int) size);
3273
0
}
3274
3275
/* Replace IPV4 address in FTP message with NATed address. */
3276
static int
3277
repl_ftp_v4_addr(struct dp_packet *pkt, ovs_be32 v4_addr_rep,
3278
                 char *ftp_data_start,
3279
                 size_t addr_offset_from_ftp_data_start,
3280
                 size_t addr_size)
3281
0
{
3282
0
    enum { MAX_FTP_V4_NAT_DELTA = 8 };
3283
3284
    /* EPSV mode. */
3285
0
    if (addr_offset_from_ftp_data_start == 0 &&
3286
0
        addr_size == 0) {
3287
0
        return 0;
3288
0
    }
3289
3290
    /* Do conservative check for pathological MTU usage. */
3291
0
    uint32_t orig_used_size = dp_packet_size(pkt);
3292
0
    if (orig_used_size + MAX_FTP_V4_NAT_DELTA >
3293
0
        dp_packet_get_allocated(pkt)) {
3294
3295
0
        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
3296
0
        VLOG_WARN_RL(&rl, "Unsupported effective MTU %u used with FTP V4",
3297
0
                     dp_packet_get_allocated(pkt));
3298
0
        return 0;
3299
0
    }
3300
3301
0
    char v4_addr_str[INET_ADDRSTRLEN] = {0};
3302
0
    ovs_assert(inet_ntop(AF_INET, &v4_addr_rep, v4_addr_str,
3303
0
                         sizeof v4_addr_str));
3304
0
    repl_bytes(v4_addr_str, '.', ',', 0);
3305
0
    modify_packet(pkt, ftp_data_start + addr_offset_from_ftp_data_start,
3306
0
                  addr_size, v4_addr_str, strlen(v4_addr_str),
3307
0
                  orig_used_size);
3308
0
    return (int) strlen(v4_addr_str) - (int) addr_size;
3309
0
}
3310
3311
static char *
3312
skip_non_digits(char *str)
3313
0
{
3314
0
    while (!isdigit(*str) && *str != 0) {
3315
0
        str++;
3316
0
    }
3317
0
    return str;
3318
0
}
3319
3320
static char *
3321
terminate_number_str(char *str, uint8_t max_digits)
3322
0
{
3323
0
    uint8_t digits_found = 0;
3324
0
    while (isdigit(*str) && digits_found <= max_digits) {
3325
0
        str++;
3326
0
        digits_found++;
3327
0
    }
3328
3329
0
    *str = 0;
3330
0
    return str;
3331
0
}
3332
3333
3334
static void
3335
get_ftp_ctl_msg(struct dp_packet *pkt, char *ftp_msg)
3336
0
{
3337
0
    struct tcp_header *th = dp_packet_l4(pkt);
3338
0
    char *tcp_hdr = (char *) th;
3339
0
    uint32_t tcp_payload_len = dp_packet_get_tcp_payload_length(pkt);
3340
0
    size_t tcp_payload_of_interest = MIN(tcp_payload_len,
3341
0
                                         LARGEST_FTP_MSG_OF_INTEREST);
3342
0
    size_t tcp_hdr_len = TCP_OFFSET(th->tcp_ctl) * 4;
3343
3344
0
    ovs_strlcpy(ftp_msg, tcp_hdr + tcp_hdr_len,
3345
0
                tcp_payload_of_interest);
3346
0
}
3347
3348
static enum ftp_ctl_pkt
3349
detect_ftp_ctl_type(const struct conn_lookup_ctx *ctx,
3350
                    struct dp_packet *pkt)
3351
0
{
3352
0
    char ftp_msg[LARGEST_FTP_MSG_OF_INTEREST + 1] = {0};
3353
0
    get_ftp_ctl_msg(pkt, ftp_msg);
3354
3355
0
    if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {
3356
0
        if (strncasecmp(ftp_msg, FTP_EPRT_CMD, strlen(FTP_EPRT_CMD)) &&
3357
0
            !strcasestr(ftp_msg, FTP_EPSV_REPLY)) {
3358
0
            return CT_FTP_CTL_OTHER;
3359
0
        }
3360
0
    } else {
3361
0
        if (strncasecmp(ftp_msg, FTP_PORT_CMD, strlen(FTP_PORT_CMD)) &&
3362
0
            strncasecmp(ftp_msg, FTP_EPRT_CMD, strlen(FTP_EPRT_CMD)) &&
3363
0
            strncasecmp(ftp_msg, FTP_PASV_REPLY_CODE,
3364
0
                        strlen(FTP_PASV_REPLY_CODE)) &&
3365
0
            strncasecmp(ftp_msg, FTP_EPSV_REPLY_CODE,
3366
0
                        strlen(FTP_EPSV_REPLY_CODE))) {
3367
0
            return CT_FTP_CTL_OTHER;
3368
0
        }
3369
0
    }
3370
3371
0
    return CT_FTP_CTL_INTEREST;
3372
0
}
3373
3374
static enum ftp_ctl_pkt
3375
process_ftp_ctl_v4(struct conntrack *ct,
3376
                   struct dp_packet *pkt,
3377
                   const struct conn *conn_for_expectation,
3378
                   ovs_be32 *v4_addr_rep,
3379
                   char **ftp_data_v4_start,
3380
                   size_t *addr_offset_from_ftp_data_start,
3381
                   size_t *addr_size)
3382
0
{
3383
0
    struct tcp_header *th = dp_packet_l4(pkt);
3384
0
    size_t tcp_hdr_len = TCP_OFFSET(th->tcp_ctl) * 4;
3385
0
    char *tcp_hdr = (char *) th;
3386
0
    *ftp_data_v4_start = tcp_hdr + tcp_hdr_len;
3387
0
    char ftp_msg[LARGEST_FTP_MSG_OF_INTEREST + 1] = {0};
3388
0
    get_ftp_ctl_msg(pkt, ftp_msg);
3389
0
    char *ftp = ftp_msg;
3390
0
    struct in_addr ip_addr;
3391
0
    enum ct_alg_mode mode;
3392
0
    bool extended = false;
3393
3394
0
    if (!strncasecmp(ftp, FTP_PORT_CMD, strlen(FTP_PORT_CMD))) {
3395
0
        ftp = ftp_msg + strlen(FTP_PORT_CMD);
3396
0
        mode = CT_FTP_MODE_ACTIVE;
3397
0
    } else if (!strncasecmp(ftp, FTP_EPRT_CMD, strlen(FTP_EPRT_CMD))) {
3398
0
        ftp = ftp_msg + strlen(FTP_EPRT_CMD);
3399
0
        mode = CT_FTP_MODE_ACTIVE;
3400
0
        extended = true;
3401
0
    } else if (!strncasecmp(ftp, FTP_EPSV_REPLY_CODE,
3402
0
                            strlen(FTP_EPSV_REPLY_CODE))) {
3403
0
        ftp = ftp_msg + strlen(FTP_EPSV_REPLY_CODE);
3404
0
        mode = CT_FTP_MODE_PASSIVE;
3405
0
        extended = true;
3406
0
    } else {
3407
0
        ftp = ftp_msg + strlen(FTP_PASV_REPLY_CODE);
3408
0
        mode = CT_FTP_MODE_PASSIVE;
3409
0
    }
3410
3411
    /* Find first space. */
3412
0
    ftp = strchr(ftp, ' ');
3413
0
    if (!ftp) {
3414
0
        return CT_FTP_CTL_INVALID;
3415
0
    }
3416
3417
    /* Find the first digit, after space. */
3418
0
    ftp = skip_non_digits(ftp);
3419
0
    if (*ftp == 0) {
3420
0
        return CT_FTP_CTL_INVALID;
3421
0
    }
3422
3423
    /* EPRT, verify address family. */
3424
0
    if (extended && mode == CT_FTP_MODE_ACTIVE) {
3425
0
        if (ftp[0] != FTP_AF_V4 || isdigit(ftp[1])) {
3426
0
            return CT_FTP_CTL_INVALID;
3427
0
        }
3428
3429
0
        ftp = skip_non_digits(ftp + 1);
3430
0
        if (*ftp == 0) {
3431
0
            return CT_FTP_CTL_INVALID;
3432
0
        }
3433
0
    }
3434
3435
0
    if (!extended || mode == CT_FTP_MODE_ACTIVE) {
3436
0
        char *ip_addr_start = ftp;
3437
0
        *addr_offset_from_ftp_data_start = ip_addr_start - ftp_msg;
3438
0
        repl_bytes(ftp, ',', '.', 3);
3439
3440
        /* Advance to end of IP address, to terminate it. */
3441
0
        while (*ftp) {
3442
0
            if (!isdigit(*ftp) && *ftp != '.') {
3443
0
                break;
3444
0
            }
3445
0
            ftp++;
3446
0
        }
3447
0
        *ftp = 0;
3448
0
        ftp++;
3449
3450
0
        int rc2 = inet_pton(AF_INET, ip_addr_start, &ip_addr);
3451
0
        if (rc2 != 1) {
3452
0
            return CT_FTP_CTL_INVALID;
3453
0
        }
3454
3455
0
        *addr_size = ftp - ip_addr_start - 1;
3456
0
    } else {
3457
0
        *addr_size = 0;
3458
0
        *addr_offset_from_ftp_data_start = 0;
3459
0
    }
3460
3461
0
    char *save_ftp = ftp;
3462
0
    uint16_t port_hs;
3463
3464
0
    if (!extended) {
3465
0
        ftp = terminate_number_str(ftp, MAX_FTP_PORT_DGTS);
3466
0
        if (!ftp) {
3467
0
            return CT_FTP_CTL_INVALID;
3468
0
        }
3469
0
        int value;
3470
0
        if (!str_to_int(save_ftp, 10, &value)) {
3471
0
            return CT_FTP_CTL_INVALID;
3472
0
        }
3473
3474
        /* This is derived from the L4 port maximum is 65535. */
3475
0
        if (value > 255) {
3476
0
            return CT_FTP_CTL_INVALID;
3477
0
        }
3478
3479
0
        port_hs = value;
3480
0
        port_hs <<= 8;
3481
3482
        /* Skip over comma. */
3483
0
        ftp++;
3484
0
        save_ftp = ftp;
3485
0
        bool digit_found = false;
3486
0
        while (isdigit(*ftp)) {
3487
0
            ftp++;
3488
0
            digit_found = true;
3489
0
        }
3490
0
        if (!digit_found) {
3491
0
            return CT_FTP_CTL_INVALID;
3492
0
        }
3493
0
        *ftp = 0;
3494
0
        if (!str_to_int(save_ftp, 10, &value)) {
3495
0
            return CT_FTP_CTL_INVALID;
3496
0
        }
3497
3498
0
        if (value > 255) {
3499
0
            return CT_FTP_CTL_INVALID;
3500
0
        }
3501
3502
0
        port_hs |= value;
3503
0
    } else {
3504
0
        ftp = terminate_number_str(ftp, MAX_EXT_FTP_PORT_DGTS);
3505
0
        if (!ftp) {
3506
0
            return CT_FTP_CTL_INVALID;
3507
0
        }
3508
0
        int value;
3509
0
        if (!str_to_int(save_ftp, 10, &value)) {
3510
0
            return CT_FTP_CTL_INVALID;
3511
0
        }
3512
0
        if (value > UINT16_MAX) {
3513
0
            return CT_FTP_CTL_INVALID;
3514
0
        }
3515
0
        port_hs = (uint16_t) value;
3516
0
    }
3517
3518
0
    ovs_be16 port = htons(port_hs);
3519
0
    ovs_be32 conn_ipv4_addr;
3520
3521
0
    switch (mode) {
3522
0
    case CT_FTP_MODE_ACTIVE:
3523
0
        *v4_addr_rep =
3524
0
            conn_for_expectation->key_node[CT_DIR_REV].key.dst.addr.ipv4;
3525
0
        conn_ipv4_addr =
3526
0
            conn_for_expectation->key_node[CT_DIR_FWD].key.src.addr.ipv4;
3527
0
        break;
3528
0
    case CT_FTP_MODE_PASSIVE:
3529
0
        *v4_addr_rep =
3530
0
            conn_for_expectation->key_node[CT_DIR_FWD].key.dst.addr.ipv4;
3531
0
        conn_ipv4_addr =
3532
0
            conn_for_expectation->key_node[CT_DIR_REV].key.src.addr.ipv4;
3533
0
        break;
3534
0
    case CT_TFTP_MODE:
3535
0
    default:
3536
0
        OVS_NOT_REACHED();
3537
0
    }
3538
3539
0
    if (!extended || mode == CT_FTP_MODE_ACTIVE) {
3540
0
        ovs_be32 ftp_ipv4_addr;
3541
0
        ftp_ipv4_addr = ip_addr.s_addr;
3542
        /* Although most servers will block this exploit, there may be some
3543
         * less well managed. */
3544
0
        if (ftp_ipv4_addr != conn_ipv4_addr && ftp_ipv4_addr != *v4_addr_rep) {
3545
0
            return CT_FTP_CTL_INVALID;
3546
0
        }
3547
0
    }
3548
3549
0
    expectation_create(ct, port, conn_for_expectation,
3550
0
                       !!(pkt->md.ct_state & CS_REPLY_DIR), false, false);
3551
0
    return CT_FTP_CTL_INTEREST;
3552
0
}
3553
3554
static char *
3555
skip_ipv6_digits(char *str)
3556
0
{
3557
0
    while (isxdigit(*str) || *str == ':' || *str == '.') {
3558
0
        str++;
3559
0
    }
3560
0
    return str;
3561
0
}
3562
3563
static enum ftp_ctl_pkt
3564
process_ftp_ctl_v6(struct conntrack *ct,
3565
                   struct dp_packet *pkt,
3566
                   const struct conn *conn_for_exp,
3567
                   union ct_addr *v6_addr_rep, char **ftp_data_start,
3568
                   size_t *addr_offset_from_ftp_data_start,
3569
                   size_t *addr_size, enum ct_alg_mode *mode)
3570
0
{
3571
0
    struct tcp_header *th = dp_packet_l4(pkt);
3572
0
    size_t tcp_hdr_len = TCP_OFFSET(th->tcp_ctl) * 4;
3573
0
    char *tcp_hdr = (char *) th;
3574
0
    char ftp_msg[LARGEST_FTP_MSG_OF_INTEREST + 1] = {0};
3575
0
    get_ftp_ctl_msg(pkt, ftp_msg);
3576
0
    *ftp_data_start = tcp_hdr + tcp_hdr_len;
3577
0
    char *ftp = ftp_msg;
3578
0
    struct in6_addr ip6_addr;
3579
3580
0
    if (!strncasecmp(ftp, FTP_EPRT_CMD, strlen(FTP_EPRT_CMD))) {
3581
0
        ftp = ftp_msg + strlen(FTP_EPRT_CMD);
3582
0
        ftp = skip_non_digits(ftp);
3583
0
        if (*ftp != FTP_AF_V6 || isdigit(ftp[1])) {
3584
0
            return CT_FTP_CTL_INVALID;
3585
0
        }
3586
        /* Jump over delimiter. */
3587
0
        ftp += 2;
3588
3589
0
        memset(&ip6_addr, 0, sizeof ip6_addr);
3590
0
        char *ip_addr_start = ftp;
3591
0
        *addr_offset_from_ftp_data_start = ip_addr_start - ftp_msg;
3592
0
        ftp = skip_ipv6_digits(ftp);
3593
0
        *ftp = 0;
3594
0
        *addr_size = ftp - ip_addr_start;
3595
0
        int rc2 = inet_pton(AF_INET6, ip_addr_start, &ip6_addr);
3596
0
        if (rc2 != 1) {
3597
0
            return CT_FTP_CTL_INVALID;
3598
0
        }
3599
0
        ftp++;
3600
0
        *mode = CT_FTP_MODE_ACTIVE;
3601
0
    } else {
3602
0
        ftp = ftp_msg + strcspn(ftp_msg, "(");
3603
0
        ftp = skip_non_digits(ftp);
3604
0
        if (!isdigit(*ftp)) {
3605
0
            return CT_FTP_CTL_INVALID;
3606
0
        }
3607
3608
        /* Not used for passive mode. */
3609
0
        *addr_offset_from_ftp_data_start = 0;
3610
0
        *addr_size = 0;
3611
3612
0
        *mode = CT_FTP_MODE_PASSIVE;
3613
0
    }
3614
3615
0
    char *save_ftp = ftp;
3616
0
    ftp = terminate_number_str(ftp, MAX_EXT_FTP_PORT_DGTS);
3617
0
    if (!ftp) {
3618
0
        return CT_FTP_CTL_INVALID;
3619
0
    }
3620
3621
0
    int value;
3622
0
    if (!str_to_int(save_ftp, 10, &value)) {
3623
0
        return CT_FTP_CTL_INVALID;
3624
0
    }
3625
0
    if (value > CT_MAX_L4_PORT) {
3626
0
        return CT_FTP_CTL_INVALID;
3627
0
    }
3628
3629
0
    uint16_t port_hs = value;
3630
0
    ovs_be16 port = htons(port_hs);
3631
3632
0
    switch (*mode) {
3633
0
    case CT_FTP_MODE_ACTIVE:
3634
0
        *v6_addr_rep = conn_for_exp->key_node[CT_DIR_REV].key.dst.addr;
3635
        /* Although most servers will block this exploit, there may be some
3636
         * less well managed. */
3637
0
        if (memcmp(&ip6_addr, &v6_addr_rep->ipv6, sizeof ip6_addr) &&
3638
0
            memcmp(&ip6_addr,
3639
0
                   &conn_for_exp->key_node[CT_DIR_FWD].key.src.addr.ipv6,
3640
0
                   sizeof ip6_addr)) {
3641
0
            return CT_FTP_CTL_INVALID;
3642
0
        }
3643
0
        break;
3644
0
    case CT_FTP_MODE_PASSIVE:
3645
0
        *v6_addr_rep = conn_for_exp->key_node[CT_DIR_FWD].key.dst.addr;
3646
0
        break;
3647
0
    case CT_TFTP_MODE:
3648
0
    default:
3649
0
        OVS_NOT_REACHED();
3650
0
    }
3651
3652
0
    expectation_create(ct, port, conn_for_exp,
3653
0
                       !!(pkt->md.ct_state & CS_REPLY_DIR), false, false);
3654
0
    return CT_FTP_CTL_INTEREST;
3655
0
}
3656
3657
static int
3658
repl_ftp_v6_addr(struct dp_packet *pkt, union ct_addr v6_addr_rep,
3659
                 char *ftp_data_start,
3660
                 size_t addr_offset_from_ftp_data_start,
3661
                 size_t addr_size, enum ct_alg_mode mode)
3662
0
{
3663
    /* This is slightly bigger than really possible. */
3664
0
    enum { MAX_FTP_V6_NAT_DELTA = 45 };
3665
3666
0
    if (mode == CT_FTP_MODE_PASSIVE) {
3667
0
        return 0;
3668
0
    }
3669
3670
    /* Do conservative check for pathological MTU usage. */
3671
0
    uint32_t orig_used_size = dp_packet_size(pkt);
3672
0
    if (orig_used_size + MAX_FTP_V6_NAT_DELTA >
3673
0
        dp_packet_get_allocated(pkt)) {
3674
3675
0
        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
3676
0
        VLOG_WARN_RL(&rl, "Unsupported effective MTU %u used with FTP V6",
3677
0
                     dp_packet_get_allocated(pkt));
3678
0
        return 0;
3679
0
    }
3680
3681
0
    char v6_addr_str[INET6_ADDRSTRLEN] = {0};
3682
0
    ovs_assert(inet_ntop(AF_INET6, &v6_addr_rep.ipv6, v6_addr_str,
3683
0
                         sizeof v6_addr_str));
3684
0
    modify_packet(pkt, ftp_data_start + addr_offset_from_ftp_data_start,
3685
0
                  addr_size, v6_addr_str, strlen(v6_addr_str),
3686
0
                  orig_used_size);
3687
0
    return (int) strlen(v6_addr_str) - (int) addr_size;
3688
0
}
3689
3690
/* Increment/decrement a TCP sequence number. */
3691
static void
3692
adj_seqnum(ovs_16aligned_be32 *val, int32_t inc)
3693
0
{
3694
0
    put_16aligned_be32(val, htonl(ntohl(get_16aligned_be32(val)) + inc));
3695
0
}
3696
3697
static void
3698
handle_ftp_ctl(struct conntrack *ct, const struct conn_lookup_ctx *ctx,
3699
               struct dp_packet *pkt, struct conn *ec, long long now,
3700
               enum ftp_ctl_pkt ftp_ctl, bool nat)
3701
0
{
3702
0
    struct ip_header *l3_hdr = dp_packet_l3(pkt);
3703
0
    ovs_be32 v4_addr_rep = 0;
3704
0
    union ct_addr v6_addr_rep;
3705
0
    size_t addr_offset_from_ftp_data_start = 0;
3706
0
    size_t addr_size = 0;
3707
0
    char *ftp_data_start;
3708
0
    enum ct_alg_mode mode = CT_FTP_MODE_ACTIVE;
3709
3710
0
    if (detect_ftp_ctl_type(ctx, pkt) != ftp_ctl) {
3711
0
        return;
3712
0
    }
3713
3714
0
    struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
3715
0
    int64_t seq_skew = 0;
3716
3717
0
    if (ftp_ctl == CT_FTP_CTL_INTEREST) {
3718
0
        enum ftp_ctl_pkt rc;
3719
0
        if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {
3720
0
            rc = process_ftp_ctl_v6(ct, pkt, ec,
3721
0
                                    &v6_addr_rep, &ftp_data_start,
3722
0
                                    &addr_offset_from_ftp_data_start,
3723
0
                                    &addr_size, &mode);
3724
0
        } else {
3725
0
            rc = process_ftp_ctl_v4(ct, pkt, ec,
3726
0
                                    &v4_addr_rep, &ftp_data_start,
3727
0
                                    &addr_offset_from_ftp_data_start,
3728
0
                                    &addr_size);
3729
0
        }
3730
0
        if (rc == CT_FTP_CTL_INVALID) {
3731
0
            static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
3732
0
            VLOG_WARN_RL(&rl, "Invalid FTP control packet format");
3733
0
            pkt->md.ct_state |= CS_TRACKED | CS_INVALID;
3734
0
            return;
3735
0
        } else if (rc == CT_FTP_CTL_INTEREST) {
3736
0
            uint16_t ip_len;
3737
3738
0
            if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {
3739
0
                if (nat) {
3740
0
                    seq_skew = repl_ftp_v6_addr(pkt, v6_addr_rep,
3741
0
                                   ftp_data_start,
3742
0
                                   addr_offset_from_ftp_data_start,
3743
0
                                   addr_size, mode);
3744
0
                }
3745
3746
0
                if (seq_skew) {
3747
0
                    ip_len = ntohs(nh6->ip6_ctlun.ip6_un1.ip6_un1_plen) +
3748
0
                        seq_skew;
3749
0
                    nh6->ip6_ctlun.ip6_un1.ip6_un1_plen = htons(ip_len);
3750
0
                }
3751
0
            } else {
3752
0
                if (nat) {
3753
0
                    seq_skew = repl_ftp_v4_addr(pkt, v4_addr_rep,
3754
0
                                   ftp_data_start,
3755
0
                                   addr_offset_from_ftp_data_start,
3756
0
                                   addr_size);
3757
0
                }
3758
0
                if (seq_skew) {
3759
0
                    ip_len = ntohs(l3_hdr->ip_tot_len) + seq_skew;
3760
0
                    if (dp_packet_ip_checksum_valid(pkt)) {
3761
0
                        dp_packet_ip_checksum_set_partial(pkt);
3762
0
                    } else {
3763
0
                        l3_hdr->ip_csum = recalc_csum16(l3_hdr->ip_csum,
3764
0
                                                        l3_hdr->ip_tot_len,
3765
0
                                                        htons(ip_len));
3766
0
                    }
3767
0
                    l3_hdr->ip_tot_len = htons(ip_len);
3768
0
                }
3769
0
            }
3770
0
        } else {
3771
0
            OVS_NOT_REACHED();
3772
0
        }
3773
0
    }
3774
3775
0
    struct tcp_header *th = dp_packet_l4(pkt);
3776
3777
0
    if (nat && ec->seq_skew != 0) {
3778
0
        ctx->reply != ec->seq_skew_dir ?
3779
0
            adj_seqnum(&th->tcp_ack, -ec->seq_skew) :
3780
0
            adj_seqnum(&th->tcp_seq, ec->seq_skew);
3781
0
    }
3782
3783
0
    if (dp_packet_l4_checksum_valid(pkt)) {
3784
0
        dp_packet_l4_checksum_set_partial(pkt);
3785
0
    } else {
3786
0
        th->tcp_csum = 0;
3787
0
        if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {
3788
0
            th->tcp_csum = packet_csum_upperlayer6(nh6, th, ctx->key.nw_proto,
3789
0
                               dp_packet_l4_size(pkt));
3790
0
        } else {
3791
0
            uint32_t tcp_csum = packet_csum_pseudoheader(l3_hdr);
3792
0
            th->tcp_csum = csum_finish(
3793
0
                 csum_continue(tcp_csum, th, dp_packet_l4_size(pkt)));
3794
0
        }
3795
0
    }
3796
3797
0
    if (seq_skew) {
3798
0
        conn_seq_skew_set(ct, ec, now, seq_skew + ec->seq_skew,
3799
0
                          ctx->reply);
3800
0
    }
3801
0
}
3802
3803
static void
3804
handle_tftp_ctl(struct conntrack *ct,
3805
                const struct conn_lookup_ctx *ctx OVS_UNUSED,
3806
                struct dp_packet *pkt, struct conn *conn_for_expectation,
3807
                long long now OVS_UNUSED, enum ftp_ctl_pkt ftp_ctl OVS_UNUSED,
3808
                bool nat OVS_UNUSED)
3809
0
{
3810
0
    expectation_create(ct,
3811
0
                       conn_for_expectation->key_node[CT_DIR_FWD].key.src.port,
3812
0
                       conn_for_expectation,
3813
0
                       !!(pkt->md.ct_state & CS_REPLY_DIR), false, false);
3814
0
}