Coverage Report

Created: 2025-08-26 06:20

/src/openvswitch/lib/conntrack.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) 2015-2019 Nicira, Inc.
3
 *
4
 * Licensed under the Apache License, Version 2.0 (the "License");
5
 * you may not use this file except in compliance with the License.
6
 * You may obtain a copy of the License at:
7
 *
8
 *     http://www.apache.org/licenses/LICENSE-2.0
9
 *
10
 * Unless required by applicable law or agreed to in writing, software
11
 * distributed under the License is distributed on an "AS IS" BASIS,
12
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
 * See the License for the specific language governing permissions and
14
 * limitations under the License.
15
 */
16
17
#include <config.h>
18
#include <ctype.h>
19
#include <errno.h>
20
#include <sys/types.h>
21
#include <netinet/in.h>
22
#include <netinet/icmp6.h>
23
#include <string.h>
24
25
#include "conntrack.h"
26
#include "conntrack-private.h"
27
#include "conntrack-tp.h"
28
#include "coverage.h"
29
#include "crc32c.h"
30
#include "csum.h"
31
#include "ct-dpif.h"
32
#include "dp-packet.h"
33
#include "flow.h"
34
#include "netdev.h"
35
#include "odp-netlink.h"
36
#include "odp-util.h"
37
#include "openvswitch/hmap.h"
38
#include "openvswitch/types.h"
39
#include "openvswitch/vlog.h"
40
#include "ovs-rcu.h"
41
#include "ovs-thread.h"
42
#include "openvswitch/poll-loop.h"
43
#include "random.h"
44
#include "rculist.h"
45
#include "timeval.h"
46
#include "unaligned.h"
47
48
VLOG_DEFINE_THIS_MODULE(conntrack);
49
50
COVERAGE_DEFINE(conntrack_full);
51
COVERAGE_DEFINE(conntrack_l3csum_checked);
52
COVERAGE_DEFINE(conntrack_l3csum_err);
53
COVERAGE_DEFINE(conntrack_l4csum_checked);
54
COVERAGE_DEFINE(conntrack_l4csum_err);
55
COVERAGE_DEFINE(conntrack_lookup_natted_miss);
56
COVERAGE_DEFINE(conntrack_zone_full);
57
58
struct conn_lookup_ctx {
59
    struct conn_key key;
60
    struct conn *conn;
61
    uint32_t hash;
62
    bool reply;
63
    bool icmp_related;
64
};
65
66
enum ftp_ctl_pkt {
67
    /* Control packets with address and/or port specifiers. */
68
    CT_FTP_CTL_INTEREST,
69
    /* Control packets without address and/or port specifiers. */
70
    CT_FTP_CTL_OTHER,
71
    CT_FTP_CTL_INVALID,
72
};
73
74
enum ct_alg_mode {
75
    CT_FTP_MODE_ACTIVE,
76
    CT_FTP_MODE_PASSIVE,
77
    CT_TFTP_MODE,
78
};
79
80
enum ct_alg_ctl_type {
81
    CT_ALG_CTL_NONE,
82
    CT_ALG_CTL_FTP,
83
    CT_ALG_CTL_TFTP,
84
    /* SIP is not enabled through Openflow and presently only used as
85
     * an example of an alg that allows a wildcard src ip. */
86
    CT_ALG_CTL_SIP,
87
};
88
89
struct zone_limit {
90
    struct cmap_node node;
91
    struct conntrack_zone_limit czl;
92
};
93
94
static bool conn_key_extract(struct conntrack *, struct dp_packet *,
95
                             ovs_be16 dl_type, struct conn_lookup_ctx *,
96
                             uint16_t zone);
97
static uint32_t conn_key_hash(const struct conn_key *, uint32_t basis);
98
static void conn_key_reverse(struct conn_key *);
99
static bool valid_new(struct dp_packet *pkt, struct conn_key *);
100
static struct conn *new_conn(struct conntrack *ct, struct dp_packet *pkt,
101
                             struct conn_key *, long long now,
102
                             uint32_t tp_id);
103
static void delete_conn__(struct conn *);
104
static void delete_conn(struct conn *);
105
static enum ct_update_res conn_update(struct conntrack *ct, struct conn *conn,
106
                                      struct dp_packet *pkt,
107
                                      struct conn_lookup_ctx *ctx,
108
                                      long long now);
109
static long long int conn_expiration(const struct conn *);
110
static bool conn_expired(const struct conn *, long long now);
111
static void conn_expire_push_front(struct conntrack *ct, struct conn *conn);
112
static void set_mark(struct dp_packet *, struct conn *,
113
                     uint32_t val, uint32_t mask);
114
static void set_label(struct dp_packet *, struct conn *,
115
                      const struct ovs_key_ct_labels *val,
116
                      const struct ovs_key_ct_labels *mask);
117
static void *clean_thread_main(void *f_);
118
119
static bool
120
nat_get_unique_tuple(struct conntrack *ct, struct conn *conn,
121
                     const struct nat_action_info_t *nat_info);
122
123
static uint8_t
124
reverse_icmp_type(uint8_t type);
125
static uint8_t
126
reverse_icmp6_type(uint8_t type);
127
static inline bool
128
extract_l3_ipv4(struct dp_packet *pkt, struct conn_key *key, const void *data,
129
                size_t size, const char **new_data);
130
static inline bool
131
extract_l3_ipv6(struct conn_key *key, const void *data, size_t size,
132
                const char **new_data);
133
static struct alg_exp_node *
134
expectation_lookup(struct hmap *alg_expectations, const struct conn_key *key,
135
                   uint32_t basis, bool src_ip_wc);
136
137
static int
138
repl_ftp_v4_addr(struct dp_packet *pkt, ovs_be32 v4_addr_rep,
139
                 char *ftp_data_v4_start,
140
                 size_t addr_offset_from_ftp_data_start, size_t addr_size);
141
142
static enum ftp_ctl_pkt
143
process_ftp_ctl_v4(struct conntrack *ct,
144
                   struct dp_packet *pkt,
145
                   const struct conn *conn_for_expectation,
146
                   ovs_be32 *v4_addr_rep,
147
                   char **ftp_data_v4_start,
148
                   size_t *addr_offset_from_ftp_data_start,
149
                   size_t *addr_size);
150
151
static enum ftp_ctl_pkt
152
detect_ftp_ctl_type(const struct conn_lookup_ctx *ctx,
153
                    struct dp_packet *pkt);
154
155
static void
156
expectation_clean(struct conntrack *ct, const struct conn_key *parent_key);
157
158
static struct ct_l4_proto *l4_protos[UINT8_MAX + 1];
159
160
static void
161
handle_ftp_ctl(struct conntrack *ct, const struct conn_lookup_ctx *ctx,
162
               struct dp_packet *pkt, struct conn *ec, long long now,
163
               enum ftp_ctl_pkt ftp_ctl, bool nat);
164
165
static void
166
handle_tftp_ctl(struct conntrack *ct,
167
                const struct conn_lookup_ctx *ctx OVS_UNUSED,
168
                struct dp_packet *pkt, struct conn *conn_for_expectation,
169
                long long now OVS_UNUSED, enum ftp_ctl_pkt ftp_ctl OVS_UNUSED,
170
                bool nat OVS_UNUSED);
171
172
typedef void (*alg_helper)(struct conntrack *ct,
173
                           const struct conn_lookup_ctx *ctx,
174
                           struct dp_packet *pkt,
175
                           struct conn *conn_for_expectation,
176
                           long long now, enum ftp_ctl_pkt ftp_ctl,
177
                           bool nat);
178
179
static alg_helper alg_helpers[] = {
180
    [CT_ALG_CTL_NONE] = NULL,
181
    [CT_ALG_CTL_FTP] = handle_ftp_ctl,
182
    [CT_ALG_CTL_TFTP] = handle_tftp_ctl,
183
};
184
185
/* The maximum TCP or UDP port number. */
186
0
#define CT_MAX_L4_PORT 65535
187
/* String buffer used for parsing FTP string messages.
188
 * This is sized about twice what is needed to leave some
189
 * margin of error. */
190
#define LARGEST_FTP_MSG_OF_INTEREST 128
191
/* FTP port string used in active mode. */
192
0
#define FTP_PORT_CMD "PORT"
193
/* FTP pasv string used in passive mode. */
194
0
#define FTP_PASV_REPLY_CODE "227"
195
/* Maximum decimal digits for port in FTP command.
196
 * The port is represented as two 3 digit numbers with the
197
 * high part a multiple of 256. */
198
0
#define MAX_FTP_PORT_DGTS 3
199
200
/* FTP extension EPRT string used for active mode. */
201
0
#define FTP_EPRT_CMD "EPRT"
202
/* FTP extension EPSV string used for passive mode. */
203
0
#define FTP_EPSV_REPLY "EXTENDED PASSIVE"
204
/* Maximum decimal digits for port in FTP extended command. */
205
0
#define MAX_EXT_FTP_PORT_DGTS 5
206
/* FTP extended command code for IPv6. */
207
0
#define FTP_AF_V6 '2'
208
/* Used to indicate a wildcard L4 source port number for ALGs.
209
 * This is used for port numbers that we cannot predict in
210
 * expectations. */
211
0
#define ALG_WC_SRC_PORT 0
212
213
/* If the total number of connections goes above this value, no new connections
214
 * are accepted. */
215
#define DEFAULT_N_CONN_LIMIT 3000000
216
217
/* Does a member by member comparison of two conn_keys; this
218
 * function must be kept in sync with struct conn_key; returns 0
219
 * if the keys are equal or 1 if the keys are not equal. */
220
static int
221
conn_key_cmp(const struct conn_key *key1, const struct conn_key *key2)
222
0
{
223
0
    if (!memcmp(&key1->src.addr, &key2->src.addr, sizeof key1->src.addr) &&
224
0
        !memcmp(&key1->dst.addr, &key2->dst.addr, sizeof key1->dst.addr) &&
225
0
        (key1->src.icmp_id == key2->src.icmp_id) &&
226
0
        (key1->src.icmp_type == key2->src.icmp_type) &&
227
0
        (key1->src.icmp_code == key2->src.icmp_code) &&
228
0
        (key1->dst.icmp_id == key2->dst.icmp_id) &&
229
0
        (key1->dst.icmp_type == key2->dst.icmp_type) &&
230
0
        (key1->dst.icmp_code == key2->dst.icmp_code) &&
231
0
        (key1->dl_type == key2->dl_type) &&
232
0
        (key1->zone == key2->zone) &&
233
0
        (key1->nw_proto == key2->nw_proto)) {
234
235
0
        return 0;
236
0
    }
237
0
    return 1;
238
0
}
239
240
/* Initializes the connection tracker 'ct'.  The caller is responsible for
241
 * calling 'conntrack_destroy()', when the instance is not needed anymore */
242
struct conntrack *
243
conntrack_init(void)
244
0
{
245
0
    static struct ovsthread_once setup_l4_once = OVSTHREAD_ONCE_INITIALIZER;
246
0
    struct conntrack *ct = xzalloc(sizeof *ct);
247
248
    /* This value can be used during init (e.g. timeout_policy_init()),
249
     * set it first to ensure it is available.
250
     */
251
0
    ct->hash_basis = random_uint32();
252
253
0
    ovs_rwlock_init(&ct->resources_lock);
254
0
    ovs_rwlock_wrlock(&ct->resources_lock);
255
0
    hmap_init(&ct->alg_expectations);
256
0
    hindex_init(&ct->alg_expectation_refs);
257
0
    ovs_rwlock_unlock(&ct->resources_lock);
258
259
0
    ovs_mutex_init_adaptive(&ct->ct_lock);
260
0
    ovs_mutex_lock(&ct->ct_lock);
261
0
    for (unsigned i = 0; i < ARRAY_SIZE(ct->conns); i++) {
262
0
        cmap_init(&ct->conns[i]);
263
0
    }
264
0
    for (unsigned i = 0; i < ARRAY_SIZE(ct->exp_lists); i++) {
265
0
        rculist_init(&ct->exp_lists[i]);
266
0
    }
267
0
    cmap_init(&ct->zone_limits);
268
0
    ct->zone_limit_seq = 0;
269
0
    timeout_policy_init(ct);
270
0
    ovs_mutex_unlock(&ct->ct_lock);
271
272
0
    atomic_count_init(&ct->n_conn, 0);
273
0
    atomic_init(&ct->n_conn_limit, DEFAULT_N_CONN_LIMIT);
274
0
    atomic_init(&ct->tcp_seq_chk, true);
275
0
    atomic_init(&ct->sweep_ms, 20000);
276
0
    atomic_init(&ct->default_zone_limit, 0);
277
0
    latch_init(&ct->clean_thread_exit);
278
0
    ct->clean_thread = ovs_thread_create("ct_clean", clean_thread_main, ct);
279
0
    ct->ipf = ipf_init();
280
281
    /* Initialize the l4 protocols. */
282
0
    if (ovsthread_once_start(&setup_l4_once)) {
283
0
        for (int i = 0; i < ARRAY_SIZE(l4_protos); i++) {
284
0
            l4_protos[i] = &ct_proto_other;
285
0
        }
286
        /* IPPROTO_UDP uses ct_proto_other, so no need to initialize it. */
287
0
        l4_protos[IPPROTO_TCP] = &ct_proto_tcp;
288
0
        l4_protos[IPPROTO_ICMP] = &ct_proto_icmp4;
289
0
        l4_protos[IPPROTO_ICMPV6] = &ct_proto_icmp6;
290
291
0
        ovsthread_once_done(&setup_l4_once);
292
0
    }
293
0
    return ct;
294
0
}
295
296
static uint32_t
297
zone_key_hash(int32_t zone, uint32_t basis)
298
0
{
299
0
    size_t hash = hash_int((OVS_FORCE uint32_t) zone, basis);
300
0
    return hash;
301
0
}
302
303
static int64_t
304
zone_limit_get_limit__(struct conntrack_zone_limit *czl)
305
0
{
306
0
    int64_t limit;
307
0
    atomic_read_relaxed(&czl->limit, &limit);
308
309
0
    return limit;
310
0
}
311
312
static int64_t
313
zone_limit_get_limit(struct conntrack *ct, struct conntrack_zone_limit *czl)
314
0
{
315
0
    int64_t limit = zone_limit_get_limit__(czl);
316
317
0
    if (limit == ZONE_LIMIT_CONN_DEFAULT) {
318
0
        atomic_read_relaxed(&ct->default_zone_limit, &limit);
319
0
        limit = limit ? limit : -1;
320
0
    }
321
322
0
    return limit;
323
0
}
324
325
static struct zone_limit *
326
zone_limit_lookup_protected(struct conntrack *ct, int32_t zone)
327
    OVS_REQUIRES(ct->ct_lock)
328
0
{
329
0
    uint32_t hash = zone_key_hash(zone, ct->hash_basis);
330
0
    struct zone_limit *zl;
331
0
    CMAP_FOR_EACH_WITH_HASH_PROTECTED (zl, node, hash, &ct->zone_limits) {
332
0
        if (zl->czl.zone == zone) {
333
0
            return zl;
334
0
        }
335
0
    }
336
0
    return NULL;
337
0
}
338
339
static struct zone_limit *
340
zone_limit_lookup(struct conntrack *ct, int32_t zone)
341
0
{
342
0
    uint32_t hash = zone_key_hash(zone, ct->hash_basis);
343
0
    struct zone_limit *zl;
344
0
    CMAP_FOR_EACH_WITH_HASH (zl, node, hash, &ct->zone_limits) {
345
0
        if (zl->czl.zone == zone) {
346
0
            return zl;
347
0
        }
348
0
    }
349
0
    return NULL;
350
0
}
351
352
static struct zone_limit *
353
zone_limit_create__(struct conntrack *ct, int32_t zone, int64_t limit)
354
    OVS_REQUIRES(ct->ct_lock)
355
0
{
356
0
    struct zone_limit *zl = NULL;
357
358
0
    if (zone > DEFAULT_ZONE && zone <= MAX_ZONE) {
359
0
        zl = xmalloc(sizeof *zl);
360
0
        atomic_init(&zl->czl.limit, limit);
361
0
        atomic_count_init(&zl->czl.count, 0);
362
0
        zl->czl.zone = zone;
363
0
        zl->czl.zone_limit_seq = ct->zone_limit_seq++;
364
0
        uint32_t hash = zone_key_hash(zone, ct->hash_basis);
365
0
        cmap_insert(&ct->zone_limits, &zl->node, hash);
366
0
    }
367
368
0
    return zl;
369
0
}
370
371
static struct zone_limit *
372
zone_limit_create(struct conntrack *ct, int32_t zone, int64_t limit)
373
    OVS_REQUIRES(ct->ct_lock)
374
0
{
375
0
    struct zone_limit *zl = zone_limit_lookup_protected(ct, zone);
376
377
0
    if (zl) {
378
0
        return zl;
379
0
    }
380
381
0
    return zone_limit_create__(ct, zone, limit);
382
0
}
383
384
/* Lazily creates a new entry in the zone_limits cmap if default limit
385
 * is set and there's no entry for the zone. */
386
static struct zone_limit *
387
zone_limit_lookup_or_default(struct conntrack *ct, int32_t zone)
388
    OVS_REQUIRES(ct->ct_lock)
389
0
{
390
0
    struct zone_limit *zl = zone_limit_lookup_protected(ct, zone);
391
392
0
    if (!zl) {
393
0
        uint32_t limit;
394
0
        atomic_read_relaxed(&ct->default_zone_limit, &limit);
395
396
0
        if (limit) {
397
0
            zl = zone_limit_create__(ct, zone, ZONE_LIMIT_CONN_DEFAULT);
398
0
        }
399
0
    }
400
401
0
    return zl;
402
0
}
403
404
struct conntrack_zone_info
405
zone_limit_get(struct conntrack *ct, int32_t zone)
406
0
{
407
0
    struct conntrack_zone_info czl = {
408
0
        .zone = DEFAULT_ZONE,
409
0
        .limit = 0,
410
0
        .count = 0,
411
0
    };
412
0
    struct zone_limit *zl = zone_limit_lookup(ct, zone);
413
0
    if (zl) {
414
0
        int64_t czl_limit = zone_limit_get_limit__(&zl->czl);
415
0
        if (czl_limit > ZONE_LIMIT_CONN_DEFAULT) {
416
0
            czl.zone = zl->czl.zone;
417
0
            czl.limit = czl_limit;
418
0
        } else {
419
0
            atomic_read_relaxed(&ct->default_zone_limit, &czl.limit);
420
0
        }
421
422
0
        czl.count = atomic_count_get(&zl->czl.count);
423
0
    } else {
424
0
        atomic_read_relaxed(&ct->default_zone_limit, &czl.limit);
425
0
    }
426
427
0
    return czl;
428
0
}
429
430
static void
431
zone_limit_clean__(struct conntrack *ct, struct zone_limit *zl)
432
    OVS_REQUIRES(ct->ct_lock)
433
0
{
434
0
    uint32_t hash = zone_key_hash(zl->czl.zone, ct->hash_basis);
435
0
    cmap_remove(&ct->zone_limits, &zl->node, hash);
436
0
    ovsrcu_postpone(free, zl);
437
0
}
438
439
static void
440
zone_limit_clean(struct conntrack *ct, struct zone_limit *zl)
441
    OVS_REQUIRES(ct->ct_lock)
442
0
{
443
0
    uint32_t limit;
444
445
0
    atomic_read_relaxed(&ct->default_zone_limit, &limit);
446
    /* Do not remove the entry if the default limit is enabled, but
447
     * simply move the limit to default. */
448
0
    if (limit) {
449
0
        atomic_store_relaxed(&zl->czl.limit, ZONE_LIMIT_CONN_DEFAULT);
450
0
    } else {
451
0
        zone_limit_clean__(ct, zl);
452
0
    }
453
0
}
454
455
static void
456
zone_limit_clean_default(struct conntrack *ct)
457
    OVS_REQUIRES(ct->ct_lock)
458
0
{
459
0
    struct zone_limit *zl;
460
0
    int64_t czl_limit;
461
462
0
    atomic_store_relaxed(&ct->default_zone_limit, 0);
463
464
0
    CMAP_FOR_EACH (zl, node, &ct->zone_limits) {
465
0
        atomic_read_relaxed(&zl->czl.limit, &czl_limit);
466
0
        if (zone_limit_get_limit__(&zl->czl) == ZONE_LIMIT_CONN_DEFAULT) {
467
0
            zone_limit_clean__(ct, zl);
468
0
        }
469
0
    }
470
0
}
471
472
static bool
473
zone_limit_delete__(struct conntrack *ct, int32_t zone)
474
    OVS_REQUIRES(ct->ct_lock)
475
0
{
476
0
    struct zone_limit *zl = NULL;
477
478
0
    if (zone == DEFAULT_ZONE) {
479
0
        zone_limit_clean_default(ct);
480
0
    } else {
481
0
        zl = zone_limit_lookup_protected(ct, zone);
482
0
        if (zl) {
483
0
            zone_limit_clean(ct, zl);
484
0
        }
485
0
    }
486
487
0
    return zl != NULL;
488
0
}
489
490
int
491
zone_limit_delete(struct conntrack *ct, int32_t zone)
492
0
{
493
0
    bool deleted;
494
495
0
    ovs_mutex_lock(&ct->ct_lock);
496
0
    deleted = zone_limit_delete__(ct, zone);
497
0
    ovs_mutex_unlock(&ct->ct_lock);
498
499
0
    if (zone != DEFAULT_ZONE) {
500
0
        VLOG_INFO(deleted
501
0
                  ? "Deleted zone limit for zone %d"
502
0
                  : "Attempted delete of non-existent zone limit: zone %d",
503
0
                  zone);
504
0
    }
505
506
0
    return 0;
507
0
}
508
509
static void
510
zone_limit_update_default(struct conntrack *ct, int32_t zone, uint32_t limit)
511
0
{
512
    /* limit zero means delete default. */
513
0
    if (limit == 0) {
514
0
        ovs_mutex_lock(&ct->ct_lock);
515
0
        zone_limit_delete__(ct, zone);
516
0
        ovs_mutex_unlock(&ct->ct_lock);
517
0
    } else {
518
0
        atomic_store_relaxed(&ct->default_zone_limit, limit);
519
0
    }
520
0
}
521
522
int
523
zone_limit_update(struct conntrack *ct, int32_t zone, uint32_t limit)
524
0
{
525
0
    struct zone_limit *zl;
526
0
    int err = 0;
527
528
0
    if (zone == DEFAULT_ZONE) {
529
0
        zone_limit_update_default(ct, zone, limit);
530
0
        VLOG_INFO("Set default zone limit to %u", limit);
531
0
        return err;
532
0
    }
533
534
0
    zl = zone_limit_lookup(ct, zone);
535
0
    if (zl) {
536
0
        atomic_store_relaxed(&zl->czl.limit, limit);
537
0
        VLOG_INFO("Changed zone limit of %u for zone %d", limit, zone);
538
0
    } else {
539
0
        ovs_mutex_lock(&ct->ct_lock);
540
0
        err = zone_limit_create(ct, zone, limit) == NULL;
541
0
        ovs_mutex_unlock(&ct->ct_lock);
542
0
        if (!err) {
543
0
            VLOG_INFO("Created zone limit of %u for zone %d", limit, zone);
544
0
        } else {
545
0
            VLOG_WARN("Request to create zone limit for invalid zone %d",
546
0
                      zone);
547
0
        }
548
0
    }
549
550
0
    return err;
551
0
}
552
553
static void
554
conn_clean__(struct conntrack *ct, struct conn *conn)
555
    OVS_REQUIRES(ct->ct_lock)
556
0
{
557
0
    uint32_t hash;
558
559
0
    if (conn->alg) {
560
0
        expectation_clean(ct, &conn->key_node[CT_DIR_FWD].key);
561
0
    }
562
563
0
    hash = conn_key_hash(&conn->key_node[CT_DIR_FWD].key, ct->hash_basis);
564
0
    cmap_remove(&ct->conns[conn->key_node[CT_DIR_FWD].key.zone],
565
0
                &conn->key_node[CT_DIR_FWD].cm_node, hash);
566
567
0
    if (conn->nat_action) {
568
0
        hash = conn_key_hash(&conn->key_node[CT_DIR_REV].key,
569
0
                             ct->hash_basis);
570
0
        cmap_remove(&ct->conns[conn->key_node[CT_DIR_REV].key.zone],
571
0
                    &conn->key_node[CT_DIR_REV].cm_node, hash);
572
0
    }
573
574
0
    rculist_remove(&conn->node);
575
0
}
576
577
/* Also removes the associated nat 'conn' from the lookup
578
   datastructures. */
579
static void
580
conn_clean(struct conntrack *ct, struct conn *conn)
581
    OVS_EXCLUDED(conn->lock, ct->ct_lock)
582
0
{
583
0
    if (atomic_flag_test_and_set(&conn->reclaimed)) {
584
0
        return;
585
0
    }
586
587
0
    ovs_mutex_lock(&ct->ct_lock);
588
0
    conn_clean__(ct, conn);
589
0
    ovs_mutex_unlock(&ct->ct_lock);
590
591
0
    struct zone_limit *zl = zone_limit_lookup(ct, conn->admit_zone);
592
0
    if (zl && zl->czl.zone_limit_seq == conn->zone_limit_seq) {
593
0
        atomic_count_dec(&zl->czl.count);
594
0
    }
595
596
0
    ovsrcu_postpone(delete_conn, conn);
597
0
    atomic_count_dec(&ct->n_conn);
598
0
}
599
600
static void
601
conn_force_expire(struct conn *conn)
602
0
{
603
0
    atomic_store_relaxed(&conn->expiration, 0);
604
0
}
605
606
/* Destroys the connection tracker 'ct' and frees all the allocated memory.
607
 * The caller of this function must already have shut down packet input
608
 * and PMD threads (which would have been quiesced).  */
609
void
610
conntrack_destroy(struct conntrack *ct)
611
0
{
612
0
    struct conn *conn;
613
614
0
    latch_set(&ct->clean_thread_exit);
615
0
    pthread_join(ct->clean_thread, NULL);
616
0
    latch_destroy(&ct->clean_thread_exit);
617
618
0
    for (unsigned i = 0; i < N_EXP_LISTS; i++) {
619
0
        RCULIST_FOR_EACH (conn, node, &ct->exp_lists[i]) {
620
0
            conn_clean(ct, conn);
621
0
        }
622
0
    }
623
624
0
    struct zone_limit *zl;
625
0
    CMAP_FOR_EACH (zl, node, &ct->zone_limits) {
626
0
        uint32_t hash = zone_key_hash(zl->czl.zone, ct->hash_basis);
627
628
0
        cmap_remove(&ct->zone_limits, &zl->node, hash);
629
0
        ovsrcu_postpone(free, zl);
630
0
    }
631
632
0
    struct timeout_policy *tp;
633
0
    CMAP_FOR_EACH (tp, node, &ct->timeout_policies) {
634
0
        uint32_t hash = hash_int(tp->policy.id, ct->hash_basis);
635
636
0
        cmap_remove(&ct->timeout_policies, &tp->node, hash);
637
0
        ovsrcu_postpone(free, tp);
638
0
    }
639
640
0
    ovs_mutex_lock(&ct->ct_lock);
641
642
0
    for (unsigned i = 0; i < ARRAY_SIZE(ct->conns); i++) {
643
0
        cmap_destroy(&ct->conns[i]);
644
0
    }
645
0
    cmap_destroy(&ct->zone_limits);
646
0
    cmap_destroy(&ct->timeout_policies);
647
648
0
    ovs_mutex_unlock(&ct->ct_lock);
649
0
    ovs_mutex_destroy(&ct->ct_lock);
650
651
0
    ovs_rwlock_wrlock(&ct->resources_lock);
652
0
    struct alg_exp_node *alg_exp_node;
653
0
    HMAP_FOR_EACH_POP (alg_exp_node, node, &ct->alg_expectations) {
654
0
        free(alg_exp_node);
655
0
    }
656
0
    hmap_destroy(&ct->alg_expectations);
657
0
    hindex_destroy(&ct->alg_expectation_refs);
658
0
    ovs_rwlock_unlock(&ct->resources_lock);
659
0
    ovs_rwlock_destroy(&ct->resources_lock);
660
661
0
    ipf_destroy(ct->ipf);
662
0
    free(ct);
663
0
}
664

665
666
static bool
667
conn_key_lookup(struct conntrack *ct, const struct conn_key *key,
668
                uint32_t hash, long long now, struct conn **conn_out,
669
                bool *reply)
670
0
{
671
0
    struct conn_key_node *keyn;
672
0
    struct conn *conn = NULL;
673
0
    bool found = false;
674
675
0
    CMAP_FOR_EACH_WITH_HASH (keyn, cm_node, hash, &ct->conns[key->zone]) {
676
0
        if (keyn->dir == CT_DIR_FWD) {
677
0
            conn = CONTAINER_OF(keyn, struct conn, key_node[CT_DIR_FWD]);
678
0
        } else {
679
0
            conn = CONTAINER_OF(keyn, struct conn, key_node[CT_DIR_REV]);
680
0
        }
681
682
0
        if (conn_expired(conn, now)) {
683
0
            continue;
684
0
        }
685
686
0
        for (int i = CT_DIR_FWD; i < CT_DIRS; i++) {
687
0
            if (!conn_key_cmp(&conn->key_node[i].key, key)) {
688
0
                found = true;
689
0
                if (reply) {
690
0
                    *reply = (i == CT_DIR_REV);
691
0
                }
692
0
                goto out_found;
693
0
            }
694
0
        }
695
0
    }
696
697
0
out_found:
698
0
    if (found && conn_out) {
699
0
        *conn_out = conn;
700
0
    } else if (conn_out) {
701
0
        *conn_out = NULL;
702
0
    }
703
704
0
    return found;
705
0
}
706
707
static bool
708
conn_lookup(struct conntrack *ct, const struct conn_key *key,
709
            long long now, struct conn **conn_out, bool *reply)
710
0
{
711
0
    uint32_t hash = conn_key_hash(key, ct->hash_basis);
712
0
    return conn_key_lookup(ct, key, hash, now, conn_out, reply);
713
0
}
714
715
static void
716
write_ct_md(struct dp_packet *pkt, uint16_t zone, const struct conn *conn,
717
            const struct conn_key *key, const struct alg_exp_node *alg_exp)
718
0
{
719
0
    pkt->md.ct_state |= CS_TRACKED;
720
0
    pkt->md.ct_zone = zone;
721
722
0
    if (conn) {
723
0
        ovs_mutex_lock(&conn->lock);
724
0
        pkt->md.ct_mark = conn->mark;
725
0
        pkt->md.ct_label = conn->label;
726
0
        ovs_mutex_unlock(&conn->lock);
727
0
    } else {
728
0
        pkt->md.ct_mark = 0;
729
0
        pkt->md.ct_label = OVS_U128_ZERO;
730
0
    }
731
732
    /* Use the original direction tuple if we have it. */
733
0
    if (conn) {
734
0
        if (conn->alg_related) {
735
0
            key = &conn->parent_key;
736
0
        } else {
737
0
            key = &conn->key_node[CT_DIR_FWD].key;
738
0
        }
739
0
    } else if (alg_exp) {
740
0
        pkt->md.ct_mark = alg_exp->parent_mark;
741
0
        pkt->md.ct_label = alg_exp->parent_label;
742
0
        key = &alg_exp->parent_key;
743
0
    }
744
745
0
    pkt->md.ct_orig_tuple_ipv6 = false;
746
747
0
    if (key) {
748
0
        if (key->dl_type == htons(ETH_TYPE_IP)) {
749
0
            pkt->md.ct_orig_tuple.ipv4 = (struct ovs_key_ct_tuple_ipv4) {
750
0
                key->src.addr.ipv4,
751
0
                key->dst.addr.ipv4,
752
0
                key->nw_proto != IPPROTO_ICMP
753
0
                ? key->src.port : htons(key->src.icmp_type),
754
0
                key->nw_proto != IPPROTO_ICMP
755
0
                ? key->dst.port : htons(key->src.icmp_code),
756
0
                key->nw_proto,
757
0
            };
758
0
        } else {
759
0
            pkt->md.ct_orig_tuple_ipv6 = true;
760
0
            pkt->md.ct_orig_tuple.ipv6 = (struct ovs_key_ct_tuple_ipv6) {
761
0
                key->src.addr.ipv6,
762
0
                key->dst.addr.ipv6,
763
0
                key->nw_proto != IPPROTO_ICMPV6
764
0
                ? key->src.port : htons(key->src.icmp_type),
765
0
                key->nw_proto != IPPROTO_ICMPV6
766
0
                ? key->dst.port : htons(key->src.icmp_code),
767
0
                key->nw_proto,
768
0
            };
769
0
        }
770
0
    } else {
771
0
        memset(&pkt->md.ct_orig_tuple, 0, sizeof pkt->md.ct_orig_tuple);
772
0
    }
773
0
}
774
775
static uint8_t
776
get_ip_proto(const struct dp_packet *pkt)
777
0
{
778
0
    uint8_t ip_proto;
779
0
    struct eth_header *l2 = dp_packet_eth(pkt);
780
0
    if (l2->eth_type == htons(ETH_TYPE_IPV6)) {
781
0
        struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
782
0
        ip_proto = nh6->ip6_ctlun.ip6_un1.ip6_un1_nxt;
783
0
    } else {
784
0
        struct ip_header *l3_hdr = dp_packet_l3(pkt);
785
0
        ip_proto = l3_hdr->ip_proto;
786
0
    }
787
788
0
    return ip_proto;
789
0
}
790
791
static bool
792
is_ftp_ctl(const enum ct_alg_ctl_type ct_alg_ctl)
793
0
{
794
0
    return ct_alg_ctl == CT_ALG_CTL_FTP;
795
0
}
796
797
static enum ct_alg_ctl_type
798
get_alg_ctl_type(const struct dp_packet *pkt, const char *helper)
799
0
{
800
    /* CT_IPPORT_FTP/TFTP is used because IPPORT_FTP/TFTP in not defined
801
     * in OSX, at least in in.h. Since these values will never change, remove
802
     * the external dependency. */
803
0
    enum { CT_IPPORT_FTP = 21 };
804
0
    enum { CT_IPPORT_TFTP = 69 };
805
0
    uint8_t ip_proto = get_ip_proto(pkt);
806
0
    struct udp_header *uh = dp_packet_l4(pkt);
807
0
    struct tcp_header *th = dp_packet_l4(pkt);
808
0
    ovs_be16 ftp_port = htons(CT_IPPORT_FTP);
809
0
    ovs_be16 tftp_port = htons(CT_IPPORT_TFTP);
810
811
0
    if (helper) {
812
0
        if ((ip_proto == IPPROTO_TCP) &&
813
0
             !strncmp(helper, "ftp", strlen("ftp"))) {
814
0
            return CT_ALG_CTL_FTP;
815
0
        }
816
0
        if ((ip_proto == IPPROTO_UDP) &&
817
0
             !strncmp(helper, "tftp", strlen("tftp"))) {
818
0
            return CT_ALG_CTL_TFTP;
819
0
        }
820
0
    }
821
822
0
    if (ip_proto == IPPROTO_UDP && uh->udp_dst == tftp_port) {
823
0
        return CT_ALG_CTL_TFTP;
824
0
    } else if (ip_proto == IPPROTO_TCP &&
825
0
               (th->tcp_src == ftp_port || th->tcp_dst == ftp_port)) {
826
0
        return CT_ALG_CTL_FTP;
827
0
    }
828
0
    return CT_ALG_CTL_NONE;
829
0
}
830
831
static bool
832
alg_src_ip_wc(enum ct_alg_ctl_type alg_ctl_type)
833
0
{
834
0
    if (alg_ctl_type == CT_ALG_CTL_SIP) {
835
0
        return true;
836
0
    }
837
0
    return false;
838
0
}
839
840
static void
841
handle_alg_ctl(struct conntrack *ct, const struct conn_lookup_ctx *ctx,
842
               struct dp_packet *pkt, enum ct_alg_ctl_type ct_alg_ctl,
843
               struct conn *conn, long long now, bool nat)
844
0
{
845
    /* ALG control packet handling with expectation creation. */
846
0
    if (OVS_UNLIKELY(alg_helpers[ct_alg_ctl] && conn && conn->alg)) {
847
0
        ovs_mutex_lock(&conn->lock);
848
0
        alg_helpers[ct_alg_ctl](ct, ctx, pkt, conn, now, CT_FTP_CTL_INTEREST,
849
0
                                nat);
850
0
        ovs_mutex_unlock(&conn->lock);
851
0
    }
852
0
}
853
854
static void
855
pat_packet(struct dp_packet *pkt, const struct conn_key *key)
856
0
{
857
0
    if (key->nw_proto == IPPROTO_TCP) {
858
0
        packet_set_tcp_port(pkt, key->dst.port, key->src.port);
859
0
    } else if (key->nw_proto == IPPROTO_UDP) {
860
0
        packet_set_udp_port(pkt, key->dst.port, key->src.port);
861
0
    } else if (key->nw_proto == IPPROTO_SCTP) {
862
0
        packet_set_sctp_port(pkt, key->dst.port, key->src.port);
863
0
    }
864
0
}
865
866
static uint16_t
867
nat_action_reverse(uint16_t nat_action)
868
0
{
869
0
    if (nat_action & NAT_ACTION_SRC) {
870
0
        nat_action ^= NAT_ACTION_SRC;
871
0
        nat_action |= NAT_ACTION_DST;
872
0
    } else if (nat_action & NAT_ACTION_DST) {
873
0
        nat_action ^= NAT_ACTION_DST;
874
0
        nat_action |= NAT_ACTION_SRC;
875
0
    }
876
0
    return nat_action;
877
0
}
878
879
static void
880
nat_packet_ipv4(struct dp_packet *pkt, const struct conn_key *key,
881
                uint16_t nat_action)
882
0
{
883
0
    struct ip_header *nh = dp_packet_l3(pkt);
884
885
0
    if (nat_action & NAT_ACTION_SRC) {
886
0
        packet_set_ipv4_addr(pkt, &nh->ip_src, key->dst.addr.ipv4);
887
0
    } else if (nat_action & NAT_ACTION_DST) {
888
0
        packet_set_ipv4_addr(pkt, &nh->ip_dst, key->src.addr.ipv4);
889
0
    }
890
0
}
891
892
static void
893
nat_packet_ipv6(struct dp_packet *pkt, const struct conn_key *key,
894
                uint16_t nat_action)
895
0
{
896
0
    struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
897
898
0
    if (nat_action & NAT_ACTION_SRC) {
899
0
        packet_set_ipv6_addr(pkt, key->nw_proto, nh6->ip6_src.be32,
900
0
                             &key->dst.addr.ipv6, true);
901
0
    } else if (nat_action & NAT_ACTION_DST) {
902
0
        packet_set_ipv6_addr(pkt, key->nw_proto, nh6->ip6_dst.be32,
903
0
                             &key->src.addr.ipv6, true);
904
0
    }
905
0
}
906
907
static void
908
nat_inner_packet(struct dp_packet *pkt, struct conn_key *key,
909
                 uint16_t nat_action)
910
0
{
911
0
    char *tail = dp_packet_tail(pkt);
912
0
    uint16_t pad = dp_packet_l2_pad_size(pkt);
913
0
    struct conn_key inner_key;
914
0
    const char *inner_l4 = NULL;
915
0
    uint16_t orig_l3_ofs = pkt->l3_ofs;
916
0
    uint16_t orig_l4_ofs = pkt->l4_ofs;
917
0
    uint32_t orig_offloads = pkt->offloads;
918
919
0
    void *l3 = dp_packet_l3(pkt);
920
0
    void *l4 = dp_packet_l4(pkt);
921
0
    void *inner_l3;
922
    /* These calls are already verified to succeed during the code path from
923
     * 'conn_key_extract()' which calls
924
     * 'extract_l4_icmp()'/'extract_l4_icmp6()'. */
925
0
    if (key->dl_type == htons(ETH_TYPE_IP)) {
926
0
        inner_l3 = (char *) l4 + sizeof(struct icmp_header);
927
0
        extract_l3_ipv4(NULL, &inner_key, inner_l3,
928
0
                        tail - ((char *) inner_l3) - pad, &inner_l4);
929
0
    } else {
930
0
        inner_l3 = (char *) l4 + sizeof(struct icmp6_data_header);
931
0
        extract_l3_ipv6(&inner_key, inner_l3, tail - ((char *) inner_l3) - pad,
932
0
                        &inner_l4);
933
0
    }
934
0
    pkt->l3_ofs += (char *) inner_l3 - (char *) l3;
935
0
    pkt->l4_ofs += inner_l4 - (char *) l4;
936
    /* Drop any offloads to force below helpers to calculate checksums
937
     * if needed. */
938
0
    dp_packet_ip_checksum_set_unknown(pkt);
939
0
    dp_packet_l4_checksum_set_unknown(pkt);
940
941
    /* Reverse the key for inner packet. */
942
0
    struct conn_key rev_key = *key;
943
0
    conn_key_reverse(&rev_key);
944
945
0
    pat_packet(pkt, &rev_key);
946
947
0
    if (key->dl_type == htons(ETH_TYPE_IP)) {
948
0
        nat_packet_ipv4(pkt, &rev_key, nat_action);
949
950
0
        struct icmp_header *icmp = (struct icmp_header *) l4;
951
0
        icmp->icmp_csum = 0;
952
0
        icmp->icmp_csum = csum(icmp, tail - (char *) icmp - pad);
953
0
    } else {
954
0
        nat_packet_ipv6(pkt, &rev_key, nat_action);
955
956
0
        struct icmp6_data_header *icmp6 = (struct icmp6_data_header *) l4;
957
0
        icmp6->icmp6_base.icmp6_cksum = 0;
958
0
        icmp6->icmp6_base.icmp6_cksum =
959
0
            packet_csum_upperlayer6(l3, icmp6, IPPROTO_ICMPV6,
960
0
                                    tail - (char *) icmp6 - pad);
961
0
    }
962
963
0
    pkt->l3_ofs = orig_l3_ofs;
964
0
    pkt->l4_ofs = orig_l4_ofs;
965
0
    pkt->offloads = orig_offloads;
966
0
}
967
968
static void
969
nat_packet(struct dp_packet *pkt, struct conn *conn, bool reply, bool related)
970
0
{
971
0
    enum key_dir dir = reply ? CT_DIR_FWD : CT_DIR_REV;
972
0
    struct conn_key *key = &conn->key_node[dir].key;
973
0
    uint16_t nat_action = reply ? nat_action_reverse(conn->nat_action)
974
0
                                : conn->nat_action;
975
976
    /* Update ct_state. */
977
0
    if (nat_action & NAT_ACTION_SRC) {
978
0
        pkt->md.ct_state |= CS_SRC_NAT;
979
0
    } else if (nat_action & NAT_ACTION_DST) {
980
0
        pkt->md.ct_state |= CS_DST_NAT;
981
0
    }
982
983
    /* Reverse the key for outer header. */
984
0
    if (key->dl_type == htons(ETH_TYPE_IP)) {
985
0
        nat_packet_ipv4(pkt, key, nat_action);
986
0
    } else {
987
0
        nat_packet_ipv6(pkt, key, nat_action);
988
0
    }
989
990
0
    if (nat_action & NAT_ACTION_SRC || nat_action & NAT_ACTION_DST) {
991
0
        if (OVS_UNLIKELY(related)) {
992
0
            nat_action = nat_action_reverse(nat_action);
993
0
            nat_inner_packet(pkt, key, nat_action);
994
0
        } else {
995
0
            pat_packet(pkt, key);
996
0
        }
997
0
    }
998
0
}
999
1000
static void
1001
conn_seq_skew_set(struct conntrack *ct, const struct conn *conn_in,
1002
                  long long now, int seq_skew, bool seq_skew_dir)
1003
0
{
1004
0
    struct conn *conn;
1005
1006
0
    conn_lookup(ct, &conn_in->key_node[CT_DIR_FWD].key, now, &conn, NULL);
1007
0
    if (conn && seq_skew) {
1008
0
        conn->seq_skew = seq_skew;
1009
0
        conn->seq_skew_dir = seq_skew_dir;
1010
0
    }
1011
0
}
1012
1013
static bool
1014
ct_verify_helper(const char *helper, enum ct_alg_ctl_type ct_alg_ctl)
1015
0
{
1016
0
    if (ct_alg_ctl == CT_ALG_CTL_NONE) {
1017
0
        return true;
1018
0
    } else if (helper) {
1019
0
        if ((ct_alg_ctl == CT_ALG_CTL_FTP) &&
1020
0
             !strncmp(helper, "ftp", strlen("ftp"))) {
1021
0
            return true;
1022
0
        } else if ((ct_alg_ctl == CT_ALG_CTL_TFTP) &&
1023
0
                   !strncmp(helper, "tftp", strlen("tftp"))) {
1024
0
            return true;
1025
0
        } else {
1026
0
            return false;
1027
0
        }
1028
0
    } else {
1029
0
        return false;
1030
0
    }
1031
0
}
1032
1033
static struct conn *
1034
conn_not_found(struct conntrack *ct, struct dp_packet *pkt,
1035
               struct conn_lookup_ctx *ctx, bool commit, long long now,
1036
               const struct nat_action_info_t *nat_action_info,
1037
               const char *helper, const struct alg_exp_node *alg_exp,
1038
               enum ct_alg_ctl_type ct_alg_ctl, uint32_t tp_id)
1039
    OVS_REQUIRES(ct->ct_lock)
1040
0
{
1041
0
    struct conn *nc = NULL;
1042
1043
0
    if (!valid_new(pkt, &ctx->key)) {
1044
0
        pkt->md.ct_state = CS_INVALID;
1045
0
        return nc;
1046
0
    }
1047
1048
0
    pkt->md.ct_state = CS_NEW;
1049
1050
0
    if (alg_exp) {
1051
0
        pkt->md.ct_state |= CS_RELATED;
1052
0
    }
1053
1054
0
    if (commit) {
1055
0
        int64_t czl_limit;
1056
0
        struct conn_key_node *fwd_key_node, *rev_key_node;
1057
0
        struct zone_limit *zl = zone_limit_lookup_or_default(ct,
1058
0
                                                             ctx->key.zone);
1059
0
        if (zl) {
1060
0
            czl_limit = zone_limit_get_limit(ct, &zl->czl);
1061
0
            if (czl_limit >= 0 &&
1062
0
                atomic_count_get(&zl->czl.count) >= czl_limit) {
1063
0
                COVERAGE_INC(conntrack_zone_full);
1064
0
                return nc;
1065
0
            }
1066
0
        }
1067
1068
0
        unsigned int n_conn_limit;
1069
0
        atomic_read_relaxed(&ct->n_conn_limit, &n_conn_limit);
1070
0
        if (atomic_count_get(&ct->n_conn) >= n_conn_limit) {
1071
0
            COVERAGE_INC(conntrack_full);
1072
0
            return nc;
1073
0
        }
1074
1075
0
        nc = new_conn(ct, pkt, &ctx->key, now, tp_id);
1076
0
        fwd_key_node = &nc->key_node[CT_DIR_FWD];
1077
0
        rev_key_node = &nc->key_node[CT_DIR_REV];
1078
0
        memcpy(&fwd_key_node->key, &ctx->key, sizeof fwd_key_node->key);
1079
0
        memcpy(&rev_key_node->key, &fwd_key_node->key,
1080
0
               sizeof rev_key_node->key);
1081
0
        conn_key_reverse(&rev_key_node->key);
1082
1083
0
        if (ct_verify_helper(helper, ct_alg_ctl)) {
1084
0
            nc->alg = nullable_xstrdup(helper);
1085
0
        }
1086
1087
0
        if (alg_exp) {
1088
0
            nc->alg_related = true;
1089
0
            nc->mark = alg_exp->parent_mark;
1090
0
            nc->label = alg_exp->parent_label;
1091
0
            nc->parent_key = alg_exp->parent_key;
1092
0
        }
1093
1094
0
        ovs_mutex_init_adaptive(&nc->lock);
1095
0
        atomic_flag_clear(&nc->reclaimed);
1096
0
        fwd_key_node->dir = CT_DIR_FWD;
1097
0
        rev_key_node->dir = CT_DIR_REV;
1098
1099
0
        if (zl) {
1100
0
            nc->admit_zone = zl->czl.zone;
1101
0
            nc->zone_limit_seq = zl->czl.zone_limit_seq;
1102
0
        } else {
1103
0
            nc->admit_zone = INVALID_ZONE;
1104
0
        }
1105
1106
0
        if (nat_action_info) {
1107
0
            nc->nat_action = nat_action_info->nat_action;
1108
1109
0
            if (alg_exp) {
1110
0
                if (alg_exp->nat_rpl_dst) {
1111
0
                    rev_key_node->key.dst.addr = alg_exp->alg_nat_repl_addr;
1112
0
                    nc->nat_action = NAT_ACTION_SRC;
1113
0
                } else {
1114
0
                    rev_key_node->key.src.addr = alg_exp->alg_nat_repl_addr;
1115
0
                    nc->nat_action = NAT_ACTION_DST;
1116
0
                }
1117
0
            } else {
1118
0
                bool nat_res = nat_get_unique_tuple(ct, nc, nat_action_info);
1119
0
                if (!nat_res) {
1120
0
                    goto nat_res_exhaustion;
1121
0
                }
1122
0
            }
1123
1124
0
            nat_packet(pkt, nc, false, ctx->icmp_related);
1125
0
            uint32_t rev_hash = conn_key_hash(&rev_key_node->key,
1126
0
                                              ct->hash_basis);
1127
0
            cmap_insert(&ct->conns[ctx->key.zone],
1128
0
                        &rev_key_node->cm_node, rev_hash);
1129
0
        }
1130
1131
0
        cmap_insert(&ct->conns[ctx->key.zone],
1132
0
                    &fwd_key_node->cm_node, ctx->hash);
1133
0
        conn_expire_push_front(ct, nc);
1134
0
        atomic_count_inc(&ct->n_conn);
1135
1136
0
        if (zl) {
1137
0
            atomic_count_inc(&zl->czl.count);
1138
0
        }
1139
1140
0
        ctx->conn = nc; /* For completeness. */
1141
0
    }
1142
1143
0
    return nc;
1144
1145
    /* This would be a user error or a DOS attack.  A user error is prevented
1146
     * by allocating enough combinations of NAT addresses when combined with
1147
     * ephemeral ports.  A DOS attack should be protected against with
1148
     * firewall rules or a separate firewall.  Also using zone partitioning
1149
     * can limit DoS impact. */
1150
0
nat_res_exhaustion:
1151
0
    delete_conn__(nc);
1152
0
    static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
1153
0
    VLOG_WARN_RL(&rl, "Unable to NAT due to tuple space exhaustion - "
1154
0
                 "if DoS attack, use firewalling and/or zone partitioning.");
1155
0
    return NULL;
1156
0
}
1157
1158
static bool
1159
conn_update_state(struct conntrack *ct, struct dp_packet *pkt,
1160
                  struct conn_lookup_ctx *ctx, struct conn *conn,
1161
                  long long now)
1162
0
{
1163
0
    bool create_new_conn = false;
1164
1165
0
    if (ctx->icmp_related) {
1166
0
        pkt->md.ct_state |= CS_RELATED;
1167
0
        if (ctx->reply) {
1168
0
            pkt->md.ct_state |= CS_REPLY_DIR;
1169
0
        }
1170
0
    } else {
1171
0
        if (conn->alg_related) {
1172
0
            pkt->md.ct_state |= CS_RELATED;
1173
0
        }
1174
1175
0
        enum ct_update_res res = conn_update(ct, conn, pkt, ctx, now);
1176
1177
0
        switch (res) {
1178
0
        case CT_UPDATE_VALID:
1179
0
            pkt->md.ct_state |= CS_ESTABLISHED;
1180
0
            pkt->md.ct_state &= ~CS_NEW;
1181
0
            if (ctx->reply) {
1182
0
                pkt->md.ct_state |= CS_REPLY_DIR;
1183
0
            }
1184
0
            break;
1185
0
        case CT_UPDATE_INVALID:
1186
0
            pkt->md.ct_state = CS_INVALID;
1187
0
            break;
1188
0
        case CT_UPDATE_NEW:
1189
0
            if (conn_lookup(ct, &conn->key_node[CT_DIR_FWD].key,
1190
0
                            now, NULL, NULL)) {
1191
0
                conn_force_expire(conn);
1192
0
            }
1193
0
            create_new_conn = true;
1194
0
            break;
1195
0
        case CT_UPDATE_VALID_NEW:
1196
0
            pkt->md.ct_state |= CS_NEW;
1197
0
            break;
1198
0
        default:
1199
0
            OVS_NOT_REACHED();
1200
0
        }
1201
0
    }
1202
0
    return create_new_conn;
1203
0
}
1204
1205
static void
1206
handle_nat(struct dp_packet *pkt, struct conn *conn,
1207
           uint16_t zone, bool reply, bool related)
1208
0
{
1209
0
    if (conn->nat_action &&
1210
0
        (!(pkt->md.ct_state & (CS_SRC_NAT | CS_DST_NAT)) ||
1211
0
          (pkt->md.ct_state & (CS_SRC_NAT | CS_DST_NAT) &&
1212
0
           zone != pkt->md.ct_zone))) {
1213
1214
0
        if (pkt->md.ct_state & (CS_SRC_NAT | CS_DST_NAT)) {
1215
0
            pkt->md.ct_state &= ~(CS_SRC_NAT | CS_DST_NAT);
1216
0
        }
1217
1218
0
        nat_packet(pkt, conn, reply, related);
1219
0
    }
1220
0
}
1221
1222
static bool
1223
check_orig_tuple(struct conntrack *ct, struct dp_packet *pkt,
1224
                 struct conn_lookup_ctx *ctx_in, long long now,
1225
                 struct conn **conn,
1226
                 const struct nat_action_info_t *nat_action_info)
1227
0
{
1228
0
    if (!(pkt->md.ct_state & (CS_SRC_NAT | CS_DST_NAT)) ||
1229
0
        (ctx_in->key.dl_type == htons(ETH_TYPE_IP) &&
1230
0
         !pkt->md.ct_orig_tuple.ipv4.ipv4_proto) ||
1231
0
        (ctx_in->key.dl_type == htons(ETH_TYPE_IPV6) &&
1232
0
         !pkt->md.ct_orig_tuple.ipv6.ipv6_proto) ||
1233
0
        nat_action_info) {
1234
0
        return false;
1235
0
    }
1236
1237
0
    struct conn_key key;
1238
0
    memset(&key, 0 , sizeof key);
1239
1240
0
    if (ctx_in->key.dl_type == htons(ETH_TYPE_IP)) {
1241
0
        key.src.addr.ipv4 = pkt->md.ct_orig_tuple.ipv4.ipv4_src;
1242
0
        key.dst.addr.ipv4 = pkt->md.ct_orig_tuple.ipv4.ipv4_dst;
1243
1244
0
        if (ctx_in->key.nw_proto == IPPROTO_ICMP) {
1245
0
            key.src.icmp_id = ctx_in->key.src.icmp_id;
1246
0
            key.dst.icmp_id = ctx_in->key.dst.icmp_id;
1247
0
            uint16_t src_port = ntohs(pkt->md.ct_orig_tuple.ipv4.src_port);
1248
0
            key.src.icmp_type = (uint8_t) src_port;
1249
0
            key.dst.icmp_type = reverse_icmp_type(key.src.icmp_type);
1250
0
        } else {
1251
0
            key.src.port = pkt->md.ct_orig_tuple.ipv4.src_port;
1252
0
            key.dst.port = pkt->md.ct_orig_tuple.ipv4.dst_port;
1253
0
        }
1254
0
        key.nw_proto = pkt->md.ct_orig_tuple.ipv4.ipv4_proto;
1255
0
    } else {
1256
0
        key.src.addr.ipv6 = pkt->md.ct_orig_tuple.ipv6.ipv6_src;
1257
0
        key.dst.addr.ipv6 = pkt->md.ct_orig_tuple.ipv6.ipv6_dst;
1258
1259
0
        if (ctx_in->key.nw_proto == IPPROTO_ICMPV6) {
1260
0
            key.src.icmp_id = ctx_in->key.src.icmp_id;
1261
0
            key.dst.icmp_id = ctx_in->key.dst.icmp_id;
1262
0
            uint16_t src_port = ntohs(pkt->md.ct_orig_tuple.ipv6.src_port);
1263
0
            key.src.icmp_type = (uint8_t) src_port;
1264
0
            key.dst.icmp_type = reverse_icmp6_type(key.src.icmp_type);
1265
0
        } else {
1266
0
            key.src.port = pkt->md.ct_orig_tuple.ipv6.src_port;
1267
0
            key.dst.port = pkt->md.ct_orig_tuple.ipv6.dst_port;
1268
0
        }
1269
0
        key.nw_proto = pkt->md.ct_orig_tuple.ipv6.ipv6_proto;
1270
0
    }
1271
1272
0
    key.dl_type = ctx_in->key.dl_type;
1273
0
    key.zone = pkt->md.ct_zone;
1274
0
    conn_lookup(ct, &key, now, conn, NULL);
1275
0
    return *conn ? true : false;
1276
0
}
1277
1278
static bool
1279
conn_update_state_alg(struct conntrack *ct, struct dp_packet *pkt,
1280
                      struct conn_lookup_ctx *ctx, struct conn *conn,
1281
                      const struct nat_action_info_t *nat_action_info,
1282
                      enum ct_alg_ctl_type ct_alg_ctl, long long now,
1283
                      bool *create_new_conn)
1284
0
{
1285
0
    if (is_ftp_ctl(ct_alg_ctl)) {
1286
        /* Keep sequence tracking in sync with the source of the
1287
         * sequence skew. */
1288
0
        ovs_mutex_lock(&conn->lock);
1289
0
        if (ctx->reply != conn->seq_skew_dir) {
1290
0
            handle_ftp_ctl(ct, ctx, pkt, conn, now, CT_FTP_CTL_OTHER,
1291
0
                           !!nat_action_info);
1292
            /* conn_update_state locks for unrelated fields, so unlock. */
1293
0
            ovs_mutex_unlock(&conn->lock);
1294
0
            *create_new_conn = conn_update_state(ct, pkt, ctx, conn, now);
1295
0
        } else {
1296
            /* conn_update_state locks for unrelated fields, so unlock. */
1297
0
            ovs_mutex_unlock(&conn->lock);
1298
0
            *create_new_conn = conn_update_state(ct, pkt, ctx, conn, now);
1299
0
            ovs_mutex_lock(&conn->lock);
1300
0
            if (*create_new_conn == false) {
1301
0
                handle_ftp_ctl(ct, ctx, pkt, conn, now, CT_FTP_CTL_OTHER,
1302
0
                               !!nat_action_info);
1303
0
            }
1304
0
            ovs_mutex_unlock(&conn->lock);
1305
0
        }
1306
0
        return true;
1307
0
    }
1308
0
    return false;
1309
0
}
1310
1311
static void
1312
set_cached_conn(const struct nat_action_info_t *nat_action_info,
1313
                const struct conn_lookup_ctx *ctx, struct conn *conn,
1314
                struct dp_packet *pkt)
1315
0
{
1316
0
    if (OVS_LIKELY(!nat_action_info)) {
1317
0
        pkt->md.conn = conn;
1318
0
        pkt->md.reply = ctx->reply;
1319
0
        pkt->md.icmp_related = ctx->icmp_related;
1320
0
    } else {
1321
0
        pkt->md.conn = NULL;
1322
0
    }
1323
0
}
1324
1325
static void
1326
process_one_fast(uint16_t zone, const uint32_t *setmark,
1327
                 const struct ovs_key_ct_labels *setlabel,
1328
                 const struct nat_action_info_t *nat_action_info,
1329
                 struct conn *conn, struct dp_packet *pkt)
1330
0
{
1331
0
    if (nat_action_info) {
1332
0
        handle_nat(pkt, conn, zone, pkt->md.reply, pkt->md.icmp_related);
1333
0
        pkt->md.conn = NULL;
1334
0
    }
1335
1336
0
    pkt->md.ct_zone = zone;
1337
0
    ovs_mutex_lock(&conn->lock);
1338
0
    pkt->md.ct_mark = conn->mark;
1339
0
    pkt->md.ct_label = conn->label;
1340
0
    ovs_mutex_unlock(&conn->lock);
1341
1342
0
    if (setmark) {
1343
0
        set_mark(pkt, conn, setmark[0], setmark[1]);
1344
0
    }
1345
1346
0
    if (setlabel) {
1347
0
        set_label(pkt, conn, &setlabel[0], &setlabel[1]);
1348
0
    }
1349
0
}
1350
1351
static void
1352
initial_conn_lookup(struct conntrack *ct, struct conn_lookup_ctx *ctx,
1353
                    long long now, bool natted)
1354
0
{
1355
0
    if (natted) {
1356
        /* If the packet has been already natted (e.g. a previous
1357
         * action took place), retrieve it performing a lookup of its
1358
         * reverse key. */
1359
0
        conn_key_reverse(&ctx->key);
1360
0
    }
1361
1362
0
    conn_key_lookup(ct, &ctx->key, ctx->hash, now, &ctx->conn, &ctx->reply);
1363
1364
0
    if (natted) {
1365
0
        if (OVS_LIKELY(ctx->conn)) {
1366
0
            enum key_dir dir;
1367
0
            ctx->reply = !ctx->reply;
1368
0
            dir = ctx->reply ? CT_DIR_REV : CT_DIR_FWD;
1369
0
            ctx->key = ctx->conn->key_node[dir].key;
1370
0
            ctx->hash = conn_key_hash(&ctx->key, ct->hash_basis);
1371
0
        } else {
1372
            /* A lookup failure does not necessarily imply that an
1373
             * error occurred, it may simply indicate that a conn got
1374
             * removed during the recirculation. */
1375
0
            COVERAGE_INC(conntrack_lookup_natted_miss);
1376
0
            conn_key_reverse(&ctx->key);
1377
0
        }
1378
0
    }
1379
0
}
1380
1381
static void
1382
process_one(struct conntrack *ct, struct dp_packet *pkt,
1383
            struct conn_lookup_ctx *ctx, uint16_t zone,
1384
            bool force, bool commit, long long now, const uint32_t *setmark,
1385
            const struct ovs_key_ct_labels *setlabel,
1386
            const struct nat_action_info_t *nat_action_info,
1387
            const char *helper, uint32_t tp_id)
1388
0
{
1389
    /* Reset ct_state whenever entering a new zone. */
1390
0
    if (pkt->md.ct_state && pkt->md.ct_zone != zone) {
1391
0
        pkt->md.ct_state = 0;
1392
0
    }
1393
1394
0
    bool create_new_conn = false;
1395
0
    initial_conn_lookup(ct, ctx, now, !!(pkt->md.ct_state &
1396
0
                                         (CS_SRC_NAT | CS_DST_NAT)));
1397
0
    struct conn *conn = ctx->conn;
1398
1399
    /* Delete found entry if in wrong direction. 'force' implies commit. */
1400
0
    if (OVS_UNLIKELY(force && ctx->reply && conn)) {
1401
0
        if (conn_lookup(ct, &conn->key_node[CT_DIR_FWD].key,
1402
0
                        now, NULL, NULL)) {
1403
0
            conn_force_expire(conn);
1404
0
        }
1405
0
        conn = NULL;
1406
0
    }
1407
1408
0
    if (conn && helper == NULL) {
1409
0
        helper = conn->alg;
1410
0
    }
1411
1412
0
    enum ct_alg_ctl_type ct_alg_ctl = get_alg_ctl_type(pkt, helper);
1413
1414
0
    if (OVS_LIKELY(conn)) {
1415
0
        if (OVS_LIKELY(!conn_update_state_alg(ct, pkt, ctx, conn,
1416
0
                                              nat_action_info,
1417
0
                                              ct_alg_ctl, now,
1418
0
                                              &create_new_conn))) {
1419
0
            create_new_conn = conn_update_state(ct, pkt, ctx, conn, now);
1420
0
        }
1421
0
        if (nat_action_info && !create_new_conn) {
1422
0
            handle_nat(pkt, conn, zone, ctx->reply, ctx->icmp_related);
1423
0
        }
1424
1425
0
    } else if (check_orig_tuple(ct, pkt, ctx, now, &conn, nat_action_info)) {
1426
0
        create_new_conn = conn_update_state(ct, pkt, ctx, conn, now);
1427
0
    } else {
1428
0
        if (ctx->icmp_related) {
1429
            /* An icmp related conn should always be found; no new
1430
               connection is created based on an icmp related packet. */
1431
0
            pkt->md.ct_state = CS_INVALID;
1432
0
        } else {
1433
0
            create_new_conn = true;
1434
0
        }
1435
0
    }
1436
1437
0
    const struct alg_exp_node *alg_exp = NULL;
1438
0
    struct alg_exp_node alg_exp_entry;
1439
1440
0
    if (OVS_UNLIKELY(create_new_conn)) {
1441
1442
0
        ovs_rwlock_rdlock(&ct->resources_lock);
1443
0
        alg_exp = expectation_lookup(&ct->alg_expectations, &ctx->key,
1444
0
                                     ct->hash_basis,
1445
0
                                     alg_src_ip_wc(ct_alg_ctl));
1446
0
        if (alg_exp) {
1447
0
            memcpy(&alg_exp_entry, alg_exp, sizeof alg_exp_entry);
1448
0
            alg_exp = &alg_exp_entry;
1449
0
        }
1450
0
        ovs_rwlock_unlock(&ct->resources_lock);
1451
1452
0
        ovs_mutex_lock(&ct->ct_lock);
1453
0
        if (!conn_lookup(ct, &ctx->key, now, NULL, NULL)) {
1454
0
            conn = conn_not_found(ct, pkt, ctx, commit, now, nat_action_info,
1455
0
                                  helper, alg_exp, ct_alg_ctl, tp_id);
1456
0
        }
1457
0
        ovs_mutex_unlock(&ct->ct_lock);
1458
0
    }
1459
1460
0
    write_ct_md(pkt, zone, conn, &ctx->key, alg_exp);
1461
1462
0
    if (conn && setmark) {
1463
0
        set_mark(pkt, conn, setmark[0], setmark[1]);
1464
0
    }
1465
1466
0
    if (conn && setlabel) {
1467
0
        set_label(pkt, conn, &setlabel[0], &setlabel[1]);
1468
0
    }
1469
1470
0
    handle_alg_ctl(ct, ctx, pkt, ct_alg_ctl, conn, now, !!nat_action_info);
1471
1472
0
    set_cached_conn(nat_action_info, ctx, conn, pkt);
1473
0
}
1474
1475
/* Sends the packets in '*pkt_batch' through the connection tracker 'ct'.  All
1476
 * the packets must have the same 'dl_type' (IPv4 or IPv6) and should have
1477
 * the l3 and and l4 offset properly set.  Performs fragment reassembly with
1478
 * the help of ipf_preprocess_conntrack().
1479
 *
1480
 * If 'commit' is true, the packets are allowed to create new entries in the
1481
 * connection tables.  'setmark', if not NULL, should point to a two
1482
 * elements array containing a value and a mask to set the connection mark.
1483
 * 'setlabel' behaves similarly for the connection label.*/
1484
int
1485
conntrack_execute(struct conntrack *ct, struct dp_packet_batch *pkt_batch,
1486
                  ovs_be16 dl_type, bool force, bool commit, uint16_t zone,
1487
                  const uint32_t *setmark,
1488
                  const struct ovs_key_ct_labels *setlabel,
1489
                  const char *helper,
1490
                  const struct nat_action_info_t *nat_action_info,
1491
                  long long now, uint32_t tp_id)
1492
0
{
1493
0
    odp_port_t in_port = ODPP_LOCAL;
1494
0
    struct conn_lookup_ctx ctx;
1495
0
    struct dp_packet *packet;
1496
1497
0
    DP_PACKET_BATCH_FOR_EACH (i, packet, pkt_batch) {
1498
        /* The ipf preprocess function may consume all packets from this batch,
1499
         * save an in_port. */
1500
0
        in_port = packet->md.in_port.odp_port;
1501
0
        break;
1502
0
    }
1503
1504
0
    ipf_preprocess_conntrack(ct->ipf, pkt_batch, now, dl_type, zone,
1505
0
                             ct->hash_basis);
1506
1507
1508
0
    DP_PACKET_BATCH_FOR_EACH (i, packet, pkt_batch) {
1509
0
        struct conn *conn = packet->md.conn;
1510
1511
0
        if (helper == NULL && conn != NULL) {
1512
0
            helper = conn->alg;
1513
0
        }
1514
1515
0
        if (OVS_UNLIKELY(packet->md.ct_state == CS_INVALID)) {
1516
0
            write_ct_md(packet, zone, NULL, NULL, NULL);
1517
0
        } else if (conn &&
1518
0
                   conn->key_node[CT_DIR_FWD].key.zone == zone && !force &&
1519
0
                   !get_alg_ctl_type(packet, helper)) {
1520
0
            process_one_fast(zone, setmark, setlabel, nat_action_info,
1521
0
                             conn, packet);
1522
0
        } else if (OVS_UNLIKELY(!conn_key_extract(ct, packet, dl_type, &ctx,
1523
0
                                zone))) {
1524
0
            packet->md.ct_state = CS_INVALID;
1525
0
            write_ct_md(packet, zone, NULL, NULL, NULL);
1526
0
        } else {
1527
0
            process_one(ct, packet, &ctx, zone, force, commit, now, setmark,
1528
0
                        setlabel, nat_action_info, helper, tp_id);
1529
0
        }
1530
0
    }
1531
1532
0
    ipf_postprocess_conntrack(ct->ipf, pkt_batch, now, dl_type, zone, in_port);
1533
1534
0
    return 0;
1535
0
}
1536
1537
void
1538
conntrack_clear(struct dp_packet *packet)
1539
0
{
1540
    /* According to pkt_metadata_init(), ct_state == 0 is enough to make all of
1541
     * the conntrack fields invalid. */
1542
0
    packet->md.ct_state = 0;
1543
0
    pkt_metadata_init_conn(&packet->md);
1544
0
}
1545
1546
static void
1547
set_mark(struct dp_packet *pkt, struct conn *conn, uint32_t val, uint32_t mask)
1548
0
{
1549
0
    ovs_mutex_lock(&conn->lock);
1550
0
    if (conn->alg_related) {
1551
0
        pkt->md.ct_mark = conn->mark;
1552
0
    } else {
1553
0
        pkt->md.ct_mark = val | (pkt->md.ct_mark & ~(mask));
1554
0
        conn->mark = pkt->md.ct_mark;
1555
0
    }
1556
0
    ovs_mutex_unlock(&conn->lock);
1557
0
}
1558
1559
static void
1560
set_label(struct dp_packet *pkt, struct conn *conn,
1561
          const struct ovs_key_ct_labels *val,
1562
          const struct ovs_key_ct_labels *mask)
1563
0
{
1564
0
    ovs_mutex_lock(&conn->lock);
1565
0
    if (conn->alg_related) {
1566
0
        pkt->md.ct_label = conn->label;
1567
0
    } else {
1568
0
        ovs_u128 v, m;
1569
1570
0
        memcpy(&v, val, sizeof v);
1571
0
        memcpy(&m, mask, sizeof m);
1572
1573
0
        pkt->md.ct_label.u64.lo = v.u64.lo
1574
0
                              | (pkt->md.ct_label.u64.lo & ~(m.u64.lo));
1575
0
        pkt->md.ct_label.u64.hi = v.u64.hi
1576
0
                              | (pkt->md.ct_label.u64.hi & ~(m.u64.hi));
1577
0
        conn->label = pkt->md.ct_label;
1578
0
    }
1579
0
    ovs_mutex_unlock(&conn->lock);
1580
0
}
1581
1582

1583
int
1584
conntrack_set_sweep_interval(struct conntrack *ct, uint32_t ms)
1585
0
{
1586
0
    atomic_store_relaxed(&ct->sweep_ms, ms);
1587
0
    return 0;
1588
0
}
1589
1590
uint32_t
1591
conntrack_get_sweep_interval(struct conntrack *ct)
1592
0
{
1593
0
    uint32_t ms;
1594
0
    atomic_read_relaxed(&ct->sweep_ms, &ms);
1595
0
    return ms;
1596
0
}
1597
1598
static size_t
1599
ct_sweep(struct conntrack *ct, struct rculist *list, long long now,
1600
         size_t *cleaned_count)
1601
    OVS_NO_THREAD_SAFETY_ANALYSIS
1602
0
{
1603
0
    struct conn *conn;
1604
0
    size_t cleaned = 0;
1605
0
    size_t count = 0;
1606
1607
0
    RCULIST_FOR_EACH (conn, node, list) {
1608
0
        if (conn_expired(conn, now)) {
1609
0
            conn_clean(ct, conn);
1610
0
            cleaned++;
1611
0
        }
1612
1613
0
        count++;
1614
0
    }
1615
1616
0
    if (cleaned_count) {
1617
0
        *cleaned_count = cleaned;
1618
0
    }
1619
1620
0
    return count;
1621
0
}
1622
1623
/* Cleans up old connection entries from 'ct'.  Returns the time
1624
 * when the next wake will happen. The return value might be zero,
1625
 * meaning that an internal limit has been reached. */
1626
static long long
1627
conntrack_clean(struct conntrack *ct, long long now)
1628
0
{
1629
0
    long long next_wakeup = now + conntrack_get_sweep_interval(ct);
1630
0
    unsigned int n_conn_limit, i;
1631
0
    size_t clean_end, count = 0;
1632
0
    size_t total_cleaned = 0;
1633
1634
0
    atomic_read_relaxed(&ct->n_conn_limit, &n_conn_limit);
1635
0
    clean_end = n_conn_limit / 64;
1636
1637
0
    for (i = ct->next_sweep; i < N_EXP_LISTS; i++) {
1638
0
        size_t cleaned;
1639
1640
0
        if (count > clean_end) {
1641
0
            next_wakeup = 0;
1642
0
            break;
1643
0
        }
1644
1645
0
        count += ct_sweep(ct, &ct->exp_lists[i], now, &cleaned);
1646
0
        total_cleaned += cleaned;
1647
0
    }
1648
1649
0
    ct->next_sweep = (i < N_EXP_LISTS) ? i : 0;
1650
1651
0
    VLOG_DBG("conntrack cleaned %"PRIuSIZE" entries out of %"PRIuSIZE
1652
0
             " entries in %lld msec", total_cleaned, count,
1653
0
             time_msec() - now);
1654
1655
0
    return next_wakeup;
1656
0
}
1657
1658
/* Cleanup:
1659
 *
1660
 * We must call conntrack_clean() periodically.  conntrack_clean() return
1661
 * value gives an hint on when the next cleanup must be done. */
1662
#define CT_CLEAN_MIN_INTERVAL_MS 200
1663
1664
static void *
1665
clean_thread_main(void *f_)
1666
    OVS_NO_THREAD_SAFETY_ANALYSIS
1667
0
{
1668
0
    struct conntrack *ct = f_;
1669
1670
0
    while (!latch_is_set(&ct->clean_thread_exit)) {
1671
0
        long long next_wake;
1672
0
        long long now = time_msec();
1673
0
        next_wake = conntrack_clean(ct, now);
1674
1675
0
        if (next_wake < now) {
1676
0
            poll_timer_wait_until(now + CT_CLEAN_MIN_INTERVAL_MS);
1677
0
        } else {
1678
0
            poll_timer_wait_until(next_wake);
1679
0
        }
1680
0
        latch_wait(&ct->clean_thread_exit);
1681
0
        poll_block();
1682
0
    }
1683
1684
0
    return NULL;
1685
0
}
1686

1687
/* 'Data' is a pointer to the beginning of the L3 header and 'new_data' is
1688
 * used to store a pointer to the first byte after the L3 header.  'Size' is
1689
 * the size of the packet beyond the data pointer. */
1690
static inline bool
1691
extract_l3_ipv4(struct dp_packet *pkt, struct conn_key *key, const void *data,
1692
                size_t size, const char **new_data)
1693
0
{
1694
0
    if (OVS_UNLIKELY(size < IP_HEADER_LEN)) {
1695
0
        return false;
1696
0
    }
1697
1698
0
    const struct ip_header *ip = data;
1699
0
    size_t ip_len = IP_IHL(ip->ip_ihl_ver) * 4;
1700
1701
0
    if (OVS_UNLIKELY(ip_len < IP_HEADER_LEN)) {
1702
0
        return false;
1703
0
    }
1704
1705
0
    if (OVS_UNLIKELY(size < ip_len)) {
1706
0
        return false;
1707
0
    }
1708
1709
0
    if (IP_IS_LATER_FRAG(ip->ip_frag_off)) {
1710
0
        return false;
1711
0
    }
1712
1713
0
    if (pkt && dp_packet_ip_checksum_unknown(pkt)) {
1714
0
        COVERAGE_INC(conntrack_l3csum_checked);
1715
0
        if (csum(data, ip_len)) {
1716
0
            COVERAGE_INC(conntrack_l3csum_err);
1717
0
            dp_packet_ip_checksum_set_bad(pkt);
1718
0
            return false;
1719
0
        }
1720
0
        dp_packet_ip_checksum_set_good(pkt);
1721
0
    }
1722
1723
0
    if (new_data) {
1724
0
        *new_data = (char *) data + ip_len;
1725
0
    }
1726
1727
0
    key->src.addr.ipv4 = get_16aligned_be32(&ip->ip_src);
1728
0
    key->dst.addr.ipv4 = get_16aligned_be32(&ip->ip_dst);
1729
0
    key->nw_proto = ip->ip_proto;
1730
1731
0
    return true;
1732
0
}
1733
1734
/* 'Data' is a pointer to the beginning of the L3 header and 'new_data' is
1735
 * used to store a pointer to the first byte after the L3 header.  'Size' is
1736
 * the size of the packet beyond the data pointer. */
1737
static inline bool
1738
extract_l3_ipv6(struct conn_key *key, const void *data, size_t size,
1739
                const char **new_data)
1740
0
{
1741
0
    const struct ovs_16aligned_ip6_hdr *ip6 = data;
1742
1743
0
    if (OVS_UNLIKELY(size < sizeof *ip6)) {
1744
0
        return false;
1745
0
    }
1746
1747
0
    data = ip6 + 1;
1748
0
    size -=  sizeof *ip6;
1749
0
    uint8_t nw_proto = ip6->ip6_nxt;
1750
0
    uint8_t nw_frag = 0;
1751
1752
0
    if (!parse_ipv6_ext_hdrs(&data, &size, &nw_proto, &nw_frag,
1753
0
                             NULL, NULL)) {
1754
0
        return false;
1755
0
    }
1756
1757
0
    if (nw_frag) {
1758
0
        return false;
1759
0
    }
1760
1761
0
    if (new_data) {
1762
0
        *new_data = data;
1763
0
    }
1764
1765
0
    memcpy(&key->src.addr.ipv6, &ip6->ip6_src, sizeof key->src.addr);
1766
0
    memcpy(&key->dst.addr.ipv6, &ip6->ip6_dst, sizeof key->dst.addr);
1767
0
    key->nw_proto = nw_proto;
1768
1769
0
    return true;
1770
0
}
1771
1772
static inline bool
1773
checksum_valid(const struct conn_key *key, const void *data, size_t size,
1774
               const void *l3)
1775
0
{
1776
0
    bool valid;
1777
1778
0
    if (key->dl_type == htons(ETH_TYPE_IP)) {
1779
0
        uint32_t csum = packet_csum_pseudoheader(l3);
1780
0
        valid = (csum_finish(csum_continue(csum, data, size)) == 0);
1781
0
    } else if (key->dl_type == htons(ETH_TYPE_IPV6)) {
1782
0
        valid = (packet_csum_upperlayer6(l3, data, key->nw_proto, size) == 0);
1783
0
    } else {
1784
0
        valid = false;
1785
0
    }
1786
1787
0
    COVERAGE_INC(conntrack_l4csum_checked);
1788
0
    if (!valid) {
1789
0
        COVERAGE_INC(conntrack_l4csum_err);
1790
0
    }
1791
1792
0
    return valid;
1793
0
}
1794
1795
static inline bool
1796
sctp_checksum_valid(const void *data, size_t size)
1797
0
{
1798
0
    struct sctp_header *sctp = (struct sctp_header *) data;
1799
0
    ovs_be32 rcvd_csum, csum;
1800
1801
0
    rcvd_csum = get_16aligned_be32(&sctp->sctp_csum);
1802
0
    put_16aligned_be32(&sctp->sctp_csum, 0);
1803
0
    csum = crc32c(data, size);
1804
0
    put_16aligned_be32(&sctp->sctp_csum, rcvd_csum);
1805
1806
0
    COVERAGE_INC(conntrack_l4csum_checked);
1807
0
    if (rcvd_csum != csum) {
1808
0
        COVERAGE_INC(conntrack_l4csum_err);
1809
0
        return false;
1810
0
    }
1811
1812
0
    return true;
1813
0
}
1814
1815
static inline bool
1816
check_l4_tcp(struct dp_packet *pkt, const struct conn_key *key,
1817
             const void *data, size_t size, const void *l3)
1818
0
{
1819
0
    const struct tcp_header *tcp = data;
1820
0
    if (size < sizeof *tcp) {
1821
0
        return false;
1822
0
    }
1823
1824
0
    size_t tcp_len = TCP_OFFSET(tcp->tcp_ctl) * 4;
1825
0
    if (OVS_UNLIKELY(tcp_len < TCP_HEADER_LEN || tcp_len > size)) {
1826
0
        return false;
1827
0
    }
1828
1829
0
    if (pkt && dp_packet_l4_checksum_unknown(pkt)) {
1830
0
        if (!checksum_valid(key, data, size, l3)) {
1831
0
            dp_packet_l4_checksum_set_bad(pkt);
1832
0
            return false;
1833
0
        }
1834
0
        dp_packet_l4_checksum_set_good(pkt);
1835
0
        dp_packet_l4_proto_set_tcp(pkt);
1836
0
    }
1837
0
    return true;
1838
0
}
1839
1840
static inline bool
1841
check_l4_udp(struct dp_packet *pkt, const struct conn_key *key,
1842
             const void *data, size_t size, const void *l3)
1843
0
{
1844
0
    const struct udp_header *udp = data;
1845
0
    if (size < sizeof *udp) {
1846
0
        return false;
1847
0
    }
1848
1849
0
    size_t udp_len = ntohs(udp->udp_len);
1850
0
    if (OVS_UNLIKELY(udp_len < UDP_HEADER_LEN || udp_len > size)) {
1851
0
        return false;
1852
0
    }
1853
1854
    /* Validation must be skipped if checksum is 0 on IPv4 packets */
1855
0
    if (!(udp->udp_csum == 0 && key->dl_type == htons(ETH_TYPE_IP))
1856
0
        && (pkt && dp_packet_l4_checksum_unknown(pkt))) {
1857
0
        if (!checksum_valid(key, data, size, l3)) {
1858
0
            dp_packet_l4_checksum_set_bad(pkt);
1859
0
            return false;
1860
0
        }
1861
0
        dp_packet_l4_checksum_set_good(pkt);
1862
0
        dp_packet_l4_proto_set_udp(pkt);
1863
0
    }
1864
0
    return true;
1865
0
}
1866
1867
static inline bool
1868
sctp_check_len(const struct sctp_header *sh, size_t size)
1869
0
{
1870
0
    const struct sctp_chunk_header *sch;
1871
0
    size_t next;
1872
1873
0
    if (size < SCTP_HEADER_LEN) {
1874
0
        return false;
1875
0
    }
1876
1877
    /* rfc4960: Chunks (including Type, Length, and Value fields) are padded
1878
     * out by the sender with all zero bytes to be a multiple of 4 bytes long.
1879
     */
1880
0
    for (next = sizeof(struct sctp_header),
1881
0
         sch = SCTP_NEXT_CHUNK(sh, next);
1882
0
         next < size;
1883
0
         next += ROUND_UP(ntohs(sch->length), 4),
1884
0
         sch = SCTP_NEXT_CHUNK(sh, next)) {
1885
        /* rfc4960: This value represents the size of the chunk in bytes,
1886
         * including the Chunk Type, Chunk Flags, Chunk Length, and Chunk Value
1887
         * fields.
1888
         * Therefore, if the Chunk Value field is zero-length, the Length
1889
         * field will be set to 4. */
1890
0
        if (ntohs(sch->length) < sizeof *sch) {
1891
0
            return false;
1892
0
        }
1893
0
    }
1894
1895
0
    return (next == size);
1896
0
}
1897
1898
static inline bool
1899
check_l4_sctp(struct dp_packet *pkt, const void *data, size_t size)
1900
0
{
1901
0
    if (OVS_UNLIKELY(!sctp_check_len(data, size))) {
1902
0
        return false;
1903
0
    }
1904
1905
0
    if (pkt && dp_packet_l4_checksum_unknown(pkt)) {
1906
0
        if (!sctp_checksum_valid(data, size)) {
1907
0
            dp_packet_l4_checksum_set_bad(pkt);
1908
0
            return false;
1909
0
        }
1910
0
        dp_packet_l4_checksum_set_good(pkt);
1911
0
        dp_packet_l4_proto_set_sctp(pkt);
1912
0
    }
1913
0
    return true;
1914
0
}
1915
1916
static inline bool
1917
check_l4_icmp(struct dp_packet *pkt, const void *data, size_t size)
1918
0
{
1919
0
    if (pkt) {
1920
0
        COVERAGE_INC(conntrack_l4csum_checked);
1921
0
        if (csum(data, size)) {
1922
0
            COVERAGE_INC(conntrack_l4csum_err);
1923
0
            return false;
1924
0
        }
1925
0
    }
1926
1927
0
    return true;
1928
0
}
1929
1930
static inline bool
1931
check_l4_icmp6(struct dp_packet *pkt, const struct conn_key *key,
1932
               const void *data, size_t size, const void *l3)
1933
0
{
1934
0
    return pkt ? checksum_valid(key, data, size, l3) : true;
1935
0
}
1936
1937
static inline bool
1938
extract_l4_tcp(struct conn_key *key, const void *data, size_t size,
1939
               size_t *chk_len)
1940
0
{
1941
0
    if (OVS_UNLIKELY(size < (chk_len ? *chk_len : TCP_HEADER_LEN))) {
1942
0
        return false;
1943
0
    }
1944
1945
0
    const struct tcp_header *tcp = data;
1946
0
    key->src.port = tcp->tcp_src;
1947
0
    key->dst.port = tcp->tcp_dst;
1948
1949
    /* Port 0 is invalid */
1950
0
    return key->src.port && key->dst.port;
1951
0
}
1952
1953
static inline bool
1954
extract_l4_udp(struct conn_key *key, const void *data, size_t size,
1955
               size_t *chk_len)
1956
0
{
1957
0
    if (OVS_UNLIKELY(size < (chk_len ? *chk_len : UDP_HEADER_LEN))) {
1958
0
        return false;
1959
0
    }
1960
1961
0
    const struct udp_header *udp = data;
1962
0
    key->src.port = udp->udp_src;
1963
0
    key->dst.port = udp->udp_dst;
1964
1965
    /* Port 0 is invalid */
1966
0
    return key->src.port && key->dst.port;
1967
0
}
1968
1969
static inline bool
1970
extract_l4_sctp(struct conn_key *key, const void *data, size_t size,
1971
                size_t *chk_len)
1972
0
{
1973
0
    if (OVS_UNLIKELY(size < (chk_len ? *chk_len : SCTP_HEADER_LEN))) {
1974
0
        return false;
1975
0
    }
1976
1977
0
    const struct sctp_header *sctp = data;
1978
0
    key->src.port = sctp->sctp_src;
1979
0
    key->dst.port = sctp->sctp_dst;
1980
1981
0
    return key->src.port && key->dst.port;
1982
0
}
1983
1984
static inline bool extract_l4(struct dp_packet *pkt, struct conn_key *key,
1985
                              const void *data, size_t size, bool *related,
1986
                              const void *l3, size_t *chk_len);
1987
1988
static uint8_t
1989
reverse_icmp_type(uint8_t type)
1990
0
{
1991
0
    switch (type) {
1992
0
    case ICMP4_ECHO_REQUEST:
1993
0
        return ICMP4_ECHO_REPLY;
1994
0
    case ICMP4_ECHO_REPLY:
1995
0
        return ICMP4_ECHO_REQUEST;
1996
1997
0
    case ICMP4_TIMESTAMP:
1998
0
        return ICMP4_TIMESTAMPREPLY;
1999
0
    case ICMP4_TIMESTAMPREPLY:
2000
0
        return ICMP4_TIMESTAMP;
2001
2002
0
    case ICMP4_INFOREQUEST:
2003
0
        return ICMP4_INFOREPLY;
2004
0
    case ICMP4_INFOREPLY:
2005
0
        return ICMP4_INFOREQUEST;
2006
0
    default:
2007
0
        OVS_NOT_REACHED();
2008
0
    }
2009
0
}
2010
2011
/* If 'related' is not NULL and the function is processing an ICMP
2012
 * error packet, extract the l3 and l4 fields from the nested header
2013
 * instead and set *related to true.  If 'related' is NULL we're
2014
 * already processing a nested header and no such recursion is
2015
 * possible */
2016
static inline int
2017
extract_l4_icmp(struct conn_key *key, const void *data, size_t size,
2018
                bool *related, size_t *chk_len)
2019
0
{
2020
0
    if (OVS_UNLIKELY(size < (chk_len ? *chk_len : ICMP_HEADER_LEN))) {
2021
0
        return false;
2022
0
    }
2023
2024
0
    const struct icmp_header *icmp = data;
2025
2026
0
    switch (icmp->icmp_type) {
2027
0
    case ICMP4_ECHO_REQUEST:
2028
0
    case ICMP4_ECHO_REPLY:
2029
0
    case ICMP4_TIMESTAMP:
2030
0
    case ICMP4_TIMESTAMPREPLY:
2031
0
    case ICMP4_INFOREQUEST:
2032
0
    case ICMP4_INFOREPLY:
2033
0
        if (icmp->icmp_code != 0) {
2034
0
            return false;
2035
0
        }
2036
        /* Separate ICMP connection: identified using id */
2037
0
        key->src.icmp_id = key->dst.icmp_id = icmp->icmp_fields.echo.id;
2038
0
        key->src.icmp_type = icmp->icmp_type;
2039
0
        key->dst.icmp_type = reverse_icmp_type(icmp->icmp_type);
2040
0
        break;
2041
0
    case ICMP4_DST_UNREACH:
2042
0
    case ICMP4_TIME_EXCEEDED:
2043
0
    case ICMP4_PARAM_PROB:
2044
0
    case ICMP4_SOURCEQUENCH:
2045
0
    case ICMP4_REDIRECT: {
2046
        /* ICMP packet part of another connection. We should
2047
         * extract the key from embedded packet header */
2048
0
        struct conn_key inner_key;
2049
0
        const char *l3 = (const char *) (icmp + 1);
2050
0
        const char *tail = (const char *) data + size;
2051
0
        const char *l4;
2052
2053
0
        if (!related) {
2054
0
            return false;
2055
0
        }
2056
2057
0
        memset(&inner_key, 0, sizeof inner_key);
2058
0
        inner_key.dl_type = htons(ETH_TYPE_IP);
2059
0
        bool ok = extract_l3_ipv4(NULL, &inner_key, l3, tail - l3, &l4);
2060
0
        if (!ok) {
2061
0
            return false;
2062
0
        }
2063
2064
0
        if (inner_key.src.addr.ipv4 != key->dst.addr.ipv4) {
2065
0
            return false;
2066
0
        }
2067
2068
0
        key->src = inner_key.src;
2069
0
        key->dst = inner_key.dst;
2070
0
        key->nw_proto = inner_key.nw_proto;
2071
0
        size_t check_len = ICMP_ERROR_DATA_L4_LEN;
2072
2073
0
        ok = extract_l4(NULL, key, l4, tail - l4, NULL, l3, &check_len);
2074
0
        if (ok) {
2075
0
            conn_key_reverse(key);
2076
0
            *related = true;
2077
0
        }
2078
0
        return ok;
2079
0
    }
2080
0
    default:
2081
0
        return false;
2082
0
    }
2083
2084
0
    return true;
2085
0
}
2086
2087
static uint8_t
2088
reverse_icmp6_type(uint8_t type)
2089
0
{
2090
0
    switch (type) {
2091
0
    case ICMP6_ECHO_REQUEST:
2092
0
        return ICMP6_ECHO_REPLY;
2093
0
    case ICMP6_ECHO_REPLY:
2094
0
        return ICMP6_ECHO_REQUEST;
2095
0
    default:
2096
0
        OVS_NOT_REACHED();
2097
0
    }
2098
0
}
2099
2100
/* If 'related' is not NULL and the function is processing an ICMP
2101
 * error packet, extract the l3 and l4 fields from the nested header
2102
 * instead and set *related to true.  If 'related' is NULL we're
2103
 * already processing a nested header and no such recursion is
2104
 * possible */
2105
static inline bool
2106
extract_l4_icmp6(struct conn_key *key, const void *data, size_t size,
2107
                 bool *related)
2108
0
{
2109
0
    const struct icmp6_header *icmp6 = data;
2110
2111
    /* All the messages that we support need at least 4 bytes after
2112
     * the header */
2113
0
    if (size < sizeof *icmp6 + 4) {
2114
0
        return false;
2115
0
    }
2116
2117
0
    switch (icmp6->icmp6_type) {
2118
0
    case ICMP6_ECHO_REQUEST:
2119
0
    case ICMP6_ECHO_REPLY:
2120
0
        if (icmp6->icmp6_code != 0) {
2121
0
            return false;
2122
0
        }
2123
        /* Separate ICMP connection: identified using id */
2124
0
        key->src.icmp_id = key->dst.icmp_id = *(ovs_be16 *) (icmp6 + 1);
2125
0
        key->src.icmp_type = icmp6->icmp6_type;
2126
0
        key->dst.icmp_type = reverse_icmp6_type(icmp6->icmp6_type);
2127
0
        break;
2128
0
    case ICMP6_DST_UNREACH:
2129
0
    case ICMP6_PACKET_TOO_BIG:
2130
0
    case ICMP6_TIME_EXCEEDED:
2131
0
    case ICMP6_PARAM_PROB: {
2132
        /* ICMP packet part of another connection. We should
2133
         * extract the key from embedded packet header */
2134
0
        struct conn_key inner_key;
2135
0
        const char *l3 = (const char *) icmp6 + 8;
2136
0
        const char *tail = (const char *) data + size;
2137
0
        const char *l4 = NULL;
2138
2139
0
        if (!related) {
2140
0
            return false;
2141
0
        }
2142
2143
0
        memset(&inner_key, 0, sizeof inner_key);
2144
0
        inner_key.dl_type = htons(ETH_TYPE_IPV6);
2145
0
        bool ok = extract_l3_ipv6(&inner_key, l3, tail - l3, &l4);
2146
0
        if (!ok) {
2147
0
            return false;
2148
0
        }
2149
2150
        /* pf doesn't do this, but it seems a good idea */
2151
0
        if (!ipv6_addr_equals(&inner_key.src.addr.ipv6,
2152
0
                              &key->dst.addr.ipv6)) {
2153
0
            return false;
2154
0
        }
2155
2156
0
        key->src = inner_key.src;
2157
0
        key->dst = inner_key.dst;
2158
0
        key->nw_proto = inner_key.nw_proto;
2159
2160
0
        ok = extract_l4(NULL, key, l4, tail - l4, NULL, l3, NULL);
2161
0
        if (ok) {
2162
0
            conn_key_reverse(key);
2163
0
            *related = true;
2164
0
        }
2165
0
        return ok;
2166
0
    }
2167
0
    default:
2168
0
        return false;
2169
0
    }
2170
2171
0
    return true;
2172
0
}
2173
2174
/* Extract l4 fields into 'key', which must already contain valid l3
2175
 * members.
2176
 *
2177
 * If 'related' is not NULL and an ICMP error packet is being
2178
 * processed, the function will extract the key from the packet nested
2179
 * in the ICMP payload and set '*related' to true.
2180
 *
2181
 * 'size' here is the layer 4 size, which can be a nested size if parsing
2182
 * an ICMP or ICMP6 header.
2183
 *
2184
 * If 'related' is NULL, it means that we're already parsing a header nested
2185
 * in an ICMP error.  In this case, we skip the checksum and some length
2186
 * validations. */
2187
static inline bool
2188
extract_l4(struct dp_packet *pkt, struct conn_key *key, const void *data,
2189
           size_t size, bool *related, const void *l3, size_t *chk_len)
2190
0
{
2191
0
    if (key->nw_proto == IPPROTO_TCP) {
2192
0
        return (!related || check_l4_tcp(pkt, key, data, size, l3))
2193
0
               && extract_l4_tcp(key, data, size, chk_len);
2194
0
    } else if (key->nw_proto == IPPROTO_UDP) {
2195
0
        return (!related || check_l4_udp(pkt, key, data, size, l3))
2196
0
               && extract_l4_udp(key, data, size, chk_len);
2197
0
    } else if (key->nw_proto == IPPROTO_SCTP) {
2198
0
        return (!related || check_l4_sctp(pkt, data, size))
2199
0
               && extract_l4_sctp(key, data, size, chk_len);
2200
0
    } else if (key->dl_type == htons(ETH_TYPE_IP)
2201
0
               && key->nw_proto == IPPROTO_ICMP) {
2202
0
        return (!related || check_l4_icmp(pkt, data, size))
2203
0
               && extract_l4_icmp(key, data, size, related, chk_len);
2204
0
    } else if (key->dl_type == htons(ETH_TYPE_IPV6)
2205
0
               && key->nw_proto == IPPROTO_ICMPV6) {
2206
0
        return (!related || check_l4_icmp6(pkt, key, data, size, l3))
2207
0
               && extract_l4_icmp6(key, data, size, related);
2208
0
    }
2209
2210
    /* For all other protocols we do not have L4 keys, so keep them zero. */
2211
0
    return true;
2212
0
}
2213
2214
static bool
2215
conn_key_extract(struct conntrack *ct, struct dp_packet *pkt, ovs_be16 dl_type,
2216
                 struct conn_lookup_ctx *ctx, uint16_t zone)
2217
0
{
2218
0
    const struct eth_header *l2 = dp_packet_eth(pkt);
2219
0
    const struct ip_header *l3 = dp_packet_l3(pkt);
2220
0
    const char *l4 = dp_packet_l4(pkt);
2221
2222
0
    memset(ctx, 0, sizeof *ctx);
2223
2224
0
    if (!l2 || !l3 || !l4) {
2225
0
        return false;
2226
0
    }
2227
2228
0
    ctx->key.zone = zone;
2229
2230
    /* XXX In this function we parse the packet (again, it has already
2231
     * gone through miniflow_extract()) for two reasons:
2232
     *
2233
     * 1) To extract the l3 addresses and l4 ports.
2234
     *    We already have the l3 and l4 headers' pointers.  Extracting
2235
     *    the l3 addresses and the l4 ports is really cheap, since they
2236
     *    can be found at fixed locations.
2237
     * 2) To extract the l4 type.
2238
     *    Extracting the l4 types, for IPv6 can be quite expensive, because
2239
     *    it's not at a fixed location.
2240
     *
2241
     * Here's a way to avoid (2) with the help of the datapath.
2242
     * The datapath doesn't keep the packet's extracted flow[1], so
2243
     * using that is not an option.  We could use the packet's matching
2244
     * megaflow, but we have to make sure that the l4 type (nw_proto)
2245
     * is unwildcarded.  This means either:
2246
     *
2247
     * a) dpif-netdev unwildcards the l4 type when a new flow is installed
2248
     *    if the actions contains ct().
2249
     *
2250
     * b) ofproto-dpif-xlate unwildcards the l4 type when translating a ct()
2251
     *    action.  This is already done in different actions, but it's
2252
     *    unnecessary for the kernel.
2253
     *
2254
     * ---
2255
     * [1] The reasons for this are that keeping the flow increases
2256
     *     (slightly) the cache footprint and increases computation
2257
     *     time as we move the packet around. Most importantly, the flow
2258
     *     should be updated by the actions and this can be slow, as
2259
     *     we use a sparse representation (miniflow).
2260
     *
2261
     */
2262
0
    bool ok;
2263
0
    ctx->key.dl_type = dl_type;
2264
2265
0
    if (ctx->key.dl_type == htons(ETH_TYPE_IP)) {
2266
0
        if (dp_packet_ip_checksum_bad(pkt)) {
2267
0
            ok = false;
2268
0
            COVERAGE_INC(conntrack_l3csum_err);
2269
0
        } else {
2270
            /* Validate the checksum only when hwol is not supported and the
2271
             * packet's checksum status is not known. */
2272
0
            ok = extract_l3_ipv4(pkt, &ctx->key, l3, dp_packet_l3_size(pkt),
2273
0
                                 NULL);
2274
0
        }
2275
0
    } else if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {
2276
0
        ok = extract_l3_ipv6(&ctx->key, l3, dp_packet_l3_size(pkt), NULL);
2277
0
    } else {
2278
0
        ok = false;
2279
0
    }
2280
2281
0
    if (ok) {
2282
0
        if (!dp_packet_l4_checksum_bad(pkt)) {
2283
            /* Validate the checksum only when hwol is not supported. */
2284
0
            if (extract_l4(pkt, &ctx->key, l4, dp_packet_l4_size(pkt),
2285
0
                           &ctx->icmp_related, l3, NULL)) {
2286
0
                ctx->hash = conn_key_hash(&ctx->key, ct->hash_basis);
2287
0
                return true;
2288
0
            }
2289
0
        } else {
2290
0
            COVERAGE_INC(conntrack_l4csum_err);
2291
0
        }
2292
0
    }
2293
2294
0
    return false;
2295
0
}
2296
2297
static uint32_t
2298
ct_addr_hash_add(uint32_t hash, const union ct_addr *addr)
2299
0
{
2300
0
    BUILD_ASSERT_DECL(sizeof *addr % 4 == 0);
2301
0
    return hash_add_bytes32(hash, (const uint32_t *) addr, sizeof *addr);
2302
0
}
2303
2304
static uint32_t
2305
ct_endpoint_hash_add(uint32_t hash, const struct ct_endpoint *ep)
2306
0
{
2307
0
    BUILD_ASSERT_DECL(sizeof *ep % 4 == 0);
2308
0
    return hash_add_bytes32(hash, (const uint32_t *) ep, sizeof *ep);
2309
0
}
2310

2311
/* Symmetric */
2312
static uint32_t
2313
conn_key_hash(const struct conn_key *key, uint32_t basis)
2314
0
{
2315
0
    uint32_t hsrc, hdst, hash;
2316
0
    hsrc = hdst = basis;
2317
0
    hsrc = ct_endpoint_hash_add(hsrc, &key->src);
2318
0
    hdst = ct_endpoint_hash_add(hdst, &key->dst);
2319
2320
    /* Even if source and destination are swapped the hash will be the same. */
2321
0
    hash = hsrc ^ hdst;
2322
2323
    /* Hash the rest of the key(L3 and L4 types and zone). */
2324
0
    return hash_words((uint32_t *) (&key->dst + 1),
2325
0
                      (uint32_t *) (key + 1) - (uint32_t *) (&key->dst + 1),
2326
0
                      hash);
2327
0
}
2328
2329
static void
2330
conn_key_reverse(struct conn_key *key)
2331
0
{
2332
0
    struct ct_endpoint tmp = key->src;
2333
0
    key->src = key->dst;
2334
0
    key->dst = tmp;
2335
0
}
2336
2337
static uint32_t
2338
nat_ipv6_addrs_delta(const struct in6_addr *ipv6_min,
2339
                     const struct in6_addr *ipv6_max)
2340
0
{
2341
0
    const uint8_t *ipv6_min_hi = &ipv6_min->s6_addr[0];
2342
0
    const uint8_t *ipv6_min_lo = &ipv6_min->s6_addr[0] +  sizeof(uint64_t);
2343
0
    const uint8_t *ipv6_max_hi = &ipv6_max->s6_addr[0];
2344
0
    const uint8_t *ipv6_max_lo = &ipv6_max->s6_addr[0] + sizeof(uint64_t);
2345
2346
0
    ovs_be64 addr6_64_min_hi;
2347
0
    ovs_be64 addr6_64_min_lo;
2348
0
    memcpy(&addr6_64_min_hi, ipv6_min_hi, sizeof addr6_64_min_hi);
2349
0
    memcpy(&addr6_64_min_lo, ipv6_min_lo, sizeof addr6_64_min_lo);
2350
2351
0
    ovs_be64 addr6_64_max_hi;
2352
0
    ovs_be64 addr6_64_max_lo;
2353
0
    memcpy(&addr6_64_max_hi, ipv6_max_hi, sizeof addr6_64_max_hi);
2354
0
    memcpy(&addr6_64_max_lo, ipv6_max_lo, sizeof addr6_64_max_lo);
2355
2356
0
    uint64_t diff;
2357
2358
0
    if (addr6_64_min_hi == addr6_64_max_hi &&
2359
0
        ntohll(addr6_64_min_lo) <= ntohll(addr6_64_max_lo)) {
2360
0
        diff = ntohll(addr6_64_max_lo) - ntohll(addr6_64_min_lo);
2361
0
    } else if (ntohll(addr6_64_min_hi) + 1 == ntohll(addr6_64_max_hi) &&
2362
0
               ntohll(addr6_64_min_lo) > ntohll(addr6_64_max_lo)) {
2363
0
        diff = UINT64_MAX - (ntohll(addr6_64_min_lo) -
2364
0
                             ntohll(addr6_64_max_lo) - 1);
2365
0
    } else {
2366
        /* Limit address delta supported to 32 bits or 4 billion approximately.
2367
         * Possibly, this should be visible to the user through a datapath
2368
         * support check, however the practical impact is probably nil. */
2369
0
        diff = 0xfffffffe;
2370
0
    }
2371
2372
0
    if (diff > 0xfffffffe) {
2373
0
        diff = 0xfffffffe;
2374
0
    }
2375
0
    return diff;
2376
0
}
2377
2378
/* This function must be used in tandem with nat_ipv6_addrs_delta(), which
2379
 * restricts the input parameters. */
2380
static void
2381
nat_ipv6_addr_increment(struct in6_addr *ipv6, uint32_t increment)
2382
0
{
2383
0
    uint8_t *ipv6_hi = &ipv6->s6_addr[0];
2384
0
    uint8_t *ipv6_lo = &ipv6->s6_addr[0] + sizeof(ovs_be64);
2385
0
    ovs_be64 addr6_64_hi;
2386
0
    ovs_be64 addr6_64_lo;
2387
0
    memcpy(&addr6_64_hi, ipv6_hi, sizeof addr6_64_hi);
2388
0
    memcpy(&addr6_64_lo, ipv6_lo, sizeof addr6_64_lo);
2389
2390
0
    if (UINT64_MAX - increment >= ntohll(addr6_64_lo)) {
2391
0
        addr6_64_lo = htonll(increment + ntohll(addr6_64_lo));
2392
0
    } else if (addr6_64_hi != OVS_BE64_MAX) {
2393
0
        addr6_64_hi = htonll(1 + ntohll(addr6_64_hi));
2394
0
        addr6_64_lo = htonll(increment - (UINT64_MAX -
2395
0
                                          ntohll(addr6_64_lo) + 1));
2396
0
    } else {
2397
0
        OVS_NOT_REACHED();
2398
0
    }
2399
2400
0
    memcpy(ipv6_hi, &addr6_64_hi, sizeof addr6_64_hi);
2401
0
    memcpy(ipv6_lo, &addr6_64_lo, sizeof addr6_64_lo);
2402
0
}
2403
2404
static uint32_t
2405
nat_range_hash(const struct conn_key *key, uint32_t basis,
2406
               const struct nat_action_info_t *nat_info)
2407
0
{
2408
0
    uint32_t hash = basis;
2409
2410
0
    if (!basis) {
2411
0
        hash = ct_addr_hash_add(hash, &key->src.addr);
2412
0
    } else {
2413
0
        hash = ct_endpoint_hash_add(hash, &key->src);
2414
0
        hash = ct_endpoint_hash_add(hash, &key->dst);
2415
0
    }
2416
2417
0
    hash = ct_addr_hash_add(hash, &nat_info->min_addr);
2418
0
    hash = ct_addr_hash_add(hash, &nat_info->max_addr);
2419
0
    hash = hash_add(hash,
2420
0
                    ((uint32_t) nat_info->max_port << 16)
2421
0
                    | nat_info->min_port);
2422
0
    hash = hash_add(hash, (OVS_FORCE uint32_t) key->dl_type);
2423
0
    hash = hash_add(hash, key->nw_proto);
2424
0
    hash = hash_add(hash, key->zone);
2425
    /* The purpose of the second parameter is to distinguish hashes of data of
2426
     * different length; our data always has the same length so there is no
2427
     * value in counting. */
2428
0
    return hash_finish(hash, 0);
2429
0
}
2430
2431
/* Ports are stored in host byte order for convenience. */
2432
static void
2433
set_sport_range(const struct nat_action_info_t *ni, const struct conn_key *k,
2434
                uint32_t off, uint16_t *curr, uint16_t *min,
2435
                uint16_t *max)
2436
0
{
2437
0
    if (((ni->nat_action & NAT_ACTION_SNAT_ALL) == NAT_ACTION_SRC) ||
2438
0
        ((ni->nat_action & NAT_ACTION_DST))) {
2439
0
        *curr = ntohs(k->src.port);
2440
0
        if (*curr < 512) {
2441
0
            *min = 1;
2442
0
            *max = 511;
2443
0
        } else if (*curr < 1024) {
2444
0
            *min = 600;
2445
0
            *max = 1023;
2446
0
        } else {
2447
0
            *min = MIN_NAT_EPHEMERAL_PORT;
2448
0
            *max = MAX_NAT_EPHEMERAL_PORT;
2449
0
        }
2450
0
    } else {
2451
0
        *min = ni->min_port;
2452
0
        *max = ni->max_port;
2453
0
        *curr =  *min + (off % ((*max - *min) + 1));
2454
0
    }
2455
0
}
2456
2457
static void
2458
set_dport_range(const struct nat_action_info_t *ni, const struct conn_key *k,
2459
                uint32_t off, uint16_t *curr, uint16_t *min,
2460
                uint16_t *max)
2461
0
{
2462
0
    if (ni->nat_action & NAT_ACTION_DST_PORT) {
2463
0
        *min = ni->min_port;
2464
0
        *max = ni->max_port;
2465
0
        *curr = *min + (off % ((*max - *min) + 1));
2466
0
    } else {
2467
0
        *curr = ntohs(k->dst.port);
2468
0
        *min = *max = *curr;
2469
0
    }
2470
0
}
2471
2472
/* Gets an in range address based on the hash.
2473
 * Addresses are kept in network order. */
2474
static void
2475
get_addr_in_range(union ct_addr *min, union ct_addr *max,
2476
                  union ct_addr *curr, uint32_t hash, bool ipv4)
2477
0
{
2478
0
    uint32_t offt, range;
2479
2480
0
    if (ipv4) {
2481
0
        range = (ntohl(max->ipv4) - ntohl(min->ipv4)) + 1;
2482
0
        offt = hash % range;
2483
0
        curr->ipv4 = htonl(ntohl(min->ipv4) + offt);
2484
0
    } else {
2485
0
        range = nat_ipv6_addrs_delta(&min->ipv6, &max->ipv6) + 1;
2486
        /* Range must be within 32 bits for full hash coverage. A 64 or
2487
         * 128 bit hash is unnecessary and hence not used here. Most code
2488
         * is kept common with V4; nat_ipv6_addrs_delta() will do the
2489
         * enforcement via max_ct_addr. */
2490
0
        offt = hash % range;
2491
0
        curr->ipv6 = min->ipv6;
2492
0
        nat_ipv6_addr_increment(&curr->ipv6, offt);
2493
0
    }
2494
0
}
2495
2496
static void
2497
find_addr(const struct conn_key *key, union ct_addr *min,
2498
          union ct_addr *max, union ct_addr *curr,
2499
          uint32_t hash, bool ipv4,
2500
          const struct nat_action_info_t *nat_info)
2501
0
{
2502
0
    union ct_addr zero_ip;
2503
2504
0
    memset(&zero_ip, 0, sizeof zero_ip);
2505
2506
    /* All-zero case. */
2507
0
    if (!memcmp(min, &zero_ip, sizeof *min)) {
2508
0
        if (nat_info->nat_action & NAT_ACTION_SRC) {
2509
0
            *curr = key->src.addr;
2510
0
        } else if (nat_info->nat_action & NAT_ACTION_DST) {
2511
0
            *curr = key->dst.addr;
2512
0
        }
2513
0
    } else {
2514
0
        get_addr_in_range(min, max, curr, hash, ipv4);
2515
0
    }
2516
0
}
2517
2518
static void
2519
store_addr_to_key(union ct_addr *addr, struct conn_key *key,
2520
                  uint16_t action)
2521
0
{
2522
0
    if (action & NAT_ACTION_SRC) {
2523
0
        key->dst.addr = *addr;
2524
0
    } else {
2525
0
        key->src.addr = *addr;
2526
0
    }
2527
0
}
2528
2529
static bool
2530
nat_get_unique_l4(struct conntrack *ct, struct conn_key *rev_key,
2531
                  ovs_be16 *port, uint16_t curr, uint16_t min,
2532
                  uint16_t max)
2533
0
{
2534
0
    static const unsigned int max_attempts = 128;
2535
0
    uint16_t range = max - min + 1;
2536
0
    unsigned int attempts;
2537
0
    uint16_t orig = curr;
2538
0
    unsigned int i = 0;
2539
2540
0
    attempts = range;
2541
0
    if (attempts > max_attempts) {
2542
0
        attempts = max_attempts;
2543
0
    }
2544
2545
0
another_round:
2546
0
    i = 0;
2547
0
    FOR_EACH_PORT_IN_RANGE (curr, min, max) {
2548
0
        if (i++ >= attempts) {
2549
0
            break;
2550
0
        }
2551
2552
0
        *port = htons(curr);
2553
0
        if (!conn_lookup(ct, rev_key, time_msec(), NULL, NULL)) {
2554
0
            return true;
2555
0
        }
2556
0
    }
2557
2558
0
    if (attempts < range && attempts >= 16) {
2559
0
        attempts /= 2;
2560
0
        curr = min + (random_uint32() % range);
2561
0
        goto another_round;
2562
0
    }
2563
2564
0
    *port = htons(orig);
2565
2566
0
    return false;
2567
0
}
2568
2569
/* This function tries to get a unique tuple.
2570
 * Every iteration checks that the reverse tuple doesn't
2571
 * collide with any existing one.
2572
 *
2573
 * In case of SNAT:
2574
 *    - Pick a src IP address in the range.
2575
 *        - Try to find a source port in range (if any).
2576
 *        - If no port range exists, use the whole
2577
 *          ephemeral range (after testing the port
2578
 *          used by the sender), otherwise use the
2579
 *          specified range.
2580
 *
2581
 * In case of DNAT:
2582
 *    - Pick a dst IP address in the range.
2583
 *        - For each dport in range (if any) tries to find
2584
 *          an unique tuple.
2585
 *        - Eventually, if the previous attempt fails,
2586
 *          tries to find a source port in the ephemeral
2587
 *          range (after testing the port used by the sender).
2588
 *
2589
 * If none can be found, return exhaustion to the caller. */
2590
static bool
2591
nat_get_unique_tuple(struct conntrack *ct, struct conn *conn,
2592
                     const struct nat_action_info_t *nat_info)
2593
0
{
2594
0
    struct conn_key *fwd_key = &conn->key_node[CT_DIR_FWD].key;
2595
0
    struct conn_key *rev_key = &conn->key_node[CT_DIR_REV].key;
2596
0
    bool pat_proto = fwd_key->nw_proto == IPPROTO_TCP ||
2597
0
                     fwd_key->nw_proto == IPPROTO_UDP ||
2598
0
                     fwd_key->nw_proto == IPPROTO_SCTP;
2599
0
    uint16_t min_dport, max_dport, curr_dport;
2600
0
    uint16_t min_sport, max_sport, curr_sport;
2601
0
    union ct_addr min_addr, max_addr, addr;
2602
0
    uint32_t hash, port_off, basis;
2603
2604
0
    memset(&min_addr, 0, sizeof min_addr);
2605
0
    memset(&max_addr, 0, sizeof max_addr);
2606
0
    memset(&addr, 0, sizeof addr);
2607
2608
0
    basis = (nat_info->nat_flags & NAT_PERSISTENT) ? 0 : ct->hash_basis;
2609
0
    hash = nat_range_hash(fwd_key, basis, nat_info);
2610
2611
0
    if (nat_info->nat_flags & NAT_RANGE_RANDOM) {
2612
0
        port_off = random_uint32();
2613
0
    } else if (basis) {
2614
0
        port_off = hash;
2615
0
    } else {
2616
0
        port_off = nat_range_hash(fwd_key, ct->hash_basis, nat_info);
2617
0
    }
2618
2619
0
    min_addr = nat_info->min_addr;
2620
0
    max_addr = nat_info->max_addr;
2621
2622
0
    find_addr(fwd_key, &min_addr, &max_addr, &addr, hash,
2623
0
              (fwd_key->dl_type == htons(ETH_TYPE_IP)), nat_info);
2624
2625
0
    set_sport_range(nat_info, fwd_key, port_off, &curr_sport,
2626
0
                    &min_sport, &max_sport);
2627
0
    set_dport_range(nat_info, fwd_key, port_off, &curr_dport,
2628
0
                    &min_dport, &max_dport);
2629
2630
0
    if (pat_proto) {
2631
0
        rev_key->src.port = htons(curr_dport);
2632
0
        rev_key->dst.port = htons(curr_sport);
2633
0
    }
2634
2635
0
    store_addr_to_key(&addr, rev_key, nat_info->nat_action);
2636
2637
0
    if (!pat_proto) {
2638
0
        return !conn_lookup(ct, rev_key, time_msec(), NULL, NULL);
2639
0
    }
2640
2641
0
    bool found = false;
2642
0
    if (nat_info->nat_action & NAT_ACTION_DST_PORT) {
2643
0
        found = nat_get_unique_l4(ct, rev_key, &rev_key->src.port,
2644
0
                                  curr_dport, min_dport, max_dport);
2645
0
    }
2646
2647
0
    if (!found) {
2648
0
        found = nat_get_unique_l4(ct, rev_key, &rev_key->dst.port,
2649
0
                                  curr_sport, min_sport, max_sport);
2650
0
    }
2651
2652
0
    if (found) {
2653
0
        return true;
2654
0
    }
2655
2656
0
    return false;
2657
0
}
2658
2659
static enum ct_update_res
2660
conn_update(struct conntrack *ct, struct conn *conn, struct dp_packet *pkt,
2661
            struct conn_lookup_ctx *ctx, long long now)
2662
0
{
2663
0
    ovs_mutex_lock(&conn->lock);
2664
0
    uint8_t nw_proto = conn->key_node[CT_DIR_FWD].key.nw_proto;
2665
0
    enum ct_update_res update_res =
2666
0
        l4_protos[nw_proto]->conn_update(ct, conn, pkt, ctx->reply, now);
2667
0
    ovs_mutex_unlock(&conn->lock);
2668
0
    return update_res;
2669
0
}
2670
2671
static void
2672
conn_expire_push_front(struct conntrack *ct, struct conn *conn)
2673
    OVS_REQUIRES(ct->ct_lock)
2674
0
{
2675
0
    unsigned int curr = ct->next_list;
2676
2677
0
    ct->next_list = (ct->next_list + 1) % N_EXP_LISTS;
2678
0
    rculist_push_front(&ct->exp_lists[curr], &conn->node);
2679
0
}
2680
2681
static long long int
2682
conn_expiration(const struct conn *conn)
2683
0
{
2684
0
    long long int expiration;
2685
2686
0
    atomic_read_relaxed(&CONST_CAST(struct conn *, conn)->expiration,
2687
0
                        &expiration);
2688
0
    return expiration;
2689
0
}
2690
2691
static bool
2692
conn_expired(const struct conn *conn, long long now)
2693
0
{
2694
0
    return now >= conn_expiration(conn);
2695
0
}
2696
2697
static bool
2698
valid_new(struct dp_packet *pkt, struct conn_key *key)
2699
0
{
2700
0
    return l4_protos[key->nw_proto]->valid_new(pkt);
2701
0
}
2702
2703
static struct conn *
2704
new_conn(struct conntrack *ct, struct dp_packet *pkt, struct conn_key *key,
2705
         long long now, uint32_t tp_id)
2706
0
{
2707
0
    return l4_protos[key->nw_proto]->new_conn(ct, pkt, now, tp_id);
2708
0
}
2709
2710
static void
2711
delete_conn__(struct conn *conn)
2712
0
{
2713
0
    free(conn->alg);
2714
0
    free(conn);
2715
0
}
2716
2717
static void
2718
delete_conn(struct conn *conn)
2719
0
{
2720
0
    ovs_mutex_destroy(&conn->lock);
2721
0
    delete_conn__(conn);
2722
0
}
2723
2724

2725
/* Convert a conntrack address 'a' into an IP address 'b' based on 'dl_type'.
2726
 *
2727
 * Note that 'dl_type' should be either "ETH_TYPE_IP" or "ETH_TYPE_IPv6"
2728
 * in network-byte order. */
2729
static void
2730
ct_endpoint_to_ct_dpif_inet_addr(const union ct_addr *a,
2731
                                 union ct_dpif_inet_addr *b,
2732
                                 ovs_be16 dl_type)
2733
0
{
2734
0
    if (dl_type == htons(ETH_TYPE_IP)) {
2735
0
        b->ip = a->ipv4;
2736
0
    } else if (dl_type == htons(ETH_TYPE_IPV6)){
2737
0
        b->in6 = a->ipv6;
2738
0
    }
2739
0
}
2740
2741
/* Convert an IP address 'a' into a conntrack address 'b' based on 'dl_type'.
2742
 *
2743
 * Note that 'dl_type' should be either "ETH_TYPE_IP" or "ETH_TYPE_IPv6"
2744
 * in network-byte order. */
2745
static void
2746
ct_dpif_inet_addr_to_ct_endpoint(const union ct_dpif_inet_addr *a,
2747
                                 union ct_addr *b, ovs_be16 dl_type)
2748
0
{
2749
0
    if (dl_type == htons(ETH_TYPE_IP)) {
2750
0
        b->ipv4 = a->ip;
2751
0
    } else if (dl_type == htons(ETH_TYPE_IPV6)){
2752
0
        b->ipv6 = a->in6;
2753
0
    }
2754
0
}
2755
2756
static void
2757
conn_key_to_tuple(const struct conn_key *key, struct ct_dpif_tuple *tuple)
2758
0
{
2759
0
    if (key->dl_type == htons(ETH_TYPE_IP)) {
2760
0
        tuple->l3_type = AF_INET;
2761
0
    } else if (key->dl_type == htons(ETH_TYPE_IPV6)) {
2762
0
        tuple->l3_type = AF_INET6;
2763
0
    }
2764
0
    tuple->ip_proto = key->nw_proto;
2765
0
    ct_endpoint_to_ct_dpif_inet_addr(&key->src.addr, &tuple->src,
2766
0
                                     key->dl_type);
2767
0
    ct_endpoint_to_ct_dpif_inet_addr(&key->dst.addr, &tuple->dst,
2768
0
                                     key->dl_type);
2769
2770
0
    if (key->nw_proto == IPPROTO_ICMP || key->nw_proto == IPPROTO_ICMPV6) {
2771
0
        tuple->icmp_id = key->src.icmp_id;
2772
0
        tuple->icmp_type = key->src.icmp_type;
2773
0
        tuple->icmp_code = key->src.icmp_code;
2774
0
    } else {
2775
0
        tuple->src_port = key->src.port;
2776
0
        tuple->dst_port = key->dst.port;
2777
0
    }
2778
0
}
2779
2780
static void
2781
tuple_to_conn_key(const struct ct_dpif_tuple *tuple, uint16_t zone,
2782
                  struct conn_key *key)
2783
0
{
2784
0
    if (tuple->l3_type == AF_INET) {
2785
0
        key->dl_type = htons(ETH_TYPE_IP);
2786
0
    } else if (tuple->l3_type == AF_INET6) {
2787
0
        key->dl_type = htons(ETH_TYPE_IPV6);
2788
0
    }
2789
0
    key->nw_proto = tuple->ip_proto;
2790
0
    ct_dpif_inet_addr_to_ct_endpoint(&tuple->src, &key->src.addr,
2791
0
                                     key->dl_type);
2792
0
    ct_dpif_inet_addr_to_ct_endpoint(&tuple->dst, &key->dst.addr,
2793
0
                                     key->dl_type);
2794
2795
0
    if (tuple->ip_proto == IPPROTO_ICMP || tuple->ip_proto == IPPROTO_ICMPV6) {
2796
0
        key->src.icmp_id = tuple->icmp_id;
2797
0
        key->src.icmp_type = tuple->icmp_type;
2798
0
        key->src.icmp_code = tuple->icmp_code;
2799
0
        key->dst.icmp_id = tuple->icmp_id;
2800
0
        key->dst.icmp_type = (tuple->ip_proto == IPPROTO_ICMP)
2801
0
                             ? reverse_icmp_type(tuple->icmp_type)
2802
0
                             : reverse_icmp6_type(tuple->icmp_type);
2803
0
        key->dst.icmp_code = tuple->icmp_code;
2804
0
    } else {
2805
0
        key->src.port = tuple->src_port;
2806
0
        key->dst.port = tuple->dst_port;
2807
0
    }
2808
0
    key->zone = zone;
2809
0
}
2810
2811
static void
2812
conn_to_ct_dpif_entry(const struct conn *conn, struct ct_dpif_entry *entry,
2813
                      long long now)
2814
0
{
2815
0
    const struct conn_key *rev_key = &conn->key_node[CT_DIR_REV].key;
2816
0
    const struct conn_key *key = &conn->key_node[CT_DIR_FWD].key;
2817
2818
0
    memset(entry, 0, sizeof *entry);
2819
0
    conn_key_to_tuple(key, &entry->tuple_orig);
2820
0
    conn_key_to_tuple(rev_key, &entry->tuple_reply);
2821
2822
0
    if (conn->alg_related) {
2823
0
        conn_key_to_tuple(&conn->parent_key, &entry->tuple_parent);
2824
0
    }
2825
2826
0
    entry->zone = key->zone;
2827
2828
0
    ovs_mutex_lock(&conn->lock);
2829
0
    entry->mark = conn->mark;
2830
0
    memcpy(&entry->labels, &conn->label, sizeof entry->labels);
2831
2832
0
    long long expiration = conn_expiration(conn) - now;
2833
2834
0
    struct ct_l4_proto *class = l4_protos[key->nw_proto];
2835
0
    if (class->conn_get_protoinfo) {
2836
0
        class->conn_get_protoinfo(conn, &entry->protoinfo);
2837
0
    }
2838
0
    ovs_mutex_unlock(&conn->lock);
2839
2840
0
    entry->timeout = (expiration > 0) ? expiration / 1000 : 0;
2841
2842
0
    if (conn->alg) {
2843
        /* Caller is responsible for freeing. */
2844
0
        entry->helper.name = xstrdup(conn->alg);
2845
0
    }
2846
0
}
2847
2848
struct ipf *
2849
conntrack_ipf_ctx(struct conntrack *ct)
2850
0
{
2851
0
    return ct->ipf;
2852
0
}
2853
2854
int
2855
conntrack_dump_start(struct conntrack *ct, struct conntrack_dump *dump,
2856
                     const uint16_t *pzone, int *ptot_bkts)
2857
0
{
2858
0
    memset(dump, 0, sizeof(*dump));
2859
2860
0
    if (pzone) {
2861
0
        dump->zone = *pzone;
2862
0
        dump->filter_zone = true;
2863
0
        dump->current_zone = dump->zone;
2864
0
    }
2865
2866
0
    dump->ct = ct;
2867
0
    *ptot_bkts = 1; /* Need to clean up the callers. */
2868
0
    dump->cursor = cmap_cursor_start(&dump->ct->conns[dump->current_zone]);
2869
0
    return 0;
2870
0
}
2871
2872
int
2873
conntrack_dump_next(struct conntrack_dump *dump, struct ct_dpif_entry *entry)
2874
0
{
2875
0
    long long now = time_msec();
2876
2877
0
    struct conn_key_node *keyn;
2878
0
    struct conn *conn;
2879
2880
0
    while (true) {
2881
0
        CMAP_CURSOR_FOR_EACH_CONTINUE (keyn, cm_node, &dump->cursor) {
2882
0
            if (keyn->dir != CT_DIR_FWD) {
2883
0
                continue;
2884
0
            }
2885
2886
0
            conn = CONTAINER_OF(keyn, struct conn, key_node[CT_DIR_FWD]);
2887
0
            if (conn_expired(conn, now)) {
2888
0
                continue;
2889
0
            }
2890
2891
0
            conn_to_ct_dpif_entry(conn, entry, now);
2892
0
            return 0;
2893
0
        }
2894
2895
0
        if (dump->filter_zone || dump->current_zone == UINT16_MAX) {
2896
0
            break;
2897
0
        }
2898
0
        dump->current_zone++;
2899
0
        dump->cursor = cmap_cursor_start(&dump->ct->conns[dump->current_zone]);
2900
0
    }
2901
2902
0
    return EOF;
2903
0
}
2904
2905
int
2906
conntrack_dump_done(struct conntrack_dump *dump OVS_UNUSED)
2907
0
{
2908
0
    return 0;
2909
0
}
2910
2911
static void
2912
exp_node_to_ct_dpif_exp(const struct alg_exp_node *exp,
2913
                        struct ct_dpif_exp *entry)
2914
0
{
2915
0
    memset(entry, 0, sizeof *entry);
2916
2917
0
    conn_key_to_tuple(&exp->key, &entry->tuple_orig);
2918
0
    conn_key_to_tuple(&exp->parent_key, &entry->tuple_parent);
2919
0
    entry->zone = exp->key.zone;
2920
0
    entry->mark = exp->parent_mark;
2921
0
    memcpy(&entry->labels, &exp->parent_label, sizeof entry->labels);
2922
0
    entry->protoinfo.proto = exp->key.nw_proto;
2923
0
}
2924
2925
int
2926
conntrack_exp_dump_start(struct conntrack *ct, struct conntrack_dump *dump,
2927
                         const uint16_t *pzone)
2928
0
{
2929
0
    memset(dump, 0, sizeof(*dump));
2930
2931
0
    if (pzone) {
2932
0
        dump->zone = *pzone;
2933
0
        dump->filter_zone = true;
2934
0
    }
2935
2936
0
    dump->ct = ct;
2937
2938
0
    return 0;
2939
0
}
2940
2941
int
2942
conntrack_exp_dump_next(struct conntrack_dump *dump, struct ct_dpif_exp *entry)
2943
0
{
2944
0
    struct conntrack *ct = dump->ct;
2945
0
    struct alg_exp_node *enode;
2946
0
    int ret = EOF;
2947
2948
0
    ovs_rwlock_rdlock(&ct->resources_lock);
2949
2950
0
    for (;;) {
2951
0
        struct hmap_node *node = hmap_at_position(&ct->alg_expectations,
2952
0
                                                  &dump->hmap_pos);
2953
0
        if (!node) {
2954
0
            break;
2955
0
        }
2956
2957
0
        enode = CONTAINER_OF(node, struct alg_exp_node, node);
2958
2959
0
        if (!dump->filter_zone || enode->key.zone == dump->zone) {
2960
0
            ret = 0;
2961
0
            exp_node_to_ct_dpif_exp(enode, entry);
2962
0
            break;
2963
0
        }
2964
0
    }
2965
2966
0
    ovs_rwlock_unlock(&ct->resources_lock);
2967
2968
0
    return ret;
2969
0
}
2970
2971
int
2972
conntrack_exp_dump_done(struct conntrack_dump *dump OVS_UNUSED)
2973
0
{
2974
0
    return 0;
2975
0
}
2976
2977
static int
2978
conntrack_flush_zone(struct conntrack *ct, const uint16_t zone)
2979
0
{
2980
0
    struct conn_key_node *keyn;
2981
0
    struct conn *conn;
2982
2983
0
    CMAP_FOR_EACH (keyn, cm_node, &ct->conns[zone]) {
2984
0
        if (keyn->dir != CT_DIR_FWD) {
2985
0
            continue;
2986
0
        }
2987
0
        conn = CONTAINER_OF(keyn, struct conn, key_node[CT_DIR_FWD]);
2988
0
        conn_clean(ct, conn);
2989
0
    }
2990
2991
0
    return 0;
2992
0
}
2993
2994
int
2995
conntrack_flush(struct conntrack *ct, const uint16_t *zone)
2996
0
{
2997
0
    if (zone) {
2998
0
        return conntrack_flush_zone(ct, *zone);
2999
0
    }
3000
3001
0
    for (unsigned i = 0; i < ARRAY_SIZE(ct->conns); i++) {
3002
0
        conntrack_flush_zone(ct, i);
3003
0
    }
3004
3005
0
    return 0;
3006
0
}
3007
3008
int
3009
conntrack_flush_tuple(struct conntrack *ct, const struct ct_dpif_tuple *tuple,
3010
                      uint16_t zone)
3011
0
{
3012
0
    struct conn_key key;
3013
0
    struct conn *conn;
3014
0
    int error = 0;
3015
3016
0
    memset(&key, 0, sizeof(key));
3017
0
    tuple_to_conn_key(tuple, zone, &key);
3018
0
    conn_lookup(ct, &key, time_msec(), &conn, NULL);
3019
3020
0
    if (conn) {
3021
0
        conn_clean(ct, conn);
3022
0
    } else {
3023
0
        VLOG_WARN("Tuple not found");
3024
0
        error = ENOENT;
3025
0
    }
3026
3027
0
    return error;
3028
0
}
3029
3030
int
3031
conntrack_set_maxconns(struct conntrack *ct, uint32_t maxconns)
3032
0
{
3033
0
    atomic_store_relaxed(&ct->n_conn_limit, maxconns);
3034
0
    return 0;
3035
0
}
3036
3037
int
3038
conntrack_get_maxconns(struct conntrack *ct, uint32_t *maxconns)
3039
0
{
3040
0
    atomic_read_relaxed(&ct->n_conn_limit, maxconns);
3041
0
    return 0;
3042
0
}
3043
3044
int
3045
conntrack_get_nconns(struct conntrack *ct, uint32_t *nconns)
3046
0
{
3047
0
    *nconns = atomic_count_get(&ct->n_conn);
3048
0
    return 0;
3049
0
}
3050
3051
int
3052
conntrack_set_tcp_seq_chk(struct conntrack *ct, bool enabled)
3053
0
{
3054
0
    atomic_store_relaxed(&ct->tcp_seq_chk, enabled);
3055
0
    return 0;
3056
0
}
3057
3058
bool
3059
conntrack_get_tcp_seq_chk(struct conntrack *ct)
3060
0
{
3061
0
    bool enabled;
3062
0
    atomic_read_relaxed(&ct->tcp_seq_chk, &enabled);
3063
0
    return enabled;
3064
0
}
3065
3066
/* This function must be called with the ct->resources read lock taken. */
3067
static struct alg_exp_node *
3068
expectation_lookup(struct hmap *alg_expectations, const struct conn_key *key,
3069
                   uint32_t basis, bool src_ip_wc)
3070
0
{
3071
0
    struct conn_key check_key;
3072
0
    memcpy(&check_key, key, sizeof check_key);
3073
0
    check_key.src.port = ALG_WC_SRC_PORT;
3074
3075
0
    if (src_ip_wc) {
3076
0
        memset(&check_key.src.addr, 0, sizeof check_key.src.addr);
3077
0
    }
3078
3079
0
    struct alg_exp_node *alg_exp_node;
3080
3081
0
    HMAP_FOR_EACH_WITH_HASH (alg_exp_node, node,
3082
0
                             conn_key_hash(&check_key, basis),
3083
0
                             alg_expectations) {
3084
0
        if (!conn_key_cmp(&alg_exp_node->key, &check_key)) {
3085
0
            return alg_exp_node;
3086
0
        }
3087
0
    }
3088
0
    return NULL;
3089
0
}
3090
3091
/* This function must be called with the ct->resources write lock taken. */
3092
static void
3093
expectation_remove(struct hmap *alg_expectations,
3094
                   const struct conn_key *key, uint32_t basis)
3095
0
{
3096
0
    struct alg_exp_node *alg_exp_node;
3097
3098
0
    HMAP_FOR_EACH_WITH_HASH (alg_exp_node, node, conn_key_hash(key, basis),
3099
0
                             alg_expectations) {
3100
0
        if (!conn_key_cmp(&alg_exp_node->key, key)) {
3101
0
            hmap_remove(alg_expectations, &alg_exp_node->node);
3102
0
            break;
3103
0
        }
3104
0
    }
3105
0
}
3106
3107
/* This function must be called with the ct->resources read lock taken. */
3108
static struct alg_exp_node *
3109
expectation_ref_lookup_unique(const struct hindex *alg_expectation_refs,
3110
                              const struct conn_key *parent_key,
3111
                              const struct conn_key *alg_exp_key,
3112
                              uint32_t basis)
3113
0
{
3114
0
    struct alg_exp_node *alg_exp_node;
3115
3116
0
    HINDEX_FOR_EACH_WITH_HASH (alg_exp_node, node_ref,
3117
0
                               conn_key_hash(parent_key, basis),
3118
0
                               alg_expectation_refs) {
3119
0
        if (!conn_key_cmp(&alg_exp_node->parent_key, parent_key) &&
3120
0
            !conn_key_cmp(&alg_exp_node->key, alg_exp_key)) {
3121
0
            return alg_exp_node;
3122
0
        }
3123
0
    }
3124
0
    return NULL;
3125
0
}
3126
3127
/* This function must be called with the ct->resources write lock taken. */
3128
static void
3129
expectation_ref_create(struct hindex *alg_expectation_refs,
3130
                       struct alg_exp_node *alg_exp_node,
3131
                       uint32_t basis)
3132
0
{
3133
0
    if (!expectation_ref_lookup_unique(alg_expectation_refs,
3134
0
                                       &alg_exp_node->parent_key,
3135
0
                                       &alg_exp_node->key, basis)) {
3136
0
        hindex_insert(alg_expectation_refs, &alg_exp_node->node_ref,
3137
0
                      conn_key_hash(&alg_exp_node->parent_key, basis));
3138
0
    }
3139
0
}
3140
3141
static void
3142
expectation_clean(struct conntrack *ct, const struct conn_key *parent_key)
3143
0
{
3144
0
    ovs_rwlock_wrlock(&ct->resources_lock);
3145
3146
0
    struct alg_exp_node *node;
3147
0
    HINDEX_FOR_EACH_WITH_HASH_SAFE (node, node_ref,
3148
0
                                    conn_key_hash(parent_key, ct->hash_basis),
3149
0
                                    &ct->alg_expectation_refs) {
3150
0
        if (!conn_key_cmp(&node->parent_key, parent_key)) {
3151
0
            expectation_remove(&ct->alg_expectations, &node->key,
3152
0
                               ct->hash_basis);
3153
0
            hindex_remove(&ct->alg_expectation_refs, &node->node_ref);
3154
0
            free(node);
3155
0
        }
3156
0
    }
3157
3158
0
    ovs_rwlock_unlock(&ct->resources_lock);
3159
0
}
3160
3161
static void
3162
expectation_create(struct conntrack *ct, ovs_be16 dst_port,
3163
                   const struct conn *parent_conn, bool reply, bool src_ip_wc,
3164
                   bool skip_nat)
3165
0
{
3166
0
    const struct conn_key *pconn_key, *pconn_rev_key;
3167
0
    union ct_addr src_addr;
3168
0
    union ct_addr dst_addr;
3169
0
    union ct_addr alg_nat_repl_addr;
3170
0
    struct alg_exp_node *alg_exp_node = xzalloc(sizeof *alg_exp_node);
3171
3172
0
    pconn_key = &parent_conn->key_node[CT_DIR_FWD].key;
3173
0
    pconn_rev_key = &parent_conn->key_node[CT_DIR_REV].key;
3174
3175
0
    if (reply) {
3176
0
        src_addr = pconn_key->src.addr;
3177
0
        dst_addr = pconn_key->dst.addr;
3178
0
        alg_exp_node->nat_rpl_dst = true;
3179
0
        if (skip_nat) {
3180
0
            alg_nat_repl_addr = dst_addr;
3181
0
        } else if (parent_conn->nat_action & NAT_ACTION_DST) {
3182
0
            alg_nat_repl_addr = pconn_rev_key->src.addr;
3183
0
            alg_exp_node->nat_rpl_dst = false;
3184
0
        } else {
3185
0
            alg_nat_repl_addr = pconn_rev_key->dst.addr;
3186
0
        }
3187
0
    } else {
3188
0
        src_addr = pconn_rev_key->src.addr;
3189
0
        dst_addr = pconn_rev_key->dst.addr;
3190
0
        alg_exp_node->nat_rpl_dst = false;
3191
0
        if (skip_nat) {
3192
0
            alg_nat_repl_addr = src_addr;
3193
0
        } else if (parent_conn->nat_action & NAT_ACTION_DST) {
3194
0
            alg_nat_repl_addr = pconn_key->dst.addr;
3195
0
            alg_exp_node->nat_rpl_dst = true;
3196
0
        } else {
3197
0
            alg_nat_repl_addr = pconn_key->src.addr;
3198
0
        }
3199
0
    }
3200
0
    if (src_ip_wc) {
3201
0
        memset(&src_addr, 0, sizeof src_addr);
3202
0
    }
3203
3204
0
    alg_exp_node->key.dl_type = pconn_key->dl_type;
3205
0
    alg_exp_node->key.nw_proto = pconn_key->nw_proto;
3206
0
    alg_exp_node->key.zone = pconn_key->zone;
3207
0
    alg_exp_node->key.src.addr = src_addr;
3208
0
    alg_exp_node->key.dst.addr = dst_addr;
3209
0
    alg_exp_node->key.src.port = ALG_WC_SRC_PORT;
3210
0
    alg_exp_node->key.dst.port = dst_port;
3211
0
    alg_exp_node->parent_mark = parent_conn->mark;
3212
0
    alg_exp_node->parent_label = parent_conn->label;
3213
0
    memcpy(&alg_exp_node->parent_key, pconn_key,
3214
0
           sizeof alg_exp_node->parent_key);
3215
    /* Take the write lock here because it is almost 100%
3216
     * likely that the lookup will fail and
3217
     * expectation_create() will be called below. */
3218
0
    ovs_rwlock_wrlock(&ct->resources_lock);
3219
0
    struct alg_exp_node *alg_exp = expectation_lookup(
3220
0
        &ct->alg_expectations, &alg_exp_node->key, ct->hash_basis, src_ip_wc);
3221
0
    if (alg_exp) {
3222
0
        free(alg_exp_node);
3223
0
        ovs_rwlock_unlock(&ct->resources_lock);
3224
0
        return;
3225
0
    }
3226
3227
0
    alg_exp_node->alg_nat_repl_addr = alg_nat_repl_addr;
3228
0
    hmap_insert(&ct->alg_expectations, &alg_exp_node->node,
3229
0
                conn_key_hash(&alg_exp_node->key, ct->hash_basis));
3230
0
    expectation_ref_create(&ct->alg_expectation_refs, alg_exp_node,
3231
0
                           ct->hash_basis);
3232
0
    ovs_rwlock_unlock(&ct->resources_lock);
3233
0
}
3234
3235
static void
3236
replace_substring(char *substr, uint8_t substr_size,
3237
                  uint8_t total_size, char *rep_str,
3238
                  uint8_t rep_str_size)
3239
0
{
3240
0
    memmove(substr + rep_str_size, substr + substr_size,
3241
0
            total_size - substr_size);
3242
0
    memcpy(substr, rep_str, rep_str_size);
3243
0
}
3244
3245
static void
3246
repl_bytes(char *str, char c1, char c2)
3247
0
{
3248
0
    while (*str) {
3249
0
        if (*str == c1) {
3250
0
            *str = c2;
3251
0
        }
3252
0
        str++;
3253
0
    }
3254
0
}
3255
3256
static void
3257
modify_packet(struct dp_packet *pkt, char *pkt_str, size_t size,
3258
              char *repl_str, size_t repl_size,
3259
              uint32_t orig_used_size)
3260
0
{
3261
0
    replace_substring(pkt_str, size,
3262
0
                      (const char *) dp_packet_tail(pkt) - pkt_str,
3263
0
                      repl_str, repl_size);
3264
0
    dp_packet_set_size(pkt, orig_used_size + (int) repl_size - (int) size);
3265
0
}
3266
3267
/* Replace IPV4 address in FTP message with NATed address. */
3268
static int
3269
repl_ftp_v4_addr(struct dp_packet *pkt, ovs_be32 v4_addr_rep,
3270
                 char *ftp_data_start,
3271
                 size_t addr_offset_from_ftp_data_start,
3272
                 size_t addr_size OVS_UNUSED)
3273
0
{
3274
0
    enum { MAX_FTP_V4_NAT_DELTA = 8 };
3275
3276
    /* Do conservative check for pathological MTU usage. */
3277
0
    uint32_t orig_used_size = dp_packet_size(pkt);
3278
0
    if (orig_used_size + MAX_FTP_V4_NAT_DELTA >
3279
0
        dp_packet_get_allocated(pkt)) {
3280
3281
0
        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
3282
0
        VLOG_WARN_RL(&rl, "Unsupported effective MTU %u used with FTP V4",
3283
0
                     dp_packet_get_allocated(pkt));
3284
0
        return 0;
3285
0
    }
3286
3287
0
    char v4_addr_str[INET_ADDRSTRLEN] = {0};
3288
0
    ovs_assert(inet_ntop(AF_INET, &v4_addr_rep, v4_addr_str,
3289
0
                         sizeof v4_addr_str));
3290
0
    repl_bytes(v4_addr_str, '.', ',');
3291
0
    modify_packet(pkt, ftp_data_start + addr_offset_from_ftp_data_start,
3292
0
                  addr_size, v4_addr_str, strlen(v4_addr_str),
3293
0
                  orig_used_size);
3294
0
    return (int) strlen(v4_addr_str) - (int) addr_size;
3295
0
}
3296
3297
static char *
3298
skip_non_digits(char *str)
3299
0
{
3300
0
    while (!isdigit(*str) && *str != 0) {
3301
0
        str++;
3302
0
    }
3303
0
    return str;
3304
0
}
3305
3306
static char *
3307
terminate_number_str(char *str, uint8_t max_digits)
3308
0
{
3309
0
    uint8_t digits_found = 0;
3310
0
    while (isdigit(*str) && digits_found <= max_digits) {
3311
0
        str++;
3312
0
        digits_found++;
3313
0
    }
3314
3315
0
    *str = 0;
3316
0
    return str;
3317
0
}
3318
3319
3320
static void
3321
get_ftp_ctl_msg(struct dp_packet *pkt, char *ftp_msg)
3322
0
{
3323
0
    struct tcp_header *th = dp_packet_l4(pkt);
3324
0
    char *tcp_hdr = (char *) th;
3325
0
    uint32_t tcp_payload_len = dp_packet_get_tcp_payload_length(pkt);
3326
0
    size_t tcp_payload_of_interest = MIN(tcp_payload_len,
3327
0
                                         LARGEST_FTP_MSG_OF_INTEREST);
3328
0
    size_t tcp_hdr_len = TCP_OFFSET(th->tcp_ctl) * 4;
3329
3330
0
    ovs_strlcpy(ftp_msg, tcp_hdr + tcp_hdr_len,
3331
0
                tcp_payload_of_interest);
3332
0
}
3333
3334
static enum ftp_ctl_pkt
3335
detect_ftp_ctl_type(const struct conn_lookup_ctx *ctx,
3336
                    struct dp_packet *pkt)
3337
0
{
3338
0
    char ftp_msg[LARGEST_FTP_MSG_OF_INTEREST + 1] = {0};
3339
0
    get_ftp_ctl_msg(pkt, ftp_msg);
3340
3341
0
    if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {
3342
0
        if (strncasecmp(ftp_msg, FTP_EPRT_CMD, strlen(FTP_EPRT_CMD)) &&
3343
0
            !strcasestr(ftp_msg, FTP_EPSV_REPLY)) {
3344
0
            return CT_FTP_CTL_OTHER;
3345
0
        }
3346
0
    } else {
3347
0
        if (strncasecmp(ftp_msg, FTP_PORT_CMD, strlen(FTP_PORT_CMD)) &&
3348
0
            strncasecmp(ftp_msg, FTP_PASV_REPLY_CODE,
3349
0
                        strlen(FTP_PASV_REPLY_CODE))) {
3350
0
            return CT_FTP_CTL_OTHER;
3351
0
        }
3352
0
    }
3353
3354
0
    return CT_FTP_CTL_INTEREST;
3355
0
}
3356
3357
static enum ftp_ctl_pkt
3358
process_ftp_ctl_v4(struct conntrack *ct,
3359
                   struct dp_packet *pkt,
3360
                   const struct conn *conn_for_expectation,
3361
                   ovs_be32 *v4_addr_rep,
3362
                   char **ftp_data_v4_start,
3363
                   size_t *addr_offset_from_ftp_data_start,
3364
                   size_t *addr_size)
3365
0
{
3366
0
    struct tcp_header *th = dp_packet_l4(pkt);
3367
0
    size_t tcp_hdr_len = TCP_OFFSET(th->tcp_ctl) * 4;
3368
0
    char *tcp_hdr = (char *) th;
3369
0
    *ftp_data_v4_start = tcp_hdr + tcp_hdr_len;
3370
0
    char ftp_msg[LARGEST_FTP_MSG_OF_INTEREST + 1] = {0};
3371
0
    get_ftp_ctl_msg(pkt, ftp_msg);
3372
0
    char *ftp = ftp_msg;
3373
0
    enum ct_alg_mode mode;
3374
3375
0
    if (!strncasecmp(ftp, FTP_PORT_CMD, strlen(FTP_PORT_CMD))) {
3376
0
        ftp = ftp_msg + strlen(FTP_PORT_CMD);
3377
0
        mode = CT_FTP_MODE_ACTIVE;
3378
0
    } else {
3379
0
        ftp = ftp_msg + strlen(FTP_PASV_REPLY_CODE);
3380
0
        mode = CT_FTP_MODE_PASSIVE;
3381
0
    }
3382
3383
    /* Find first space. */
3384
0
    ftp = strchr(ftp, ' ');
3385
0
    if (!ftp) {
3386
0
        return CT_FTP_CTL_INVALID;
3387
0
    }
3388
3389
    /* Find the first digit, after space. */
3390
0
    ftp = skip_non_digits(ftp);
3391
0
    if (*ftp == 0) {
3392
0
        return CT_FTP_CTL_INVALID;
3393
0
    }
3394
3395
0
    char *ip_addr_start = ftp;
3396
0
    *addr_offset_from_ftp_data_start = ip_addr_start - ftp_msg;
3397
3398
0
    uint8_t comma_count = 0;
3399
0
    while (comma_count < 4 && *ftp) {
3400
0
        if (*ftp == ',') {
3401
0
            comma_count++;
3402
0
            if (comma_count == 4) {
3403
0
                *ftp = 0;
3404
0
            } else {
3405
0
                *ftp = '.';
3406
0
            }
3407
0
        }
3408
0
        ftp++;
3409
0
    }
3410
0
    if (comma_count != 4) {
3411
0
        return CT_FTP_CTL_INVALID;
3412
0
    }
3413
3414
0
    struct in_addr ip_addr;
3415
0
    int rc2 = inet_pton(AF_INET, ip_addr_start, &ip_addr);
3416
0
    if (rc2 != 1) {
3417
0
        return CT_FTP_CTL_INVALID;
3418
0
    }
3419
3420
0
    *addr_size = ftp - ip_addr_start - 1;
3421
0
    char *save_ftp = ftp;
3422
0
    ftp = terminate_number_str(ftp, MAX_FTP_PORT_DGTS);
3423
0
    if (!ftp) {
3424
0
        return CT_FTP_CTL_INVALID;
3425
0
    }
3426
0
    int value;
3427
0
    if (!str_to_int(save_ftp, 10, &value)) {
3428
0
        return CT_FTP_CTL_INVALID;
3429
0
    }
3430
3431
    /* This is derived from the L4 port maximum is 65535. */
3432
0
    if (value > 255) {
3433
0
        return CT_FTP_CTL_INVALID;
3434
0
    }
3435
3436
0
    uint16_t port_hs = value;
3437
0
    port_hs <<= 8;
3438
3439
    /* Skip over comma. */
3440
0
    ftp++;
3441
0
    save_ftp = ftp;
3442
0
    bool digit_found = false;
3443
0
    while (isdigit(*ftp)) {
3444
0
        ftp++;
3445
0
        digit_found = true;
3446
0
    }
3447
0
    if (!digit_found) {
3448
0
        return CT_FTP_CTL_INVALID;
3449
0
    }
3450
0
    *ftp = 0;
3451
0
    if (!str_to_int(save_ftp, 10, &value)) {
3452
0
        return CT_FTP_CTL_INVALID;
3453
0
    }
3454
3455
0
    if (value > 255) {
3456
0
        return CT_FTP_CTL_INVALID;
3457
0
    }
3458
3459
0
    port_hs |= value;
3460
0
    ovs_be16 port = htons(port_hs);
3461
0
    ovs_be32 conn_ipv4_addr;
3462
3463
0
    switch (mode) {
3464
0
    case CT_FTP_MODE_ACTIVE:
3465
0
        *v4_addr_rep =
3466
0
            conn_for_expectation->key_node[CT_DIR_REV].key.dst.addr.ipv4;
3467
0
        conn_ipv4_addr =
3468
0
            conn_for_expectation->key_node[CT_DIR_FWD].key.src.addr.ipv4;
3469
0
        break;
3470
0
    case CT_FTP_MODE_PASSIVE:
3471
0
        *v4_addr_rep =
3472
0
            conn_for_expectation->key_node[CT_DIR_FWD].key.dst.addr.ipv4;
3473
0
        conn_ipv4_addr =
3474
0
            conn_for_expectation->key_node[CT_DIR_REV].key.src.addr.ipv4;
3475
0
        break;
3476
0
    case CT_TFTP_MODE:
3477
0
    default:
3478
0
        OVS_NOT_REACHED();
3479
0
    }
3480
3481
0
    ovs_be32 ftp_ipv4_addr;
3482
0
    ftp_ipv4_addr = ip_addr.s_addr;
3483
    /* Although most servers will block this exploit, there may be some
3484
     * less well managed. */
3485
0
    if (ftp_ipv4_addr != conn_ipv4_addr && ftp_ipv4_addr != *v4_addr_rep) {
3486
0
        return CT_FTP_CTL_INVALID;
3487
0
    }
3488
3489
0
    expectation_create(ct, port, conn_for_expectation,
3490
0
                       !!(pkt->md.ct_state & CS_REPLY_DIR), false, false);
3491
0
    return CT_FTP_CTL_INTEREST;
3492
0
}
3493
3494
static char *
3495
skip_ipv6_digits(char *str)
3496
0
{
3497
0
    while (isxdigit(*str) || *str == ':' || *str == '.') {
3498
0
        str++;
3499
0
    }
3500
0
    return str;
3501
0
}
3502
3503
static enum ftp_ctl_pkt
3504
process_ftp_ctl_v6(struct conntrack *ct,
3505
                   struct dp_packet *pkt,
3506
                   const struct conn *conn_for_exp,
3507
                   union ct_addr *v6_addr_rep, char **ftp_data_start,
3508
                   size_t *addr_offset_from_ftp_data_start,
3509
                   size_t *addr_size, enum ct_alg_mode *mode)
3510
0
{
3511
0
    struct tcp_header *th = dp_packet_l4(pkt);
3512
0
    size_t tcp_hdr_len = TCP_OFFSET(th->tcp_ctl) * 4;
3513
0
    char *tcp_hdr = (char *) th;
3514
0
    char ftp_msg[LARGEST_FTP_MSG_OF_INTEREST + 1] = {0};
3515
0
    get_ftp_ctl_msg(pkt, ftp_msg);
3516
0
    *ftp_data_start = tcp_hdr + tcp_hdr_len;
3517
0
    char *ftp = ftp_msg;
3518
0
    struct in6_addr ip6_addr;
3519
3520
0
    if (!strncasecmp(ftp, FTP_EPRT_CMD, strlen(FTP_EPRT_CMD))) {
3521
0
        ftp = ftp_msg + strlen(FTP_EPRT_CMD);
3522
0
        ftp = skip_non_digits(ftp);
3523
0
        if (*ftp != FTP_AF_V6 || isdigit(ftp[1])) {
3524
0
            return CT_FTP_CTL_INVALID;
3525
0
        }
3526
        /* Jump over delimiter. */
3527
0
        ftp += 2;
3528
3529
0
        memset(&ip6_addr, 0, sizeof ip6_addr);
3530
0
        char *ip_addr_start = ftp;
3531
0
        *addr_offset_from_ftp_data_start = ip_addr_start - ftp_msg;
3532
0
        ftp = skip_ipv6_digits(ftp);
3533
0
        *ftp = 0;
3534
0
        *addr_size = ftp - ip_addr_start;
3535
0
        int rc2 = inet_pton(AF_INET6, ip_addr_start, &ip6_addr);
3536
0
        if (rc2 != 1) {
3537
0
            return CT_FTP_CTL_INVALID;
3538
0
        }
3539
0
        ftp++;
3540
0
        *mode = CT_FTP_MODE_ACTIVE;
3541
0
    } else {
3542
0
        ftp = ftp_msg + strcspn(ftp_msg, "(");
3543
0
        ftp = skip_non_digits(ftp);
3544
0
        if (!isdigit(*ftp)) {
3545
0
            return CT_FTP_CTL_INVALID;
3546
0
        }
3547
3548
        /* Not used for passive mode. */
3549
0
        *addr_offset_from_ftp_data_start = 0;
3550
0
        *addr_size = 0;
3551
3552
0
        *mode = CT_FTP_MODE_PASSIVE;
3553
0
    }
3554
3555
0
    char *save_ftp = ftp;
3556
0
    ftp = terminate_number_str(ftp, MAX_EXT_FTP_PORT_DGTS);
3557
0
    if (!ftp) {
3558
0
        return CT_FTP_CTL_INVALID;
3559
0
    }
3560
3561
0
    int value;
3562
0
    if (!str_to_int(save_ftp, 10, &value)) {
3563
0
        return CT_FTP_CTL_INVALID;
3564
0
    }
3565
0
    if (value > CT_MAX_L4_PORT) {
3566
0
        return CT_FTP_CTL_INVALID;
3567
0
    }
3568
3569
0
    uint16_t port_hs = value;
3570
0
    ovs_be16 port = htons(port_hs);
3571
3572
0
    switch (*mode) {
3573
0
    case CT_FTP_MODE_ACTIVE:
3574
0
        *v6_addr_rep = conn_for_exp->key_node[CT_DIR_REV].key.dst.addr;
3575
        /* Although most servers will block this exploit, there may be some
3576
         * less well managed. */
3577
0
        if (memcmp(&ip6_addr, &v6_addr_rep->ipv6, sizeof ip6_addr) &&
3578
0
            memcmp(&ip6_addr,
3579
0
                   &conn_for_exp->key_node[CT_DIR_FWD].key.src.addr.ipv6,
3580
0
                   sizeof ip6_addr)) {
3581
0
            return CT_FTP_CTL_INVALID;
3582
0
        }
3583
0
        break;
3584
0
    case CT_FTP_MODE_PASSIVE:
3585
0
        *v6_addr_rep = conn_for_exp->key_node[CT_DIR_FWD].key.dst.addr;
3586
0
        break;
3587
0
    case CT_TFTP_MODE:
3588
0
    default:
3589
0
        OVS_NOT_REACHED();
3590
0
    }
3591
3592
0
    expectation_create(ct, port, conn_for_exp,
3593
0
                       !!(pkt->md.ct_state & CS_REPLY_DIR), false, false);
3594
0
    return CT_FTP_CTL_INTEREST;
3595
0
}
3596
3597
static int
3598
repl_ftp_v6_addr(struct dp_packet *pkt, union ct_addr v6_addr_rep,
3599
                 char *ftp_data_start,
3600
                 size_t addr_offset_from_ftp_data_start,
3601
                 size_t addr_size, enum ct_alg_mode mode)
3602
0
{
3603
    /* This is slightly bigger than really possible. */
3604
0
    enum { MAX_FTP_V6_NAT_DELTA = 45 };
3605
3606
0
    if (mode == CT_FTP_MODE_PASSIVE) {
3607
0
        return 0;
3608
0
    }
3609
3610
    /* Do conservative check for pathological MTU usage. */
3611
0
    uint32_t orig_used_size = dp_packet_size(pkt);
3612
0
    if (orig_used_size + MAX_FTP_V6_NAT_DELTA >
3613
0
        dp_packet_get_allocated(pkt)) {
3614
3615
0
        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
3616
0
        VLOG_WARN_RL(&rl, "Unsupported effective MTU %u used with FTP V6",
3617
0
                     dp_packet_get_allocated(pkt));
3618
0
        return 0;
3619
0
    }
3620
3621
0
    char v6_addr_str[INET6_ADDRSTRLEN] = {0};
3622
0
    ovs_assert(inet_ntop(AF_INET6, &v6_addr_rep.ipv6, v6_addr_str,
3623
0
                         sizeof v6_addr_str));
3624
0
    modify_packet(pkt, ftp_data_start + addr_offset_from_ftp_data_start,
3625
0
                  addr_size, v6_addr_str, strlen(v6_addr_str),
3626
0
                  orig_used_size);
3627
0
    return (int) strlen(v6_addr_str) - (int) addr_size;
3628
0
}
3629
3630
/* Increment/decrement a TCP sequence number. */
3631
static void
3632
adj_seqnum(ovs_16aligned_be32 *val, int32_t inc)
3633
0
{
3634
0
    put_16aligned_be32(val, htonl(ntohl(get_16aligned_be32(val)) + inc));
3635
0
}
3636
3637
static void
3638
handle_ftp_ctl(struct conntrack *ct, const struct conn_lookup_ctx *ctx,
3639
               struct dp_packet *pkt, struct conn *ec, long long now,
3640
               enum ftp_ctl_pkt ftp_ctl, bool nat)
3641
0
{
3642
0
    struct ip_header *l3_hdr = dp_packet_l3(pkt);
3643
0
    ovs_be32 v4_addr_rep = 0;
3644
0
    union ct_addr v6_addr_rep;
3645
0
    size_t addr_offset_from_ftp_data_start = 0;
3646
0
    size_t addr_size = 0;
3647
0
    char *ftp_data_start;
3648
0
    enum ct_alg_mode mode = CT_FTP_MODE_ACTIVE;
3649
3650
0
    if (detect_ftp_ctl_type(ctx, pkt) != ftp_ctl) {
3651
0
        return;
3652
0
    }
3653
3654
0
    struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
3655
0
    int64_t seq_skew = 0;
3656
3657
0
    if (ftp_ctl == CT_FTP_CTL_INTEREST) {
3658
0
        enum ftp_ctl_pkt rc;
3659
0
        if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {
3660
0
            rc = process_ftp_ctl_v6(ct, pkt, ec,
3661
0
                                    &v6_addr_rep, &ftp_data_start,
3662
0
                                    &addr_offset_from_ftp_data_start,
3663
0
                                    &addr_size, &mode);
3664
0
        } else {
3665
0
            rc = process_ftp_ctl_v4(ct, pkt, ec,
3666
0
                                    &v4_addr_rep, &ftp_data_start,
3667
0
                                    &addr_offset_from_ftp_data_start,
3668
0
                                    &addr_size);
3669
0
        }
3670
0
        if (rc == CT_FTP_CTL_INVALID) {
3671
0
            static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
3672
0
            VLOG_WARN_RL(&rl, "Invalid FTP control packet format");
3673
0
            pkt->md.ct_state |= CS_TRACKED | CS_INVALID;
3674
0
            return;
3675
0
        } else if (rc == CT_FTP_CTL_INTEREST) {
3676
0
            uint16_t ip_len;
3677
3678
0
            if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {
3679
0
                if (nat) {
3680
0
                    seq_skew = repl_ftp_v6_addr(pkt, v6_addr_rep,
3681
0
                                   ftp_data_start,
3682
0
                                   addr_offset_from_ftp_data_start,
3683
0
                                   addr_size, mode);
3684
0
                }
3685
3686
0
                if (seq_skew) {
3687
0
                    ip_len = ntohs(nh6->ip6_ctlun.ip6_un1.ip6_un1_plen) +
3688
0
                        seq_skew;
3689
0
                    nh6->ip6_ctlun.ip6_un1.ip6_un1_plen = htons(ip_len);
3690
0
                }
3691
0
            } else {
3692
0
                if (nat) {
3693
0
                    seq_skew = repl_ftp_v4_addr(pkt, v4_addr_rep,
3694
0
                                   ftp_data_start,
3695
0
                                   addr_offset_from_ftp_data_start,
3696
0
                                   addr_size);
3697
0
                }
3698
0
                if (seq_skew) {
3699
0
                    ip_len = ntohs(l3_hdr->ip_tot_len) + seq_skew;
3700
0
                    if (dp_packet_ip_checksum_valid(pkt)) {
3701
0
                        dp_packet_ip_checksum_set_partial(pkt);
3702
0
                    } else {
3703
0
                        l3_hdr->ip_csum = recalc_csum16(l3_hdr->ip_csum,
3704
0
                                                        l3_hdr->ip_tot_len,
3705
0
                                                        htons(ip_len));
3706
0
                    }
3707
0
                    l3_hdr->ip_tot_len = htons(ip_len);
3708
0
                }
3709
0
            }
3710
0
        } else {
3711
0
            OVS_NOT_REACHED();
3712
0
        }
3713
0
    }
3714
3715
0
    struct tcp_header *th = dp_packet_l4(pkt);
3716
3717
0
    if (nat && ec->seq_skew != 0) {
3718
0
        ctx->reply != ec->seq_skew_dir ?
3719
0
            adj_seqnum(&th->tcp_ack, -ec->seq_skew) :
3720
0
            adj_seqnum(&th->tcp_seq, ec->seq_skew);
3721
0
    }
3722
3723
0
    if (dp_packet_l4_checksum_valid(pkt)) {
3724
0
        dp_packet_l4_checksum_set_partial(pkt);
3725
0
    } else {
3726
0
        th->tcp_csum = 0;
3727
0
        if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {
3728
0
            th->tcp_csum = packet_csum_upperlayer6(nh6, th, ctx->key.nw_proto,
3729
0
                               dp_packet_l4_size(pkt));
3730
0
        } else {
3731
0
            uint32_t tcp_csum = packet_csum_pseudoheader(l3_hdr);
3732
0
            th->tcp_csum = csum_finish(
3733
0
                 csum_continue(tcp_csum, th, dp_packet_l4_size(pkt)));
3734
0
        }
3735
0
    }
3736
3737
0
    if (seq_skew) {
3738
0
        conn_seq_skew_set(ct, ec, now, seq_skew + ec->seq_skew,
3739
0
                          ctx->reply);
3740
0
    }
3741
0
}
3742
3743
static void
3744
handle_tftp_ctl(struct conntrack *ct,
3745
                const struct conn_lookup_ctx *ctx OVS_UNUSED,
3746
                struct dp_packet *pkt, struct conn *conn_for_expectation,
3747
                long long now OVS_UNUSED, enum ftp_ctl_pkt ftp_ctl OVS_UNUSED,
3748
                bool nat OVS_UNUSED)
3749
0
{
3750
0
    expectation_create(ct,
3751
0
                       conn_for_expectation->key_node[CT_DIR_FWD].key.src.port,
3752
0
                       conn_for_expectation,
3753
0
                       !!(pkt->md.ct_state & CS_REPLY_DIR), false, false);
3754
0
}