Coverage Report

Created: 2025-07-01 06:51

/src/openvswitch/lib/conntrack-tcp.c
Line
Count
Source (jump to first uncovered line)
1
/*-
2
 * Copyright (c) 2001 Daniel Hartmeier
3
 * Copyright (c) 2002 - 2008 Henning Brauer
4
 * Copyright (c) 2012 Gleb Smirnoff <glebius@FreeBSD.org>
5
 * Copyright (c) 2015, 2016 Nicira, Inc.
6
 * All rights reserved.
7
 *
8
 * Redistribution and use in source and binary forms, with or without
9
 * modification, are permitted provided that the following conditions
10
 * are met:
11
 *
12
 *    - Redistributions of source code must retain the above copyright
13
 *      notice, this list of conditions and the following disclaimer.
14
 *    - Redistributions in binary form must reproduce the above
15
 *      copyright notice, this list of conditions and the following
16
 *      disclaimer in the documentation and/or other materials provided
17
 *      with the distribution.
18
 *
19
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
22
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
23
 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
24
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
25
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
26
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
29
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30
 * POSSIBILITY OF SUCH DAMAGE.
31
 *
32
 * Effort sponsored in part by the Defense Advanced Research Projects
33
 * Agency (DARPA) and Air Force Research Laboratory, Air Force
34
 * Materiel Command, USAF, under agreement number F30602-01-2-0537.
35
 *
36
 *      $OpenBSD: pf.c,v 1.634 2009/02/27 12:37:45 henning Exp $
37
 */
38
39
#include <config.h>
40
41
#include "conntrack-private.h"
42
#include "conntrack-tp.h"
43
#include "coverage.h"
44
#include "ct-dpif.h"
45
#include "dp-packet.h"
46
#include "util.h"
47
48
COVERAGE_DEFINE(conntrack_tcp_seq_chk_bypass);
49
COVERAGE_DEFINE(conntrack_tcp_seq_chk_failed);
50
COVERAGE_DEFINE(conntrack_invalid_tcp_flags);
51
52
struct tcp_peer {
53
    uint32_t               seqlo;          /* Max sequence number sent     */
54
    uint32_t               seqhi;          /* Max the other end ACKd + win */
55
    uint16_t               max_win;        /* largest window (pre scaling) */
56
    uint8_t                wscale;         /* window scaling factor        */
57
    enum ct_dpif_tcp_state state;
58
};
59
60
struct conn_tcp {
61
    struct conn up;
62
    struct tcp_peer peer[2]; /* 'conn' lock protected. */
63
};
64
65
enum {
66
    TCPOPT_EOL,
67
    TCPOPT_NOP,
68
    TCPOPT_WINDOW = 3,
69
};
70
71
/* TCP sequence numbers are 32 bit integers operated
72
 * on with modular arithmetic.  These macros can be
73
 * used to compare such integers. */
74
#define SEQ_LT(a,b)     INT_MOD_LT(a, b)
75
#define SEQ_LEQ(a,b)    INT_MOD_LEQ(a, b)
76
0
#define SEQ_GT(a,b)     INT_MOD_GT(a, b)
77
0
#define SEQ_GEQ(a,b)    INT_MOD_GEQ(a, b)
78
79
#define SEQ_MIN(a, b)   INT_MOD_MIN(a, b)
80
#define SEQ_MAX(a, b)   INT_MOD_MAX(a, b)
81
82
static struct conn_tcp*
83
conn_tcp_cast(const struct conn* conn)
84
0
{
85
0
    return CONTAINER_OF(conn, struct conn_tcp, up);
86
0
}
87
88
/* pf does this in in pf_normalize_tcp(), and it is called only if scrub
89
 * is enabled.  We're not scrubbing, but this check seems reasonable.  */
90
static bool
91
tcp_invalid_flags(uint16_t flags)
92
0
{
93
94
0
    if (flags & TCP_SYN) {
95
0
        if (flags & TCP_RST || flags & TCP_FIN) {
96
0
            return true;
97
0
        }
98
0
    } else {
99
        /* Illegal packet */
100
0
        if (!(flags & (TCP_ACK|TCP_RST))) {
101
0
            return true;
102
0
        }
103
0
    }
104
105
0
    if (!(flags & TCP_ACK)) {
106
        /* These flags are only valid if ACK is set */
107
0
        if ((flags & TCP_FIN) || (flags & TCP_PSH) || (flags & TCP_URG)) {
108
0
            return true;
109
0
        }
110
0
    }
111
112
0
    return false;
113
0
}
114
115
0
#define TCP_MAX_WSCALE 14
116
0
#define CT_WSCALE_FLAG 0x80
117
0
#define CT_WSCALE_UNKNOWN 0x40
118
0
#define CT_WSCALE_MASK 0xf
119
120
static uint8_t
121
tcp_get_wscale(const struct tcp_header *tcp)
122
0
{
123
0
    int len = TCP_OFFSET(tcp->tcp_ctl) * 4 - sizeof *tcp;
124
0
    const uint8_t *opt = (const uint8_t *)(tcp + 1);
125
0
    uint8_t wscale = 0;
126
0
    uint8_t optlen;
127
128
0
    while (len >= 3) {
129
0
        switch (*opt) {
130
0
        case TCPOPT_EOL:
131
0
            return wscale;
132
0
        case TCPOPT_NOP:
133
0
            opt++;
134
0
            len--;
135
0
            break;
136
0
        case TCPOPT_WINDOW:
137
0
            wscale = MIN(opt[2], TCP_MAX_WSCALE);
138
0
            wscale |= CT_WSCALE_FLAG;
139
            /* fall through */
140
0
        default:
141
0
            optlen = opt[1];
142
0
            if (optlen < 2) {
143
0
                optlen = 2;
144
0
            }
145
0
            len -= optlen;
146
0
            opt += optlen;
147
0
        }
148
0
    }
149
150
0
    return wscale;
151
0
}
152
153
static bool
154
tcp_bypass_seq_chk(struct conntrack *ct)
155
0
{
156
0
    if (!conntrack_get_tcp_seq_chk(ct)) {
157
0
        COVERAGE_INC(conntrack_tcp_seq_chk_bypass);
158
0
        return true;
159
0
    }
160
0
    return false;
161
0
}
162
163
static enum ct_update_res
164
tcp_conn_update(struct conntrack *ct, struct conn *conn_,
165
                struct dp_packet *pkt, bool reply, long long now)
166
0
{
167
0
    struct conn_tcp *conn = conn_tcp_cast(conn_);
168
0
    struct tcp_header *tcp = dp_packet_l4(pkt);
169
    /* The peer that sent 'pkt' */
170
0
    struct tcp_peer *src = &conn->peer[reply ? 1 : 0];
171
    /* The peer that should receive 'pkt' */
172
0
    struct tcp_peer *dst = &conn->peer[reply ? 0 : 1];
173
0
    uint8_t sws = 0, dws = 0;
174
0
    uint16_t tcp_flags = TCP_FLAGS(tcp->tcp_ctl);
175
176
0
    uint16_t win = ntohs(tcp->tcp_winsz);
177
0
    uint32_t ack, end, seq, orig_seq;
178
0
    uint32_t p_len = dp_packet_get_tcp_payload_length(pkt);
179
180
0
    if (tcp_invalid_flags(tcp_flags)) {
181
0
        COVERAGE_INC(conntrack_invalid_tcp_flags);
182
0
        return CT_UPDATE_INVALID;
183
0
    }
184
185
0
    if ((tcp_flags & (TCP_SYN | TCP_ACK)) == TCP_SYN) {
186
0
        if (dst->state >= CT_DPIF_TCPS_FIN_WAIT_2
187
0
            && src->state >= CT_DPIF_TCPS_FIN_WAIT_2) {
188
0
            src->state = dst->state = CT_DPIF_TCPS_CLOSED;
189
0
            return CT_UPDATE_NEW;
190
0
        } else if (src->state <= CT_DPIF_TCPS_SYN_SENT) {
191
0
            src->state = CT_DPIF_TCPS_SYN_SENT;
192
0
            conn_update_expiration(ct, &conn->up, CT_TM_TCP_FIRST_PACKET, now);
193
0
            return CT_UPDATE_VALID_NEW;
194
0
        }
195
0
    }
196
197
0
    if (src->wscale & CT_WSCALE_FLAG
198
0
        && dst->wscale & CT_WSCALE_FLAG
199
0
        && !(tcp_flags & TCP_SYN)) {
200
201
0
        sws = src->wscale & CT_WSCALE_MASK;
202
0
        dws = dst->wscale & CT_WSCALE_MASK;
203
204
0
    } else if (src->wscale & CT_WSCALE_UNKNOWN
205
0
               && dst->wscale & CT_WSCALE_UNKNOWN
206
0
               && !(tcp_flags & TCP_SYN)) {
207
208
0
        sws = TCP_MAX_WSCALE;
209
0
        dws = TCP_MAX_WSCALE;
210
0
    }
211
212
    /*
213
     * Sequence tracking algorithm from Guido van Rooij's paper:
214
     *   http://www.madison-gurkha.com/publications/tcp_filtering/
215
     *      tcp_filtering.ps
216
     */
217
218
0
    orig_seq = seq = ntohl(get_16aligned_be32(&tcp->tcp_seq));
219
0
    bool check_ackskew = true;
220
0
    if (src->state < CT_DPIF_TCPS_SYN_SENT) {
221
        /* First packet from this end. Set its state */
222
223
0
        ack = ntohl(get_16aligned_be32(&tcp->tcp_ack));
224
225
0
        end = seq + p_len;
226
0
        if (tcp_flags & TCP_SYN) {
227
0
            end++;
228
0
            if (dst->wscale & CT_WSCALE_FLAG) {
229
0
                src->wscale = tcp_get_wscale(tcp);
230
0
                if (src->wscale & CT_WSCALE_FLAG) {
231
                    /* Remove scale factor from initial window */
232
0
                    sws = src->wscale & CT_WSCALE_MASK;
233
0
                    win = DIV_ROUND_UP((uint32_t) win, 1 << sws);
234
0
                    dws = dst->wscale & CT_WSCALE_MASK;
235
0
                } else {
236
                    /* fixup other window */
237
0
                    dst->max_win <<= dst->wscale & CT_WSCALE_MASK;
238
                    /* in case of a retrans SYN|ACK */
239
0
                    dst->wscale = 0;
240
0
                }
241
0
            }
242
0
        }
243
0
        if (tcp_flags & TCP_FIN) {
244
0
            end++;
245
0
        }
246
247
0
        src->seqlo = seq;
248
0
        src->state = CT_DPIF_TCPS_SYN_SENT;
249
        /*
250
         * May need to slide the window (seqhi may have been set by
251
         * the crappy stack check or if we picked up the connection
252
         * after establishment)
253
         */
254
0
        if (src->seqhi == 1
255
0
                || SEQ_GEQ(end + MAX(1, dst->max_win << dws), src->seqhi)) {
256
0
            src->seqhi = end + MAX(1, dst->max_win << dws);
257
            /* We are either picking up a new connection or a connection which
258
             * was already in place.  We are more permissive in terms of
259
             * ackskew checking in these cases.
260
             */
261
0
            check_ackskew = false;
262
0
        }
263
0
        if (win > src->max_win) {
264
0
            src->max_win = win;
265
0
        }
266
267
0
    } else {
268
0
        ack = ntohl(get_16aligned_be32(&tcp->tcp_ack));
269
0
        end = seq + p_len;
270
0
        if (tcp_flags & TCP_SYN) {
271
0
            end++;
272
0
        }
273
0
        if (tcp_flags & TCP_FIN) {
274
0
            end++;
275
0
        }
276
0
    }
277
278
0
    if ((tcp_flags & TCP_ACK) == 0) {
279
        /* Let it pass through the ack skew check */
280
0
        ack = dst->seqlo;
281
0
    } else if ((ack == 0
282
0
                && (tcp_flags & (TCP_ACK|TCP_RST)) == (TCP_ACK|TCP_RST))
283
0
               /* broken tcp stacks do not set ack */) {
284
        /* Many stacks (ours included) will set the ACK number in an
285
         * FIN|ACK if the SYN times out -- no sequence to ACK. */
286
0
        ack = dst->seqlo;
287
0
    }
288
289
0
    if (seq == end) {
290
        /* Ease sequencing restrictions on no data packets */
291
0
        seq = src->seqlo;
292
0
        end = seq;
293
0
    }
294
295
0
    int ackskew = check_ackskew ? dst->seqlo - ack : 0;
296
0
#define MAXACKWINDOW (0xffff + 1500)    /* 1500 is an arbitrary fudge factor */
297
0
    if ((SEQ_GEQ(src->seqhi, end)
298
        /* Last octet inside other's window space */
299
0
        && SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws))
300
        /* Retrans: not more than one window back */
301
0
        && (ackskew >= -MAXACKWINDOW)
302
        /* Acking not more than one reassembled fragment backwards */
303
0
        && (ackskew <= (MAXACKWINDOW << sws))
304
        /* Acking not more than one window forward */
305
0
        && ((tcp_flags & TCP_RST) == 0 || orig_seq == src->seqlo
306
0
            || (orig_seq == src->seqlo + 1) || (orig_seq + 1 == src->seqlo)))
307
0
        || tcp_bypass_seq_chk(ct)) {
308
        /* Require an exact/+1 sequence match on resets when possible */
309
310
        /* update max window */
311
0
        if (src->max_win < win) {
312
0
            src->max_win = win;
313
0
        }
314
        /* synchronize sequencing */
315
0
        if (SEQ_GT(end, src->seqlo)) {
316
0
            src->seqlo = end;
317
0
        }
318
        /* slide the window of what the other end can send */
319
0
        if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) {
320
0
            dst->seqhi = ack + MAX((win << sws), 1);
321
0
        }
322
323
        /* update states */
324
0
        if (tcp_flags & TCP_SYN && src->state < CT_DPIF_TCPS_SYN_SENT) {
325
0
                src->state = CT_DPIF_TCPS_SYN_SENT;
326
0
        }
327
0
        if (tcp_flags & TCP_FIN && src->state < CT_DPIF_TCPS_CLOSING) {
328
0
                src->state = CT_DPIF_TCPS_CLOSING;
329
0
        }
330
0
        if (tcp_flags & TCP_ACK) {
331
0
            if (dst->state == CT_DPIF_TCPS_SYN_SENT) {
332
0
                dst->state = CT_DPIF_TCPS_ESTABLISHED;
333
0
            } else if (dst->state == CT_DPIF_TCPS_CLOSING) {
334
0
                dst->state = CT_DPIF_TCPS_FIN_WAIT_2;
335
0
            }
336
0
        }
337
0
        if (tcp_flags & TCP_RST) {
338
0
            src->state = dst->state = CT_DPIF_TCPS_TIME_WAIT;
339
0
        }
340
341
0
        if (src->state >= CT_DPIF_TCPS_FIN_WAIT_2
342
0
            && dst->state >= CT_DPIF_TCPS_FIN_WAIT_2) {
343
0
            conn_update_expiration(ct, &conn->up, CT_TM_TCP_CLOSED, now);
344
0
        } else if (src->state >= CT_DPIF_TCPS_CLOSING
345
0
                   && dst->state >= CT_DPIF_TCPS_CLOSING) {
346
0
            conn_update_expiration(ct, &conn->up, CT_TM_TCP_FIN_WAIT, now);
347
0
        } else if (src->state < CT_DPIF_TCPS_ESTABLISHED
348
0
                   || dst->state < CT_DPIF_TCPS_ESTABLISHED) {
349
0
            conn_update_expiration(ct, &conn->up, CT_TM_TCP_OPENING, now);
350
0
        } else if (src->state >= CT_DPIF_TCPS_CLOSING
351
0
                   || dst->state >= CT_DPIF_TCPS_CLOSING) {
352
0
            conn_update_expiration(ct, &conn->up, CT_TM_TCP_CLOSING, now);
353
0
        } else {
354
0
            conn_update_expiration(ct, &conn->up, CT_TM_TCP_ESTABLISHED, now);
355
0
        }
356
0
    } else if ((dst->state < CT_DPIF_TCPS_SYN_SENT
357
0
                || dst->state >= CT_DPIF_TCPS_FIN_WAIT_2
358
0
                || src->state >= CT_DPIF_TCPS_FIN_WAIT_2)
359
0
               && SEQ_GEQ(src->seqhi + MAXACKWINDOW, end)
360
               /* Within a window forward of the originating packet */
361
0
               && SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW)) {
362
               /* Within a window backward of the originating packet */
363
364
        /*
365
         * This currently handles three situations:
366
         *  1) Stupid stacks will shotgun SYNs before their peer
367
         *     replies.
368
         *  2) When PF catches an already established stream (the
369
         *     firewall rebooted, the state table was flushed, routes
370
         *     changed...)
371
         *  3) Packets get funky immediately after the connection
372
         *     closes (this should catch Solaris spurious ACK|FINs
373
         *     that web servers like to spew after a close)
374
         *
375
         * This must be a little more careful than the above code
376
         * since packet floods will also be caught here. We don't
377
         * update the TTL here to mitigate the damage of a packet
378
         * flood and so the same code can handle awkward establishment
379
         * and a loosened connection close.
380
         * In the establishment case, a correct peer response will
381
         * validate the connection, go through the normal state code
382
         * and keep updating the state TTL.
383
         */
384
385
        /* update max window */
386
0
        if (src->max_win < win) {
387
0
            src->max_win = win;
388
0
        }
389
        /* synchronize sequencing */
390
0
        if (SEQ_GT(end, src->seqlo)) {
391
0
            src->seqlo = end;
392
0
        }
393
        /* slide the window of what the other end can send */
394
0
        if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) {
395
0
            dst->seqhi = ack + MAX((win << sws), 1);
396
0
        }
397
398
        /*
399
         * Cannot set dst->seqhi here since this could be a shotgunned
400
         * SYN and not an already established connection.
401
         */
402
403
0
        if (tcp_flags & TCP_FIN && src->state < CT_DPIF_TCPS_CLOSING) {
404
0
            src->state = CT_DPIF_TCPS_CLOSING;
405
0
        }
406
407
0
        if (tcp_flags & TCP_RST) {
408
0
            src->state = dst->state = CT_DPIF_TCPS_TIME_WAIT;
409
0
        }
410
0
    } else {
411
0
        COVERAGE_INC(conntrack_tcp_seq_chk_failed);
412
0
        return CT_UPDATE_INVALID;
413
0
    }
414
415
0
    return CT_UPDATE_VALID;
416
0
}
417
418
static bool
419
tcp_valid_new(struct dp_packet *pkt)
420
0
{
421
0
    struct tcp_header *tcp = dp_packet_l4(pkt);
422
0
    uint16_t tcp_flags = TCP_FLAGS(tcp->tcp_ctl);
423
424
0
    if (tcp_invalid_flags(tcp_flags)) {
425
0
        return false;
426
0
    }
427
428
    /* A syn+ack is not allowed to create a connection.  We want to allow
429
     * totally new connections (syn) or already established, not partially
430
     * open (syn+ack). */
431
0
    if ((tcp_flags & TCP_SYN) && (tcp_flags & TCP_ACK)) {
432
0
        return false;
433
0
    }
434
435
0
    return true;
436
0
}
437
438
static struct conn *
439
tcp_new_conn(struct conntrack *ct, struct dp_packet *pkt, long long now,
440
             uint32_t tp_id)
441
0
{
442
0
    struct conn_tcp* newconn = NULL;
443
0
    struct tcp_header *tcp = dp_packet_l4(pkt);
444
0
    struct tcp_peer *src, *dst;
445
0
    uint16_t tcp_flags = TCP_FLAGS(tcp->tcp_ctl);
446
447
0
    newconn = xzalloc(sizeof *newconn);
448
449
0
    src = &newconn->peer[0];
450
0
    dst = &newconn->peer[1];
451
452
0
    src->seqlo = ntohl(get_16aligned_be32(&tcp->tcp_seq));
453
0
    src->seqhi = src->seqlo + dp_packet_get_tcp_payload_length(pkt) + 1;
454
455
0
    if (tcp_flags & TCP_SYN) {
456
0
        src->seqhi++;
457
0
        src->wscale = tcp_get_wscale(tcp);
458
0
    } else {
459
0
        src->wscale = CT_WSCALE_UNKNOWN;
460
0
        dst->wscale = CT_WSCALE_UNKNOWN;
461
0
    }
462
0
    src->max_win = MAX(ntohs(tcp->tcp_winsz), 1);
463
0
    if (src->wscale & CT_WSCALE_MASK) {
464
        /* Remove scale factor from initial window */
465
0
        uint8_t sws = src->wscale & CT_WSCALE_MASK;
466
0
        src->max_win = DIV_ROUND_UP((uint32_t) src->max_win, 1 << sws);
467
0
    }
468
0
    if (tcp_flags & TCP_FIN) {
469
0
        src->seqhi++;
470
0
    }
471
0
    dst->seqhi = 1;
472
0
    dst->max_win = 1;
473
0
    src->state = CT_DPIF_TCPS_SYN_SENT;
474
0
    dst->state = CT_DPIF_TCPS_CLOSED;
475
476
0
    newconn->up.tp_id = tp_id;
477
0
    conn_init_expiration(ct, &newconn->up, CT_TM_TCP_FIRST_PACKET, now);
478
479
0
    return &newconn->up;
480
0
}
481
482
static uint8_t
483
tcp_peer_to_protoinfo_flags(const struct tcp_peer *peer)
484
0
{
485
0
    uint8_t res = 0;
486
487
0
    if (peer->wscale & CT_WSCALE_FLAG) {
488
0
        res |= CT_DPIF_TCPF_WINDOW_SCALE;
489
0
    }
490
491
0
    if (peer->wscale & CT_WSCALE_UNKNOWN) {
492
0
        res |= CT_DPIF_TCPF_BE_LIBERAL;
493
0
    }
494
495
0
    return res;
496
0
}
497
498
static void
499
tcp_conn_get_protoinfo(const struct conn *conn_,
500
                       struct ct_dpif_protoinfo *protoinfo)
501
0
{
502
0
    const struct conn_tcp *conn = conn_tcp_cast(conn_);
503
504
0
    protoinfo->proto = IPPROTO_TCP;
505
0
    protoinfo->tcp.state_orig = conn->peer[0].state;
506
0
    protoinfo->tcp.state_reply = conn->peer[1].state;
507
508
0
    protoinfo->tcp.wscale_orig = conn->peer[0].wscale & CT_WSCALE_MASK;
509
0
    protoinfo->tcp.wscale_reply = conn->peer[1].wscale & CT_WSCALE_MASK;
510
511
0
    protoinfo->tcp.flags_orig = tcp_peer_to_protoinfo_flags(&conn->peer[0]);
512
0
    protoinfo->tcp.flags_reply = tcp_peer_to_protoinfo_flags(&conn->peer[1]);
513
0
}
514
515
struct ct_l4_proto ct_proto_tcp = {
516
    .new_conn = tcp_new_conn,
517
    .valid_new = tcp_valid_new,
518
    .conn_update = tcp_conn_update,
519
    .conn_get_protoinfo = tcp_conn_get_protoinfo,
520
};