Coverage Report

Created: 2024-10-03 06:24

/src/SockFuzzer/third_party/xnu/bsd/netinet/mptcp.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) 2012-2018 Apple Inc. All rights reserved.
3
 *
4
 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5
 *
6
 * This file contains Original Code and/or Modifications of Original Code
7
 * as defined in and that are subject to the Apple Public Source License
8
 * Version 2.0 (the 'License'). You may not use this file except in
9
 * compliance with the License. The rights granted to you under the License
10
 * may not be used to create, or enable the creation or redistribution of,
11
 * unlawful or unlicensed copies of an Apple operating system, or to
12
 * circumvent, violate, or enable the circumvention or violation of, any
13
 * terms of an Apple operating system software license agreement.
14
 *
15
 * Please obtain a copy of the License at
16
 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17
 *
18
 * The Original Code and all software distributed under the License are
19
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23
 * Please see the License for the specific language governing rights and
24
 * limitations under the License.
25
 *
26
 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27
 */
28
29
/*
30
 * A note on the MPTCP/NECP-interactions:
31
 *
32
 * MPTCP uses NECP-callbacks to get notified of interface/policy events.
33
 * MPTCP registers to these events at the MPTCP-layer for interface-events
34
 * through a call to necp_client_register_multipath_cb.
35
 * To get per-flow events (aka per TCP-subflow), we register to it with
36
 * necp_client_register_socket_flow. Both registrations happen by using the
37
 * necp-client-uuid that comes from the app.
38
 *
39
 * The locking is rather tricky. In general, we expect the lock-ordering to
40
 * happen from necp-fd -> necp->client -> mpp_lock.
41
 *
42
 * There are however some subtleties.
43
 *
44
 * 1. When registering the multipath_cb, we are holding the mpp_lock. This is
45
 * safe, because it is the very first time this MPTCP-connection goes into NECP.
46
 * As we go into NECP we take the NECP-locks and thus are guaranteed that no
47
 * NECP-locks will deadlock us. Because these NECP-events will also first take
48
 * the NECP-locks. Either they win the race and thus won't find our
49
 * MPTCP-connection. Or, MPTCP wins the race and thus it will safely install
50
 * the callbacks while holding the NECP lock.
51
 *
52
 * 2. When registering the subflow-callbacks we must unlock the mpp_lock. This,
53
 * because we have already registered callbacks and we might race against an
54
 * NECP-event that will match on our socket. So, we have to unlock to be safe.
55
 *
56
 * 3. When removing the multipath_cb, we do it in mp_pcbdispose(). The
57
 * so_usecount has reached 0. We must be careful to not remove the mpp_socket
58
 * pointers before we unregistered the callback. Because, again we might be
59
 * racing against an NECP-event. Unregistering must happen with an unlocked
60
 * mpp_lock, because of the lock-ordering constraint. It could be that
61
 * before we had a chance to unregister an NECP-event triggers. That's why
62
 * we need to check for the so_usecount in mptcp_session_necp_cb. If we get
63
 * there while the socket is being garbage-collected, the use-count will go
64
 * down to 0 and we exit. Removal of the multipath_cb again happens by taking
65
 * the NECP-locks so any running NECP-events will finish first and exit cleanly.
66
 *
67
 * 4. When removing the subflow-callback, we do it in in_pcbdispose(). Again,
68
 * the socket-lock must be unlocked for lock-ordering constraints. This gets a
69
 * bit tricky here, as in tcp_garbage_collect we hold the mp_so and so lock.
70
 * So, we drop the mp_so-lock as soon as the subflow is unlinked with
71
 * mptcp_subflow_del. Then, in in_pcbdispose we drop the subflow-lock.
72
 * If an NECP-event was waiting on the lock in mptcp_subflow_necp_cb, when it
73
 * gets it, it will realize that the subflow became non-MPTCP and retry (see
74
 * tcp_lock). Then it waits again on the subflow-lock. When we drop this lock
75
 * in in_pcbdispose, and enter necp_inpcb_dispose, this one will have to wait
76
 * for the NECP-lock (held by the other thread that is taking care of the NECP-
77
 * event). So, the event now finally gets the subflow-lock and then hits an
78
 * so_usecount that is 0 and exits. Eventually, we can remove the subflow from
79
 * the NECP callback.
80
 */
81
82
#include <sys/param.h>
83
#include <sys/systm.h>
84
#include <sys/kernel.h>
85
#include <sys/mbuf.h>
86
#include <sys/mcache.h>
87
#include <sys/socket.h>
88
#include <sys/socketvar.h>
89
#include <sys/syslog.h>
90
#include <sys/protosw.h>
91
92
#include <kern/zalloc.h>
93
#include <kern/locks.h>
94
95
#include <mach/sdt.h>
96
97
#include <net/if.h>
98
#include <netinet/in.h>
99
#include <netinet/in_var.h>
100
#include <netinet/tcp.h>
101
#include <netinet/tcp_fsm.h>
102
#include <netinet/tcp_seq.h>
103
#include <netinet/tcp_var.h>
104
#include <netinet/mptcp_var.h>
105
#include <netinet/mptcp.h>
106
#include <netinet/mptcp_seq.h>
107
#include <netinet/mptcp_opt.h>
108
#include <netinet/mptcp_timer.h>
109
110
int mptcp_enable = 1;
111
SYSCTL_INT(_net_inet_mptcp, OID_AUTO, enable, CTLFLAG_RW | CTLFLAG_LOCKED,
112
    &mptcp_enable, 0, "Enable Multipath TCP Support");
113
114
/*
115
 * Number of times to try negotiating MPTCP on SYN retransmissions.
116
 * We haven't seen any reports of a middlebox that is dropping all SYN-segments
117
 * that have an MPTCP-option. Thus, let's be generous and retransmit it 4 times.
118
 */
119
int mptcp_mpcap_retries = 4;
120
SYSCTL_INT(_net_inet_mptcp, OID_AUTO, mptcp_cap_retr,
121
    CTLFLAG_RW | CTLFLAG_LOCKED,
122
    &mptcp_mpcap_retries, 0, "Number of MP Capable SYN Retries");
123
124
/*
125
 * By default, DSS checksum is turned off, revisit if we ever do
126
 * MPTCP for non SSL Traffic.
127
 */
128
int mptcp_dss_csum = 0;
129
SYSCTL_INT(_net_inet_mptcp, OID_AUTO, dss_csum, CTLFLAG_RW | CTLFLAG_LOCKED,
130
    &mptcp_dss_csum, 0, "Enable DSS checksum");
131
132
/*
133
 * When mptcp_fail_thresh number of retransmissions are sent, subflow failover
134
 * is attempted on a different path.
135
 */
136
int mptcp_fail_thresh = 1;
137
SYSCTL_INT(_net_inet_mptcp, OID_AUTO, fail, CTLFLAG_RW | CTLFLAG_LOCKED,
138
    &mptcp_fail_thresh, 0, "Failover threshold");
139
140
/*
141
 * MPTCP subflows have TCP keepalives set to ON. Set a conservative keeptime
142
 * as carrier networks mostly have a 30 minute to 60 minute NAT Timeout.
143
 * Some carrier networks have a timeout of 10 or 15 minutes.
144
 */
145
int mptcp_subflow_keeptime = 60 * 14;
146
SYSCTL_INT(_net_inet_mptcp, OID_AUTO, keepalive, CTLFLAG_RW | CTLFLAG_LOCKED,
147
    &mptcp_subflow_keeptime, 0, "Keepalive in seconds");
148
149
int mptcp_rtthist_rtthresh = 600;
150
SYSCTL_INT(_net_inet_mptcp, OID_AUTO, rtthist_thresh, CTLFLAG_RW | CTLFLAG_LOCKED,
151
    &mptcp_rtthist_rtthresh, 0, "Rtt threshold");
152
153
int mptcp_rtothresh = 1500;
154
SYSCTL_INT(_net_inet_mptcp, OID_AUTO, rto_thresh, CTLFLAG_RW | CTLFLAG_LOCKED,
155
    &mptcp_rtothresh, 0, "RTO threshold");
156
157
/*
158
 * Probe the preferred path, when it is not in use
159
 */
160
uint32_t mptcp_probeto = 1000;
161
SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, probeto, CTLFLAG_RW | CTLFLAG_LOCKED,
162
    &mptcp_probeto, 0, "Disable probing by setting to 0");
163
164
uint32_t mptcp_probecnt = 5;
165
SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, probecnt, CTLFLAG_RW | CTLFLAG_LOCKED,
166
    &mptcp_probecnt, 0, "Number of probe writes");
167
168
static int
169
mptcp_reass_present(struct socket *mp_so)
170
0
{
171
0
  struct mptses *mpte = mpsotompte(mp_so);
172
0
  struct mptcb *mp_tp = mpte->mpte_mptcb;
173
0
  struct tseg_qent *q;
174
0
  int dowakeup = 0;
175
0
  int flags = 0;
176
177
  /*
178
   * Present data to user, advancing rcv_nxt through
179
   * completed sequence space.
180
   */
181
0
  if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
182
0
    return flags;
183
0
  }
184
0
  q = LIST_FIRST(&mp_tp->mpt_segq);
185
0
  if (!q || q->tqe_m->m_pkthdr.mp_dsn != mp_tp->mpt_rcvnxt) {
186
0
    return flags;
187
0
  }
188
189
  /*
190
   * If there is already another thread doing reassembly for this
191
   * connection, it is better to let it finish the job --
192
   * (radar 16316196)
193
   */
194
0
  if (mp_tp->mpt_flags & MPTCPF_REASS_INPROG) {
195
0
    return flags;
196
0
  }
197
198
0
  mp_tp->mpt_flags |= MPTCPF_REASS_INPROG;
199
200
0
  do {
201
0
    mp_tp->mpt_rcvnxt += q->tqe_len;
202
0
    LIST_REMOVE(q, tqe_q);
203
0
    if (mp_so->so_state & SS_CANTRCVMORE) {
204
0
      m_freem(q->tqe_m);
205
0
    } else {
206
0
      flags = !!(q->tqe_m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN);
207
0
      if (sbappendstream_rcvdemux(mp_so, q->tqe_m)) {
208
0
        dowakeup = 1;
209
0
      }
210
0
    }
211
0
    zfree(tcp_reass_zone, q);
212
0
    mp_tp->mpt_reassqlen--;
213
0
    q = LIST_FIRST(&mp_tp->mpt_segq);
214
0
  } while (q && q->tqe_m->m_pkthdr.mp_dsn == mp_tp->mpt_rcvnxt);
215
0
  mp_tp->mpt_flags &= ~MPTCPF_REASS_INPROG;
216
217
0
  if (dowakeup) {
218
0
    sorwakeup(mp_so); /* done with socket lock held */
219
0
  }
220
0
  return flags;
221
0
}
222
223
static int
224
mptcp_reass(struct socket *mp_so, struct pkthdr *phdr, int *tlenp, struct mbuf *m)
225
0
{
226
0
  struct mptcb *mp_tp = mpsotomppcb(mp_so)->mpp_pcbe->mpte_mptcb;
227
0
  u_int64_t mb_dsn = phdr->mp_dsn;
228
0
  struct tseg_qent *q;
229
0
  struct tseg_qent *p = NULL;
230
0
  struct tseg_qent *nq;
231
0
  struct tseg_qent *te = NULL;
232
0
  uint32_t qlimit;
233
234
  /*
235
   * Limit the number of segments in the reassembly queue to prevent
236
   * holding on to too many segments (and thus running out of mbufs).
237
   * Make sure to let the missing segment through which caused this
238
   * queue.  Always keep one global queue entry spare to be able to
239
   * process the missing segment.
240
   */
241
0
  qlimit = MIN(MAX(100, mp_so->so_rcv.sb_hiwat >> 10),
242
0
      (tcp_autorcvbuf_max >> 10));
243
0
  if (mb_dsn != mp_tp->mpt_rcvnxt &&
244
0
      (mp_tp->mpt_reassqlen + 1) >= qlimit) {
245
0
    tcpstat.tcps_mptcp_rcvmemdrop++;
246
0
    m_freem(m);
247
0
    *tlenp = 0;
248
0
    return 0;
249
0
  }
250
251
  /* Allocate a new queue entry. If we can't, just drop the pkt. XXX */
252
0
  te = (struct tseg_qent *) zalloc(tcp_reass_zone);
253
0
  if (te == NULL) {
254
0
    tcpstat.tcps_mptcp_rcvmemdrop++;
255
0
    m_freem(m);
256
0
    return 0;
257
0
  }
258
259
0
  mp_tp->mpt_reassqlen++;
260
261
  /*
262
   * Find a segment which begins after this one does.
263
   */
264
0
  LIST_FOREACH(q, &mp_tp->mpt_segq, tqe_q) {
265
0
    if (MPTCP_SEQ_GT(q->tqe_m->m_pkthdr.mp_dsn, mb_dsn)) {
266
0
      break;
267
0
    }
268
0
    p = q;
269
0
  }
270
271
  /*
272
   * If there is a preceding segment, it may provide some of
273
   * our data already.  If so, drop the data from the incoming
274
   * segment.  If it provides all of our data, drop us.
275
   */
276
0
  if (p != NULL) {
277
0
    int64_t i;
278
    /* conversion to int (in i) handles seq wraparound */
279
0
    i = p->tqe_m->m_pkthdr.mp_dsn + p->tqe_len - mb_dsn;
280
0
    if (i > 0) {
281
0
      if (i >= *tlenp) {
282
0
        tcpstat.tcps_mptcp_rcvduppack++;
283
0
        m_freem(m);
284
0
        zfree(tcp_reass_zone, te);
285
0
        te = NULL;
286
0
        mp_tp->mpt_reassqlen--;
287
        /*
288
         * Try to present any queued data
289
         * at the left window edge to the user.
290
         * This is needed after the 3-WHS
291
         * completes.
292
         */
293
0
        goto out;
294
0
      }
295
0
      VERIFY(i <= INT_MAX);
296
0
      m_adj(m, (int)i);
297
0
      *tlenp -= i;
298
0
      phdr->mp_dsn += i;
299
0
    }
300
0
  }
301
302
0
  tcpstat.tcps_mp_oodata++;
303
304
  /*
305
   * While we overlap succeeding segments trim them or,
306
   * if they are completely covered, dequeue them.
307
   */
308
0
  while (q) {
309
0
    int64_t i = (mb_dsn + *tlenp) - q->tqe_m->m_pkthdr.mp_dsn;
310
0
    if (i <= 0) {
311
0
      break;
312
0
    }
313
314
0
    if (i < q->tqe_len) {
315
0
      q->tqe_m->m_pkthdr.mp_dsn += i;
316
0
      q->tqe_len -= i;
317
318
0
      VERIFY(i <= INT_MAX);
319
0
      m_adj(q->tqe_m, (int)i);
320
0
      break;
321
0
    }
322
323
0
    nq = LIST_NEXT(q, tqe_q);
324
0
    LIST_REMOVE(q, tqe_q);
325
0
    m_freem(q->tqe_m);
326
0
    zfree(tcp_reass_zone, q);
327
0
    mp_tp->mpt_reassqlen--;
328
0
    q = nq;
329
0
  }
330
331
  /* Insert the new segment queue entry into place. */
332
0
  te->tqe_m = m;
333
0
  te->tqe_th = NULL;
334
0
  te->tqe_len = *tlenp;
335
336
0
  if (p == NULL) {
337
0
    LIST_INSERT_HEAD(&mp_tp->mpt_segq, te, tqe_q);
338
0
  } else {
339
0
    LIST_INSERT_AFTER(p, te, tqe_q);
340
0
  }
341
342
0
out:
343
0
  return mptcp_reass_present(mp_so);
344
0
}
345
346
/*
347
 * MPTCP input, called when data has been read from a subflow socket.
348
 */
349
void
350
mptcp_input(struct mptses *mpte, struct mbuf *m)
351
0
{
352
0
  struct socket *mp_so;
353
0
  struct mptcb *mp_tp = NULL;
354
0
  int count = 0, wakeup = 0;
355
0
  struct mbuf *save = NULL, *prev = NULL;
356
0
  struct mbuf *freelist = NULL, *tail = NULL;
357
358
0
  VERIFY(m->m_flags & M_PKTHDR);
359
360
0
  mp_so = mptetoso(mpte);
361
0
  mp_tp = mpte->mpte_mptcb;
362
363
0
  socket_lock_assert_owned(mp_so);
364
365
0
  DTRACE_MPTCP(input);
366
367
0
  mp_tp->mpt_rcvwnd = mptcp_sbspace(mp_tp);
368
369
  /*
370
   * Each mbuf contains MPTCP Data Sequence Map
371
   * Process the data for reassembly, delivery to MPTCP socket
372
   * client, etc.
373
   *
374
   */
375
0
  count = mp_so->so_rcv.sb_cc;
376
377
  /*
378
   * In the degraded fallback case, data is accepted without DSS map
379
   */
380
0
  if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
381
0
    struct mbuf *iter;
382
0
    int mb_dfin = 0;
383
0
fallback:
384
0
    mptcp_sbrcv_grow(mp_tp);
385
386
0
    iter = m;
387
0
    while (iter) {
388
0
      if ((iter->m_flags & M_PKTHDR) &&
389
0
          (iter->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN)) {
390
0
        mb_dfin = 1;
391
0
      }
392
393
0
      if ((iter->m_flags & M_PKTHDR) && m_pktlen(iter) == 0) {
394
        /* Don't add zero-length packets, so jump it! */
395
0
        if (prev == NULL) {
396
0
          m = iter->m_next;
397
0
          m_free(iter);
398
0
          iter = m;
399
0
        } else {
400
0
          prev->m_next = iter->m_next;
401
0
          m_free(iter);
402
0
          iter = prev->m_next;
403
0
        }
404
405
        /* It was a zero-length packet so next one must be a pkthdr */
406
0
        VERIFY(iter == NULL || iter->m_flags & M_PKTHDR);
407
0
      } else {
408
0
        prev = iter;
409
0
        iter = iter->m_next;
410
0
      }
411
0
    }
412
413
    /*
414
     * assume degraded flow as this may be the first packet
415
     * without DSS, and the subflow state is not updated yet.
416
     */
417
0
    if (sbappendstream_rcvdemux(mp_so, m)) {
418
0
      sorwakeup(mp_so);
419
0
    }
420
421
0
    DTRACE_MPTCP5(receive__degraded, struct mbuf *, m,
422
0
        struct socket *, mp_so,
423
0
        struct sockbuf *, &mp_so->so_rcv,
424
0
        struct sockbuf *, &mp_so->so_snd,
425
0
        struct mptses *, mpte);
426
0
    count = mp_so->so_rcv.sb_cc - count;
427
428
0
    mp_tp->mpt_rcvnxt += count;
429
430
0
    if (mb_dfin) {
431
0
      mptcp_close_fsm(mp_tp, MPCE_RECV_DATA_FIN);
432
0
      socantrcvmore(mp_so);
433
0
    }
434
0
    return;
435
0
  }
436
437
0
  do {
438
0
    u_int64_t mb_dsn;
439
0
    int32_t mb_datalen;
440
0
    int64_t todrop;
441
0
    int mb_dfin = 0;
442
443
0
    VERIFY(m->m_flags & M_PKTHDR);
444
445
    /* If fallback occurs, mbufs will not have PKTF_MPTCP set */
446
0
    if (!(m->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
447
0
      goto fallback;
448
0
    }
449
450
0
    save = m->m_next;
451
    /*
452
     * A single TCP packet formed of multiple mbufs
453
     * holds DSS mapping in the first mbuf of the chain.
454
     * Other mbufs in the chain may have M_PKTHDR set
455
     * even though they belong to the same TCP packet
456
     * and therefore use the DSS mapping stored in the
457
     * first mbuf of the mbuf chain. mptcp_input() can
458
     * get an mbuf chain with multiple TCP packets.
459
     */
460
0
    while (save && (!(save->m_flags & M_PKTHDR) ||
461
0
        !(save->m_pkthdr.pkt_flags & PKTF_MPTCP))) {
462
0
      prev = save;
463
0
      save = save->m_next;
464
0
    }
465
0
    if (prev) {
466
0
      prev->m_next = NULL;
467
0
    } else {
468
0
      m->m_next = NULL;
469
0
    }
470
471
0
    mb_dsn = m->m_pkthdr.mp_dsn;
472
0
    mb_datalen = m->m_pkthdr.mp_rlen;
473
474
0
    todrop = (mb_dsn + mb_datalen) - (mp_tp->mpt_rcvnxt + mp_tp->mpt_rcvwnd);
475
0
    if (todrop > 0) {
476
0
      tcpstat.tcps_mptcp_rcvpackafterwin++;
477
478
0
      os_log_info(mptcp_log_handle, "%s - %lx: dropping dsn %u dlen %u rcvnxt %u rcvwnd %u todrop %lld\n",
479
0
          __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
480
0
          (uint32_t)mb_dsn, mb_datalen, (uint32_t)mp_tp->mpt_rcvnxt,
481
0
          mp_tp->mpt_rcvwnd, todrop);
482
483
0
      if (todrop >= mb_datalen) {
484
0
        if (freelist == NULL) {
485
0
          freelist = m;
486
0
        } else {
487
0
          tail->m_next = m;
488
0
        }
489
490
0
        if (prev != NULL) {
491
0
          tail = prev;
492
0
        } else {
493
0
          tail = m;
494
0
        }
495
496
0
        m = save;
497
0
        prev = save = NULL;
498
0
        continue;
499
0
      } else {
500
0
        VERIFY(todrop <= INT_MAX);
501
0
        m_adj(m, (int)-todrop);
502
0
        mb_datalen -= todrop;
503
0
        m->m_pkthdr.mp_rlen -= todrop;
504
0
      }
505
506
      /*
507
       * We drop from the right edge of the mbuf, thus the
508
       * DATA_FIN is dropped as well
509
       */
510
0
      m->m_pkthdr.pkt_flags &= ~PKTF_MPTCP_DFIN;
511
0
    }
512
513
0
    if (MPTCP_SEQ_LT(mb_dsn, mp_tp->mpt_rcvnxt)) {
514
0
      if (MPTCP_SEQ_LEQ((mb_dsn + mb_datalen),
515
0
          mp_tp->mpt_rcvnxt)) {
516
0
        if (freelist == NULL) {
517
0
          freelist = m;
518
0
        } else {
519
0
          tail->m_next = m;
520
0
        }
521
522
0
        if (prev != NULL) {
523
0
          tail = prev;
524
0
        } else {
525
0
          tail = m;
526
0
        }
527
528
0
        m = save;
529
0
        prev = save = NULL;
530
0
        continue;
531
0
      } else {
532
0
        VERIFY((mp_tp->mpt_rcvnxt - mb_dsn) <= INT_MAX);
533
0
        m_adj(m, (int)(mp_tp->mpt_rcvnxt - mb_dsn));
534
0
        mb_datalen -= (mp_tp->mpt_rcvnxt - mb_dsn);
535
0
        mb_dsn = mp_tp->mpt_rcvnxt;
536
0
        VERIFY(mb_datalen >= 0 && mb_datalen <= USHRT_MAX);
537
0
        m->m_pkthdr.mp_rlen = (uint16_t)mb_datalen;
538
0
        m->m_pkthdr.mp_dsn = mb_dsn;
539
0
      }
540
0
    }
541
542
0
    if (MPTCP_SEQ_GT(mb_dsn, mp_tp->mpt_rcvnxt) ||
543
0
        !LIST_EMPTY(&mp_tp->mpt_segq)) {
544
0
      mb_dfin = mptcp_reass(mp_so, &m->m_pkthdr, &mb_datalen, m);
545
546
0
      goto next;
547
0
    }
548
0
    mb_dfin = !!(m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN);
549
550
0
    mptcp_sbrcv_grow(mp_tp);
551
552
0
    if (sbappendstream_rcvdemux(mp_so, m)) {
553
0
      wakeup = 1;
554
0
    }
555
556
0
    DTRACE_MPTCP6(receive, struct mbuf *, m, struct socket *, mp_so,
557
0
        struct sockbuf *, &mp_so->so_rcv,
558
0
        struct sockbuf *, &mp_so->so_snd,
559
0
        struct mptses *, mpte,
560
0
        struct mptcb *, mp_tp);
561
0
    count = mp_so->so_rcv.sb_cc - count;
562
0
    tcpstat.tcps_mp_rcvtotal++;
563
0
    tcpstat.tcps_mp_rcvbytes += count;
564
565
0
    mp_tp->mpt_rcvnxt += count;
566
567
0
next:
568
0
    if (mb_dfin) {
569
0
      mptcp_close_fsm(mp_tp, MPCE_RECV_DATA_FIN);
570
0
      socantrcvmore(mp_so);
571
0
    }
572
0
    m = save;
573
0
    prev = save = NULL;
574
0
    count = mp_so->so_rcv.sb_cc;
575
0
  } while (m);
576
577
0
  if (freelist) {
578
0
    m_freem(freelist);
579
0
  }
580
581
0
  if (wakeup) {
582
0
    sorwakeup(mp_so);
583
0
  }
584
0
}
585
586
boolean_t
587
mptcp_can_send_more(struct mptcb *mp_tp, boolean_t ignore_reinject)
588
0
{
589
0
  struct socket *mp_so = mptetoso(mp_tp->mpt_mpte);
590
591
  /*
592
   * Always send if there is data in the reinject-queue.
593
   */
594
0
  if (!ignore_reinject && mp_tp->mpt_mpte->mpte_reinjectq) {
595
0
    return TRUE;
596
0
  }
597
598
  /*
599
   * Don't send, if:
600
   *
601
   * 1. snd_nxt >= snd_max : Means, basically everything has been sent.
602
   *    Except when using TFO, we might be doing a 0-byte write.
603
   * 2. snd_una + snd_wnd <= snd_nxt: No space in the receiver's window
604
   * 3. snd_nxt + 1 == snd_max and we are closing: A DATA_FIN is scheduled.
605
   */
606
607
0
  if (!(mp_so->so_flags1 & SOF1_PRECONNECT_DATA) && MPTCP_SEQ_GEQ(mp_tp->mpt_sndnxt, mp_tp->mpt_sndmax)) {
608
0
    return FALSE;
609
0
  }
610
611
0
  if (MPTCP_SEQ_LEQ(mp_tp->mpt_snduna + mp_tp->mpt_sndwnd, mp_tp->mpt_sndnxt)) {
612
0
    return FALSE;
613
0
  }
614
615
0
  if (mp_tp->mpt_sndnxt + 1 == mp_tp->mpt_sndmax && mp_tp->mpt_state > MPTCPS_CLOSE_WAIT) {
616
0
    return FALSE;
617
0
  }
618
619
0
  if (mp_tp->mpt_state >= MPTCPS_FIN_WAIT_2) {
620
0
    return FALSE;
621
0
  }
622
623
0
  return TRUE;
624
0
}
625
626
/*
627
 * MPTCP output.
628
 */
629
int
630
mptcp_output(struct mptses *mpte)
631
0
{
632
0
  struct mptcb *mp_tp;
633
0
  struct mptsub *mpts;
634
0
  struct mptsub *mpts_tried = NULL;
635
0
  struct socket *mp_so;
636
0
  struct mptsub *preferred_mpts = NULL;
637
0
  uint64_t old_snd_nxt;
638
0
  int error = 0;
639
640
0
  mp_so = mptetoso(mpte);
641
0
  mp_tp = mpte->mpte_mptcb;
642
643
0
  socket_lock_assert_owned(mp_so);
644
645
0
  if (mp_so->so_flags & SOF_DEFUNCT) {
646
0
    return 0;
647
0
  }
648
649
0
  VERIFY(!(mpte->mpte_mppcb->mpp_flags & MPP_WUPCALL));
650
0
  mpte->mpte_mppcb->mpp_flags |= MPP_WUPCALL;
651
652
0
  old_snd_nxt = mp_tp->mpt_sndnxt;
653
  // nedwill: limit attempts to avoid infinite loop
654
0
  int attempts = 0;
655
0
  while (mptcp_can_send_more(mp_tp, FALSE) && attempts++ < 16) {
656
    /* get the "best" subflow to be used for transmission */
657
0
    mpts = mptcp_get_subflow(mpte, &preferred_mpts);
658
0
    if (mpts == NULL) {
659
0
      mptcplog((LOG_INFO, "%s: no subflow\n", __func__),
660
0
          MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
661
0
      break;
662
0
    }
663
664
    /* In case there's just one flow, we reattempt later */
665
0
    if (mpts_tried != NULL &&
666
0
        (mpts == mpts_tried || (mpts->mpts_flags & MPTSF_FAILINGOVER))) {
667
0
      mpts_tried->mpts_flags &= ~MPTSF_FAILINGOVER;
668
0
      mpts_tried->mpts_flags |= MPTSF_ACTIVE;
669
0
      mptcp_start_timer(mpte, MPTT_REXMT);
670
0
      break;
671
0
    }
672
673
    /*
674
     * Automatic sizing of send socket buffer. Increase the send
675
     * socket buffer size if all of the following criteria are met
676
     *  1. the receiver has enough buffer space for this data
677
     *  2. send buffer is filled to 7/8th with data (so we actually
678
     *     have data to make use of it);
679
     */
680
0
    if ((mp_so->so_snd.sb_flags & (SB_AUTOSIZE | SB_TRIM)) == SB_AUTOSIZE &&
681
0
        tcp_cansbgrow(&mp_so->so_snd)) {
682
0
      if ((mp_tp->mpt_sndwnd / 4 * 5) >= mp_so->so_snd.sb_hiwat &&
683
0
          mp_so->so_snd.sb_cc >= (mp_so->so_snd.sb_hiwat / 8 * 7)) {
684
0
        if (sbreserve(&mp_so->so_snd,
685
0
            min(mp_so->so_snd.sb_hiwat + tcp_autosndbuf_inc,
686
0
            tcp_autosndbuf_max)) == 1) {
687
0
          mp_so->so_snd.sb_idealsize = mp_so->so_snd.sb_hiwat;
688
0
        }
689
0
      }
690
0
    }
691
692
0
    DTRACE_MPTCP3(output, struct mptses *, mpte, struct mptsub *, mpts,
693
0
        struct socket *, mp_so);
694
0
    error = mptcp_subflow_output(mpte, mpts, 0);
695
0
    if (error) {
696
      /* can be a temporary loss of source address or other error */
697
0
      mpts->mpts_flags |= MPTSF_FAILINGOVER;
698
0
      mpts->mpts_flags &= ~MPTSF_ACTIVE;
699
0
      mpts_tried = mpts;
700
0
      if (error != ECANCELED) {
701
0
        os_log_error(mptcp_log_handle, "%s - %lx: Error = %d mpts_flags %#x\n",
702
0
            __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
703
0
            error, mpts->mpts_flags);
704
0
      }
705
0
      break;
706
0
    }
707
    /* The model is to have only one active flow at a time */
708
0
    mpts->mpts_flags |= MPTSF_ACTIVE;
709
0
    mpts->mpts_probesoon = mpts->mpts_probecnt = 0;
710
711
    /* Allows us to update the smoothed rtt */
712
0
    if (mptcp_probeto && mpts != preferred_mpts && preferred_mpts != NULL) {
713
0
      if (preferred_mpts->mpts_probesoon) {
714
0
        if ((tcp_now - preferred_mpts->mpts_probesoon) > mptcp_probeto) {
715
0
          mptcp_subflow_output(mpte, preferred_mpts, MPTCP_SUBOUT_PROBING);
716
0
          if (preferred_mpts->mpts_probecnt >= mptcp_probecnt) {
717
0
            preferred_mpts->mpts_probesoon = 0;
718
0
            preferred_mpts->mpts_probecnt = 0;
719
0
          }
720
0
        }
721
0
      } else {
722
0
        preferred_mpts->mpts_probesoon = tcp_now;
723
0
        preferred_mpts->mpts_probecnt = 0;
724
0
      }
725
0
    }
726
727
0
    if (mpte->mpte_active_sub == NULL) {
728
0
      mpte->mpte_active_sub = mpts;
729
0
    } else if (mpte->mpte_active_sub != mpts) {
730
0
      mpte->mpte_active_sub->mpts_flags &= ~MPTSF_ACTIVE;
731
0
      mpte->mpte_active_sub = mpts;
732
733
0
      mptcpstats_inc_switch(mpte, mpts);
734
0
    }
735
0
  }
736
737
0
  if (mp_tp->mpt_state > MPTCPS_CLOSE_WAIT) {
738
0
    if (mp_tp->mpt_sndnxt + 1 == mp_tp->mpt_sndmax &&
739
0
        mp_tp->mpt_snduna == mp_tp->mpt_sndnxt) {
740
0
      mptcp_finish_usrclosed(mpte);
741
0
    }
742
0
  }
743
744
0
  mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_WUPCALL);
745
746
  /* subflow errors should not be percolated back up */
747
0
  return 0;
748
0
}
749
750
751
static struct mptsub *
752
mptcp_choose_subflow(struct mptsub *mpts, struct mptsub *curbest, int *currtt)
753
0
{
754
0
  struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
755
756
  /*
757
   * Lower RTT? Take it, if it's our first one, or
758
   * it doesn't has any loss, or the current one has
759
   * loss as well.
760
   */
761
0
  if (tp->t_srtt && *currtt > tp->t_srtt &&
762
0
      (curbest == NULL || tp->t_rxtshift == 0 ||
763
0
      sototcpcb(curbest->mpts_socket)->t_rxtshift)) {
764
0
    *currtt = tp->t_srtt;
765
0
    return mpts;
766
0
  }
767
768
  /*
769
   * If we find a subflow without loss, take it always!
770
   */
771
0
  if (curbest &&
772
0
      sototcpcb(curbest->mpts_socket)->t_rxtshift &&
773
0
      tp->t_rxtshift == 0) {
774
0
    *currtt = tp->t_srtt;
775
0
    return mpts;
776
0
  }
777
778
0
  return curbest != NULL ? curbest : mpts;
779
0
}
780
781
static struct mptsub *
782
mptcp_return_subflow(struct mptsub *mpts)
783
0
{
784
0
  if (mpts && mptcp_subflow_cwnd_space(mpts->mpts_socket) <= 0) {
785
0
    return NULL;
786
0
  }
787
788
0
  return mpts;
789
0
}
790
791
static boolean_t
792
mptcp_subflow_is_slow(struct mptses *mpte, struct mptsub *mpts)
793
0
{
794
0
  struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
795
0
  int fail_thresh = mptcp_fail_thresh;
796
797
0
  if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER || mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER) {
798
0
    fail_thresh *= 2;
799
0
  }
800
801
0
  return tp->t_rxtshift >= fail_thresh &&
802
0
         (mptetoso(mpte)->so_snd.sb_cc || mpte->mpte_reinjectq);
803
0
}
804
805
/*
806
 * Return the most eligible subflow to be used for sending data.
807
 */
808
struct mptsub *
809
mptcp_get_subflow(struct mptses *mpte, struct mptsub **preferred)
810
0
{
811
0
  struct tcpcb *besttp, *secondtp;
812
0
  struct inpcb *bestinp, *secondinp;
813
0
  struct mptsub *mpts;
814
0
  struct mptsub *best = NULL;
815
0
  struct mptsub *second_best = NULL;
816
0
  int exp_rtt = INT_MAX, cheap_rtt = INT_MAX;
817
818
  /*
819
   * First Step:
820
   * Choose the best subflow for cellular and non-cellular interfaces.
821
   */
822
823
0
  TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
824
0
    struct socket *so = mpts->mpts_socket;
825
0
    struct tcpcb *tp = sototcpcb(so);
826
0
    struct inpcb *inp = sotoinpcb(so);
827
828
0
    mptcplog((LOG_DEBUG, "%s mpts %u mpts_flags %#x, suspended %u sostate %#x tpstate %u cellular %d rtt %u rxtshift %u cheap %u exp %u cwnd %d\n",
829
0
        __func__, mpts->mpts_connid, mpts->mpts_flags,
830
0
        INP_WAIT_FOR_IF_FEEDBACK(inp), so->so_state, tp->t_state,
831
0
        inp->inp_last_outifp ? IFNET_IS_CELLULAR(inp->inp_last_outifp) : -1,
832
0
        tp->t_srtt, tp->t_rxtshift, cheap_rtt, exp_rtt,
833
0
        mptcp_subflow_cwnd_space(so)),
834
0
        MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
835
836
    /*
837
     * First, the hard conditions to reject subflows
838
     * (e.g., not connected,...)
839
     */
840
0
    if (inp->inp_last_outifp == NULL) {
841
0
      continue;
842
0
    }
843
844
0
    if (INP_WAIT_FOR_IF_FEEDBACK(inp)) {
845
0
      continue;
846
0
    }
847
848
    /* There can only be one subflow in degraded state */
849
0
    if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
850
0
      best = mpts;
851
0
      break;
852
0
    }
853
854
    /*
855
     * If this subflow is waiting to finally send, do it!
856
     */
857
0
    if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
858
0
      return mptcp_return_subflow(mpts);
859
0
    }
860
861
    /*
862
     * Only send if the subflow is MP_CAPABLE. The exceptions to
863
     * this rule (degraded or TFO) have been taken care of above.
864
     */
865
0
    if (!(mpts->mpts_flags & MPTSF_MP_CAPABLE)) {
866
0
      continue;
867
0
    }
868
869
0
    if ((so->so_state & SS_ISDISCONNECTED) ||
870
0
        !(so->so_state & SS_ISCONNECTED) ||
871
0
        !TCPS_HAVEESTABLISHED(tp->t_state) ||
872
0
        tp->t_state > TCPS_CLOSE_WAIT) {
873
0
      continue;
874
0
    }
875
876
    /*
877
     * Second, the soft conditions to find the subflow with best
878
     * conditions for each set (aka cellular vs non-cellular)
879
     */
880
0
    if (IFNET_IS_CELLULAR(inp->inp_last_outifp)) {
881
0
      second_best = mptcp_choose_subflow(mpts, second_best,
882
0
          &exp_rtt);
883
0
    } else {
884
0
      best = mptcp_choose_subflow(mpts, best, &cheap_rtt);
885
0
    }
886
0
  }
887
888
  /*
889
   * If there is no preferred or backup subflow, and there is no active
890
   * subflow use the last usable subflow.
891
   */
892
0
  if (best == NULL) {
893
0
    return mptcp_return_subflow(second_best);
894
0
  }
895
896
0
  if (second_best == NULL) {
897
0
    return mptcp_return_subflow(best);
898
0
  }
899
900
0
  besttp = sototcpcb(best->mpts_socket);
901
0
  bestinp = sotoinpcb(best->mpts_socket);
902
0
  secondtp = sototcpcb(second_best->mpts_socket);
903
0
  secondinp = sotoinpcb(second_best->mpts_socket);
904
905
0
  if (preferred != NULL) {
906
0
    *preferred = mptcp_return_subflow(best);
907
0
  }
908
909
  /*
910
   * Second Step: Among best and second_best. Choose the one that is
911
   * most appropriate for this particular service-type.
912
   */
913
0
  if (mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER) {
914
0
    return mptcp_return_subflow(best);
915
0
  } else if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER) {
916
    /*
917
     * Only handover if Symptoms tells us to do so.
918
     */
919
0
    if (!IFNET_IS_CELLULAR(bestinp->inp_last_outifp) &&
920
0
        mptcp_is_wifi_unusable_for_session(mpte) != 0 && mptcp_subflow_is_slow(mpte, best)) {
921
0
      return mptcp_return_subflow(second_best);
922
0
    }
923
924
0
    return mptcp_return_subflow(best);
925
0
  } else if (mpte->mpte_svctype == MPTCP_SVCTYPE_INTERACTIVE) {
926
0
    int rtt_thresh = mptcp_rtthist_rtthresh << TCP_RTT_SHIFT;
927
0
    int rto_thresh = mptcp_rtothresh;
928
929
    /* Adjust with symptoms information */
930
0
    if (!IFNET_IS_CELLULAR(bestinp->inp_last_outifp) &&
931
0
        mptcp_is_wifi_unusable_for_session(mpte) != 0) {
932
0
      rtt_thresh /= 2;
933
0
      rto_thresh /= 2;
934
0
    }
935
936
0
    if (besttp->t_srtt && secondtp->t_srtt &&
937
0
        besttp->t_srtt >= rtt_thresh &&
938
0
        secondtp->t_srtt < rtt_thresh) {
939
0
      tcpstat.tcps_mp_sel_rtt++;
940
0
      mptcplog((LOG_DEBUG, "%s: best cid %d at rtt %d,  second cid %d at rtt %d\n", __func__,
941
0
          best->mpts_connid, besttp->t_srtt >> TCP_RTT_SHIFT,
942
0
          second_best->mpts_connid,
943
0
          secondtp->t_srtt >> TCP_RTT_SHIFT),
944
0
          MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
945
0
      return mptcp_return_subflow(second_best);
946
0
    }
947
948
0
    if (mptcp_subflow_is_slow(mpte, best) &&
949
0
        secondtp->t_rxtshift == 0) {
950
0
      return mptcp_return_subflow(second_best);
951
0
    }
952
953
    /* Compare RTOs, select second_best if best's rto exceeds rtothresh */
954
0
    if (besttp->t_rxtcur && secondtp->t_rxtcur &&
955
0
        besttp->t_rxtcur >= rto_thresh &&
956
0
        secondtp->t_rxtcur < rto_thresh) {
957
0
      tcpstat.tcps_mp_sel_rto++;
958
0
      mptcplog((LOG_DEBUG, "%s: best cid %d at rto %d, second cid %d at rto %d\n", __func__,
959
0
          best->mpts_connid, besttp->t_rxtcur,
960
0
          second_best->mpts_connid, secondtp->t_rxtcur),
961
0
          MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
962
963
0
      return mptcp_return_subflow(second_best);
964
0
    }
965
966
    /*
967
     * None of the above conditions for sending on the secondary
968
     * were true. So, let's schedule on the best one, if he still
969
     * has some space in the congestion-window.
970
     */
971
0
    return mptcp_return_subflow(best);
972
0
  } else if (mpte->mpte_svctype >= MPTCP_SVCTYPE_AGGREGATE) {
973
0
    struct mptsub *tmp;
974
975
    /*
976
     * We only care about RTT when aggregating
977
     */
978
0
    if (besttp->t_srtt > secondtp->t_srtt) {
979
0
      tmp = best;
980
0
      best = second_best;
981
0
      besttp = secondtp;
982
0
      bestinp = secondinp;
983
984
0
      second_best = tmp;
985
0
      secondtp = sototcpcb(second_best->mpts_socket);
986
0
      secondinp = sotoinpcb(second_best->mpts_socket);
987
0
    }
988
989
    /* Is there still space in the congestion window? */
990
0
    if (mptcp_subflow_cwnd_space(bestinp->inp_socket) <= 0) {
991
0
      return mptcp_return_subflow(second_best);
992
0
    }
993
994
0
    return mptcp_return_subflow(best);
995
0
  } else {
996
0
    panic("Unknown service-type configured for MPTCP");
997
0
  }
998
999
0
  return NULL;
1000
0
}
1001
1002
static const char *
1003
mptcp_event_to_str(uint32_t event)
1004
0
{
1005
0
  const char *c = "UNDEFINED";
1006
0
  switch (event) {
1007
0
  case MPCE_CLOSE:
1008
0
    c = "MPCE_CLOSE";
1009
0
    break;
1010
0
  case MPCE_RECV_DATA_ACK:
1011
0
    c = "MPCE_RECV_DATA_ACK";
1012
0
    break;
1013
0
  case MPCE_RECV_DATA_FIN:
1014
0
    c = "MPCE_RECV_DATA_FIN";
1015
0
    break;
1016
0
  }
1017
0
  return c;
1018
0
}
1019
1020
static const char *
1021
mptcp_state_to_str(mptcp_state_t state)
1022
0
{
1023
0
  const char *c = "UNDEFINED";
1024
0
  switch (state) {
1025
0
  case MPTCPS_CLOSED:
1026
0
    c = "MPTCPS_CLOSED";
1027
0
    break;
1028
0
  case MPTCPS_LISTEN:
1029
0
    c = "MPTCPS_LISTEN";
1030
0
    break;
1031
0
  case MPTCPS_ESTABLISHED:
1032
0
    c = "MPTCPS_ESTABLISHED";
1033
0
    break;
1034
0
  case MPTCPS_CLOSE_WAIT:
1035
0
    c = "MPTCPS_CLOSE_WAIT";
1036
0
    break;
1037
0
  case MPTCPS_FIN_WAIT_1:
1038
0
    c = "MPTCPS_FIN_WAIT_1";
1039
0
    break;
1040
0
  case MPTCPS_CLOSING:
1041
0
    c = "MPTCPS_CLOSING";
1042
0
    break;
1043
0
  case MPTCPS_LAST_ACK:
1044
0
    c = "MPTCPS_LAST_ACK";
1045
0
    break;
1046
0
  case MPTCPS_FIN_WAIT_2:
1047
0
    c = "MPTCPS_FIN_WAIT_2";
1048
0
    break;
1049
0
  case MPTCPS_TIME_WAIT:
1050
0
    c = "MPTCPS_TIME_WAIT";
1051
0
    break;
1052
0
  case MPTCPS_TERMINATE:
1053
0
    c = "MPTCPS_TERMINATE";
1054
0
    break;
1055
0
  }
1056
0
  return c;
1057
0
}
1058
1059
void
1060
mptcp_close_fsm(struct mptcb *mp_tp, uint32_t event)
1061
0
{
1062
0
  struct socket *mp_so = mptetoso(mp_tp->mpt_mpte);
1063
1064
0
  socket_lock_assert_owned(mp_so);
1065
1066
0
  mptcp_state_t old_state = mp_tp->mpt_state;
1067
1068
0
  DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp,
1069
0
      uint32_t, event);
1070
1071
0
  switch (mp_tp->mpt_state) {
1072
0
  case MPTCPS_CLOSED:
1073
0
  case MPTCPS_LISTEN:
1074
0
    mp_tp->mpt_state = MPTCPS_TERMINATE;
1075
0
    break;
1076
1077
0
  case MPTCPS_ESTABLISHED:
1078
0
    if (event == MPCE_CLOSE) {
1079
0
      mp_tp->mpt_state = MPTCPS_FIN_WAIT_1;
1080
0
      mp_tp->mpt_sndmax += 1; /* adjust for Data FIN */
1081
0
    } else if (event == MPCE_RECV_DATA_FIN) {
1082
0
      mp_tp->mpt_rcvnxt += 1; /* adj remote data FIN */
1083
0
      mp_tp->mpt_state = MPTCPS_CLOSE_WAIT;
1084
0
    }
1085
0
    break;
1086
1087
0
  case MPTCPS_CLOSE_WAIT:
1088
0
    if (event == MPCE_CLOSE) {
1089
0
      mp_tp->mpt_state = MPTCPS_LAST_ACK;
1090
0
      mp_tp->mpt_sndmax += 1; /* adjust for Data FIN */
1091
0
    }
1092
0
    break;
1093
1094
0
  case MPTCPS_FIN_WAIT_1:
1095
0
    if (event == MPCE_RECV_DATA_ACK) {
1096
0
      mp_tp->mpt_state = MPTCPS_FIN_WAIT_2;
1097
0
    } else if (event == MPCE_RECV_DATA_FIN) {
1098
0
      mp_tp->mpt_rcvnxt += 1; /* adj remote data FIN */
1099
0
      mp_tp->mpt_state = MPTCPS_CLOSING;
1100
0
    }
1101
0
    break;
1102
1103
0
  case MPTCPS_CLOSING:
1104
0
    if (event == MPCE_RECV_DATA_ACK) {
1105
0
      mp_tp->mpt_state = MPTCPS_TIME_WAIT;
1106
0
    }
1107
0
    break;
1108
1109
0
  case MPTCPS_LAST_ACK:
1110
0
    if (event == MPCE_RECV_DATA_ACK) {
1111
0
      mptcp_close(mp_tp->mpt_mpte, mp_tp);
1112
0
    }
1113
0
    break;
1114
1115
0
  case MPTCPS_FIN_WAIT_2:
1116
0
    if (event == MPCE_RECV_DATA_FIN) {
1117
0
      mp_tp->mpt_rcvnxt += 1; /* adj remote data FIN */
1118
0
      mp_tp->mpt_state = MPTCPS_TIME_WAIT;
1119
0
    }
1120
0
    break;
1121
1122
0
  case MPTCPS_TIME_WAIT:
1123
0
  case MPTCPS_TERMINATE:
1124
0
    break;
1125
1126
0
  default:
1127
0
    VERIFY(0);
1128
    /* NOTREACHED */
1129
0
  }
1130
0
  DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp,
1131
0
      uint32_t, event);
1132
0
  mptcplog((LOG_INFO, "%s: %s to %s on event %s\n", __func__,
1133
0
      mptcp_state_to_str(old_state),
1134
0
      mptcp_state_to_str(mp_tp->mpt_state),
1135
0
      mptcp_event_to_str(event)),
1136
0
      MPTCP_STATE_DBG, MPTCP_LOGLVL_LOG);
1137
0
}
1138
1139
/* If you change this function, match up mptcp_update_rcv_state_f */
1140
void
1141
mptcp_update_dss_rcv_state(struct mptcp_dsn_opt *dss_info, struct tcpcb *tp,
1142
    uint16_t csum)
1143
0
{
1144
0
  struct mptcb *mp_tp = tptomptp(tp);
1145
0
  u_int64_t full_dsn = 0;
1146
1147
0
  NTOHL(dss_info->mdss_dsn);
1148
0
  NTOHL(dss_info->mdss_subflow_seqn);
1149
0
  NTOHS(dss_info->mdss_data_len);
1150
1151
  /* XXX for autosndbuf grow sb here */
1152
0
  MPTCP_EXTEND_DSN(mp_tp->mpt_rcvnxt, dss_info->mdss_dsn, full_dsn);
1153
0
  mptcp_update_rcv_state_meat(mp_tp, tp,
1154
0
      full_dsn, dss_info->mdss_subflow_seqn, dss_info->mdss_data_len,
1155
0
      csum);
1156
0
}
1157
1158
void
1159
mptcp_update_rcv_state_meat(struct mptcb *mp_tp, struct tcpcb *tp,
1160
    u_int64_t full_dsn, u_int32_t seqn, u_int16_t mdss_data_len,
1161
    uint16_t csum)
1162
0
{
1163
0
  if (mdss_data_len == 0) {
1164
0
    os_log_error(mptcp_log_handle, "%s - %lx: Infinite Mapping.\n",
1165
0
        __func__, (unsigned long)VM_KERNEL_ADDRPERM(mp_tp->mpt_mpte));
1166
1167
0
    if ((mp_tp->mpt_flags & MPTCPF_CHECKSUM) && (csum != 0)) {
1168
0
      os_log_error(mptcp_log_handle, "%s - %lx: Bad checksum %x \n",
1169
0
          __func__, (unsigned long)VM_KERNEL_ADDRPERM(mp_tp->mpt_mpte), csum);
1170
0
    }
1171
0
    mptcp_notify_mpfail(tp->t_inpcb->inp_socket);
1172
0
    return;
1173
0
  }
1174
1175
0
  mptcp_notify_mpready(tp->t_inpcb->inp_socket);
1176
1177
0
  tp->t_rcv_map.mpt_dsn = full_dsn;
1178
0
  tp->t_rcv_map.mpt_sseq = seqn;
1179
0
  tp->t_rcv_map.mpt_len = mdss_data_len;
1180
0
  tp->t_rcv_map.mpt_csum = csum;
1181
0
  tp->t_mpflags |= TMPF_EMBED_DSN;
1182
0
}
1183
1184
1185
static int
1186
mptcp_validate_dss_map(struct socket *so, struct tcpcb *tp, struct mbuf *m,
1187
    int hdrlen)
1188
0
{
1189
0
  u_int32_t datalen;
1190
1191
0
  if (!(m->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
1192
0
    return 0;
1193
0
  }
1194
1195
0
  datalen = m->m_pkthdr.mp_rlen;
1196
1197
  /* unacceptable DSS option, fallback to TCP */
1198
0
  if (m->m_pkthdr.len > ((int) datalen + hdrlen)) {
1199
0
    os_log_error(mptcp_log_handle, "%s - %lx: mbuf len %d, MPTCP expected %d",
1200
0
        __func__, (unsigned long)VM_KERNEL_ADDRPERM(tptomptp(tp)->mpt_mpte), m->m_pkthdr.len, datalen);
1201
0
  } else {
1202
0
    return 0;
1203
0
  }
1204
0
  tp->t_mpflags |= TMPF_SND_MPFAIL;
1205
0
  mptcp_notify_mpfail(so);
1206
0
  m_freem(m);
1207
0
  return -1;
1208
0
}
1209
1210
int
1211
mptcp_input_preproc(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
1212
    int drop_hdrlen)
1213
0
{
1214
0
  mptcp_insert_rmap(tp, m, th);
1215
0
  if (mptcp_validate_dss_map(tp->t_inpcb->inp_socket, tp, m,
1216
0
      drop_hdrlen) != 0) {
1217
0
    return -1;
1218
0
  }
1219
0
  return 0;
1220
0
}
1221
1222
static uint16_t
1223
mptcp_input_csum(struct tcpcb *tp, struct mbuf *m, uint64_t dsn, uint32_t sseq,
1224
    uint16_t dlen, uint16_t csum, int dfin)
1225
0
{
1226
0
  struct mptcb *mp_tp = tptomptp(tp);
1227
0
  int real_len = dlen - dfin;
1228
0
  uint32_t sum = 0;
1229
1230
0
  VERIFY(real_len >= 0);
1231
1232
0
  if (mp_tp == NULL) {
1233
0
    return 0;
1234
0
  }
1235
1236
0
  if (!(mp_tp->mpt_flags & MPTCPF_CHECKSUM)) {
1237
0
    return 0;
1238
0
  }
1239
1240
0
  if (tp->t_mpflags & TMPF_TCP_FALLBACK) {
1241
0
    return 0;
1242
0
  }
1243
1244
  /*
1245
   * The remote side may send a packet with fewer bytes than the
1246
   * claimed DSS checksum length.
1247
   */
1248
0
  if ((int)m_length2(m, NULL) < real_len) {
1249
0
    return 0xffff;
1250
0
  }
1251
1252
0
  if (real_len != 0) {
1253
0
    sum = m_sum16(m, 0, real_len);
1254
0
  }
1255
1256
0
  sum += in_pseudo64(htonll(dsn), htonl(sseq), htons(dlen) + csum);
1257
0
  ADDCARRY(sum);
1258
1259
0
  DTRACE_MPTCP3(checksum__result, struct tcpcb *, tp, struct mbuf *, m,
1260
0
      uint32_t, sum);
1261
1262
0
  return ~sum & 0xffff;
1263
0
}
1264
1265
/*
1266
 * MPTCP Checksum support
1267
 * The checksum is calculated whenever the MPTCP DSS option is included
1268
 * in the TCP packet. The checksum includes the sum of the MPTCP psuedo
1269
 * header and the actual data indicated by the length specified in the
1270
 * DSS option.
1271
 */
1272
1273
int
1274
mptcp_validate_csum(struct tcpcb *tp, struct mbuf *m, uint64_t dsn,
1275
    uint32_t sseq, uint16_t dlen, uint16_t csum, int dfin)
1276
0
{
1277
0
  uint16_t mptcp_csum;
1278
1279
0
  mptcp_csum = mptcp_input_csum(tp, m, dsn, sseq, dlen, csum, dfin);
1280
0
  if (mptcp_csum) {
1281
0
    tp->t_mpflags |= TMPF_SND_MPFAIL;
1282
0
    mptcp_notify_mpfail(tp->t_inpcb->inp_socket);
1283
0
    m_freem(m);
1284
0
    tcpstat.tcps_mp_badcsum++;
1285
0
    return -1;
1286
0
  }
1287
0
  return 0;
1288
0
}
1289
1290
uint16_t
1291
mptcp_output_csum(struct mbuf *m, uint64_t dss_val, uint32_t sseq, uint16_t dlen)
1292
0
{
1293
0
  uint32_t sum = 0;
1294
1295
0
  if (dlen) {
1296
0
    sum = m_sum16(m, 0, dlen);
1297
0
  }
1298
1299
0
  dss_val = mptcp_hton64(dss_val);
1300
0
  sseq = htonl(sseq);
1301
0
  dlen = htons(dlen);
1302
0
  sum += in_pseudo64(dss_val, sseq, dlen);
1303
1304
0
  ADDCARRY(sum);
1305
0
  sum = ~sum & 0xffff;
1306
0
  DTRACE_MPTCP2(checksum__result, struct mbuf *, m, uint32_t, sum);
1307
0
  mptcplog((LOG_DEBUG, "%s: sum = %x \n", __func__, sum),
1308
0
      MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
1309
1310
0
  return (uint16_t)sum;
1311
0
}
1312
1313
/*
1314
 * When WiFi signal starts fading, there's more loss and RTT spikes.
1315
 * Check if there has been a large spike by comparing against
1316
 * a tolerable RTT spike threshold.
1317
 */
1318
boolean_t
1319
mptcp_no_rto_spike(struct socket *so)
1320
0
{
1321
0
  struct tcpcb *tp = intotcpcb(sotoinpcb(so));
1322
0
  int32_t spike = 0;
1323
1324
0
  if (tp->t_rxtcur > mptcp_rtothresh) {
1325
0
    spike = tp->t_rxtcur - mptcp_rtothresh;
1326
1327
0
    mptcplog((LOG_DEBUG, "%s: spike = %d rto = %d best = %d cur = %d\n",
1328
0
        __func__, spike,
1329
0
        tp->t_rxtcur, tp->t_rttbest >> TCP_RTT_SHIFT,
1330
0
        tp->t_rttcur),
1331
0
        (MPTCP_SOCKET_DBG | MPTCP_SENDER_DBG), MPTCP_LOGLVL_LOG);
1332
0
  }
1333
1334
0
  if (spike > 0) {
1335
0
    return FALSE;
1336
0
  } else {
1337
0
    return TRUE;
1338
0
  }
1339
0
}
1340
1341
void
1342
mptcp_handle_deferred_upcalls(struct mppcb *mpp, uint32_t flag)
1343
0
{
1344
0
  VERIFY(mpp->mpp_flags & flag);
1345
0
  mpp->mpp_flags &= ~flag;
1346
1347
0
  if (mptcp_should_defer_upcall(mpp)) {
1348
0
    return;
1349
0
  }
1350
1351
0
  if (mpp->mpp_flags & MPP_SHOULD_WORKLOOP) {
1352
0
    mpp->mpp_flags &= ~MPP_SHOULD_WORKLOOP;
1353
1354
0
    mptcp_subflow_workloop(mpp->mpp_pcbe);
1355
0
  }
1356
1357
0
  if (mpp->mpp_flags & MPP_SHOULD_RWAKEUP) {
1358
0
    mpp->mpp_flags &= ~MPP_SHOULD_RWAKEUP;
1359
1360
0
    sorwakeup(mpp->mpp_socket);
1361
0
  }
1362
1363
0
  if (mpp->mpp_flags & MPP_SHOULD_WWAKEUP) {
1364
0
    mpp->mpp_flags &= ~MPP_SHOULD_WWAKEUP;
1365
1366
0
    sowwakeup(mpp->mpp_socket);
1367
0
  }
1368
0
}
1369
1370
static void
1371
mptcp_reset_itfinfo(struct mpt_itf_info *info)
1372
0
{
1373
0
  memset(info, 0, sizeof(*info));
1374
0
}
1375
1376
void
1377
mptcp_session_necp_cb(void *handle, int action, uint32_t interface_index,
1378
    uint32_t necp_flags, __unused bool *viable)
1379
0
{
1380
0
  boolean_t has_v4 = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_HAS_IPV4);
1381
0
  boolean_t has_v6 = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_HAS_IPV6);
1382
0
  boolean_t has_nat64 = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_HAS_NAT64);
1383
0
  boolean_t low_power = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_INTERFACE_LOW_POWER);
1384
0
  struct mppcb *mp = (struct mppcb *)handle;
1385
0
  struct mptses *mpte = mptompte(mp);
1386
0
  struct socket *mp_so;
1387
0
  struct mptcb *mp_tp;
1388
0
  uint32_t i, ifindex;
1389
0
  struct ifnet *ifp;
1390
0
  int locked = 0;
1391
1392
0
  ifindex = interface_index;
1393
0
  VERIFY(ifindex != IFSCOPE_NONE);
1394
1395
  /* About to be garbage-collected (see note about MPTCP/NECP interactions) */
1396
0
  if (mp->mpp_socket->so_usecount == 0) {
1397
0
    return;
1398
0
  }
1399
1400
0
  mp_so = mptetoso(mpte);
1401
1402
0
  if (action != NECP_CLIENT_CBACTION_INITIAL) {
1403
0
    socket_lock(mp_so, 1);
1404
0
    locked = 1;
1405
1406
    /* Check again, because it might have changed while waiting */
1407
0
    if (mp->mpp_socket->so_usecount == 0) {
1408
0
      goto out;
1409
0
    }
1410
0
  }
1411
1412
0
  socket_lock_assert_owned(mp_so);
1413
1414
0
  mp_tp = mpte->mpte_mptcb;
1415
1416
0
  ifnet_head_lock_shared();
1417
0
  ifp = ifindex2ifnet[ifindex];
1418
0
  ifnet_head_done();
1419
1420
0
  os_log(mptcp_log_handle, "%s - %lx: action: %u ifindex %u delegated to %u usecount %u mpt_flags %#x state %u v4 %u v6 %u nat64 %u power %u\n",
1421
0
      __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), action, ifindex,
1422
0
      ifp && ifp->if_delegated.ifp ? ifp->if_delegated.ifp->if_index : IFSCOPE_NONE,
1423
0
      mp->mpp_socket->so_usecount, mp_tp->mpt_flags, mp_tp->mpt_state,
1424
0
      has_v4, has_v6, has_nat64, low_power);
1425
1426
  /* No need on fallen back sockets */
1427
0
  if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
1428
0
    goto out;
1429
0
  }
1430
1431
  /*
1432
   * When the interface goes in low-power mode we don't want to establish
1433
   * new subflows on it. Thus, mark it internally as non-viable.
1434
   */
1435
0
  if (low_power) {
1436
0
    action = NECP_CLIENT_CBACTION_NONVIABLE;
1437
0
  }
1438
1439
0
  if (action == NECP_CLIENT_CBACTION_NONVIABLE) {
1440
0
    for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
1441
0
      if (mpte->mpte_itfinfo[i].ifindex == IFSCOPE_NONE) {
1442
0
        continue;
1443
0
      }
1444
1445
0
      if (mpte->mpte_itfinfo[i].ifindex == ifindex) {
1446
0
        mptcp_reset_itfinfo(&mpte->mpte_itfinfo[i]);
1447
0
      }
1448
0
    }
1449
1450
0
    mptcp_sched_create_subflows(mpte);
1451
0
  } else if (action == NECP_CLIENT_CBACTION_VIABLE ||
1452
0
      action == NECP_CLIENT_CBACTION_INITIAL) {
1453
0
    int found_slot = 0, slot_index = -1;
1454
0
    struct sockaddr *dst;
1455
1456
0
    if (ifp == NULL) {
1457
0
      goto out;
1458
0
    }
1459
1460
0
    if (IFNET_IS_COMPANION_LINK(ifp)) {
1461
0
      goto out;
1462
0
    }
1463
1464
0
    if (IFNET_IS_EXPENSIVE(ifp) &&
1465
0
        (mp_so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE)) {
1466
0
      goto out;
1467
0
    }
1468
1469
0
    if (IFNET_IS_CONSTRAINED(ifp) &&
1470
0
        (mp_so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED)) {
1471
0
      goto out;
1472
0
    }
1473
1474
0
    if (IFNET_IS_CELLULAR(ifp) &&
1475
0
        (mp_so->so_restrictions & SO_RESTRICT_DENY_CELLULAR)) {
1476
0
      goto out;
1477
0
    }
1478
1479
0
    if (IS_INTF_CLAT46(ifp)) {
1480
0
      has_v4 = FALSE;
1481
0
    }
1482
1483
    /* Look for the slot on where to store/update the interface-info. */
1484
0
    for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
1485
      /* Found a potential empty slot where we can put it */
1486
0
      if (mpte->mpte_itfinfo[i].ifindex == 0) {
1487
0
        found_slot = 1;
1488
0
        slot_index = i;
1489
0
      }
1490
1491
      /*
1492
       * The interface is already in our array. Check if we
1493
       * need to update it.
1494
       */
1495
0
      if (mpte->mpte_itfinfo[i].ifindex == ifindex &&
1496
0
          (mpte->mpte_itfinfo[i].has_v4_conn != has_v4 ||
1497
0
          mpte->mpte_itfinfo[i].has_v6_conn != has_v6 ||
1498
0
          mpte->mpte_itfinfo[i].has_nat64_conn != has_nat64)) {
1499
0
        found_slot = 1;
1500
0
        slot_index = i;
1501
0
        break;
1502
0
      }
1503
1504
0
      if (mpte->mpte_itfinfo[i].ifindex == ifindex) {
1505
        /*
1506
         * Ok, it's already there and we don't need
1507
         * to update it
1508
         */
1509
0
        goto out;
1510
0
      }
1511
0
    }
1512
1513
0
    dst = mptcp_get_session_dst(mpte, has_v6, has_v4);
1514
0
    if (dst && dst->sa_family == AF_INET &&
1515
0
        has_v6 && !has_nat64 && !has_v4) {
1516
0
      if (found_slot) {
1517
0
        mpte->mpte_itfinfo[slot_index].ifindex = ifindex;
1518
0
        mpte->mpte_itfinfo[slot_index].has_v4_conn = has_v4;
1519
0
        mpte->mpte_itfinfo[slot_index].has_v6_conn = has_v6;
1520
0
        mpte->mpte_itfinfo[slot_index].has_nat64_conn = has_nat64;
1521
0
      }
1522
0
      goto out;
1523
0
    }
1524
1525
0
    if (found_slot == 0) {
1526
0
      int new_size = mpte->mpte_itfinfo_size * 2;
1527
0
      struct mpt_itf_info *info = _MALLOC(sizeof(*info) * new_size, M_TEMP, M_ZERO);
1528
1529
0
      if (info == NULL) {
1530
0
        os_log_error(mptcp_log_handle, "%s - %lx: malloc failed for %u\n",
1531
0
            __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), new_size);
1532
0
        goto out;
1533
0
      }
1534
1535
0
      memcpy(info, mpte->mpte_itfinfo, mpte->mpte_itfinfo_size * sizeof(*info));
1536
1537
0
      if (mpte->mpte_itfinfo_size > MPTE_ITFINFO_SIZE) {
1538
0
        _FREE(mpte->mpte_itfinfo, M_TEMP);
1539
0
      }
1540
1541
      /* We allocated a new one, thus the first must be empty */
1542
0
      slot_index = mpte->mpte_itfinfo_size;
1543
1544
0
      mpte->mpte_itfinfo = info;
1545
0
      mpte->mpte_itfinfo_size = new_size;
1546
0
    }
1547
1548
0
    VERIFY(slot_index >= 0 && slot_index < (int)mpte->mpte_itfinfo_size);
1549
0
    mpte->mpte_itfinfo[slot_index].ifindex = ifindex;
1550
0
    mpte->mpte_itfinfo[slot_index].has_v4_conn = has_v4;
1551
0
    mpte->mpte_itfinfo[slot_index].has_v6_conn = has_v6;
1552
0
    mpte->mpte_itfinfo[slot_index].has_nat64_conn = has_nat64;
1553
1554
0
    mptcp_sched_create_subflows(mpte);
1555
0
  }
1556
1557
0
out:
1558
0
  if (locked) {
1559
0
    socket_unlock(mp_so, 1);
1560
0
  }
1561
0
}
1562
1563
void
1564
mptcp_set_restrictions(struct socket *mp_so)
1565
0
{
1566
0
  struct mptses *mpte = mpsotompte(mp_so);
1567
0
  uint32_t i;
1568
1569
0
  socket_lock_assert_owned(mp_so);
1570
1571
0
  ifnet_head_lock_shared();
1572
1573
0
  for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
1574
0
    struct mpt_itf_info *info = &mpte->mpte_itfinfo[i];
1575
0
    uint32_t ifindex = info->ifindex;
1576
0
    struct ifnet *ifp;
1577
1578
0
    if (ifindex == IFSCOPE_NONE) {
1579
0
      continue;
1580
0
    }
1581
1582
0
    ifp = ifindex2ifnet[ifindex];
1583
0
    if (ifp == NULL) {
1584
0
      continue;
1585
0
    }
1586
1587
0
    if (IFNET_IS_EXPENSIVE(ifp) &&
1588
0
        (mp_so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE)) {
1589
0
      info->ifindex = IFSCOPE_NONE;
1590
0
    }
1591
1592
0
    if (IFNET_IS_CONSTRAINED(ifp) &&
1593
0
        (mp_so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED)) {
1594
0
      info->ifindex = IFSCOPE_NONE;
1595
0
    }
1596
1597
0
    if (IFNET_IS_CELLULAR(ifp) &&
1598
0
        (mp_so->so_restrictions & SO_RESTRICT_DENY_CELLULAR)) {
1599
0
      info->ifindex = IFSCOPE_NONE;
1600
0
    }
1601
0
  }
1602
1603
0
  ifnet_head_done();
1604
0
}