/src/openthread/third_party/tcplp/bsdtcp/tcp_input.c
Line | Count | Source |
1 | | /*- |
2 | | * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 |
3 | | * The Regents of the University of California. All rights reserved. |
4 | | * Copyright (c) 2007-2008,2010 |
5 | | * Swinburne University of Technology, Melbourne, Australia. |
6 | | * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org> |
7 | | * Copyright (c) 2010 The FreeBSD Foundation |
8 | | * Copyright (c) 2010-2011 Juniper Networks, Inc. |
9 | | * All rights reserved. |
10 | | * |
11 | | * Portions of this software were developed at the Centre for Advanced Internet |
12 | | * Architectures, Swinburne University of Technology, by Lawrence Stewart, |
13 | | * James Healy and David Hayes, made possible in part by a grant from the Cisco |
14 | | * University Research Program Fund at Community Foundation Silicon Valley. |
15 | | * |
16 | | * Portions of this software were developed at the Centre for Advanced |
17 | | * Internet Architectures, Swinburne University of Technology, Melbourne, |
18 | | * Australia by David Hayes under sponsorship from the FreeBSD Foundation. |
19 | | * |
20 | | * Portions of this software were developed by Robert N. M. Watson under |
21 | | * contract to Juniper Networks, Inc. |
22 | | * |
23 | | * Redistribution and use in source and binary forms, with or without |
24 | | * modification, are permitted provided that the following conditions |
25 | | * are met: |
26 | | * 1. Redistributions of source code must retain the above copyright |
27 | | * notice, this list of conditions and the following disclaimer. |
28 | | * 2. Redistributions in binary form must reproduce the above copyright |
29 | | * notice, this list of conditions and the following disclaimer in the |
30 | | * documentation and/or other materials provided with the distribution. |
31 | | * 4. Neither the name of the University nor the names of its contributors |
32 | | * may be used to endorse or promote products derived from this software |
33 | | * without specific prior written permission. |
34 | | * |
35 | | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND |
36 | | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
37 | | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
38 | | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE |
39 | | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
40 | | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
41 | | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
42 | | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
43 | | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
44 | | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
45 | | * SUCH DAMAGE. |
46 | | * |
47 | | * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 |
48 | | */ |
49 | | |
50 | | |
51 | | /* |
52 | | * Determine a reasonable value for maxseg size. |
53 | | * If the route is known, check route for mtu. |
54 | | * If none, use an mss that can be handled on the outgoing interface |
55 | | * without forcing IP to fragment. If no route is found, route has no mtu, |
56 | | * or the destination isn't local, use a default, hopefully conservative |
57 | | * size (usually 512 or the default IP max size, but no more than the mtu |
58 | | * of the interface), as we can't discover anything about intervening |
59 | | * gateways or networks. We also initialize the congestion/slow start |
60 | | * window to be a single segment if the destination isn't local. |
61 | | * While looking at the routing entry, we also initialize other path-dependent |
62 | | * parameters from pre-set or cached values in the routing entry. |
63 | | * |
64 | | * Also take into account the space needed for options that we |
65 | | * send regularly. Make maxseg shorter by that amount to assure |
66 | | * that we can send maxseg amount of data even when the options |
67 | | * are present. Store the upper limit of the length of options plus |
68 | | * data in maxopd. |
69 | | * |
70 | | * NOTE that this routine is only called when we process an incoming |
71 | | * segment, or an ICMP need fragmentation datagram. Outgoing SYN/ACK MSS |
72 | | * settings are handled in tcp_mssopt(). |
73 | | */ |
74 | | |
75 | | #include <errno.h> |
76 | | #include <string.h> |
77 | | #include <strings.h> |
78 | | |
79 | | #include "tcp.h" |
80 | | #include "tcp_fsm.h" |
81 | | #include "tcp_seq.h" |
82 | | #include "tcp_timer.h" |
83 | | #include "tcp_var.h" |
84 | | #include "tcp_fastopen.h" |
85 | | #include "../lib/bitmap.h" |
86 | | #include "../lib/cbuf.h" |
87 | | #include "icmp_var.h" |
88 | | #include "ip.h" |
89 | | #include "ip6.h" |
90 | | #include "sys/queue.h" |
91 | | |
92 | | #include "tcp_const.h" |
93 | | |
94 | | /* samkumar: Copied from in.h */ |
95 | 0 | #define IPPROTO_DONE 267 |
96 | | |
97 | | /* samkumar: Copied from sys/libkern.h */ |
98 | 0 | static int imax(int a, int b) { return (a > b ? a : b); } |
99 | 0 | static int imin(int a, int b) { return (a < b ? a : b); } |
100 | | |
101 | 0 | static int min(int a, int b) { return imin(a, b); } |
102 | | |
103 | | static void tcp_dooptions(struct tcpopt *, uint8_t *, int, int); |
104 | | static void |
105 | | tcp_do_segment(struct ip6_hdr* ip6, struct tcphdr *th, otMessage* msg, |
106 | | struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos, |
107 | | struct tcplp_signals* sig); |
108 | | static void tcp_xmit_timer(struct tcpcb *, int); |
109 | | void tcp_hc_get(/*struct in_conninfo *inc*/ struct tcpcb* tp, struct hc_metrics_lite *hc_metrics_lite); |
110 | | static void tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *); |
111 | | |
112 | | /* |
113 | | * CC wrapper hook functions |
114 | | */ |
115 | | static inline void |
116 | | cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t type) |
117 | 0 | { |
118 | 0 | tp->ccv->bytes_this_ack = BYTES_THIS_ACK(tp, th); |
119 | 0 | if (tp->snd_cwnd <= tp->snd_wnd) |
120 | 0 | tp->ccv->flags |= CCF_CWND_LIMITED; |
121 | 0 | else |
122 | 0 | tp->ccv->flags &= ~CCF_CWND_LIMITED; |
123 | |
|
124 | 0 | if (type == CC_ACK) { |
125 | 0 | if (tp->snd_cwnd > tp->snd_ssthresh) { |
126 | 0 | tp->t_bytes_acked += min(tp->ccv->bytes_this_ack, |
127 | 0 | V_tcp_abc_l_var * tp->t_maxseg); |
128 | 0 | if (tp->t_bytes_acked >= tp->snd_cwnd) { |
129 | 0 | tp->t_bytes_acked -= tp->snd_cwnd; |
130 | 0 | tp->ccv->flags |= CCF_ABC_SENTAWND; |
131 | 0 | } |
132 | 0 | } else { |
133 | 0 | tp->ccv->flags &= ~CCF_ABC_SENTAWND; |
134 | 0 | tp->t_bytes_acked = 0; |
135 | 0 | } |
136 | 0 | } |
137 | |
|
138 | 0 | if (CC_ALGO(tp)->ack_received != NULL) { |
139 | | /* XXXLAS: Find a way to live without this */ |
140 | 0 | tp->ccv->curack = th->th_ack; |
141 | 0 | CC_ALGO(tp)->ack_received(tp->ccv, type); |
142 | 0 | } |
143 | 0 | } |
144 | | |
145 | | static inline void |
146 | | cc_conn_init(struct tcpcb *tp) |
147 | 0 | { |
148 | 0 | struct hc_metrics_lite metrics; |
149 | 0 | int rtt; |
150 | | |
151 | | /* |
152 | | * samkumar: remove locks, inpcb, and stats. |
153 | | */ |
154 | | |
155 | | /* samkumar: Used to take &inp->inp_inc as an argument. */ |
156 | 0 | tcp_hc_get(tp, &metrics); |
157 | |
|
158 | 0 | if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt)) { |
159 | 0 | tp->t_srtt = rtt; |
160 | 0 | tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE; |
161 | 0 | if (metrics.rmx_rttvar) { |
162 | 0 | tp->t_rttvar = metrics.rmx_rttvar; |
163 | 0 | } else { |
164 | | /* default variation is +- 1 rtt */ |
165 | 0 | tp->t_rttvar = |
166 | 0 | tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE; |
167 | 0 | } |
168 | 0 | TCPT_RANGESET(tp->t_rxtcur, |
169 | 0 | ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1, |
170 | 0 | tp->t_rttmin, TCPTV_REXMTMAX); |
171 | 0 | } |
172 | 0 | if (metrics.rmx_ssthresh) { |
173 | | /* |
174 | | * There's some sort of gateway or interface |
175 | | * buffer limit on the path. Use this to set |
176 | | * the slow start threshhold, but set the |
177 | | * threshold to no less than 2*mss. |
178 | | */ |
179 | 0 | tp->snd_ssthresh = max(2 * tp->t_maxseg, metrics.rmx_ssthresh); |
180 | 0 | } |
181 | | |
182 | | /* |
183 | | * Set the initial slow-start flight size. |
184 | | * |
185 | | * RFC5681 Section 3.1 specifies the default conservative values. |
186 | | * RFC3390 specifies slightly more aggressive values. |
187 | | * RFC6928 increases it to ten segments. |
188 | | * Support for user specified value for initial flight size. |
189 | | * |
190 | | * If a SYN or SYN/ACK was lost and retransmitted, we have to |
191 | | * reduce the initial CWND to one segment as congestion is likely |
192 | | * requiring us to be cautious. |
193 | | */ |
194 | 0 | if (tp->snd_cwnd == 1) |
195 | 0 | tp->snd_cwnd = tp->t_maxseg; /* SYN(-ACK) lost */ |
196 | 0 | else if (V_tcp_initcwnd_segments) |
197 | 0 | tp->snd_cwnd = min(V_tcp_initcwnd_segments * tp->t_maxseg, |
198 | 0 | max(2 * tp->t_maxseg, V_tcp_initcwnd_segments * 1460)); |
199 | 0 | else if (V_tcp_do_rfc3390) |
200 | 0 | tp->snd_cwnd = min(4 * tp->t_maxseg, |
201 | 0 | max(2 * tp->t_maxseg, 4380)); |
202 | 0 | else { |
203 | | /* Per RFC5681 Section 3.1 */ |
204 | 0 | if (tp->t_maxseg > 2190) |
205 | 0 | tp->snd_cwnd = 2 * tp->t_maxseg; |
206 | 0 | else if (tp->t_maxseg > 1095) |
207 | 0 | tp->snd_cwnd = 3 * tp->t_maxseg; |
208 | 0 | else |
209 | 0 | tp->snd_cwnd = 4 * tp->t_maxseg; |
210 | 0 | } |
211 | |
|
212 | 0 | if (CC_ALGO(tp)->conn_init != NULL) |
213 | 0 | CC_ALGO(tp)->conn_init(tp->ccv); |
214 | | |
215 | | /* samkumar: print statement for debugging. Resurrect with DEBUG macro? */ |
216 | | #ifdef INSTRUMENT_TCP |
217 | | tcplp_sys_log("TCP CC_INIT %u %d %d", (unsigned int) tcplp_sys_get_millis(), (int) tp->snd_cwnd, (int) tp->snd_ssthresh); |
218 | | #endif |
219 | 0 | } |
220 | | |
221 | | inline void |
222 | | cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type) |
223 | 0 | { |
224 | | /* samkumar: Remove locks and stats from this function. */ |
225 | |
|
226 | 0 | switch(type) { |
227 | 0 | case CC_NDUPACK: |
228 | 0 | if (!IN_FASTRECOVERY(tp->t_flags)) { |
229 | 0 | tp->snd_recover = tp->snd_max; |
230 | 0 | if (tp->t_flags & TF_ECN_PERMIT) |
231 | 0 | tp->t_flags |= TF_ECN_SND_CWR; |
232 | 0 | } |
233 | 0 | break; |
234 | 0 | case CC_ECN: |
235 | 0 | if (!IN_CONGRECOVERY(tp->t_flags)) { |
236 | 0 | tp->snd_recover = tp->snd_max; |
237 | 0 | if (tp->t_flags & TF_ECN_PERMIT) |
238 | 0 | tp->t_flags |= TF_ECN_SND_CWR; |
239 | 0 | } |
240 | 0 | break; |
241 | 0 | case CC_RTO: |
242 | 0 | tp->t_dupacks = 0; |
243 | 0 | tp->t_bytes_acked = 0; |
244 | 0 | EXIT_RECOVERY(tp->t_flags); |
245 | | /* |
246 | | * samkumar: I added the cast to uint64_t below to fix an OpenThread |
247 | | * code scanning alert relating to integer overflow in multiplication. |
248 | | */ |
249 | 0 | tp->snd_ssthresh = max(2, min(tp->snd_wnd, tp->snd_cwnd) / 2 / |
250 | 0 | tp->t_maxseg) * ((uint64_t) tp->t_maxseg); |
251 | 0 | tp->snd_cwnd = tp->t_maxseg; |
252 | | |
253 | | /* |
254 | | * samkumar: Stats for TCPlp: count the number of timeouts (RTOs). |
255 | | * I've commented this out (with #if 0) because it isn't part of TCP |
256 | | * functionality. At some point, we may want to bring it back to |
257 | | * measure performance. |
258 | | */ |
259 | | #if 0 |
260 | | tcplp_timeoutRexmitCnt++; |
261 | | #endif |
262 | | #ifdef INSTRUMENT_TCP |
263 | | tcplp_sys_log("TCP CC_RTO %u %d %d", (unsigned int) tcplp_sys_get_millis(), (int) tp->snd_cwnd, (int) tp->snd_ssthresh); |
264 | | #endif |
265 | 0 | break; |
266 | 0 | case CC_RTO_ERR: |
267 | | /* RTO was unnecessary, so reset everything. */ |
268 | 0 | tp->snd_cwnd = tp->snd_cwnd_prev; |
269 | 0 | tp->snd_ssthresh = tp->snd_ssthresh_prev; |
270 | 0 | tp->snd_recover = tp->snd_recover_prev; |
271 | 0 | if (tp->t_flags & TF_WASFRECOVERY) |
272 | 0 | ENTER_FASTRECOVERY(tp->t_flags); |
273 | 0 | if (tp->t_flags & TF_WASCRECOVERY) |
274 | 0 | ENTER_CONGRECOVERY(tp->t_flags); |
275 | 0 | tp->snd_nxt = tp->snd_max; |
276 | 0 | tp->t_flags &= ~TF_PREVVALID; |
277 | 0 | tp->t_badrxtwin = 0; |
278 | | #ifdef INSTRUMENT_TCP |
279 | | tcplp_sys_log("TCP CC_RTO_ERR %u %d %d", (unsigned int) tcplp_sys_get_millis(), (int) tp->snd_cwnd, (int) tp->snd_ssthresh); |
280 | | #endif |
281 | 0 | break; |
282 | 0 | } |
283 | | |
284 | 0 | if (CC_ALGO(tp)->cong_signal != NULL) { |
285 | 0 | if (th != NULL) |
286 | 0 | tp->ccv->curack = th->th_ack; |
287 | 0 | CC_ALGO(tp)->cong_signal(tp->ccv, type); |
288 | 0 | } |
289 | 0 | } |
290 | | |
291 | | static inline void |
292 | | cc_post_recovery(struct tcpcb *tp, struct tcphdr *th) |
293 | 0 | { |
294 | | /* samkumar: remove lock */ |
295 | | |
296 | | /* XXXLAS: KASSERT that we're in recovery? */ |
297 | 0 | if (CC_ALGO(tp)->post_recovery != NULL) { |
298 | 0 | tp->ccv->curack = th->th_ack; |
299 | 0 | CC_ALGO(tp)->post_recovery(tp->ccv); |
300 | 0 | } |
301 | | /* XXXLAS: EXIT_RECOVERY ? */ |
302 | 0 | tp->t_bytes_acked = 0; |
303 | 0 | } |
304 | | |
305 | | |
306 | | /* |
307 | | * Indicate whether this ack should be delayed. We can delay the ack if |
308 | | * following conditions are met: |
309 | | * - There is no delayed ack timer in progress. |
310 | | * - Our last ack wasn't a 0-sized window. We never want to delay |
311 | | * the ack that opens up a 0-sized window. |
312 | | * - LRO wasn't used for this segment. We make sure by checking that the |
313 | | * segment size is not larger than the MSS. |
314 | | * - Delayed acks are enabled or this is a half-synchronized T/TCP |
315 | | * connection. |
316 | | */ |
317 | | #define DELAY_ACK(tp, tlen) \ |
318 | 0 | ((!tcp_timer_active(tp, TT_DELACK) && \ |
319 | 0 | (tp->t_flags & TF_RXWIN0SENT) == 0) && \ |
320 | 0 | (tlen <= tp->t_maxopd) && \ |
321 | 0 | (V_tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN))) |
322 | | |
323 | | static inline void |
324 | | cc_ecnpkt_handler(struct tcpcb *tp, struct tcphdr *th, uint8_t iptos) |
325 | 0 | { |
326 | | /* samkumar: remove lock */ |
327 | |
|
328 | 0 | if (CC_ALGO(tp)->ecnpkt_handler != NULL) { |
329 | 0 | switch (iptos & IPTOS_ECN_MASK) { |
330 | 0 | case IPTOS_ECN_CE: |
331 | 0 | tp->ccv->flags |= CCF_IPHDR_CE; |
332 | 0 | break; |
333 | 0 | case IPTOS_ECN_ECT0: |
334 | 0 | tp->ccv->flags &= ~CCF_IPHDR_CE; |
335 | 0 | break; |
336 | 0 | case IPTOS_ECN_ECT1: |
337 | 0 | tp->ccv->flags &= ~CCF_IPHDR_CE; |
338 | 0 | break; |
339 | 0 | } |
340 | | |
341 | 0 | if (th->th_flags & TH_CWR) |
342 | 0 | tp->ccv->flags |= CCF_TCPHDR_CWR; |
343 | 0 | else |
344 | 0 | tp->ccv->flags &= ~CCF_TCPHDR_CWR; |
345 | |
|
346 | 0 | if (tp->t_flags & TF_DELACK) |
347 | 0 | tp->ccv->flags |= CCF_DELACK; |
348 | 0 | else |
349 | 0 | tp->ccv->flags &= ~CCF_DELACK; |
350 | |
|
351 | 0 | CC_ALGO(tp)->ecnpkt_handler(tp->ccv); |
352 | |
|
353 | 0 | if (tp->ccv->flags & CCF_ACKNOW) |
354 | 0 | tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); |
355 | 0 | } |
356 | 0 | } |
357 | | |
358 | | /* |
359 | | * External function: look up an entry in the hostcache and fill out the |
360 | | * supplied TCP metrics structure. Fills in NULL when no entry was found or |
361 | | * a value is not set. |
362 | | */ |
363 | | /* |
364 | | * samkumar: This function is taken from tcp_hostcache.c. We have no host cache |
365 | | * in TCPlp, so I changed this to always act as if there is a miss. I removed |
366 | | * the first argument, formerly "struct in_coninfo *inc". |
367 | | */ |
368 | | void |
369 | | tcp_hc_get(struct tcpcb* tp, struct hc_metrics_lite *hc_metrics_lite) |
370 | 0 | { |
371 | 0 | bzero(hc_metrics_lite, sizeof(*hc_metrics_lite)); |
372 | 0 | } |
373 | | |
374 | | /* |
375 | | * External function: look up an entry in the hostcache and return the |
376 | | * discovered path MTU. Returns NULL if no entry is found or value is not |
377 | | * set. |
378 | | */ |
379 | | /* |
380 | | * samkumar: This function is taken from tcp_hostcache.c. We have no host cache |
381 | | * in TCPlp, so I changed this to always act as if there is a miss. |
382 | | */ |
383 | | uint64_t |
384 | | tcp_hc_getmtu(struct tcpcb* tp) |
385 | 0 | { |
386 | 0 | return 0; |
387 | 0 | } |
388 | | |
389 | | |
390 | | /* |
391 | | * Issue RST and make ACK acceptable to originator of segment. |
392 | | * The mbuf must still include the original packet header. |
393 | | * tp may be NULL. |
394 | | */ |
395 | | /* |
396 | | * samkumar: Original signature was: |
397 | | * static void tcp_dropwithreset(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, |
398 | | * int tlen, int rstreason) |
399 | | */ |
400 | | void |
401 | | tcp_dropwithreset(struct ip6_hdr* ip6, struct tcphdr *th, struct tcpcb *tp, otInstance* instance, |
402 | | int tlen, int rstreason) |
403 | 40 | { |
404 | | /* |
405 | | * samkumar: I removed logic to skip this for broadcast or multicast |
406 | | * packets. In the FreeBSD version of this function, it would just |
407 | | * call m_freem(m), if m->m_flags has M_BCAST or M_MCAST set, and not |
408 | | * send a response packet. |
409 | | * I also removed bandwidth limiting. |
410 | | */ |
411 | 40 | if (th->th_flags & TH_RST) |
412 | 13 | return; |
413 | | |
414 | | /* tcp_respond consumes the mbuf chain. */ |
415 | 27 | if (th->th_flags & TH_ACK) { |
416 | 9 | tcp_respond(tp, instance, ip6, th, (tcp_seq) 0, th->th_ack, TH_RST); |
417 | 18 | } else { |
418 | 18 | if (th->th_flags & TH_SYN) |
419 | 9 | tlen++; |
420 | 18 | tcp_respond(tp, instance, ip6, th, th->th_seq + tlen, (tcp_seq) 0, TH_RST | TH_ACK); |
421 | 18 | } |
422 | 27 | return; |
423 | 40 | } |
424 | | |
425 | | /* |
426 | | * TCP input handling is split into multiple parts: |
427 | | * tcp6_input is a thin wrapper around tcplp_input for the extended |
428 | | * ip6_protox[] call format in ip6_input |
429 | | * tcplp_input handles primary segment validation, inpcb lookup and |
430 | | * SYN processing on listen sockets |
431 | | * tcp_do_segment processes the ACK and text of the segment for |
432 | | * establishing, established and closing connections |
433 | | */ |
434 | | /* samkumar: The signature of this function was originally: |
435 | | tcp_input(struct mbuf **mp, int *offp, int proto) */ |
436 | | /* NOTE: tcp_fields_to_host(th) must be called before this function is called. */ |
437 | | int |
438 | | tcplp_input(struct ip6_hdr* ip6, struct tcphdr* th, otMessage* msg, struct tcpcb* tp, struct tcpcb_listen* tpl, |
439 | | struct tcplp_signals* sig) |
440 | 0 | { |
441 | | /* |
442 | | * samkumar: I significantly modified this function, compared to the |
443 | | * FreeBSD version. This function used to be reponsible for matching an |
444 | | * incoming TCP segment to its TCB. That functionality is now done by |
445 | | * TCPlp, and this function is only called once a match has been |
446 | | * identified. |
447 | | * |
448 | | * The tp and tpl arguments are used to indicate the match. Exactly one of |
449 | | * them must be NULL, and the other must be set. If tp is non-NULL, then |
450 | | * this function assumes that the packet was matched to an active socket |
451 | | * (connection endpoint). If tpl is non-NULL, then this function assumes |
452 | | * that this packet is a candidate match for a passive socket (listener) |
453 | | * and attempts to set up a new connection if the flags, sequence numbers, |
454 | | * etc. look OK. |
455 | | * |
456 | | * TCPlp assumes that the packets are IPv6, so I removed any logic specific |
457 | | * to IPv4. |
458 | | * |
459 | | * And of course, all code pertaining to locks and stats has been removed. |
460 | | */ |
461 | 0 | int tlen = 0, off; |
462 | 0 | int thflags; |
463 | 0 | uint8_t iptos = 0; |
464 | 0 | int drop_hdrlen; |
465 | 0 | int rstreason = 0; |
466 | 0 | struct tcpopt to; /* options in this segment */ |
467 | 0 | uint8_t* optp = NULL; |
468 | 0 | int optlen = 0; |
469 | 0 | to.to_flags = 0; |
470 | 0 | KASSERT(tp || tpl, ("One of tp and tpl must be positive")); |
471 | | |
472 | | /* |
473 | | * samkumar: Here, there used to be code that handled preprocessing: |
474 | | * calling m_pullup(m, sizeof(*ip6) + sizeof(*th)) to get the headers |
475 | | * contiguous in memory, setting the ip6 and th pointers, validating the |
476 | | * checksum, and dropping packets with unspecified source address. In |
477 | | * TCPlp, all of this is done for a packet before this function is called. |
478 | | */ |
479 | |
|
480 | 0 | tlen = ntohs(ip6->ip6_plen); // assume *off == sizeof(*ip6) |
481 | | |
482 | | /* |
483 | | * samkumar: Logic that handled IPv4 was deleted below. I won't add a |
484 | | * comment every time this is done, but I'm putting it here (one of the |
485 | | * first instances of this) for clarity. |
486 | | */ |
487 | 0 | iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; |
488 | | |
489 | | /* |
490 | | * Check that TCP offset makes sense, |
491 | | * pull out TCP options and adjust length. XXX |
492 | | */ |
493 | 0 | off = (th->th_off_x2 >> TH_OFF_SHIFT) << 2; |
494 | 0 | if (off < sizeof (struct tcphdr) || off > tlen) { |
495 | 0 | goto drop; |
496 | 0 | } |
497 | 0 | tlen -= off; /* tlen is used instead of ti->ti_len */ |
498 | | /* samkumar: now, tlen is the length of the data */ |
499 | |
|
500 | 0 | if (off > sizeof (struct tcphdr)) { |
501 | | /* |
502 | | * samkumar: I removed a call to IP6_EXTHDR_CHECK, which I believe |
503 | | * checks for IPv6 extension headers. In TCPlp, we assume that these |
504 | | * are handled elsewhere in the networking stack, before the incoming |
505 | | * packet is processed at the TCP layer. I also removed the followup |
506 | | * calls to reassign the ip6 and th pointers. |
507 | | */ |
508 | 0 | optlen = off - sizeof (struct tcphdr); |
509 | 0 | optp = (uint8_t *)(th + 1); |
510 | 0 | } |
511 | |
|
512 | 0 | thflags = th->th_flags; |
513 | | |
514 | | /* |
515 | | * samkumar: There used to be a call here to tcp_fields_to_host(th), which |
516 | | * changes the byte order of various fields to host format. I removed this |
517 | | * call from there and handle it in TCPlp, before calling this. The reason |
518 | | * is that it's possible for this function to be called twice by TCPlp's |
519 | | * logic (e.g., if the packet matches a TIME-WAIT socket this function |
520 | | * returns early, and the packet may then match a listening socket, at |
521 | | * which ppoint this function will be called again). Thus, any operations |
522 | | * like this, which mutate the packet itself, need to happen before calling |
523 | | * this function. |
524 | | */ |
525 | | |
526 | | /* |
527 | | * Delay dropping TCP, IP headers, IPv6 ext headers, and TCP options. |
528 | | * |
529 | | * samkumar: My TCP header is in a different buffer from the IP header. |
530 | | * drop_hdrlen is only meaningful as an offset into the TCP buffer, |
531 | | * because it is used to determine how much of the packet to discard |
532 | | * before copying it into the receive buffer. Therefore, my offset does |
533 | | * not include the length of IP header and options, only the length of |
534 | | * the TCP header and options. |
535 | | */ |
536 | 0 | drop_hdrlen = /*off0 +*/ off; |
537 | | |
538 | | /* |
539 | | * Locate pcb for segment; if we're likely to add or remove a |
540 | | * connection then first acquire pcbinfo lock. There are three cases |
541 | | * where we might discover later we need a write lock despite the |
542 | | * flags: ACKs moving a connection out of the syncache, ACKs for a |
543 | | * connection in TIMEWAIT and SYNs not targeting a listening socket. |
544 | | */ |
545 | | |
546 | | /* |
547 | | * samkumar: Locking code is removed, invalidating most of the above |
548 | | * comment. |
549 | | */ |
550 | | |
551 | | /* |
552 | | * samkumar: The FreeBSD code at logic here to check m->m_flags for the |
553 | | * M_IP6_NEXTHOP flag, and search for the PACKET_TAG_IPFORWARD tag and |
554 | | * store it in fwd_tag if so. In TCPlp, we assume that the IPv6 layer of |
555 | | * the host network stack handles this kind of IPv6-related functionality, |
556 | | * so this logic has been removed. |
557 | | */ |
558 | | |
559 | | /* |
560 | | * samkumar: Here, there was code to match the packet to an inpcb and reply |
561 | | * with an RST segment if no match is found. This included taking the |
562 | | * fwd_tag into account, if set above (see the previous comment). I removed |
563 | | * this code because, in TCPlp, this is done before calling this function. |
564 | | */ |
565 | | |
566 | | /* |
567 | | * A previous connection in TIMEWAIT state is supposed to catch stray |
568 | | * or duplicate segments arriving late. If this segment was a |
569 | | * legitimate new connection attempt, the old INPCB gets removed and |
570 | | * we can try again to find a listening socket. |
571 | | * |
572 | | * At this point, due to earlier optimism, we may hold only an inpcb |
573 | | * lock, and not the inpcbinfo write lock. If so, we need to try to |
574 | | * acquire it, or if that fails, acquire a reference on the inpcb, |
575 | | * drop all locks, acquire a global write lock, and then re-acquire |
576 | | * the inpcb lock. We may at that point discover that another thread |
577 | | * has tried to free the inpcb, in which case we need to loop back |
578 | | * and try to find a new inpcb to deliver to. |
579 | | * |
580 | | * XXXRW: It may be time to rethink timewait locking. |
581 | | */ |
582 | | /* |
583 | | * samkumar: The original code checked inp->inp_flags & INP_TIMEWAIT. I |
584 | | * changed it to instead check tp->t_state, since we don't use inpcbs in |
585 | | * TCPlp. |
586 | | */ |
587 | 0 | if (tp && tp->t_state == TCP6S_TIME_WAIT) { |
588 | | /* |
589 | | * samkumar: There's nothing wrong with the call to tcp_dooptions call |
590 | | * that I've commented out below; it's just that the modified |
591 | | * "tcp_twcheck" function no longer needs the options structure, so |
592 | | * I figured that there's no longer a good reason to parse the options. |
593 | | * In fact, this call was probably unnecessary even in the original |
594 | | * FreeBSD TCP code, since tcp_twcheck, even without my modifications, |
595 | | * did not use the pointer to the options structure! |
596 | | */ |
597 | | //if (thflags & TH_SYN) |
598 | | //tcp_dooptions(&to, optp, optlen, TO_SYN); |
599 | | /* |
600 | | * samkumar: The original code would "goto findpcb;" if this branch is |
601 | | * taken. Matching with a TCB is done outside of this function in |
602 | | * TCPlp, so we instead return a special value so that the caller knows |
603 | | * to try re-matching this packet to a socket. |
604 | | */ |
605 | 0 | if (tcp_twcheck(tp,/*inp, &to,*/ th, /*m,*/ tlen)) |
606 | 0 | return (RELOOKUP_REQUIRED); |
607 | 0 | return (IPPROTO_DONE); |
608 | 0 | } |
609 | | /* |
610 | | * The TCPCB may no longer exist if the connection is winding |
611 | | * down or it is in the CLOSED state. Either way we drop the |
612 | | * segment and send an appropriate response. |
613 | | */ |
614 | | /* |
615 | | * samkumar: There used to be code here that grabs the tp from the inpcb |
616 | | * and drops with reset if the connection is in the closed state or if |
617 | | * the tp is NULL. In TCPlp, the equivalent logic is done before entering |
618 | | * this function. There was also code here to handle TCP offload, which |
619 | | * TCPlp does not handle. |
620 | | */ |
621 | | |
622 | | /* |
623 | | * We've identified a valid inpcb, but it could be that we need an |
624 | | * inpcbinfo write lock but don't hold it. In this case, attempt to |
625 | | * acquire using the same strategy as the TIMEWAIT case above. If we |
626 | | * relock, we have to jump back to 'relocked' as the connection might |
627 | | * now be in TIMEWAIT. |
628 | | */ |
629 | | /* |
630 | | * samkumar: There used to be some code here for synchronization, MAC |
631 | | * management, and debugging. |
632 | | */ |
633 | | |
634 | | /* |
635 | | * When the socket is accepting connections (the INPCB is in LISTEN |
636 | | * state) we look into the SYN cache if this is a new connection |
637 | | * attempt or the completion of a previous one. Instead of checking |
638 | | * so->so_options to check if the socket is listening, we rely on the |
639 | | * arguments passed to this function (if tp == NULL, then tpl is not NULL |
640 | | * and is the matching listen socket). |
641 | | */ |
642 | | |
643 | 0 | if (/*so->so_options & SO_ACCEPTCONN*/tp == NULL) { |
644 | 0 | int tfo_cookie_valid = 0; |
645 | 0 | uint64_t tfo_response_cookie; |
646 | | // int tfo_response_cookie_valid = 0; |
647 | | |
648 | | /* samkumar: NULL check isn't needed but prevents a compiler warning */ |
649 | 0 | KASSERT(tpl != NULL && tpl->t_state == TCP6S_LISTEN, ("listen socket must be in listening state!")); |
650 | | |
651 | | /* |
652 | | * samkumar: There used to be some code here that checks if the |
653 | | * received segment is an ACK, and if so, searches the SYN cache to |
654 | | * find an entry whose connection establishment handshake this segment |
655 | | * completes. If such an entry is found, then a socket is created and |
656 | | * then tcp_do_segment is called to actually run the code to mark the |
657 | | * connection as established. If the received segment is an RST, then |
658 | | * that is processed in the syncache as well. In TCPlp we do not use a |
659 | | * SYN cache, so I've removed that code. The actual connection |
660 | | * establishment/processing logic happens in tcp_do_segment anyway, |
661 | | * which is called at the bottom of this function, so there's no need |
662 | | * to rewrite this code with special-case logic for that. |
663 | | */ |
664 | | |
665 | | /* |
666 | | * We can't do anything without SYN. |
667 | | */ |
668 | 0 | if ((thflags & TH_SYN) == 0) { |
669 | | /* |
670 | | * samkumar: Here, and in several other instances, the FreeBSD |
671 | | * code would call tcp_log_addrs. Improving logging in these |
672 | | * edge cases in TCPlp is left for the future --- for now, I just |
673 | | * put "<addrs go here>" where the address string would go. |
674 | | */ |
675 | 0 | tcplp_sys_log("%s; %s: Listen socket: " |
676 | 0 | "SYN is missing, segment ignored", |
677 | 0 | "<addrs go here>", __func__); |
678 | 0 | goto dropunlock; |
679 | 0 | } |
680 | | /* |
681 | | * (SYN|ACK) is bogus on a listen socket. |
682 | | */ |
683 | 0 | if (thflags & TH_ACK) { |
684 | | /* samkumar: See above comment regarding tcp_log_addrs. */ |
685 | 0 | tcplp_sys_log("%s; %s: Listen socket: " |
686 | 0 | "SYN|ACK invalid, segment rejected", |
687 | 0 | "<addrs go here>", __func__); |
688 | | /* samkumar: Removed call to syncache_badack(&inc); */ |
689 | 0 | rstreason = BANDLIM_RST_OPENPORT; |
690 | 0 | goto dropwithreset; |
691 | 0 | } |
692 | | /* |
693 | | * If the drop_synfin option is enabled, drop all |
694 | | * segments with both the SYN and FIN bits set. |
695 | | * This prevents e.g. nmap from identifying the |
696 | | * TCP/IP stack. |
697 | | * XXX: Poor reasoning. nmap has other methods |
698 | | * and is constantly refining its stack detection |
699 | | * strategies. |
700 | | * XXX: This is a violation of the TCP specification |
701 | | * and was used by RFC1644. |
702 | | */ |
703 | 0 | if ((thflags & TH_FIN) && V_drop_synfin) { |
704 | | /* samkumar: See above comment regarding tcp_log_addrs. */ |
705 | 0 | tcplp_sys_log("%s; %s: Listen socket: " |
706 | 0 | "SYN|FIN segment ignored (based on " |
707 | 0 | "sysctl setting)", "<addrs go here>", __func__); |
708 | 0 | goto dropunlock; |
709 | 0 | } |
710 | | /* |
711 | | * Segment's flags are (SYN) or (SYN|FIN). |
712 | | * |
713 | | * TH_PUSH, TH_URG, TH_ECE, TH_CWR are ignored |
714 | | * as they do not affect the state of the TCP FSM. |
715 | | * The data pointed to by TH_URG and th_urp is ignored. |
716 | | */ |
717 | 0 | KASSERT((thflags & (TH_RST|TH_ACK)) == 0, |
718 | 0 | ("%s: Listen socket: TH_RST or TH_ACK set", __func__)); |
719 | 0 | KASSERT(thflags & (TH_SYN), |
720 | 0 | ("%s: Listen socket: TH_SYN not set", __func__)); |
721 | | |
722 | | /* |
723 | | * samkumar: There used to be some code here to reject incoming |
724 | | * SYN packets for deprecated interface addresses unless |
725 | | * V_ip6_use_deprecated is true. Rejecting the packet, in this case, |
726 | | * means to "goto dropwithreset". I removed this functionality. |
727 | | */ |
728 | | |
729 | | /* |
730 | | * Basic sanity checks on incoming SYN requests: |
731 | | * Don't respond if the destination is a link layer |
732 | | * broadcast according to RFC1122 4.2.3.10, p. 104. |
733 | | * If it is from this socket it must be forged. |
734 | | * Don't respond if the source or destination is a |
735 | | * global or subnet broad- or multicast address. |
736 | | * Note that it is quite possible to receive unicast |
737 | | * link-layer packets with a broadcast IP address. Use |
738 | | * in_broadcast() to find them. |
739 | | */ |
740 | | |
741 | | /* |
742 | | * samkumar: There used to be a sanity check that drops (via |
743 | | * "goto dropunlock") any broadcast or multicast packets. This check is |
744 | | * done by checking m->m_flags for (M_BAST|M_MCAST). The original |
745 | | * FreeBSD code for this has been removed (since checking m->m_flags |
746 | | * isn't really useful to us anyway). Note that other FreeBSD code that |
747 | | * checks for multicast source/destination addresses is retained below |
748 | | * (but only for the IPv6 case; the original FreeBSD code also handled |
749 | | * it for IPv4 addresses). |
750 | | */ |
751 | |
|
752 | 0 | if (th->th_dport == th->th_sport && |
753 | 0 | IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &ip6->ip6_src)) { |
754 | | /* samkumar: See above comment regarding tcp_log_addrs. */ |
755 | 0 | tcplp_sys_log("%s; %s: Listen socket: " |
756 | 0 | "Connection attempt to/from self " |
757 | 0 | "ignored", "<addrs go here>", __func__); |
758 | 0 | goto dropunlock; |
759 | 0 | } |
760 | 0 | if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || |
761 | 0 | IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) { |
762 | | /* samkumar: See above comment regarding tcp_log_addrs. */ |
763 | 0 | tcplp_sys_log("%s; %s: Listen socket: " |
764 | 0 | "Connection attempt from/to multicast " |
765 | 0 | "address ignored", "<addrs go here>", __func__); |
766 | 0 | goto dropunlock; |
767 | 0 | } |
768 | | |
769 | | /* |
770 | | * samkumar: The FreeBSD code would call |
771 | | * syncache_add(&inc, &to, th, inp, &so, m, NULL, NULL); |
772 | | * to add an entry to the SYN cache at this point. TCPlp doesn't use a |
773 | | * syncache, so we initialize the new socket right away. The code to |
774 | | * initialize the socket is taken from the syncache_socket function. |
775 | | */ |
776 | | /* |
777 | | * samkumar: As of FreeBSD 10.3, the syncache_add function returns |
778 | | * a flag indicating if a "fast open" code path should be taken. |
779 | | * In that case, there is a "goto" statement to the removed logic |
780 | | * above that calls tcp_do_segment after expanding a syncache entry. |
781 | | * Analogous logic is implemented below. |
782 | | */ |
783 | 0 | tcp_dooptions(&to, optp, optlen, TO_SYN); |
784 | | |
785 | | /* |
786 | | * samkumar: TCP Fast Open logic taken from syncache_add in |
787 | | * FreeBSD 12.0. |
788 | | */ |
789 | 0 | if (V_tcp_fastopen_server_enable && /*IS_FASTOPEN(tp->t_flags) && |
790 | | (tp->t_tfo_pending != NULL) && */ |
791 | 0 | (to.to_flags & TOF_FASTOPEN)) { |
792 | | /* |
793 | | * Limit the number of pending TFO connections to |
794 | | * approximately half of the queue limit. This prevents TFO |
795 | | * SYN floods from starving the service by filling the |
796 | | * listen queue with bogus TFO connections. |
797 | | */ |
798 | | /* |
799 | | * samkumar: Since we let the application handle the listen |
800 | | * queue it doesn't make sense to limit the number of pending |
801 | | * TFO connections as above. Long term, I think the best fix |
802 | | * is to let applications know if an incoming connection is |
803 | | * TFO, so that they can handle the case appropriately (e.g., |
804 | | * by disabling TFO or by declining the connection). |
805 | | */ |
806 | 0 | int result = tcp_fastopen_check_cookie(NULL, |
807 | 0 | to.to_tfo_cookie, to.to_tfo_len, |
808 | 0 | &tfo_response_cookie); |
809 | 0 | tfo_cookie_valid = (result > 0); |
810 | | // tfo_response_cookie_valid = (result >= 0); |
811 | 0 | } |
812 | |
|
813 | 0 | tp = tcplp_sys_accept_ready(tpl, &ip6->ip6_src, th->th_sport); // Try to allocate an active socket to accept into |
814 | 0 | if (tp == NULL) { |
815 | | /* If we couldn't allocate, just ignore the SYN. */ |
816 | 0 | return IPPROTO_DONE; |
817 | 0 | } |
818 | 0 | if (tp == (struct tcpcb *) -1) { |
819 | 0 | rstreason = ECONNREFUSED; |
820 | 0 | tp = NULL; |
821 | 0 | goto dropwithreset; |
822 | 0 | } |
823 | 0 | sig->accepted_connection = tp; |
824 | 0 | tcp_state_change(tp, TCPS_SYN_RECEIVED); |
825 | 0 | tpmarkpassiveopen(tp); |
826 | 0 | tp->iss = tcp_new_isn(tp); |
827 | 0 | tp->irs = th->th_seq; |
828 | 0 | tcp_rcvseqinit(tp); |
829 | 0 | tcp_sendseqinit(tp); |
830 | 0 | tp->snd_wl1 = th->th_seq; |
831 | | /* |
832 | | * samkumar: We remove the "+ 1"s below since we use |
833 | | * tcplp_output to send the appropriate SYN-ACK. For |
834 | | * example, syncache_tfo_expand eliminates the "+ 1"s |
835 | | * too. My understanding is that syncache_socket has |
836 | | * the "+ 1"s because it's normally called once the |
837 | | * SYN-ACK has already been ACKed, which is not how |
838 | | * TCPlp operates. |
839 | | */ |
840 | 0 | tp->snd_max = tp->iss/* + 1*/; |
841 | 0 | tp->snd_nxt = tp->iss/* + 1*/; |
842 | 0 | tp->rcv_up = th->th_seq + 1; |
843 | 0 | tp->rcv_wnd = imin(imax(cbuf_free_space(&tp->recvbuf), 0), TCP_MAXWIN); |
844 | 0 | tp->rcv_adv += tp->rcv_wnd; |
845 | 0 | tp->last_ack_sent = tp->rcv_nxt; |
846 | 0 | memcpy(&tp->laddr, &ip6->ip6_dst, sizeof(tp->laddr)); |
847 | 0 | memcpy(&tp->faddr, &ip6->ip6_src, sizeof(tp->faddr)); |
848 | 0 | tp->fport = th->th_sport; |
849 | 0 | tp->lport = tpl->lport; |
850 | | |
851 | | /* |
852 | | * samkumar: Several of the checks below (taken from syncache_socket!) |
853 | | * check for flags in sc->sc_flags. They have been written to directly |
854 | | * check for the conditions on the TCP options structure or in the TCP |
855 | | * header that would ordinarily be used to set flags in sc->sc_flags |
856 | | * when adding an entry to the SYN cache. |
857 | | * |
858 | | * In effect, we combine the logic in syncache_add to set elements of |
859 | | * sc with the logic in syncache_socket to transfer state from sc |
860 | | * to the socket, but short-circuit the process to avoid ever storing |
861 | | * data in sc. Since this isn't just adding or deleting code, I decided |
862 | | * that it's better to keep comments indicating exactly how I composed |
863 | | * these two functions. |
864 | | */ |
865 | 0 | tp->t_flags = tp->t_flags & (TF_NOPUSH | TF_NODELAY | TF_NOOPT); |
866 | | // tp->t_flags = sototcpcb(lso)->t_flags & (TF_NOPUSH|TF_NODELAY); |
867 | | // if (sc->sc_flags & SCF_NOOPT) |
868 | | // tp->t_flags |= TF_NOOPT; |
869 | | // else { |
870 | 0 | if (!(tp->t_flags & TF_NOOPT) && V_tcp_do_rfc1323) { |
871 | 0 | if (/*sc->sc_flags & SCF_WINSCALE*/to.to_flags & TOF_SCALE) { |
872 | 0 | int wscale = 0; |
873 | | |
874 | | /* |
875 | | * Pick the smallest possible scaling factor that |
876 | | * will still allow us to scale up to sb_max, aka |
877 | | * kern.ipc.maxsockbuf. |
878 | | * |
879 | | * We do this because there are broken firewalls that |
880 | | * will corrupt the window scale option, leading to |
881 | | * the other endpoint believing that our advertised |
882 | | * window is unscaled. At scale factors larger than |
883 | | * 5 the unscaled window will drop below 1500 bytes, |
884 | | * leading to serious problems when traversing these |
885 | | * broken firewalls. |
886 | | * |
887 | | * With the default maxsockbuf of 256K, a scale factor |
888 | | * of 3 will be chosen by this algorithm. Those who |
889 | | * choose a larger maxsockbuf should watch out |
890 | | * for the compatiblity problems mentioned above. |
891 | | * |
892 | | * RFC1323: The Window field in a SYN (i.e., a <SYN> |
893 | | * or <SYN,ACK>) segment itself is never scaled. |
894 | | */ |
895 | | |
896 | | /* |
897 | | * samkumar: The original logic, taken from syncache_add, is |
898 | | * listed below, commented out. In practice, we just use |
899 | | * wscale = 0 because in TCPlp we assume that the buffers |
900 | | * aren't big enough for window scaling to be all that useful. |
901 | | */ |
902 | | #if 0 |
903 | | while (wscale < TCP_MAX_WINSHIFT && |
904 | | (TCP_MAXWIN << wscale) < sb_max) |
905 | | wscale++; |
906 | | #endif |
907 | |
|
908 | 0 | tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE; |
909 | 0 | tp->snd_scale = /*sc->sc_requested_s_scale*/to.to_wscale; |
910 | 0 | tp->request_r_scale = wscale; |
911 | 0 | } |
912 | 0 | if (/*sc->sc_flags & SCF_TIMESTAMP*/to.to_flags & TOF_TS) { |
913 | 0 | tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP; |
914 | 0 | tp->ts_recent = /*sc->sc_tsreflect*/to.to_tsval; |
915 | 0 | tp->ts_recent_age = tcp_ts_getticks(); |
916 | 0 | tp->ts_offset = /*sc->sc_tsoff*/0; // No syncookies, so this should always be 0 |
917 | 0 | } |
918 | | |
919 | | /* |
920 | | * samkumar: there used to be code here that would set the |
921 | | * TF_SIGNATURE flag on tp->t_flags if SCF_SIGNATURE is set on |
922 | | * sc->sc_flags. I've left it in below, commented out. |
923 | | */ |
924 | | #if 0 |
925 | | #ifdef TCP_SIGNATURE |
926 | | if (sc->sc_flags & SCF_SIGNATURE) |
927 | | tp->t_flags |= TF_SIGNATURE; |
928 | | #endif |
929 | | #endif |
930 | 0 | if (/*sc->sc_flags & SCF_SACK*/ to.to_flags & TOF_SACKPERM) |
931 | 0 | tp->t_flags |= TF_SACK_PERMIT; |
932 | 0 | } |
933 | 0 | if (/*sc->sc_flags & SCF_ECN*/(th->th_flags & (TH_ECE|TH_CWR)) && V_tcp_do_ecn) |
934 | 0 | tp->t_flags |= TF_ECN_PERMIT; |
935 | | |
936 | | /* |
937 | | * Set up MSS and get cached values from tcp_hostcache. |
938 | | * This might overwrite some of the defaults we just set. |
939 | | */ |
940 | 0 | tcp_mss(tp, /*sc->sc_peer_mss*/(to.to_flags & TOF_MSS) ? to.to_mss : 0); |
941 | |
|
942 | 0 | if (tfo_cookie_valid) { |
943 | | /* |
944 | | * samkumar: The code below is taken from syncache_tfo_socket. |
945 | | * It calls syncache_socket (upon which the above code is based) |
946 | | * so it makes sense for this logic to go here. |
947 | | */ |
948 | 0 | tp->t_flags |= TF_FASTOPEN; |
949 | 0 | tp->t_tfo_cookie.server = tfo_response_cookie; |
950 | 0 | tp->snd_max = tp->iss; |
951 | 0 | tp->snd_nxt = tp->iss; |
952 | | // tp->tfo_pending = pending_counter; |
953 | | /* This would normally "goto" labeled code that calls tcp_do_segment. */ |
954 | 0 | tcp_do_segment(ip6, th, msg, tp, drop_hdrlen, tlen, iptos, sig); |
955 | |
|
956 | 0 | tp->accepted_from = tpl; |
957 | 0 | return (IPPROTO_DONE); |
958 | 0 | } else { |
959 | 0 | tp->t_flags |= TF_ACKNOW; // samkumar: my addition |
960 | 0 | } |
961 | | |
962 | 0 | tcplp_output(tp); // to send the SYN-ACK |
963 | |
|
964 | 0 | tp->accepted_from = tpl; |
965 | 0 | return (IPPROTO_DONE); |
966 | 0 | } else if (tp->t_state == TCPS_LISTEN) { |
967 | | /* |
968 | | * When a listen socket is torn down the SO_ACCEPTCONN |
969 | | * flag is removed first while connections are drained |
970 | | * from the accept queue in a unlock/lock cycle of the |
971 | | * ACCEPT_LOCK, opening a race condition allowing a SYN |
972 | | * attempt go through unhandled. |
973 | | */ |
974 | 0 | goto dropunlock; |
975 | 0 | } |
976 | | |
977 | 0 | KASSERT(tp, ("tp is still NULL!")); |
978 | | |
979 | | /* |
980 | | * samkumar: There used to be code here to verify TCP signatures. We don't |
981 | | * support TCP signatures in TCPlp. |
982 | | */ |
983 | | |
984 | | /* |
985 | | * Segment belongs to a connection in SYN_SENT, ESTABLISHED or later |
986 | | * state. tcp_do_segment() always consumes the mbuf chain, unlocks |
987 | | * the inpcb, and unlocks pcbinfo. |
988 | | */ |
989 | 0 | tcp_do_segment(ip6, th, msg, tp, drop_hdrlen, tlen, iptos, sig); |
990 | 0 | return (IPPROTO_DONE); |
991 | | |
992 | | /* |
993 | | * samkumar: Removed some locking and debugging code under all three of |
994 | | * these labels: dropwithreset, dropunlock, and drop. I also removed some |
995 | | * memory management code (e.g., calling m_freem(m) if m != NULL) since |
996 | | * the caller of this function will take care of that kind of memory |
997 | | * management in TCPlp. |
998 | | */ |
999 | 0 | dropwithreset: |
1000 | | |
1001 | | /* |
1002 | | * samkumar: The check against inp != NULL is now a check on tp != NULL. |
1003 | | */ |
1004 | 0 | if (tp != NULL) { |
1005 | 0 | tcp_dropwithreset(ip6, th, tp, tp->instance, tlen, rstreason); |
1006 | 0 | } else |
1007 | 0 | tcp_dropwithreset(ip6, th, NULL, tpl->instance, tlen, rstreason); |
1008 | 0 | goto drop; |
1009 | | |
1010 | 0 | dropunlock: |
1011 | 0 | drop: |
1012 | 0 | return (IPPROTO_DONE); |
1013 | 0 | } |
1014 | | |
1015 | | /* |
1016 | | * samkumar: Original signature |
1017 | | * static void |
1018 | | * tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, |
1019 | | * struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos, |
1020 | | * int ti_locked) |
1021 | | */ |
1022 | | static void |
1023 | | tcp_do_segment(struct ip6_hdr* ip6, struct tcphdr *th, otMessage* msg, |
1024 | | struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos, |
1025 | | struct tcplp_signals* sig) |
1026 | 0 | { |
1027 | | /* |
1028 | | * samkumar: All code pertaining to locks, stats, and debug has been |
1029 | | * removed from this function. |
1030 | | */ |
1031 | |
|
1032 | 0 | int thflags, acked, ourfinisacked, needoutput = 0; |
1033 | 0 | int rstreason, todrop, win; |
1034 | 0 | uint64_t tiwin; |
1035 | 0 | struct tcpopt to; |
1036 | 0 | int tfo_syn; |
1037 | 0 | uint32_t ticks = tcplp_sys_get_ticks(); |
1038 | 0 | otInstance* instance = tp->instance; |
1039 | 0 | thflags = th->th_flags; |
1040 | 0 | tp->sackhint.last_sack_ack = 0; |
1041 | | |
1042 | | /* |
1043 | | * If this is either a state-changing packet or current state isn't |
1044 | | * established, we require a write lock on tcbinfo. Otherwise, we |
1045 | | * allow the tcbinfo to be in either alocked or unlocked, as the |
1046 | | * caller may have unnecessarily acquired a write lock due to a race. |
1047 | | */ |
1048 | | |
1049 | | /* samkumar: There used to be synchronization code here. */ |
1050 | 0 | KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", |
1051 | 0 | __func__)); |
1052 | 0 | KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", |
1053 | 0 | __func__)); |
1054 | | |
1055 | | /* |
1056 | | * Segment received on connection. |
1057 | | * Reset idle time and keep-alive timer. |
1058 | | * XXX: This should be done after segment |
1059 | | * validation to ignore broken/spoofed segs. |
1060 | | */ |
1061 | 0 | tp->t_rcvtime = ticks; |
1062 | 0 | if (TCPS_HAVEESTABLISHED(tp->t_state)) |
1063 | 0 | tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp)); |
1064 | | |
1065 | | /* |
1066 | | * Scale up the window into a 32-bit value. |
1067 | | * For the SYN_SENT state the scale is zero. |
1068 | | */ |
1069 | 0 | tiwin = th->th_win << tp->snd_scale; |
1070 | | |
1071 | | /* |
1072 | | * TCP ECN processing. |
1073 | | */ |
1074 | | /* |
1075 | | * samkumar: I intentionally left the TCPSTAT_INC lines below commented |
1076 | | * out, to avoid altering the structure of the code too much by |
1077 | | * reorganizing the switch statement. |
1078 | | */ |
1079 | 0 | if (tp->t_flags & TF_ECN_PERMIT) { |
1080 | 0 | if (thflags & TH_CWR) |
1081 | 0 | tp->t_flags &= ~TF_ECN_SND_ECE; |
1082 | 0 | switch (iptos & IPTOS_ECN_MASK) { |
1083 | 0 | case IPTOS_ECN_CE: |
1084 | 0 | tp->t_flags |= TF_ECN_SND_ECE; |
1085 | | //TCPSTAT_INC(tcps_ecn_ce); |
1086 | 0 | break; |
1087 | 0 | case IPTOS_ECN_ECT0: |
1088 | | //TCPSTAT_INC(tcps_ecn_ect0); |
1089 | 0 | break; |
1090 | 0 | case IPTOS_ECN_ECT1: |
1091 | | //TCPSTAT_INC(tcps_ecn_ect1); |
1092 | 0 | break; |
1093 | 0 | } |
1094 | | |
1095 | | /* Process a packet differently from RFC3168. */ |
1096 | 0 | cc_ecnpkt_handler(tp, th, iptos); |
1097 | | |
1098 | | /* Congestion experienced. */ |
1099 | 0 | if (thflags & TH_ECE) { |
1100 | 0 | cc_cong_signal(tp, th, CC_ECN); |
1101 | 0 | } |
1102 | 0 | } |
1103 | | |
1104 | | /* |
1105 | | * Parse options on any incoming segment. |
1106 | | */ |
1107 | 0 | tcp_dooptions(&to, (uint8_t *)(th + 1), |
1108 | 0 | ((th->th_off_x2 >> TH_OFF_SHIFT) << 2) - sizeof(struct tcphdr), |
1109 | 0 | (thflags & TH_SYN) ? TO_SYN : 0); |
1110 | | |
1111 | | /* |
1112 | | * If echoed timestamp is later than the current time, |
1113 | | * fall back to non RFC1323 RTT calculation. Normalize |
1114 | | * timestamp if syncookies were used when this connection |
1115 | | * was established. |
1116 | | */ |
1117 | |
|
1118 | 0 | if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { |
1119 | 0 | to.to_tsecr -= tp->ts_offset; |
1120 | 0 | if (TSTMP_GT(to.to_tsecr, tcp_ts_getticks())) |
1121 | 0 | to.to_tsecr = 0; |
1122 | 0 | } |
1123 | | /* |
1124 | | * If timestamps were negotiated during SYN/ACK they should |
1125 | | * appear on every segment during this session and vice versa. |
1126 | | */ |
1127 | 0 | if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS)) { |
1128 | | /* samkumar: See above comment regarding tcp_log_addrs. */ |
1129 | 0 | tcplp_sys_log("%s; %s: Timestamp missing, " |
1130 | 0 | "no action", "<addrs go here>", __func__); |
1131 | 0 | } |
1132 | 0 | if (!(tp->t_flags & TF_RCVD_TSTMP) && (to.to_flags & TOF_TS)) { |
1133 | | /* samkumar: See above comment regarding tcp_log_addrs. */ |
1134 | 0 | tcplp_sys_log("%s; %s: Timestamp not expected, " |
1135 | 0 | "no action", "<addrs go here>", __func__); |
1136 | 0 | } |
1137 | | |
1138 | | /* |
1139 | | * Process options only when we get SYN/ACK back. The SYN case |
1140 | | * for incoming connections is handled in tcp_syncache. |
1141 | | * According to RFC1323 the window field in a SYN (i.e., a <SYN> |
1142 | | * or <SYN,ACK>) segment itself is never scaled. |
1143 | | * XXX this is traditional behavior, may need to be cleaned up. |
1144 | | */ |
1145 | 0 | if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { |
1146 | 0 | if ((to.to_flags & TOF_SCALE) && |
1147 | 0 | (tp->t_flags & TF_REQ_SCALE)) { |
1148 | 0 | tp->t_flags |= TF_RCVD_SCALE; |
1149 | 0 | tp->snd_scale = to.to_wscale; |
1150 | 0 | } |
1151 | | /* |
1152 | | * Initial send window. It will be updated with |
1153 | | * the next incoming segment to the scaled value. |
1154 | | */ |
1155 | 0 | tp->snd_wnd = th->th_win; |
1156 | 0 | if (to.to_flags & TOF_TS) { |
1157 | 0 | tp->t_flags |= TF_RCVD_TSTMP; |
1158 | 0 | tp->ts_recent = to.to_tsval; |
1159 | 0 | tp->ts_recent_age = tcp_ts_getticks(); |
1160 | 0 | } |
1161 | 0 | if (to.to_flags & TOF_MSS) |
1162 | 0 | tcp_mss(tp, to.to_mss); |
1163 | 0 | if ((tp->t_flags & TF_SACK_PERMIT) && |
1164 | 0 | (to.to_flags & TOF_SACKPERM) == 0) |
1165 | 0 | tp->t_flags &= ~TF_SACK_PERMIT; |
1166 | | /* |
1167 | | * samkumar: TCP Fast Open logic from FreeBSD 12.0. |
1168 | | */ |
1169 | 0 | if (IS_FASTOPEN(tp->t_flags)) { |
1170 | 0 | if (to.to_flags & TOF_FASTOPEN) { |
1171 | 0 | uint16_t mss; |
1172 | |
|
1173 | 0 | if (to.to_flags & TOF_MSS) |
1174 | 0 | mss = to.to_mss; |
1175 | 0 | else |
1176 | | /* |
1177 | | * samkumar: The original code here would set |
1178 | | * mss to either TCP6_MAXSS or TCP_MAXSS depending |
1179 | | * on whether the INP_IPV6 flag is present in |
1180 | | * tp->t_inpcb->inp_vflag. In TCPlp, we always |
1181 | | * assume IPv6. |
1182 | | */ |
1183 | 0 | mss = TCP6_MAXSS; |
1184 | 0 | tcp_fastopen_update_cache(tp, mss, |
1185 | 0 | to.to_tfo_len, to.to_tfo_cookie); |
1186 | 0 | } else |
1187 | 0 | tcp_fastopen_disable_path(tp); |
1188 | 0 | } |
1189 | 0 | } |
1190 | | /* |
1191 | | * Header prediction: check for the two common cases |
1192 | | * of a uni-directional data xfer. If the packet has |
1193 | | * no control flags, is in-sequence, the window didn't |
1194 | | * change and we're not retransmitting, it's a |
1195 | | * candidate. If the length is zero and the ack moved |
1196 | | * forward, we're the sender side of the xfer. Just |
1197 | | * free the data acked & wake any higher level process |
1198 | | * that was blocked waiting for space. If the length |
1199 | | * is non-zero and the ack didn't move, we're the |
1200 | | * receiver side. If we're getting packets in-order |
1201 | | * (the reassembly queue is empty), add the data to |
1202 | | * the socket buffer and note that we need a delayed ack. |
1203 | | * Make sure that the hidden state-flags are also off. |
1204 | | * Since we check for TCPS_ESTABLISHED first, it can only |
1205 | | * be TH_NEEDSYN. |
1206 | | */ |
1207 | | /* |
1208 | | * samkumar: Replaced LIST_EMPTY(&tp->tsegq with the call to bmp_isempty). |
1209 | | */ |
1210 | 0 | if (tp->t_state == TCPS_ESTABLISHED && |
1211 | 0 | th->th_seq == tp->rcv_nxt && |
1212 | 0 | (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && |
1213 | 0 | tp->snd_nxt == tp->snd_max && |
1214 | 0 | tiwin && tiwin == tp->snd_wnd && |
1215 | 0 | ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) && |
1216 | 0 | bmp_isempty(tp->reassbmp, REASSBMP_SIZE(tp)) && |
1217 | 0 | ((to.to_flags & TOF_TS) == 0 || |
1218 | 0 | TSTMP_GEQ(to.to_tsval, tp->ts_recent)) ) { |
1219 | | |
1220 | | /* |
1221 | | * If last ACK falls within this segment's sequence numbers, |
1222 | | * record the timestamp. |
1223 | | * NOTE that the test is modified according to the latest |
1224 | | * proposal of the tcplw@cray.com list (Braden 1993/04/26). |
1225 | | */ |
1226 | 0 | if ((to.to_flags & TOF_TS) != 0 && |
1227 | 0 | SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { |
1228 | 0 | tp->ts_recent_age = tcp_ts_getticks(); |
1229 | 0 | tp->ts_recent = to.to_tsval; |
1230 | 0 | } |
1231 | |
|
1232 | 0 | if (tlen == 0) { |
1233 | 0 | if (SEQ_GT(th->th_ack, tp->snd_una) && |
1234 | 0 | SEQ_LEQ(th->th_ack, tp->snd_max) && |
1235 | 0 | !IN_RECOVERY(tp->t_flags) && |
1236 | 0 | (to.to_flags & TOF_SACK) == 0 && |
1237 | 0 | TAILQ_EMPTY(&tp->snd_holes)) { |
1238 | | /* |
1239 | | * This is a pure ack for outstanding data. |
1240 | | */ |
1241 | | |
1242 | | /* |
1243 | | * "bad retransmit" recovery. |
1244 | | */ |
1245 | 0 | if (tp->t_rxtshift == 1 && |
1246 | 0 | tp->t_flags & TF_PREVVALID && |
1247 | 0 | (int)(ticks - tp->t_badrxtwin) < 0) { |
1248 | 0 | cc_cong_signal(tp, th, CC_RTO_ERR); |
1249 | 0 | } |
1250 | | |
1251 | | /* |
1252 | | * Recalculate the transmit timer / rtt. |
1253 | | * |
1254 | | * Some boxes send broken timestamp replies |
1255 | | * during the SYN+ACK phase, ignore |
1256 | | * timestamps of 0 or we could calculate a |
1257 | | * huge RTT and blow up the retransmit timer. |
1258 | | */ |
1259 | |
|
1260 | 0 | if ((to.to_flags & TOF_TS) != 0 && |
1261 | 0 | to.to_tsecr) { |
1262 | 0 | uint32_t t; |
1263 | |
|
1264 | 0 | t = tcp_ts_getticks() - to.to_tsecr; |
1265 | 0 | if (!tp->t_rttlow || tp->t_rttlow > t) |
1266 | 0 | tp->t_rttlow = t; |
1267 | 0 | tcp_xmit_timer(tp, |
1268 | 0 | TCP_TS_TO_TICKS(t) + 1); |
1269 | 0 | } else if (tp->t_rtttime && |
1270 | 0 | SEQ_GT(th->th_ack, tp->t_rtseq)) { |
1271 | 0 | if (!tp->t_rttlow || |
1272 | 0 | tp->t_rttlow > ticks - tp->t_rtttime) |
1273 | 0 | tp->t_rttlow = ticks - tp->t_rtttime; |
1274 | 0 | tcp_xmit_timer(tp, |
1275 | 0 | ticks - tp->t_rtttime); |
1276 | 0 | } |
1277 | |
|
1278 | 0 | acked = BYTES_THIS_ACK(tp, th); |
1279 | | |
1280 | | /* |
1281 | | * samkumar: Replaced sbdrop(&so->so_snd, acked) with this call |
1282 | | * to lbuf_pop. |
1283 | | */ |
1284 | 0 | { |
1285 | 0 | uint32_t poppedbytes = lbuf_pop(&tp->sendbuf, acked, &sig->links_popped); |
1286 | 0 | KASSERT(poppedbytes == acked, ("More bytes were acked than are in the send buffer")); |
1287 | 0 | sig->bytes_acked += poppedbytes; |
1288 | 0 | } |
1289 | 0 | if (SEQ_GT(tp->snd_una, tp->snd_recover) && |
1290 | 0 | SEQ_LEQ(th->th_ack, tp->snd_recover)) |
1291 | 0 | tp->snd_recover = th->th_ack - 1; |
1292 | | |
1293 | | /* |
1294 | | * Let the congestion control algorithm update |
1295 | | * congestion control related information. This |
1296 | | * typically means increasing the congestion |
1297 | | * window. |
1298 | | */ |
1299 | 0 | cc_ack_received(tp, th, CC_ACK); |
1300 | |
|
1301 | 0 | tp->snd_una = th->th_ack; |
1302 | | /* |
1303 | | * Pull snd_wl2 up to prevent seq wrap relative |
1304 | | * to th_ack. |
1305 | | */ |
1306 | 0 | tp->snd_wl2 = th->th_ack; |
1307 | 0 | tp->t_dupacks = 0; |
1308 | | |
1309 | | /* |
1310 | | * If all outstanding data are acked, stop |
1311 | | * retransmit timer, otherwise restart timer |
1312 | | * using current (possibly backed-off) value. |
1313 | | * If process is waiting for space, |
1314 | | * wakeup/selwakeup/signal. If data |
1315 | | * are ready to send, let tcplp_output |
1316 | | * decide between more output or persist. |
1317 | | */ |
1318 | |
|
1319 | 0 | if (tp->snd_una == tp->snd_max) |
1320 | 0 | tcp_timer_activate(tp, TT_REXMT, 0); |
1321 | 0 | else if (!tcp_timer_active(tp, TT_PERSIST)) |
1322 | 0 | tcp_timer_activate(tp, TT_REXMT, |
1323 | 0 | tp->t_rxtcur); |
1324 | | |
1325 | | /* |
1326 | | * samkumar: There used to be a call to sowwakeup(so); here, |
1327 | | * which wakes up any threads waiting for the socket to |
1328 | | * become ready for writing. TCPlp handles its send buffer |
1329 | | * differently so we do not need to replace this call with |
1330 | | * specialized code to handle this. |
1331 | | */ |
1332 | | |
1333 | | /* |
1334 | | * samkumar: Replaced sbavail(&so->so_snd) with this call to |
1335 | | * lbuf_used_space. |
1336 | | */ |
1337 | 0 | if (lbuf_used_space(&tp->sendbuf)) |
1338 | 0 | (void) tcplp_output(tp); |
1339 | 0 | goto check_delack; |
1340 | 0 | } |
1341 | 0 | } else if (th->th_ack == tp->snd_una && |
1342 | | /* |
1343 | | * samkumar: Replaced sbspace(&so->so_rcv) with this call to |
1344 | | * cbuf_free_space. |
1345 | | */ |
1346 | 0 | tlen <= cbuf_free_space(&tp->recvbuf)) { |
1347 | | |
1348 | | /* |
1349 | | * This is a pure, in-sequence data packet with |
1350 | | * nothing on the reassembly queue and we have enough |
1351 | | * buffer space to take it. |
1352 | | */ |
1353 | | /* Clean receiver SACK report if present */ |
1354 | 0 | if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks) |
1355 | 0 | tcp_clean_sackreport(tp); |
1356 | |
|
1357 | 0 | tp->rcv_nxt += tlen; |
1358 | | /* |
1359 | | * Pull snd_wl1 up to prevent seq wrap relative to |
1360 | | * th_seq. |
1361 | | */ |
1362 | 0 | tp->snd_wl1 = th->th_seq; |
1363 | | /* |
1364 | | * Pull rcv_up up to prevent seq wrap relative to |
1365 | | * rcv_nxt. |
1366 | | */ |
1367 | 0 | tp->rcv_up = tp->rcv_nxt; |
1368 | | |
1369 | | /* |
1370 | | * Automatic sizing of receive socket buffer. Often the send |
1371 | | * buffer size is not optimally adjusted to the actual network |
1372 | | * conditions at hand (delay bandwidth product). Setting the |
1373 | | * buffer size too small limits throughput on links with high |
1374 | | * bandwidth and high delay (eg. trans-continental/oceanic links). |
1375 | | * |
1376 | | * On the receive side the socket buffer memory is only rarely |
1377 | | * used to any significant extent. This allows us to be much |
1378 | | * more aggressive in scaling the receive socket buffer. For |
1379 | | * the case that the buffer space is actually used to a large |
1380 | | * extent and we run out of kernel memory we can simply drop |
1381 | | * the new segments; TCP on the sender will just retransmit it |
1382 | | * later. Setting the buffer size too big may only consume too |
1383 | | * much kernel memory if the application doesn't read() from |
1384 | | * the socket or packet loss or reordering makes use of the |
1385 | | * reassembly queue. |
1386 | | * |
1387 | | * The criteria to step up the receive buffer one notch are: |
1388 | | * 1. Application has not set receive buffer size with |
1389 | | * SO_RCVBUF. Setting SO_RCVBUF clears SB_AUTOSIZE. |
1390 | | * 2. the number of bytes received during the time it takes |
1391 | | * one timestamp to be reflected back to us (the RTT); |
1392 | | * 3. received bytes per RTT is within seven eighth of the |
1393 | | * current socket buffer size; |
1394 | | * 4. receive buffer size has not hit maximal automatic size; |
1395 | | * |
1396 | | * This algorithm does one step per RTT at most and only if |
1397 | | * we receive a bulk stream w/o packet losses or reorderings. |
1398 | | * Shrinking the buffer during idle times is not necessary as |
1399 | | * it doesn't consume any memory when idle. |
1400 | | * |
1401 | | * TODO: Only step up if the application is actually serving |
1402 | | * the buffer to better manage the socket buffer resources. |
1403 | | */ |
1404 | | |
1405 | | /* |
1406 | | * samkumar: There used to be code here to dynamically size the |
1407 | | * receive buffer (tp->rfbuf_ts, rp->rfbuf_cnt, and the local |
1408 | | * newsize variable). In TCPlp, we don't support this, as the user |
1409 | | * allocates the receive buffer and its size can't be changed here. |
1410 | | * Therefore, I removed the code that does this. Note that the |
1411 | | * actual resizing of the buffer is done using sbreserve_locked, |
1412 | | * whose call comes later (not exactly where this comment is). |
1413 | | */ |
1414 | | |
1415 | | /* Add data to socket buffer. */ |
1416 | | |
1417 | | /* |
1418 | | * samkumar: The code that was here would just free the mbuf |
1419 | | * (with m_freem(m)) if SBS_CANTRCVMORE is set in |
1420 | | * so->so_rcv.sb_state. Otherwise, it would cut drop_hdrlen bytes |
1421 | | * from the mbuf (using m_adj(m, drop_hdrlen)) to discard the |
1422 | | * headers and then append the mbuf to the receive buffer using |
1423 | | * sbappendstream_locked(&so->so_rcv, m, 0). I've rewritten this |
1424 | | * to work the TCPlp way. The check to so->so_rcv.sb_state is |
1425 | | * replaced by a tcpiscantrcv call, and we copy bytes into |
1426 | | * TCPlp's circular buffer (since we designed it to avoid |
1427 | | * having dynamically-allocated memory for the receive buffer). |
1428 | | */ |
1429 | |
|
1430 | 0 | if (!tpiscantrcv(tp)) { |
1431 | 0 | cbuf_write(&tp->recvbuf, msg, otMessageGetOffset(msg) + drop_hdrlen, tlen, cbuf_copy_from_message); |
1432 | 0 | if (tlen > 0) { |
1433 | 0 | sig->recvbuf_added = true; |
1434 | 0 | } |
1435 | 0 | } else { |
1436 | | /* |
1437 | | * samkumar: We already know tlen != 0, so if we got here, then |
1438 | | * it means that we got data after we called SHUT_RD, or after |
1439 | | * receiving a FIN. I'm going to drop the connection in this |
1440 | | * case. I think FreeBSD might have just dropped the packet |
1441 | | * silently, but Linux handles it this way; this seems to be |
1442 | | * the right approach to me. |
1443 | | */ |
1444 | 0 | tcp_drop(tp, ECONNABORTED); |
1445 | 0 | goto drop; |
1446 | 0 | } |
1447 | | /* NB: sorwakeup_locked() does an implicit unlock. */ |
1448 | | /* |
1449 | | * samkumar: There used to be a call to sorwakeup_locked(so); here, |
1450 | | * which wakes up any threads waiting for the socket to become |
1451 | | * become ready for reading. TCPlp handles its buffering |
1452 | | * differently so we do not need to replace this call with |
1453 | | * specialized code to handle this. |
1454 | | */ |
1455 | 0 | if (DELAY_ACK(tp, tlen)) { |
1456 | 0 | tp->t_flags |= TF_DELACK; |
1457 | 0 | } else { |
1458 | 0 | tp->t_flags |= TF_ACKNOW; |
1459 | 0 | tcplp_output(tp); |
1460 | 0 | } |
1461 | 0 | goto check_delack; |
1462 | 0 | } |
1463 | 0 | } |
1464 | | |
1465 | | /* |
1466 | | * Calculate amount of space in receive window, |
1467 | | * and then do TCP input processing. |
1468 | | * Receive window is amount of space in rcv queue, |
1469 | | * but not less than advertised window. |
1470 | | */ |
1471 | | /* samkumar: Replaced sbspace(&so->so_rcv) with call to cbuf_free_space. */ |
1472 | 0 | win = cbuf_free_space(&tp->recvbuf); |
1473 | 0 | if (win < 0) |
1474 | 0 | win = 0; |
1475 | 0 | tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); |
1476 | | |
1477 | | /* Reset receive buffer auto scaling when not in bulk receive mode. */ |
1478 | | /* samkumar: Removed this receive buffer autoscaling code. */ |
1479 | |
|
1480 | 0 | switch (tp->t_state) { |
1481 | | |
1482 | | /* |
1483 | | * If the state is SYN_RECEIVED: |
1484 | | * if seg contains an ACK, but not for our SYN/ACK, send a RST. |
1485 | | * (Added by Sam) if seg is resending the original SYN, resend the SYN/ACK |
1486 | | */ |
1487 | | /* |
1488 | | * samkumar: If we receive a retransmission of the original SYN, then |
1489 | | * resend the SYN/ACK segment. This case was probably handled by the |
1490 | | * SYN cache. Because TCPlp does not use a SYN cache, we need to write |
1491 | | * custom logic for it. It is handled in the "else if" clause below. |
1492 | | */ |
1493 | 0 | case TCPS_SYN_RECEIVED: |
1494 | 0 | if ((thflags & TH_ACK) && |
1495 | 0 | (SEQ_LEQ(th->th_ack, tp->snd_una) || |
1496 | 0 | SEQ_GT(th->th_ack, tp->snd_max))) { |
1497 | 0 | rstreason = BANDLIM_RST_OPENPORT; |
1498 | 0 | goto dropwithreset; |
1499 | 0 | } else if (!IS_FASTOPEN(tp->t_flags) && (thflags & TH_SYN) && !(thflags & TH_ACK) && (th->th_seq == tp->irs)) { |
1500 | 0 | tp->t_flags |= TF_ACKNOW; |
1501 | 0 | } |
1502 | | /* |
1503 | | * samkumar: TCP Fast Open Logic from FreeBSD 12.0. |
1504 | | */ |
1505 | 0 | if (IS_FASTOPEN(tp->t_flags)) { |
1506 | | /* |
1507 | | * When a TFO connection is in SYN_RECEIVED, the |
1508 | | * only valid packets are the initial SYN, a |
1509 | | * retransmit/copy of the initial SYN (possibly with |
1510 | | * a subset of the original data), a valid ACK, a |
1511 | | * FIN, or a RST. |
1512 | | */ |
1513 | 0 | if ((thflags & (TH_SYN|TH_ACK)) == (TH_SYN|TH_ACK)) { |
1514 | 0 | rstreason = BANDLIM_RST_OPENPORT; |
1515 | 0 | goto dropwithreset; |
1516 | 0 | } else if (thflags & TH_SYN) { |
1517 | | /* non-initial SYN is ignored */ |
1518 | 0 | if ((tcp_timer_active(tp, TT_DELACK) || |
1519 | 0 | tcp_timer_active(tp, TT_REXMT))) |
1520 | 0 | goto drop; |
1521 | 0 | } else if (!(thflags & (TH_ACK|TH_FIN|TH_RST))) { |
1522 | 0 | goto drop; |
1523 | 0 | } |
1524 | 0 | } |
1525 | 0 | break; |
1526 | | |
1527 | | /* |
1528 | | * If the state is SYN_SENT: |
1529 | | * if seg contains an ACK, but not for our SYN, drop the input. |
1530 | | * if seg contains a RST, then drop the connection. |
1531 | | * if seg does not contain SYN, then drop it. |
1532 | | * Otherwise this is an acceptable SYN segment |
1533 | | * initialize tp->rcv_nxt and tp->irs |
1534 | | * if seg contains ack then advance tp->snd_una |
1535 | | * if seg contains an ECE and ECN support is enabled, the stream |
1536 | | * is ECN capable. |
1537 | | * if SYN has been acked change to ESTABLISHED else SYN_RCVD state |
1538 | | * arrange for segment to be acked (eventually) |
1539 | | * continue processing rest of data/controls, beginning with URG |
1540 | | */ |
1541 | 0 | case TCPS_SYN_SENT: |
1542 | 0 | if ((thflags & TH_ACK) && |
1543 | 0 | (SEQ_LEQ(th->th_ack, tp->iss) || |
1544 | 0 | SEQ_GT(th->th_ack, tp->snd_max))) { |
1545 | 0 | rstreason = BANDLIM_UNLIMITED; |
1546 | 0 | goto dropwithreset; |
1547 | 0 | } |
1548 | 0 | if ((thflags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) { |
1549 | 0 | tp = tcp_drop(tp, ECONNREFUSED); |
1550 | 0 | } |
1551 | 0 | if (thflags & TH_RST) |
1552 | 0 | goto drop; |
1553 | 0 | if (!(thflags & TH_SYN)) |
1554 | 0 | goto drop; |
1555 | | |
1556 | 0 | tp->irs = th->th_seq; |
1557 | 0 | tcp_rcvseqinit(tp); |
1558 | 0 | if (thflags & TH_ACK) { |
1559 | 0 | int tfo_partial_ack = 0; |
1560 | | |
1561 | | /* |
1562 | | * samkumar: Removed call to soisconnected(so), since TCPlp has its |
1563 | | * own buffering. |
1564 | | */ |
1565 | | |
1566 | | /* Do window scaling on this connection? */ |
1567 | 0 | if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == |
1568 | 0 | (TF_RCVD_SCALE|TF_REQ_SCALE)) { |
1569 | 0 | tp->rcv_scale = tp->request_r_scale; |
1570 | 0 | } |
1571 | 0 | tp->rcv_adv += imin(tp->rcv_wnd, |
1572 | 0 | TCP_MAXWIN << tp->rcv_scale); |
1573 | 0 | tp->snd_una++; /* SYN is acked */ |
1574 | | /* |
1575 | | * If not all the data that was sent in the TFO SYN |
1576 | | * has been acked, resend the remainder right away. |
1577 | | */ |
1578 | 0 | if (IS_FASTOPEN(tp->t_flags) && |
1579 | 0 | (tp->snd_una != tp->snd_max)) { |
1580 | 0 | tp->snd_nxt = th->th_ack; |
1581 | 0 | tfo_partial_ack = 1; |
1582 | 0 | } |
1583 | | /* |
1584 | | * If there's data, delay ACK; if there's also a FIN |
1585 | | * ACKNOW will be turned on later. |
1586 | | */ |
1587 | 0 | if (DELAY_ACK(tp, tlen) && tlen != 0 && !tfo_partial_ack) |
1588 | 0 | tcp_timer_activate(tp, TT_DELACK, |
1589 | 0 | tcp_delacktime); |
1590 | 0 | else |
1591 | 0 | tp->t_flags |= TF_ACKNOW; |
1592 | |
|
1593 | 0 | if ((thflags & TH_ECE) && V_tcp_do_ecn) { |
1594 | 0 | tp->t_flags |= TF_ECN_PERMIT; |
1595 | 0 | } |
1596 | | |
1597 | | /* |
1598 | | * Received <SYN,ACK> in SYN_SENT[*] state. |
1599 | | * Transitions: |
1600 | | * SYN_SENT --> ESTABLISHED |
1601 | | * SYN_SENT* --> FIN_WAIT_1 |
1602 | | */ |
1603 | 0 | tp->t_starttime = ticks; |
1604 | 0 | if (tp->t_flags & TF_NEEDFIN) { |
1605 | 0 | tcp_state_change(tp, TCPS_FIN_WAIT_1); |
1606 | 0 | tp->t_flags &= ~TF_NEEDFIN; |
1607 | 0 | thflags &= ~TH_SYN; |
1608 | 0 | } else { |
1609 | 0 | tcp_state_change(tp, TCPS_ESTABLISHED); |
1610 | | /* samkumar: Set conn_established signal for TCPlp. */ |
1611 | 0 | sig->conn_established = true; |
1612 | 0 | cc_conn_init(tp); |
1613 | 0 | tcp_timer_activate(tp, TT_KEEP, |
1614 | 0 | TP_KEEPIDLE(tp)); |
1615 | 0 | } |
1616 | 0 | } else { |
1617 | | /* |
1618 | | * Received initial SYN in SYN-SENT[*] state => |
1619 | | * simultaneous open. |
1620 | | * If it succeeds, connection is * half-synchronized. |
1621 | | * Otherwise, do 3-way handshake: |
1622 | | * SYN-SENT -> SYN-RECEIVED |
1623 | | * SYN-SENT* -> SYN-RECEIVED* |
1624 | | */ |
1625 | 0 | tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); |
1626 | 0 | tcp_timer_activate(tp, TT_REXMT, 0); |
1627 | 0 | tcp_state_change(tp, TCPS_SYN_RECEIVED); |
1628 | | /* |
1629 | | * samkumar: We would have incremented snd_next in tcplp_output when |
1630 | | * we sent the original SYN, so decrement it here. (Another |
1631 | | * consequence of removing the SYN cache.) |
1632 | | */ |
1633 | 0 | tp->snd_nxt--; |
1634 | 0 | } |
1635 | | |
1636 | | /* |
1637 | | * Advance th->th_seq to correspond to first data byte. |
1638 | | * If data, trim to stay within window, |
1639 | | * dropping FIN if necessary. |
1640 | | */ |
1641 | 0 | th->th_seq++; |
1642 | 0 | if (tlen > tp->rcv_wnd) { |
1643 | 0 | todrop = tlen - tp->rcv_wnd; |
1644 | | /* |
1645 | | * samkumar: I removed a call to m_adj(m, -todrop), which intends |
1646 | | * to trim the data so it fits in the window. We can just read less |
1647 | | * when copying into the receive buffer in TCPlp, so we don't need |
1648 | | * to do this. |
1649 | | */ |
1650 | 0 | (void) todrop; /* samkumar: Prevent a compiler warning */ |
1651 | 0 | tlen = tp->rcv_wnd; |
1652 | 0 | thflags &= ~TH_FIN; |
1653 | 0 | } |
1654 | 0 | tp->snd_wl1 = th->th_seq - 1; |
1655 | 0 | tp->rcv_up = th->th_seq; |
1656 | | /* |
1657 | | * Client side of transaction: already sent SYN and data. |
1658 | | * If the remote host used T/TCP to validate the SYN, |
1659 | | * our data will be ACK'd; if so, enter normal data segment |
1660 | | * processing in the middle of step 5, ack processing. |
1661 | | * Otherwise, goto step 6. |
1662 | | */ |
1663 | 0 | if (thflags & TH_ACK) |
1664 | 0 | goto process_ACK; |
1665 | | |
1666 | 0 | goto step6; |
1667 | | |
1668 | | /* |
1669 | | * If the state is LAST_ACK or CLOSING or TIME_WAIT: |
1670 | | * do normal processing. |
1671 | | * |
1672 | | * NB: Leftover from RFC1644 T/TCP. Cases to be reused later. |
1673 | | */ |
1674 | 0 | case TCPS_LAST_ACK: |
1675 | 0 | case TCPS_CLOSING: |
1676 | 0 | break; /* continue normal processing */ |
1677 | 0 | } |
1678 | | |
1679 | | /* |
1680 | | * States other than LISTEN or SYN_SENT. |
1681 | | * First check the RST flag and sequence number since reset segments |
1682 | | * are exempt from the timestamp and connection count tests. This |
1683 | | * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix |
1684 | | * below which allowed reset segments in half the sequence space |
1685 | | * to fall though and be processed (which gives forged reset |
1686 | | * segments with a random sequence number a 50 percent chance of |
1687 | | * killing a connection). |
1688 | | * Then check timestamp, if present. |
1689 | | * Then check the connection count, if present. |
1690 | | * Then check that at least some bytes of segment are within |
1691 | | * receive window. If segment begins before rcv_nxt, |
1692 | | * drop leading data (and SYN); if nothing left, just ack. |
1693 | | */ |
1694 | 0 | if (thflags & TH_RST) { |
1695 | | /* |
1696 | | * RFC5961 Section 3.2 |
1697 | | * |
1698 | | * - RST drops connection only if SEG.SEQ == RCV.NXT. |
1699 | | * - If RST is in window, we send challenge ACK. |
1700 | | * |
1701 | | * Note: to take into account delayed ACKs, we should |
1702 | | * test against last_ack_sent instead of rcv_nxt. |
1703 | | * Note 2: we handle special case of closed window, not |
1704 | | * covered by the RFC. |
1705 | | */ |
1706 | 0 | if ((SEQ_GEQ(th->th_seq, tp->last_ack_sent) && |
1707 | 0 | SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) || |
1708 | 0 | (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) { |
1709 | | |
1710 | | /* |
1711 | | * samkumar: This if statement used to also be prefaced with |
1712 | | * "V_tcp_insecure_rst ||". But I removed it, since there's no |
1713 | | * reason to support an insecure option in TCPlp (my guess is that |
1714 | | * FreeBSD supported it for legacy reasons). |
1715 | | */ |
1716 | 0 | if (tp->last_ack_sent == th->th_seq) { |
1717 | | /* |
1718 | | * samkumar: Normally, the error number would be stored in |
1719 | | * so->so_error. Instead, we put it in this "droperror" local |
1720 | | * variable and then pass it to tcplp_sys_connection_lost. |
1721 | | */ |
1722 | 0 | int droperror = 0; |
1723 | | /* Drop the connection. */ |
1724 | 0 | switch (tp->t_state) { |
1725 | 0 | case TCPS_SYN_RECEIVED: |
1726 | 0 | droperror = ECONNREFUSED; |
1727 | 0 | goto close; |
1728 | 0 | case TCPS_ESTABLISHED: |
1729 | 0 | case TCPS_FIN_WAIT_1: |
1730 | 0 | case TCPS_FIN_WAIT_2: |
1731 | 0 | case TCPS_CLOSE_WAIT: |
1732 | 0 | droperror = ECONNRESET; |
1733 | 0 | close: |
1734 | 0 | tcp_state_change(tp, TCPS_CLOSED); |
1735 | | /* FALLTHROUGH */ |
1736 | 0 | default: |
1737 | 0 | tp = tcp_close_tcb(tp); |
1738 | 0 | tcplp_sys_connection_lost(tp, droperror); |
1739 | 0 | } |
1740 | 0 | } else { |
1741 | | /* Send challenge ACK. */ |
1742 | 0 | tcp_respond(tp, tp->instance, ip6, th, tp->rcv_nxt, tp->snd_nxt, TH_ACK); |
1743 | 0 | tp->last_ack_sent = tp->rcv_nxt; |
1744 | 0 | } |
1745 | 0 | } |
1746 | 0 | goto drop; |
1747 | 0 | } |
1748 | | |
1749 | | /* |
1750 | | * RFC5961 Section 4.2 |
1751 | | * Send challenge ACK for any SYN in synchronized state. |
1752 | | */ |
1753 | | /* |
1754 | | * samkumar: I added the check for the SYN-RECEIVED state in this if |
1755 | | * statement (another consequence of removing the SYN cache). |
1756 | | */ |
1757 | 0 | if ((thflags & TH_SYN) && tp->t_state != TCPS_SYN_SENT && tp->t_state != TCP6S_SYN_RECEIVED) { |
1758 | | /* |
1759 | | * samkumar: The modern way to handle this is to send a Challenge ACK. |
1760 | | * FreeBSD supports this, but it also has this V_tcp_insecure_syn |
1761 | | * options that will cause it to drop the connection if the SYN falls |
1762 | | * in the receive window. In TCPlp we *only* support Challenge ACKs |
1763 | | * (the secure way of doing it), so I've removed code for the insecure |
1764 | | * way. (Presumably the reason why FreeBSD supports the insecure way is |
1765 | | * for legacy code, which we don't really care about in TCPlp). |
1766 | | */ |
1767 | | /* Send challenge ACK. */ |
1768 | 0 | tcplp_sys_log("Sending challenge ACK"); |
1769 | 0 | tcp_respond(tp, tp->instance, ip6, th, tp->rcv_nxt, tp->snd_nxt, TH_ACK); |
1770 | 0 | tp->last_ack_sent = tp->rcv_nxt; |
1771 | 0 | goto drop; |
1772 | 0 | } |
1773 | | |
1774 | | /* |
1775 | | * RFC 1323 PAWS: If we have a timestamp reply on this segment |
1776 | | * and it's less than ts_recent, drop it. |
1777 | | */ |
1778 | 0 | if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent && |
1779 | 0 | TSTMP_LT(to.to_tsval, tp->ts_recent)) { |
1780 | | |
1781 | | /* Check to see if ts_recent is over 24 days old. */ |
1782 | 0 | if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) { |
1783 | | /* |
1784 | | * Invalidate ts_recent. If this segment updates |
1785 | | * ts_recent, the age will be reset later and ts_recent |
1786 | | * will get a valid value. If it does not, setting |
1787 | | * ts_recent to zero will at least satisfy the |
1788 | | * requirement that zero be placed in the timestamp |
1789 | | * echo reply when ts_recent isn't valid. The |
1790 | | * age isn't reset until we get a valid ts_recent |
1791 | | * because we don't want out-of-order segments to be |
1792 | | * dropped when ts_recent is old. |
1793 | | */ |
1794 | 0 | tp->ts_recent = 0; |
1795 | 0 | } else { |
1796 | 0 | if (tlen) |
1797 | 0 | goto dropafterack; |
1798 | 0 | goto drop; |
1799 | 0 | } |
1800 | 0 | } |
1801 | | |
1802 | | /* |
1803 | | * In the SYN-RECEIVED state, validate that the packet belongs to |
1804 | | * this connection before trimming the data to fit the receive |
1805 | | * window. Check the sequence number versus IRS since we know |
1806 | | * the sequence numbers haven't wrapped. This is a partial fix |
1807 | | * for the "LAND" DoS attack. |
1808 | | */ |
1809 | 0 | if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) { |
1810 | 0 | rstreason = BANDLIM_RST_OPENPORT; |
1811 | 0 | goto dropwithreset; |
1812 | 0 | } |
1813 | | |
1814 | 0 | todrop = tp->rcv_nxt - th->th_seq; |
1815 | 0 | if (todrop > 0) { |
1816 | 0 | if (thflags & TH_SYN) { |
1817 | 0 | thflags &= ~TH_SYN; |
1818 | 0 | th->th_seq++; |
1819 | 0 | if (th->th_urp > 1) |
1820 | 0 | th->th_urp--; |
1821 | 0 | else |
1822 | 0 | thflags &= ~TH_URG; |
1823 | 0 | todrop--; |
1824 | 0 | } |
1825 | | /* |
1826 | | * Following if statement from Stevens, vol. 2, p. 960. |
1827 | | */ |
1828 | 0 | if (todrop > tlen |
1829 | 0 | || (todrop == tlen && (thflags & TH_FIN) == 0)) { |
1830 | | /* |
1831 | | * Any valid FIN must be to the left of the window. |
1832 | | * At this point the FIN must be a duplicate or out |
1833 | | * of sequence; drop it. |
1834 | | */ |
1835 | 0 | thflags &= ~TH_FIN; |
1836 | | |
1837 | | /* |
1838 | | * Send an ACK to resynchronize and drop any data. |
1839 | | * But keep on processing for RST or ACK. |
1840 | | */ |
1841 | 0 | tp->t_flags |= TF_ACKNOW; |
1842 | 0 | todrop = tlen; |
1843 | 0 | } |
1844 | | /* samkumar: There was an else case that only collected stats. */ |
1845 | 0 | drop_hdrlen += todrop; /* drop from the top afterwards */ |
1846 | 0 | th->th_seq += todrop; |
1847 | 0 | tlen -= todrop; |
1848 | 0 | if (th->th_urp > todrop) |
1849 | 0 | th->th_urp -= todrop; |
1850 | 0 | else { |
1851 | 0 | thflags &= ~TH_URG; |
1852 | 0 | th->th_urp = 0; |
1853 | 0 | } |
1854 | 0 | } |
1855 | | |
1856 | | /* |
1857 | | * If new data are received on a connection after the |
1858 | | * user processes are gone, then RST the other end. |
1859 | | */ |
1860 | | /* |
1861 | | * samkumar: TCPlp is designed for embedded systems where there is no |
1862 | | * concept of a "process" that has allocated a TCP socket. Therefore, we |
1863 | | * do not implement the functionality in the above comment (the code for |
1864 | | * it used to be here, and I removed it). |
1865 | | */ |
1866 | | /* |
1867 | | * If segment ends after window, drop trailing data |
1868 | | * (and PUSH and FIN); if nothing left, just ACK. |
1869 | | */ |
1870 | 0 | todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd); |
1871 | 0 | if (todrop > 0) { |
1872 | 0 | if (todrop >= tlen) { |
1873 | | /* |
1874 | | * If window is closed can only take segments at |
1875 | | * window edge, and have to drop data and PUSH from |
1876 | | * incoming segments. Continue processing, but |
1877 | | * remember to ack. Otherwise, drop segment |
1878 | | * and ack. |
1879 | | */ |
1880 | 0 | if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { |
1881 | 0 | tp->t_flags |= TF_ACKNOW; |
1882 | 0 | } else |
1883 | 0 | goto dropafterack; |
1884 | 0 | } |
1885 | | /* |
1886 | | * samkumar: I removed a call to m_adj(m, -todrop), which intends |
1887 | | * to trim the data so it fits in the window. We can just read less |
1888 | | * when copying into the receive buffer in TCPlp, so we don't need |
1889 | | * to do this. Subtracting it from tlen gives us enough information to |
1890 | | * do this later. In FreeBSD, this isn't possible because the mbuf |
1891 | | * itself becomes part of the receive buffer, so the mbuf has to be |
1892 | | * trimmed in order for this to work out. |
1893 | | */ |
1894 | 0 | tlen -= todrop; |
1895 | 0 | thflags &= ~(TH_PUSH|TH_FIN); |
1896 | 0 | } |
1897 | | |
1898 | | /* |
1899 | | * If last ACK falls within this segment's sequence numbers, |
1900 | | * record its timestamp. |
1901 | | * NOTE: |
1902 | | * 1) That the test incorporates suggestions from the latest |
1903 | | * proposal of the tcplw@cray.com list (Braden 1993/04/26). |
1904 | | * 2) That updating only on newer timestamps interferes with |
1905 | | * our earlier PAWS tests, so this check should be solely |
1906 | | * predicated on the sequence space of this segment. |
1907 | | * 3) That we modify the segment boundary check to be |
1908 | | * Last.ACK.Sent <= SEG.SEQ + SEG.Len |
1909 | | * instead of RFC1323's |
1910 | | * Last.ACK.Sent < SEG.SEQ + SEG.Len, |
1911 | | * This modified check allows us to overcome RFC1323's |
1912 | | * limitations as described in Stevens TCP/IP Illustrated |
1913 | | * Vol. 2 p.869. In such cases, we can still calculate the |
1914 | | * RTT correctly when RCV.NXT == Last.ACK.Sent. |
1915 | | */ |
1916 | | |
1917 | 0 | if ((to.to_flags & TOF_TS) != 0 && |
1918 | 0 | SEQ_LEQ(th->th_seq, tp->last_ack_sent) && |
1919 | 0 | SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + |
1920 | 0 | ((thflags & (TH_SYN|TH_FIN)) != 0))) { |
1921 | 0 | tp->ts_recent_age = tcp_ts_getticks(); |
1922 | 0 | tp->ts_recent = to.to_tsval; |
1923 | 0 | } |
1924 | | |
1925 | | /* |
1926 | | * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN |
1927 | | * flag is on (half-synchronized state), then queue data for |
1928 | | * later processing; else drop segment and return. |
1929 | | */ |
1930 | 0 | if ((thflags & TH_ACK) == 0) { |
1931 | 0 | if (tp->t_state == TCPS_SYN_RECEIVED || |
1932 | 0 | (tp->t_flags & TF_NEEDSYN)) { |
1933 | 0 | if (tp->t_state == TCPS_SYN_RECEIVED && |
1934 | 0 | IS_FASTOPEN(tp->t_flags)) { |
1935 | 0 | tp->snd_wnd = tiwin; |
1936 | 0 | cc_conn_init(tp); |
1937 | 0 | } |
1938 | 0 | goto step6; |
1939 | 0 | } else if (tp->t_flags & TF_ACKNOW) |
1940 | 0 | goto dropafterack; |
1941 | 0 | else |
1942 | 0 | goto drop; |
1943 | 0 | } |
1944 | | |
1945 | 0 | tcplp_sys_log("Processing ACK"); |
1946 | | |
1947 | | /* |
1948 | | * Ack processing. |
1949 | | */ |
1950 | 0 | switch (tp->t_state) { |
1951 | | |
1952 | | /* |
1953 | | * In SYN_RECEIVED state, the ack ACKs our SYN, so enter |
1954 | | * ESTABLISHED state and continue processing. |
1955 | | * The ACK was checked above. |
1956 | | */ |
1957 | 0 | case TCPS_SYN_RECEIVED: |
1958 | | /* |
1959 | | * samkumar: Removed call to soisconnected(so), since TCPlp has its |
1960 | | * own buffering. |
1961 | | */ |
1962 | | /* Do window scaling? */ |
1963 | 0 | if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == |
1964 | 0 | (TF_RCVD_SCALE|TF_REQ_SCALE)) { |
1965 | 0 | tp->rcv_scale = tp->request_r_scale; |
1966 | 0 | tp->snd_wnd = tiwin; |
1967 | 0 | } |
1968 | | /* |
1969 | | * Make transitions: |
1970 | | * SYN-RECEIVED -> ESTABLISHED |
1971 | | * SYN-RECEIVED* -> FIN-WAIT-1 |
1972 | | */ |
1973 | 0 | tp->t_starttime = ticks; |
1974 | | /* |
1975 | | * samkumar: I'm eliminating the TFO pending counter. |
1976 | | */ |
1977 | 0 | if (IS_FASTOPEN(tp->t_flags)/* && tp->t_tfo_pending */) {\ |
1978 | | /* |
1979 | | tcp_fastopen_decrement_counter(tp->t_tfo_pending); |
1980 | | tp->t_tfo_pending = NULL; |
1981 | | */ |
1982 | | |
1983 | | /* |
1984 | | * Account for the ACK of our SYN prior to |
1985 | | * regular ACK processing below. |
1986 | | */ |
1987 | 0 | tp->snd_una++; |
1988 | 0 | } |
1989 | 0 | if (tp->t_flags & TF_NEEDFIN) { |
1990 | 0 | tcp_state_change(tp, TCPS_FIN_WAIT_1); |
1991 | 0 | tp->t_flags &= ~TF_NEEDFIN; |
1992 | 0 | } else { |
1993 | 0 | tcp_state_change(tp, TCPS_ESTABLISHED); |
1994 | | /* samkumar: Set conn_established signal for TCPlp. */ |
1995 | 0 | sig->conn_established = true; |
1996 | | /* |
1997 | | * TFO connections call cc_conn_init() during SYN |
1998 | | * processing. Calling it again here for such |
1999 | | * connections is not harmless as it would undo the |
2000 | | * snd_cwnd reduction that occurs when a TFO SYN|ACK |
2001 | | * is retransmitted. |
2002 | | */ |
2003 | 0 | if (!IS_FASTOPEN(tp->t_flags)) |
2004 | 0 | cc_conn_init(tp); |
2005 | 0 | tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp)); |
2006 | | /* |
2007 | | * samkumar: I added this check to account for simultaneous open. |
2008 | | * If this socket was opened actively, then the fact that we are |
2009 | | * in SYN-RECEIVED indicates that we are in simultaneous open. |
2010 | | * Therefore, don't ACK the SYN-ACK (unless it contains data or |
2011 | | * something, which will be processed later). |
2012 | | */ |
2013 | 0 | if (!tpispassiveopen(tp)) { |
2014 | 0 | tp->t_flags &= ~TF_ACKNOW; |
2015 | 0 | } else { |
2016 | | /* |
2017 | | * samkumar: Otherwise, we entered the ESTABLISHED state by |
2018 | | * accepting a connection, so call the appropriate callback in |
2019 | | * TCPlp. TODO: consider using signals to handle this? |
2020 | | */ |
2021 | 0 | bool accepted = tcplp_sys_accepted_connection(tp->accepted_from, tp, &ip6->ip6_src, th->th_sport); |
2022 | 0 | if (!accepted) { |
2023 | 0 | rstreason = ECONNREFUSED; |
2024 | 0 | goto dropwithreset; |
2025 | 0 | } |
2026 | 0 | } |
2027 | 0 | } |
2028 | | /* |
2029 | | * If segment contains data or ACK, will call tcp_reass() |
2030 | | * later; if not, do so now to pass queued data to user. |
2031 | | */ |
2032 | 0 | if (tlen == 0 && (thflags & TH_FIN) == 0) |
2033 | 0 | (void) tcp_reass(tp, (struct tcphdr *)0, 0, |
2034 | 0 | (otMessage*)0, 0, sig); |
2035 | |
|
2036 | 0 | tp->snd_wl1 = th->th_seq - 1; |
2037 | | /* FALLTHROUGH */ |
2038 | | |
2039 | | /* |
2040 | | * In ESTABLISHED state: drop duplicate ACKs; ACK out of range |
2041 | | * ACKs. If the ack is in the range |
2042 | | * tp->snd_una < th->th_ack <= tp->snd_max |
2043 | | * then advance tp->snd_una to th->th_ack and drop |
2044 | | * data from the retransmission queue. If this ACK reflects |
2045 | | * more up to date window information we update our window information. |
2046 | | */ |
2047 | 0 | case TCPS_ESTABLISHED: |
2048 | 0 | case TCPS_FIN_WAIT_1: |
2049 | 0 | case TCPS_FIN_WAIT_2: |
2050 | 0 | case TCPS_CLOSE_WAIT: |
2051 | 0 | case TCPS_CLOSING: |
2052 | 0 | case TCPS_LAST_ACK: |
2053 | 0 | if (SEQ_GT(th->th_ack, tp->snd_max)) { |
2054 | 0 | goto dropafterack; |
2055 | 0 | } |
2056 | | |
2057 | 0 | if ((tp->t_flags & TF_SACK_PERMIT) && |
2058 | 0 | ((to.to_flags & TOF_SACK) || |
2059 | 0 | !TAILQ_EMPTY(&tp->snd_holes))) |
2060 | 0 | tcp_sack_doack(tp, &to, th->th_ack); |
2061 | |
|
2062 | 0 | if (SEQ_LEQ(th->th_ack, tp->snd_una)) { |
2063 | 0 | if (tlen == 0 && tiwin == tp->snd_wnd) { |
2064 | | /* |
2065 | | * If this is the first time we've seen a |
2066 | | * FIN from the remote, this is not a |
2067 | | * duplicate and it needs to be processed |
2068 | | * normally. This happens during a |
2069 | | * simultaneous close. |
2070 | | */ |
2071 | 0 | if ((thflags & TH_FIN) && |
2072 | 0 | (TCPS_HAVERCVDFIN(tp->t_state) == 0)) { |
2073 | 0 | tp->t_dupacks = 0; |
2074 | 0 | break; |
2075 | 0 | } |
2076 | | /* |
2077 | | * If we have outstanding data (other than |
2078 | | * a window probe), this is a completely |
2079 | | * duplicate ack (ie, window info didn't |
2080 | | * change and FIN isn't set), |
2081 | | * the ack is the biggest we've |
2082 | | * seen and we've seen exactly our rexmt |
2083 | | * threshhold of them, assume a packet |
2084 | | * has been dropped and retransmit it. |
2085 | | * Kludge snd_nxt & the congestion |
2086 | | * window so we send only this one |
2087 | | * packet. |
2088 | | * |
2089 | | * We know we're losing at the current |
2090 | | * window size so do congestion avoidance |
2091 | | * (set ssthresh to half the current window |
2092 | | * and pull our congestion window back to |
2093 | | * the new ssthresh). |
2094 | | * |
2095 | | * Dup acks mean that packets have left the |
2096 | | * network (they're now cached at the receiver) |
2097 | | * so bump cwnd by the amount in the receiver |
2098 | | * to keep a constant cwnd packets in the |
2099 | | * network. |
2100 | | * |
2101 | | * When using TCP ECN, notify the peer that |
2102 | | * we reduced the cwnd. |
2103 | | */ |
2104 | 0 | if (!tcp_timer_active(tp, TT_REXMT) || |
2105 | 0 | th->th_ack != tp->snd_una) |
2106 | 0 | tp->t_dupacks = 0; |
2107 | 0 | else if (++tp->t_dupacks > tcprexmtthresh || |
2108 | 0 | IN_FASTRECOVERY(tp->t_flags)) { |
2109 | 0 | cc_ack_received(tp, th, CC_DUPACK); |
2110 | 0 | if ((tp->t_flags & TF_SACK_PERMIT) && |
2111 | 0 | IN_FASTRECOVERY(tp->t_flags)) { |
2112 | 0 | int awnd; |
2113 | | |
2114 | | /* |
2115 | | * Compute the amount of data in flight first. |
2116 | | * We can inject new data into the pipe iff |
2117 | | * we have less than 1/2 the original window's |
2118 | | * worth of data in flight. |
2119 | | */ |
2120 | 0 | awnd = (tp->snd_nxt - tp->snd_fack) + |
2121 | 0 | tp->sackhint.sack_bytes_rexmit; |
2122 | 0 | if (awnd < tp->snd_ssthresh) { |
2123 | 0 | tp->snd_cwnd += tp->t_maxseg; |
2124 | 0 | if (tp->snd_cwnd > tp->snd_ssthresh) |
2125 | 0 | tp->snd_cwnd = tp->snd_ssthresh; |
2126 | 0 | } |
2127 | 0 | } else |
2128 | 0 | tp->snd_cwnd += tp->t_maxseg; |
2129 | | #ifdef INSTRUMENT_TCP |
2130 | | tcplp_sys_log("TCP DUPACK"); |
2131 | | #endif |
2132 | 0 | (void) tcplp_output(tp); |
2133 | 0 | goto drop; |
2134 | 0 | } else if (tp->t_dupacks == tcprexmtthresh) { |
2135 | 0 | tcp_seq onxt = tp->snd_nxt; |
2136 | | |
2137 | | /* |
2138 | | * If we're doing sack, check to |
2139 | | * see if we're already in sack |
2140 | | * recovery. If we're not doing sack, |
2141 | | * check to see if we're in newreno |
2142 | | * recovery. |
2143 | | */ |
2144 | 0 | if (tp->t_flags & TF_SACK_PERMIT) { |
2145 | 0 | if (IN_FASTRECOVERY(tp->t_flags)) { |
2146 | 0 | tp->t_dupacks = 0; |
2147 | 0 | break; |
2148 | 0 | } |
2149 | 0 | } else { |
2150 | 0 | if (SEQ_LEQ(th->th_ack, |
2151 | 0 | tp->snd_recover)) { |
2152 | 0 | tp->t_dupacks = 0; |
2153 | 0 | break; |
2154 | 0 | } |
2155 | 0 | } |
2156 | | /* Congestion signal before ack. */ |
2157 | 0 | cc_cong_signal(tp, th, CC_NDUPACK); |
2158 | 0 | cc_ack_received(tp, th, CC_DUPACK); |
2159 | 0 | tcp_timer_activate(tp, TT_REXMT, 0); |
2160 | 0 | tp->t_rtttime = 0; |
2161 | |
|
2162 | | #ifdef INSTRUMENT_TCP |
2163 | | tcplp_sys_log("TCP DUPACK_THRESH"); |
2164 | | #endif |
2165 | 0 | if (tp->t_flags & TF_SACK_PERMIT) { |
2166 | 0 | tp->sack_newdata = tp->snd_nxt; |
2167 | 0 | tp->snd_cwnd = tp->t_maxseg; |
2168 | 0 | (void) tcplp_output(tp); |
2169 | 0 | goto drop; |
2170 | 0 | } |
2171 | | |
2172 | 0 | tp->snd_nxt = th->th_ack; |
2173 | 0 | tp->snd_cwnd = tp->t_maxseg; |
2174 | 0 | (void) tcplp_output(tp); |
2175 | | /* |
2176 | | * samkumar: I added casts to uint64_t below to |
2177 | | * fix an OpenThread code scanning alert relating |
2178 | | * to integer overflow in multiplication. |
2179 | | */ |
2180 | 0 | tp->snd_cwnd = tp->snd_ssthresh + |
2181 | 0 | ((uint64_t) tp->t_maxseg) * |
2182 | 0 | ((uint64_t) (tp->t_dupacks - tp->snd_limited)); |
2183 | | #ifdef INSTRUMENT_TCP |
2184 | | tcplp_sys_log("TCP SET_cwnd %d", (int) tp->snd_cwnd); |
2185 | | #endif |
2186 | 0 | if (SEQ_GT(onxt, tp->snd_nxt)) |
2187 | 0 | tp->snd_nxt = onxt; |
2188 | 0 | goto drop; |
2189 | 0 | } else if (V_tcp_do_rfc3042) { |
2190 | | /* |
2191 | | * Process first and second duplicate |
2192 | | * ACKs. Each indicates a segment |
2193 | | * leaving the network, creating room |
2194 | | * for more. Make sure we can send a |
2195 | | * packet on reception of each duplicate |
2196 | | * ACK by increasing snd_cwnd by one |
2197 | | * segment. Restore the original |
2198 | | * snd_cwnd after packet transmission. |
2199 | | */ |
2200 | 0 | uint64_t oldcwnd; |
2201 | 0 | tcp_seq oldsndmax; |
2202 | 0 | uint32_t sent; |
2203 | 0 | int avail; |
2204 | 0 | cc_ack_received(tp, th, CC_DUPACK); |
2205 | 0 | oldcwnd = tp->snd_cwnd; |
2206 | 0 | oldsndmax = tp->snd_max; |
2207 | |
|
2208 | | #ifdef INSTRUMENT_TCP |
2209 | | tcplp_sys_log("TCP LIM_TRANS"); |
2210 | | #endif |
2211 | |
|
2212 | 0 | KASSERT(tp->t_dupacks == 1 || |
2213 | 0 | tp->t_dupacks == 2, |
2214 | 0 | ("%s: dupacks not 1 or 2", |
2215 | 0 | __func__)); |
2216 | 0 | if (tp->t_dupacks == 1) |
2217 | 0 | tp->snd_limited = 0; |
2218 | 0 | tp->snd_cwnd = |
2219 | 0 | (tp->snd_nxt - tp->snd_una) + |
2220 | 0 | (tp->t_dupacks - tp->snd_limited) * |
2221 | 0 | tp->t_maxseg; |
2222 | | /* |
2223 | | * Only call tcplp_output when there |
2224 | | * is new data available to be sent. |
2225 | | * Otherwise we would send pure ACKs. |
2226 | | */ |
2227 | | /* |
2228 | | * samkumar: Replace sbavail(&so->so_snd) with the call to |
2229 | | * lbuf_used_space. |
2230 | | */ |
2231 | 0 | avail = lbuf_used_space(&tp->sendbuf) - |
2232 | 0 | (tp->snd_nxt - tp->snd_una); |
2233 | 0 | if (avail > 0) |
2234 | 0 | (void) tcplp_output(tp); |
2235 | 0 | sent = tp->snd_max - oldsndmax; |
2236 | 0 | if (sent > tp->t_maxseg) { |
2237 | 0 | KASSERT((tp->t_dupacks == 2 && |
2238 | 0 | tp->snd_limited == 0) || |
2239 | 0 | (sent == tp->t_maxseg + 1 && |
2240 | 0 | tp->t_flags & TF_SENTFIN), |
2241 | 0 | ("%s: sent too much", |
2242 | 0 | __func__)); |
2243 | 0 | tp->snd_limited = 2; |
2244 | 0 | } else if (sent > 0) |
2245 | 0 | ++tp->snd_limited; |
2246 | 0 | tp->snd_cwnd = oldcwnd; |
2247 | | #ifdef INSTRUMENT_TCP |
2248 | | tcplp_sys_log("TCP RESET_cwnd %d", (int) tp->snd_cwnd); |
2249 | | #endif |
2250 | 0 | goto drop; |
2251 | 0 | } |
2252 | 0 | } else |
2253 | 0 | tp->t_dupacks = 0; |
2254 | 0 | break; |
2255 | 0 | } |
2256 | | |
2257 | 0 | KASSERT(SEQ_GT(th->th_ack, tp->snd_una), |
2258 | 0 | ("%s: th_ack <= snd_una", __func__)); |
2259 | | |
2260 | | /* |
2261 | | * If the congestion window was inflated to account |
2262 | | * for the other side's cached packets, retract it. |
2263 | | */ |
2264 | 0 | if (IN_FASTRECOVERY(tp->t_flags)) { |
2265 | 0 | if (SEQ_LT(th->th_ack, tp->snd_recover)) { |
2266 | 0 | if (tp->t_flags & TF_SACK_PERMIT) |
2267 | 0 | tcp_sack_partialack(tp, th); |
2268 | 0 | else |
2269 | 0 | tcp_newreno_partial_ack(tp, th); |
2270 | 0 | } else |
2271 | 0 | cc_post_recovery(tp, th); |
2272 | 0 | } |
2273 | |
|
2274 | 0 | tp->t_dupacks = 0; |
2275 | | /* |
2276 | | * If we reach this point, ACK is not a duplicate, |
2277 | | * i.e., it ACKs something we sent. |
2278 | | */ |
2279 | 0 | if (tp->t_flags & TF_NEEDSYN) { |
2280 | | /* |
2281 | | * T/TCP: Connection was half-synchronized, and our |
2282 | | * SYN has been ACK'd (so connection is now fully |
2283 | | * synchronized). Go to non-starred state, |
2284 | | * increment snd_una for ACK of SYN, and check if |
2285 | | * we can do window scaling. |
2286 | | */ |
2287 | 0 | tp->t_flags &= ~TF_NEEDSYN; |
2288 | 0 | tp->snd_una++; |
2289 | | /* Do window scaling? */ |
2290 | 0 | if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == |
2291 | 0 | (TF_RCVD_SCALE|TF_REQ_SCALE)) { |
2292 | 0 | tp->rcv_scale = tp->request_r_scale; |
2293 | | /* Send window already scaled. */ |
2294 | 0 | } |
2295 | 0 | } |
2296 | |
|
2297 | 0 | process_ACK: |
2298 | 0 | acked = BYTES_THIS_ACK(tp, th); |
2299 | |
|
2300 | 0 | tcplp_sys_log("Bytes acked: %d", acked); |
2301 | | /* |
2302 | | * If we just performed our first retransmit, and the ACK |
2303 | | * arrives within our recovery window, then it was a mistake |
2304 | | * to do the retransmit in the first place. Recover our |
2305 | | * original cwnd and ssthresh, and proceed to transmit where |
2306 | | * we left off. |
2307 | | */ |
2308 | 0 | if (tp->t_rxtshift == 1 && tp->t_flags & TF_PREVVALID && |
2309 | 0 | (int)(ticks - tp->t_badrxtwin) < 0) |
2310 | 0 | cc_cong_signal(tp, th, CC_RTO_ERR); |
2311 | | |
2312 | | /* |
2313 | | * If we have a timestamp reply, update smoothed |
2314 | | * round trip time. If no timestamp is present but |
2315 | | * transmit timer is running and timed sequence |
2316 | | * number was acked, update smoothed round trip time. |
2317 | | * Since we now have an rtt measurement, cancel the |
2318 | | * timer backoff (cf., Phil Karn's retransmit alg.). |
2319 | | * Recompute the initial retransmit timer. |
2320 | | * |
2321 | | * Some boxes send broken timestamp replies |
2322 | | * during the SYN+ACK phase, ignore |
2323 | | * timestamps of 0 or we could calculate a |
2324 | | * huge RTT and blow up the retransmit timer. |
2325 | | */ |
2326 | |
|
2327 | 0 | if ((to.to_flags & TOF_TS) != 0 && to.to_tsecr) { |
2328 | 0 | uint32_t t; |
2329 | |
|
2330 | 0 | t = tcp_ts_getticks() - to.to_tsecr; |
2331 | 0 | if (!tp->t_rttlow || tp->t_rttlow > t) |
2332 | 0 | tp->t_rttlow = t; |
2333 | 0 | tcp_xmit_timer(tp, TCP_TS_TO_TICKS(t) + 1); |
2334 | 0 | } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) { |
2335 | 0 | if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime) |
2336 | 0 | tp->t_rttlow = ticks - tp->t_rtttime; |
2337 | 0 | tcp_xmit_timer(tp, ticks - tp->t_rtttime); |
2338 | 0 | } |
2339 | | |
2340 | | /* |
2341 | | * If all outstanding data is acked, stop retransmit |
2342 | | * timer and remember to restart (more output or persist). |
2343 | | * If there is more data to be acked, restart retransmit |
2344 | | * timer, using current (possibly backed-off) value. |
2345 | | */ |
2346 | 0 | if (th->th_ack == tp->snd_max) { |
2347 | 0 | tcp_timer_activate(tp, TT_REXMT, 0); |
2348 | 0 | needoutput = 1; |
2349 | 0 | } else if (!tcp_timer_active(tp, TT_PERSIST)) { |
2350 | 0 | tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); |
2351 | 0 | } |
2352 | | |
2353 | | /* |
2354 | | * If no data (only SYN) was ACK'd, |
2355 | | * skip rest of ACK processing. |
2356 | | */ |
2357 | 0 | if (acked == 0) |
2358 | 0 | goto step6; |
2359 | | |
2360 | | /* |
2361 | | * Let the congestion control algorithm update congestion |
2362 | | * control related information. This typically means increasing |
2363 | | * the congestion window. |
2364 | | */ |
2365 | 0 | cc_ack_received(tp, th, CC_ACK); |
2366 | | |
2367 | | /* |
2368 | | * samkumar: I replaced the calls to sbavail(&so->so_snd) with new |
2369 | | * calls to lbuf_used_space, and then I modified the code to actually |
2370 | | * remove code from the send buffer, formerly done via |
2371 | | * sbcut_locked(&so->so_send, (int)sbavail(&so->so_snd)) in the if case |
2372 | | * and sbcut_locked(&so->so_snd, acked) in the else case, to use the |
2373 | | * data structures for TCPlp's data buffering. |
2374 | | */ |
2375 | 0 | if (acked > lbuf_used_space(&tp->sendbuf)) { |
2376 | 0 | uint32_t poppedbytes; |
2377 | 0 | uint32_t usedspace = lbuf_used_space(&tp->sendbuf); |
2378 | 0 | tp->snd_wnd -= usedspace; |
2379 | 0 | poppedbytes = lbuf_pop(&tp->sendbuf, usedspace, &sig->links_popped); |
2380 | 0 | KASSERT(poppedbytes == usedspace, ("Could not fully empty send buffer")); |
2381 | 0 | sig->bytes_acked += poppedbytes; |
2382 | 0 | ourfinisacked = 1; |
2383 | 0 | } else { |
2384 | 0 | uint32_t poppedbytes = lbuf_pop(&tp->sendbuf, acked, &sig->links_popped); |
2385 | 0 | KASSERT(poppedbytes == acked, ("Could not remove acked bytes from send buffer")); |
2386 | 0 | sig->bytes_acked += poppedbytes; |
2387 | 0 | tp->snd_wnd -= acked; |
2388 | 0 | ourfinisacked = 0; |
2389 | 0 | } |
2390 | | /* NB: sowwakeup_locked() does an implicit unlock. */ |
2391 | | /* |
2392 | | * samkumar: There used to be a call to sowwakeup(so); here, |
2393 | | * which wakes up any threads waiting for the socket to |
2394 | | * become ready for writing. TCPlp handles its send buffer |
2395 | | * differently so we do not need to replace this call with |
2396 | | * specialized code to handle this. |
2397 | | */ |
2398 | | /* Detect una wraparound. */ |
2399 | 0 | if (!IN_RECOVERY(tp->t_flags) && |
2400 | 0 | SEQ_GT(tp->snd_una, tp->snd_recover) && |
2401 | 0 | SEQ_LEQ(th->th_ack, tp->snd_recover)) |
2402 | 0 | tp->snd_recover = th->th_ack - 1; |
2403 | | /* XXXLAS: Can this be moved up into cc_post_recovery? */ |
2404 | 0 | if (IN_RECOVERY(tp->t_flags) && |
2405 | 0 | SEQ_GEQ(th->th_ack, tp->snd_recover)) { |
2406 | 0 | EXIT_RECOVERY(tp->t_flags); |
2407 | 0 | } |
2408 | 0 | tp->snd_una = th->th_ack; |
2409 | 0 | if (tp->t_flags & TF_SACK_PERMIT) { |
2410 | 0 | if (SEQ_GT(tp->snd_una, tp->snd_recover)) |
2411 | 0 | tp->snd_recover = tp->snd_una; |
2412 | 0 | } |
2413 | 0 | if (SEQ_LT(tp->snd_nxt, tp->snd_una)) |
2414 | 0 | tp->snd_nxt = tp->snd_una; |
2415 | |
|
2416 | 0 | switch (tp->t_state) { |
2417 | | |
2418 | | /* |
2419 | | * In FIN_WAIT_1 STATE in addition to the processing |
2420 | | * for the ESTABLISHED state if our FIN is now acknowledged |
2421 | | * then enter FIN_WAIT_2. |
2422 | | */ |
2423 | 0 | case TCPS_FIN_WAIT_1: |
2424 | 0 | if (ourfinisacked) { |
2425 | | /* |
2426 | | * If we can't receive any more |
2427 | | * data, then closing user can proceed. |
2428 | | * Starting the timer is contrary to the |
2429 | | * specification, but if we don't get a FIN |
2430 | | * we'll hang forever. |
2431 | | * |
2432 | | * XXXjl: |
2433 | | * we should release the tp also, and use a |
2434 | | * compressed state. |
2435 | | */ |
2436 | | /* |
2437 | | * samkumar: I replaced a check for the SBS_CANTRCVMORE flag |
2438 | | * in so->so_rcv.sb_state with a call to tcpiscantrcv. |
2439 | | */ |
2440 | 0 | if (tpiscantrcv(tp)) { |
2441 | | /* samkumar: Removed a call to soisdisconnected(so). */ |
2442 | 0 | tcp_timer_activate(tp, TT_2MSL, |
2443 | 0 | (tcp_fast_finwait2_recycle ? |
2444 | 0 | tcp_finwait2_timeout : |
2445 | 0 | TP_MAXIDLE(tp))); |
2446 | 0 | } |
2447 | 0 | tcp_state_change(tp, TCPS_FIN_WAIT_2); |
2448 | 0 | } |
2449 | 0 | break; |
2450 | | |
2451 | | /* |
2452 | | * In CLOSING STATE in addition to the processing for |
2453 | | * the ESTABLISHED state if the ACK acknowledges our FIN |
2454 | | * then enter the TIME-WAIT state, otherwise ignore |
2455 | | * the segment. |
2456 | | */ |
2457 | 0 | case TCPS_CLOSING: |
2458 | 0 | if (ourfinisacked) { |
2459 | | /* |
2460 | | * samkumar: I added the line below. We need to avoid sending |
2461 | | * an ACK in the TIME-WAIT state, since we don't want to |
2462 | | * ACK ACKs. This edge case appears because TCPlp, unlike the |
2463 | | * original FreeBSD code, uses tcpcbs for connections in the |
2464 | | * TIME-WAIT state (FreeBSD uses a different, smaller |
2465 | | * structure). |
2466 | | */ |
2467 | 0 | tp->t_flags &= ~TF_ACKNOW; |
2468 | 0 | tcp_twstart(tp); |
2469 | 0 | return; |
2470 | 0 | } |
2471 | 0 | break; |
2472 | | |
2473 | | /* |
2474 | | * In LAST_ACK, we may still be waiting for data to drain |
2475 | | * and/or to be acked, as well as for the ack of our FIN. |
2476 | | * If our FIN is now acknowledged, delete the TCB, |
2477 | | * enter the closed state and return. |
2478 | | */ |
2479 | 0 | case TCPS_LAST_ACK: |
2480 | 0 | if (ourfinisacked) { |
2481 | 0 | tp = tcp_close_tcb(tp); |
2482 | 0 | tcplp_sys_connection_lost(tp, CONN_LOST_NORMAL); |
2483 | 0 | goto drop; |
2484 | 0 | } |
2485 | 0 | break; |
2486 | 0 | } |
2487 | 0 | } |
2488 | | |
2489 | 0 | step6: |
2490 | | |
2491 | | /* |
2492 | | * Update window information. |
2493 | | * Don't look at window if no ACK: TAC's send garbage on first SYN. |
2494 | | */ |
2495 | 0 | if ((thflags & TH_ACK) && |
2496 | 0 | (SEQ_LT(tp->snd_wl1, th->th_seq) || |
2497 | 0 | (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || |
2498 | 0 | (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { |
2499 | | /* keep track of pure window updates */ |
2500 | | /* |
2501 | | * samkumar: There used to be an if statement here that would check if |
2502 | | * this is a "pure" window update (tlen == 0 && |
2503 | | * tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) and keep |
2504 | | * statistics for how often that happens. |
2505 | | */ |
2506 | 0 | tp->snd_wnd = tiwin; |
2507 | 0 | tp->snd_wl1 = th->th_seq; |
2508 | 0 | tp->snd_wl2 = th->th_ack; |
2509 | 0 | if (tp->snd_wnd > tp->max_sndwnd) |
2510 | 0 | tp->max_sndwnd = tp->snd_wnd; |
2511 | 0 | needoutput = 1; |
2512 | 0 | } |
2513 | | |
2514 | | /* |
2515 | | * Process segments with URG. |
2516 | | */ |
2517 | | /* |
2518 | | * samkumar: TCPlp does not support the urgent pointer, so we omit all |
2519 | | * urgent-pointer-related processing and buffering. The code below is the |
2520 | | * code that was in the "else" case that handles no valid urgent data in |
2521 | | * the received packet. |
2522 | | */ |
2523 | 0 | { |
2524 | | /* |
2525 | | * If no out of band data is expected, |
2526 | | * pull receive urgent pointer along |
2527 | | * with the receive window. |
2528 | | */ |
2529 | 0 | if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) |
2530 | 0 | tp->rcv_up = tp->rcv_nxt; |
2531 | 0 | } |
2532 | | |
2533 | | /* |
2534 | | * Process the segment text, merging it into the TCP sequencing queue, |
2535 | | * and arranging for acknowledgment of receipt if necessary. |
2536 | | * This process logically involves adjusting tp->rcv_wnd as data |
2537 | | * is presented to the user (this happens in tcp_usrreq.c, |
2538 | | * case PRU_RCVD). If a FIN has already been received on this |
2539 | | * connection then we just ignore the text. |
2540 | | */ |
2541 | 0 | tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) && |
2542 | 0 | IS_FASTOPEN(tp->t_flags)); |
2543 | 0 | if ((tlen || (thflags & TH_FIN) || tfo_syn) && |
2544 | 0 | TCPS_HAVERCVDFIN(tp->t_state) == 0) { |
2545 | 0 | tcp_seq save_start = th->th_seq; |
2546 | | /* |
2547 | | * samkumar: I removed a call to m_adj(m, drop_hdrlen), which intends |
2548 | | * to drop data from the mbuf so it can be chained into the receive |
2549 | | * header. This is not necessary for TCPlp because we copy the data |
2550 | | * anyway; we just add the offset when copying data into the receive |
2551 | | * buffer. |
2552 | | */ |
2553 | | /* |
2554 | | * Insert segment which includes th into TCP reassembly queue |
2555 | | * with control block tp. Set thflags to whether reassembly now |
2556 | | * includes a segment with FIN. This handles the common case |
2557 | | * inline (segment is the next to be received on an established |
2558 | | * connection, and the queue is empty), avoiding linkage into |
2559 | | * and removal from the queue and repetition of various |
2560 | | * conversions. |
2561 | | * Set DELACK for segments received in order, but ack |
2562 | | * immediately when segments are out of order (so |
2563 | | * fast retransmit can work). |
2564 | | */ |
2565 | | /* |
2566 | | * samkumar: I replaced LIST_EMPTY(&tp->t_segq) with the calls to |
2567 | | * tpiscantrcv and bmp_isempty on the second line below. |
2568 | | */ |
2569 | 0 | if (th->th_seq == tp->rcv_nxt && |
2570 | 0 | (tpiscantrcv(tp) || bmp_isempty(tp->reassbmp, REASSBMP_SIZE(tp))) && |
2571 | 0 | (TCPS_HAVEESTABLISHED(tp->t_state) || |
2572 | 0 | tfo_syn)) { |
2573 | 0 | if (DELAY_ACK(tp, tlen) || tfo_syn) |
2574 | 0 | tp->t_flags |= TF_DELACK; |
2575 | 0 | else |
2576 | 0 | tp->t_flags |= TF_ACKNOW; |
2577 | 0 | tp->rcv_nxt += tlen; |
2578 | 0 | thflags = th->th_flags & TH_FIN; |
2579 | | |
2580 | | /* |
2581 | | * samkumar: I replaced the code that used to be here (which would |
2582 | | * free the mbuf with m_freem(m) if the SBS_CANTRCVMORE flag is set |
2583 | | * on so->so_rcv.sb_state, and otherwise call |
2584 | | * sbappendstream_locked(&so->so_rcv, m, 0);). |
2585 | | */ |
2586 | 0 | if (!tpiscantrcv(tp)) { |
2587 | 0 | cbuf_write(&tp->recvbuf, msg, otMessageGetOffset(msg) + drop_hdrlen, tlen, cbuf_copy_from_message); |
2588 | 0 | if (tlen > 0) { |
2589 | 0 | sig->recvbuf_added = true; |
2590 | 0 | } |
2591 | 0 | } else if (tlen > 0) { |
2592 | | /* |
2593 | | * samkumar: We already know tlen != 0, so if we got here, then |
2594 | | * it means that we got data after we called SHUT_RD, or after |
2595 | | * receiving a FIN. I'm going to drop the connection in this |
2596 | | * case. I think FreeBSD might have just dropped the packet |
2597 | | * silently, but Linux handles it this way; this seems to be |
2598 | | * the right approach to me. |
2599 | | */ |
2600 | 0 | tcp_drop(tp, ECONNABORTED); |
2601 | 0 | goto drop; |
2602 | 0 | } |
2603 | | /* NB: sorwakeup_locked() does an implicit unlock. */ |
2604 | | /* |
2605 | | * samkumar: There used to be a call to sorwakeup_locked(so); here, |
2606 | | * which wakes up any threads waiting for the socket to become |
2607 | | * become ready for reading. TCPlp handles its buffering |
2608 | | * differently so we do not need to replace this call with |
2609 | | * specialized code to handle this. |
2610 | | */ |
2611 | 0 | } else if (tpiscantrcv(tp)) { |
2612 | | /* |
2613 | | * samkumar: We will reach this point if we get out-of-order data |
2614 | | * on a socket which was shut down with SHUT_RD, or where we |
2615 | | * already received a FIN. My response here is to drop the segment |
2616 | | * and send an RST. |
2617 | | */ |
2618 | 0 | tcp_drop(tp, ECONNABORTED); |
2619 | 0 | goto drop; |
2620 | 0 | } else { |
2621 | | /* |
2622 | | * XXX: Due to the header drop above "th" is |
2623 | | * theoretically invalid by now. Fortunately |
2624 | | * m_adj() doesn't actually frees any mbufs |
2625 | | * when trimming from the head. |
2626 | | */ |
2627 | 0 | thflags = tcp_reass(tp, th, &tlen, msg, otMessageGetOffset(msg) + drop_hdrlen, sig); |
2628 | 0 | tp->t_flags |= TF_ACKNOW; |
2629 | 0 | } |
2630 | | // Only place tlen is used after the call to tcp_reass is below |
2631 | 0 | if (tlen > 0 && (tp->t_flags & TF_SACK_PERMIT)) |
2632 | 0 | tcp_update_sack_list(tp, save_start, save_start + tlen); |
2633 | | /* |
2634 | | * samkumar: This is not me commenting things out; this was already |
2635 | | * commented out in the FreeBSD code. |
2636 | | */ |
2637 | | #if 0 |
2638 | | /* |
2639 | | * Note the amount of data that peer has sent into |
2640 | | * our window, in order to estimate the sender's |
2641 | | * buffer size. |
2642 | | * XXX: Unused. |
2643 | | */ |
2644 | | if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) |
2645 | | len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); |
2646 | | else |
2647 | | len = so->so_rcv.sb_hiwat; |
2648 | | #endif |
2649 | 0 | } else { |
2650 | 0 | thflags &= ~TH_FIN; |
2651 | 0 | } |
2652 | | |
2653 | | /* |
2654 | | * If FIN is received ACK the FIN and let the user know |
2655 | | * that the connection is closing. |
2656 | | */ |
2657 | 0 | if (thflags & TH_FIN) { |
2658 | 0 | tcplp_sys_log("FIN Processing start"); |
2659 | 0 | if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { |
2660 | | /* samkumar: replace socantrcvmore with tpcantrcvmore */ |
2661 | 0 | tpcantrcvmore(tp); |
2662 | | /* |
2663 | | * If connection is half-synchronized |
2664 | | * (ie NEEDSYN flag on) then delay ACK, |
2665 | | * so it may be piggybacked when SYN is sent. |
2666 | | * Otherwise, since we received a FIN then no |
2667 | | * more input can be expected, send ACK now. |
2668 | | */ |
2669 | 0 | if (tp->t_flags & TF_NEEDSYN) |
2670 | 0 | tp->t_flags |= TF_DELACK; |
2671 | 0 | else |
2672 | 0 | tp->t_flags |= TF_ACKNOW; |
2673 | 0 | tp->rcv_nxt++; |
2674 | 0 | } |
2675 | | /* |
2676 | | * samkumar: This -2 state is added by me, so that we do not consider |
2677 | | * any more FINs in reassembly. |
2678 | | */ |
2679 | 0 | if (tp->reass_fin_index != -2) { |
2680 | 0 | sig->rcvd_fin = true; |
2681 | 0 | tp->reass_fin_index = -2; |
2682 | 0 | } |
2683 | 0 | switch (tp->t_state) { |
2684 | | |
2685 | | /* |
2686 | | * In SYN_RECEIVED and ESTABLISHED STATES |
2687 | | * enter the CLOSE_WAIT state. |
2688 | | */ |
2689 | 0 | case TCPS_SYN_RECEIVED: |
2690 | 0 | tp->t_starttime = ticks; |
2691 | | /* FALLTHROUGH */ |
2692 | 0 | case TCPS_ESTABLISHED: |
2693 | 0 | tcp_state_change(tp, TCPS_CLOSE_WAIT); |
2694 | 0 | break; |
2695 | | |
2696 | | /* |
2697 | | * If still in FIN_WAIT_1 STATE FIN has not been acked so |
2698 | | * enter the CLOSING state. |
2699 | | */ |
2700 | 0 | case TCPS_FIN_WAIT_1: |
2701 | 0 | tcp_state_change(tp, TCPS_CLOSING); |
2702 | 0 | break; |
2703 | | |
2704 | | /* |
2705 | | * In FIN_WAIT_2 state enter the TIME_WAIT state, |
2706 | | * starting the time-wait timer, turning off the other |
2707 | | * standard timers. |
2708 | | */ |
2709 | 0 | case TCPS_FIN_WAIT_2: |
2710 | 0 | tcp_twstart(tp); |
2711 | 0 | return; |
2712 | 0 | } |
2713 | 0 | } |
2714 | | |
2715 | | /* |
2716 | | * samkumar: Remove code for synchronization and debugging, here and in |
2717 | | * the labels below. I also removed the line to free the mbuf if it hasn't |
2718 | | * been freed already (the line was "m_freem(m)"). |
2719 | | */ |
2720 | | /* |
2721 | | * Return any desired output. |
2722 | | */ |
2723 | 0 | if (needoutput || (tp->t_flags & TF_ACKNOW)) |
2724 | 0 | (void) tcplp_output(tp); |
2725 | |
|
2726 | 0 | check_delack: |
2727 | 0 | if (tp->t_flags & TF_DELACK) { |
2728 | 0 | tp->t_flags &= ~TF_DELACK; |
2729 | 0 | tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); |
2730 | 0 | } |
2731 | 0 | return; |
2732 | | |
2733 | 0 | dropafterack: |
2734 | | /* |
2735 | | * Generate an ACK dropping incoming segment if it occupies |
2736 | | * sequence space, where the ACK reflects our state. |
2737 | | * |
2738 | | * We can now skip the test for the RST flag since all |
2739 | | * paths to this code happen after packets containing |
2740 | | * RST have been dropped. |
2741 | | * |
2742 | | * In the SYN-RECEIVED state, don't send an ACK unless the |
2743 | | * segment we received passes the SYN-RECEIVED ACK test. |
2744 | | * If it fails send a RST. This breaks the loop in the |
2745 | | * "LAND" DoS attack, and also prevents an ACK storm |
2746 | | * between two listening ports that have been sent forged |
2747 | | * SYN segments, each with the source address of the other. |
2748 | | */ |
2749 | 0 | if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) && |
2750 | 0 | (SEQ_GT(tp->snd_una, th->th_ack) || |
2751 | 0 | SEQ_GT(th->th_ack, tp->snd_max)) ) { |
2752 | 0 | rstreason = BANDLIM_RST_OPENPORT; |
2753 | 0 | goto dropwithreset; |
2754 | 0 | } |
2755 | | |
2756 | 0 | tp->t_flags |= TF_ACKNOW; |
2757 | 0 | (void) tcplp_output(tp); |
2758 | 0 | return; |
2759 | | |
2760 | 0 | dropwithreset: |
2761 | 0 | if (tp != NULL) { |
2762 | 0 | tcp_dropwithreset(ip6, th, tp, instance, tlen, rstreason); |
2763 | 0 | } else |
2764 | 0 | tcp_dropwithreset(ip6, th, NULL, instance, tlen, rstreason); |
2765 | 0 | return; |
2766 | | |
2767 | 0 | drop: |
2768 | 0 | return; |
2769 | 0 | } |
2770 | | |
2771 | | /* |
2772 | | * Parse TCP options and place in tcpopt. |
2773 | | */ |
2774 | | static void |
2775 | | tcp_dooptions(struct tcpopt *to, uint8_t *cp, int cnt, int flags) |
2776 | 0 | { |
2777 | 0 | int opt, optlen; |
2778 | |
|
2779 | 0 | to->to_flags = 0; |
2780 | 0 | for (; cnt > 0; cnt -= optlen, cp += optlen) { |
2781 | 0 | opt = cp[0]; |
2782 | 0 | if (opt == TCPOPT_EOL) |
2783 | 0 | break; |
2784 | 0 | if (opt == TCPOPT_NOP) |
2785 | 0 | optlen = 1; |
2786 | 0 | else { |
2787 | 0 | if (cnt < 2) |
2788 | 0 | break; |
2789 | 0 | optlen = cp[1]; |
2790 | 0 | if (optlen < 2 || optlen > cnt) |
2791 | 0 | break; |
2792 | 0 | } |
2793 | 0 | switch (opt) { |
2794 | 0 | case TCPOPT_MAXSEG: |
2795 | 0 | if (optlen != TCPOLEN_MAXSEG) |
2796 | 0 | continue; |
2797 | 0 | if (!(flags & TO_SYN)) |
2798 | 0 | continue; |
2799 | 0 | to->to_flags |= TOF_MSS; |
2800 | 0 | bcopy((char *)cp + 2, |
2801 | 0 | (char *)&to->to_mss, sizeof(to->to_mss)); |
2802 | 0 | to->to_mss = ntohs(to->to_mss); |
2803 | 0 | break; |
2804 | 0 | case TCPOPT_WINDOW: |
2805 | 0 | if (optlen != TCPOLEN_WINDOW) |
2806 | 0 | continue; |
2807 | 0 | if (!(flags & TO_SYN)) |
2808 | 0 | continue; |
2809 | 0 | to->to_flags |= TOF_SCALE; |
2810 | 0 | to->to_wscale = min(cp[2], TCP_MAX_WINSHIFT); |
2811 | 0 | break; |
2812 | 0 | case TCPOPT_TIMESTAMP: |
2813 | 0 | if (optlen != TCPOLEN_TIMESTAMP) |
2814 | 0 | continue; |
2815 | 0 | to->to_flags |= TOF_TS; |
2816 | 0 | bcopy((char *)cp + 2, |
2817 | 0 | (char *)&to->to_tsval, sizeof(to->to_tsval)); |
2818 | 0 | to->to_tsval = ntohl(to->to_tsval); |
2819 | 0 | bcopy((char *)cp + 6, |
2820 | 0 | (char *)&to->to_tsecr, sizeof(to->to_tsecr)); |
2821 | 0 | to->to_tsecr = ntohl(to->to_tsecr); |
2822 | 0 | break; |
2823 | | #ifdef TCP_SIGNATURE |
2824 | | /* |
2825 | | * XXX In order to reply to a host which has set the |
2826 | | * TCP_SIGNATURE option in its initial SYN, we have to |
2827 | | * record the fact that the option was observed here |
2828 | | * for the syncache code to perform the correct response. |
2829 | | */ |
2830 | | case TCPOPT_SIGNATURE: |
2831 | | if (optlen != TCPOLEN_SIGNATURE) |
2832 | | continue; |
2833 | | to->to_flags |= TOF_SIGNATURE; |
2834 | | to->to_signature = cp + 2; |
2835 | | break; |
2836 | | #endif |
2837 | 0 | case TCPOPT_SACK_PERMITTED: |
2838 | 0 | if (optlen != TCPOLEN_SACK_PERMITTED) |
2839 | 0 | continue; |
2840 | 0 | if (!(flags & TO_SYN)) |
2841 | 0 | continue; |
2842 | 0 | if (!V_tcp_do_sack) |
2843 | 0 | continue; |
2844 | 0 | to->to_flags |= TOF_SACKPERM; |
2845 | 0 | break; |
2846 | 0 | case TCPOPT_SACK: |
2847 | 0 | if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0) |
2848 | 0 | continue; |
2849 | 0 | if (flags & TO_SYN) |
2850 | 0 | continue; |
2851 | 0 | to->to_flags |= TOF_SACK; |
2852 | 0 | to->to_nsacks = (optlen - 2) / TCPOLEN_SACK; |
2853 | 0 | to->to_sacks = cp + 2; |
2854 | 0 | break; |
2855 | 0 | case TCPOPT_FAST_OPEN: |
2856 | | /* |
2857 | | * Cookie length validation is performed by the |
2858 | | * server side cookie checking code or the client |
2859 | | * side cookie cache update code. |
2860 | | */ |
2861 | 0 | if (!(flags & TO_SYN)) |
2862 | 0 | continue; |
2863 | 0 | if (!V_tcp_fastopen_client_enable && |
2864 | 0 | !V_tcp_fastopen_server_enable) |
2865 | 0 | continue; |
2866 | 0 | to->to_flags |= TOF_FASTOPEN; |
2867 | 0 | to->to_tfo_len = optlen - 2; |
2868 | 0 | to->to_tfo_cookie = to->to_tfo_len ? cp + 2 : NULL; |
2869 | 0 | break; |
2870 | 0 | default: |
2871 | 0 | continue; |
2872 | 0 | } |
2873 | 0 | } |
2874 | 0 | } |
2875 | | |
2876 | | |
2877 | | /* |
2878 | | * Collect new round-trip time estimate |
2879 | | * and update averages and current timeout. |
2880 | | */ |
2881 | | static void |
2882 | | tcp_xmit_timer(struct tcpcb *tp, int rtt) |
2883 | 0 | { |
2884 | 0 | int delta; |
2885 | |
|
2886 | 0 | tp->t_rttupdated++; |
2887 | 0 | if (tp->t_srtt != 0) { |
2888 | | /* |
2889 | | * srtt is stored as fixed point with 5 bits after the |
2890 | | * binary point (i.e., scaled by 8). The following magic |
2891 | | * is equivalent to the smoothing algorithm in rfc793 with |
2892 | | * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed |
2893 | | * point). Adjust rtt to origin 0. |
2894 | | */ |
2895 | 0 | delta = ((rtt - 1) << TCP_DELTA_SHIFT) |
2896 | 0 | - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)); |
2897 | |
|
2898 | 0 | if ((tp->t_srtt += delta) <= 0) |
2899 | 0 | tp->t_srtt = 1; |
2900 | | |
2901 | | /* |
2902 | | * We accumulate a smoothed rtt variance (actually, a |
2903 | | * smoothed mean difference), then set the retransmit |
2904 | | * timer to smoothed rtt + 4 times the smoothed variance. |
2905 | | * rttvar is stored as fixed point with 4 bits after the |
2906 | | * binary point (scaled by 16). The following is |
2907 | | * equivalent to rfc793 smoothing with an alpha of .75 |
2908 | | * (rttvar = rttvar*3/4 + |delta| / 4). This replaces |
2909 | | * rfc793's wired-in beta. |
2910 | | */ |
2911 | 0 | if (delta < 0) |
2912 | 0 | delta = -delta; |
2913 | 0 | delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT); |
2914 | 0 | if ((tp->t_rttvar += delta) <= 0) |
2915 | 0 | tp->t_rttvar = 1; |
2916 | 0 | if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar) |
2917 | 0 | tp->t_rttbest = tp->t_srtt + tp->t_rttvar; |
2918 | 0 | } else { |
2919 | | /* |
2920 | | * No rtt measurement yet - use the unsmoothed rtt. |
2921 | | * Set the variance to half the rtt (so our first |
2922 | | * retransmit happens at 3*rtt). |
2923 | | */ |
2924 | 0 | tp->t_srtt = rtt << TCP_RTT_SHIFT; |
2925 | 0 | tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1); |
2926 | 0 | tp->t_rttbest = tp->t_srtt + tp->t_rttvar; |
2927 | 0 | } |
2928 | 0 | tp->t_rtttime = 0; |
2929 | 0 | tp->t_rxtshift = 0; |
2930 | | |
2931 | | /* |
2932 | | * the retransmit should happen at rtt + 4 * rttvar. |
2933 | | * Because of the way we do the smoothing, srtt and rttvar |
2934 | | * will each average +1/2 tick of bias. When we compute |
2935 | | * the retransmit timer, we want 1/2 tick of rounding and |
2936 | | * 1 extra tick because of +-1/2 tick uncertainty in the |
2937 | | * firing of the timer. The bias will give us exactly the |
2938 | | * 1.5 tick we need. But, because the bias is |
2939 | | * statistical, we have to test that we don't drop below |
2940 | | * the minimum feasible timer (which is 2 ticks). |
2941 | | */ |
2942 | 0 | TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), |
2943 | 0 | max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX); |
2944 | |
|
2945 | | #ifdef INSTRUMENT_TCP |
2946 | | tcplp_sys_log("TCP timer %u %d %d %d", (unsigned int) tcplp_sys_get_millis(), rtt, (int) tp->t_srtt, (int) tp->t_rttvar); |
2947 | | #endif |
2948 | | |
2949 | | |
2950 | | /* |
2951 | | * We received an ack for a packet that wasn't retransmitted; |
2952 | | * it is probably safe to discard any error indications we've |
2953 | | * received recently. This isn't quite right, but close enough |
2954 | | * for now (a route might have failed after we sent a segment, |
2955 | | * and the return path might not be symmetrical). |
2956 | | */ |
2957 | 0 | tp->t_softerror = 0; |
2958 | 0 | } |
2959 | | |
2960 | | /* |
2961 | | * samkumar: Taken from netinet6/in6.c. |
2962 | | * |
2963 | | * This function is supposed to check whether the provided address is an |
2964 | | * IPv6 address of this host. This function, however, is used only as a hint, |
2965 | | * as the MSS is clamped at V_tcp_v6mssdflt for connections to non-local |
2966 | | * addresses. It is difficult for us to actually determine if the address |
2967 | | * belongs to us, so we are conservative and only return 1 (true) if it is |
2968 | | * obviously so---we keep the part of the function that checks for loopback or |
2969 | | * link local and remove the rest of the code that checks for the addresses |
2970 | | * assigned to interfaces. In cases where we return 0 but should have returned |
2971 | | * 1, we may conservatively clamp the MTU, but that should be OK for TCPlp. |
2972 | | * In fact, the constants are set such that we'll get the right answer whether |
2973 | | * we clamp or not, so this shouldn't really matter at all. |
2974 | | */ |
2975 | | int |
2976 | | in6_localaddr(struct in6_addr *in6) |
2977 | 0 | { |
2978 | 0 | if (IN6_IS_ADDR_LOOPBACK(in6) || IN6_IS_ADDR_LINKLOCAL(in6)) |
2979 | 0 | return 1; |
2980 | 0 | return (0); |
2981 | 0 | } |
2982 | | |
2983 | | /* |
2984 | | * Determine a reasonable value for maxseg size. |
2985 | | * If the route is known, check route for mtu. |
2986 | | * If none, use an mss that can be handled on the outgoing interface |
2987 | | * without forcing IP to fragment. If no route is found, route has no mtu, |
2988 | | * or the destination isn't local, use a default, hopefully conservative |
2989 | | * size (usually 512 or the default IP max size, but no more than the mtu |
2990 | | * of the interface), as we can't discover anything about intervening |
2991 | | * gateways or networks. We also initialize the congestion/slow start |
2992 | | * window to be a single segment if the destination isn't local. |
2993 | | * While looking at the routing entry, we also initialize other path-dependent |
2994 | | * parameters from pre-set or cached values in the routing entry. |
2995 | | * |
2996 | | * Also take into account the space needed for options that we |
2997 | | * send regularly. Make maxseg shorter by that amount to assure |
2998 | | * that we can send maxseg amount of data even when the options |
2999 | | * are present. Store the upper limit of the length of options plus |
3000 | | * data in maxopd. |
3001 | | * |
3002 | | * NOTE that this routine is only called when we process an incoming |
3003 | | * segment, or an ICMP need fragmentation datagram. Outgoing SYN/ACK MSS |
3004 | | * settings are handled in tcp_mssopt(). |
3005 | | */ |
3006 | | /* |
3007 | | * samkumar: Using struct tcpcb instead of the inpcb. |
3008 | | */ |
3009 | | void |
3010 | | tcp_mss_update(struct tcpcb *tp, int offer, int mtuoffer, |
3011 | | struct hc_metrics_lite *metricptr, struct tcp_ifcap *cap) |
3012 | 0 | { |
3013 | | /* |
3014 | | * samkumar: I removed all IPv4-specific logic and cases, including logic |
3015 | | * to check for IPv4 vs. IPv6, as well as all locking and debugging code. |
3016 | | */ |
3017 | 0 | int mss = 0; |
3018 | 0 | uint64_t maxmtu = 0; |
3019 | 0 | struct hc_metrics_lite metrics; |
3020 | 0 | int origoffer; |
3021 | 0 | size_t min_protoh = IP6HDR_SIZE + sizeof (struct tcphdr); |
3022 | |
|
3023 | 0 | if (mtuoffer != -1) { |
3024 | 0 | KASSERT(offer == -1, ("%s: conflict", __func__)); |
3025 | 0 | offer = mtuoffer - min_protoh; |
3026 | 0 | } |
3027 | 0 | origoffer = offer; |
3028 | |
|
3029 | 0 | maxmtu = tcp_maxmtu6(tp, cap); |
3030 | 0 | tp->t_maxopd = tp->t_maxseg = V_tcp_v6mssdflt; |
3031 | | |
3032 | | /* |
3033 | | * No route to sender, stay with default mss and return. |
3034 | | */ |
3035 | 0 | if (maxmtu == 0) { |
3036 | | /* |
3037 | | * In case we return early we need to initialize metrics |
3038 | | * to a defined state as tcp_hc_get() would do for us |
3039 | | * if there was no cache hit. |
3040 | | */ |
3041 | 0 | if (metricptr != NULL) |
3042 | 0 | bzero(metricptr, sizeof(struct hc_metrics_lite)); |
3043 | 0 | return; |
3044 | 0 | } |
3045 | | |
3046 | | /* What have we got? */ |
3047 | 0 | switch (offer) { |
3048 | 0 | case 0: |
3049 | | /* |
3050 | | * Offer == 0 means that there was no MSS on the SYN |
3051 | | * segment, in this case we use tcp_mssdflt as |
3052 | | * already assigned to t_maxopd above. |
3053 | | */ |
3054 | 0 | offer = tp->t_maxopd; |
3055 | 0 | break; |
3056 | | |
3057 | 0 | case -1: |
3058 | | /* |
3059 | | * Offer == -1 means that we didn't receive SYN yet. |
3060 | | */ |
3061 | | /* FALLTHROUGH */ |
3062 | |
|
3063 | 0 | default: |
3064 | | /* |
3065 | | * Prevent DoS attack with too small MSS. Round up |
3066 | | * to at least minmss. |
3067 | | */ |
3068 | 0 | offer = max(offer, V_tcp_minmss); |
3069 | 0 | } |
3070 | | |
3071 | | /* |
3072 | | * rmx information is now retrieved from tcp_hostcache. |
3073 | | */ |
3074 | 0 | tcp_hc_get(tp, &metrics); |
3075 | 0 | if (metricptr != NULL) |
3076 | 0 | bcopy(&metrics, metricptr, sizeof(struct hc_metrics_lite)); |
3077 | | |
3078 | | /* |
3079 | | * If there's a discovered mtu in tcp hostcache, use it. |
3080 | | * Else, use the link mtu. |
3081 | | */ |
3082 | 0 | if (metrics.rmx_mtu) |
3083 | 0 | mss = min(metrics.rmx_mtu, maxmtu) - min_protoh; |
3084 | 0 | else { |
3085 | 0 | mss = maxmtu - min_protoh; |
3086 | 0 | if (!V_path_mtu_discovery && |
3087 | 0 | !in6_localaddr(&tp->faddr)) |
3088 | 0 | mss = min(mss, V_tcp_v6mssdflt); |
3089 | | /* |
3090 | | * XXX - The above conditional (mss = maxmtu - min_protoh) |
3091 | | * probably violates the TCP spec. |
3092 | | * The problem is that, since we don't know the |
3093 | | * other end's MSS, we are supposed to use a conservative |
3094 | | * default. But, if we do that, then MTU discovery will |
3095 | | * never actually take place, because the conservative |
3096 | | * default is much less than the MTUs typically seen |
3097 | | * on the Internet today. For the moment, we'll sweep |
3098 | | * this under the carpet. |
3099 | | * |
3100 | | * The conservative default might not actually be a problem |
3101 | | * if the only case this occurs is when sending an initial |
3102 | | * SYN with options and data to a host we've never talked |
3103 | | * to before. Then, they will reply with an MSS value which |
3104 | | * will get recorded and the new parameters should get |
3105 | | * recomputed. For Further Study. |
3106 | | */ |
3107 | 0 | } |
3108 | 0 | mss = min(mss, offer); |
3109 | | |
3110 | | /* |
3111 | | * Sanity check: make sure that maxopd will be large |
3112 | | * enough to allow some data on segments even if the |
3113 | | * all the option space is used (40bytes). Otherwise |
3114 | | * funny things may happen in tcplp_output. |
3115 | | */ |
3116 | | /* |
3117 | | * samkumar: When I was experimenting with different MSS values, I had |
3118 | | * changed this to "mss = max(mss, TCP_MAXOLEN + 1);" but I am changing it |
3119 | | * back for the version that will be merged into OpenThread. |
3120 | | */ |
3121 | 0 | mss = max(mss, 64); |
3122 | | |
3123 | | /* |
3124 | | * maxopd stores the maximum length of data AND options |
3125 | | * in a segment; maxseg is the amount of data in a normal |
3126 | | * segment. We need to store this value (maxopd) apart |
3127 | | * from maxseg, because now every segment carries options |
3128 | | * and thus we normally have somewhat less data in segments. |
3129 | | */ |
3130 | 0 | tp->t_maxopd = mss; |
3131 | | |
3132 | | /* |
3133 | | * origoffer==-1 indicates that no segments were received yet. |
3134 | | * In this case we just guess. |
3135 | | */ |
3136 | 0 | if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && |
3137 | 0 | (origoffer == -1 || |
3138 | 0 | (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)) |
3139 | 0 | mss -= TCPOLEN_TSTAMP_APPA; |
3140 | |
|
3141 | 0 | tp->t_maxseg = mss; |
3142 | 0 | } |
3143 | | |
3144 | | void |
3145 | | tcp_mss(struct tcpcb *tp, int offer) |
3146 | 0 | { |
3147 | 0 | struct hc_metrics_lite metrics; |
3148 | 0 | struct tcp_ifcap cap; |
3149 | |
|
3150 | 0 | KASSERT(tp != NULL, ("%s: tp == NULL", __func__)); |
3151 | |
|
3152 | 0 | bzero(&cap, sizeof(cap)); |
3153 | 0 | tcp_mss_update(tp, offer, -1, &metrics, &cap); |
3154 | | |
3155 | | /* |
3156 | | * samkumar: There used to be code below that might modify the MSS, but I |
3157 | | * removed all of it (see the comments below for the reason). It used to |
3158 | | * read tp->t_maxseg into the local variable mss, modify mss, and then |
3159 | | * reassign tp->t_maxseg to mss. I've kept the assignments, commented out, |
3160 | | * for clarity. |
3161 | | */ |
3162 | | //mss = tp->t_maxseg; |
3163 | | |
3164 | | /* |
3165 | | * If there's a pipesize, change the socket buffer to that size, |
3166 | | * don't change if sb_hiwat is different than default (then it |
3167 | | * has been changed on purpose with setsockopt). |
3168 | | * Make the socket buffers an integral number of mss units; |
3169 | | * if the mss is larger than the socket buffer, decrease the mss. |
3170 | | */ |
3171 | | |
3172 | | /* |
3173 | | * samkumar: There used to be code here would would limit the MSS to at |
3174 | | * most the size of the send buffer, and then round up the send buffer to |
3175 | | * a multiple of the MSS using |
3176 | | * "sbreserve_locked(&so->so_snd, bufsize, so, NULL);". With TCPlp, we do |
3177 | | * not do this, because the linked buffer used at the send buffer doesn't |
3178 | | * have a real limit. Had we used a circular buffer, then limiting the MSS |
3179 | | * to the buffer size would have made sense, but we still would not be able |
3180 | | * to resize the send buffer because it is not allocated by TCPlp. |
3181 | | */ |
3182 | | |
3183 | | /* |
3184 | | * samkumar: See the comment above about me removing code that modifies |
3185 | | * the MSS, making this assignment and the one above both unnecessary. |
3186 | | */ |
3187 | | //tp->t_maxseg = mss; |
3188 | | |
3189 | | /* |
3190 | | * samkumar: There used to be code here that would round up the receive |
3191 | | * buffer size to a multiple of the MSS, assuming that the receive buffer |
3192 | | * size is bigger than the MSS. The new buffer size is set using |
3193 | | * "sbreserve_locked(&so->so_rcv, bufsize, so, NULL);". In TCPlp, the |
3194 | | * buffer is not allocated by TCPlp so I removed the code for this. |
3195 | | */ |
3196 | | /* |
3197 | | * samkumar: There used to be code here to handle TCP Segmentation |
3198 | | * Offloading (TSO); I removed it becuase we don't support that in TCPlp. |
3199 | | */ |
3200 | 0 | } |
3201 | | |
3202 | | /* |
3203 | | * Determine the MSS option to send on an outgoing SYN. |
3204 | | */ |
3205 | | /* |
3206 | | * samkumar: In the signature, changed "struct in_conninfo *inc" to |
3207 | | * "struct tcpcb* tp". |
3208 | | */ |
3209 | | int |
3210 | | tcp_mssopt(struct tcpcb* tp) |
3211 | 0 | { |
3212 | | /* |
3213 | | * samkumar: I removed all processing code specific to IPv4, or to decide |
3214 | | * between IPv4 and IPv6. This is OK because TCPlp assumes IPv6. |
3215 | | */ |
3216 | 0 | int mss = 0; |
3217 | 0 | uint64_t maxmtu = 0; |
3218 | 0 | uint64_t thcmtu = 0; |
3219 | 0 | size_t min_protoh; |
3220 | |
|
3221 | 0 | KASSERT(tp != NULL, ("tcp_mssopt with NULL tcpcb pointer")); |
3222 | |
|
3223 | 0 | mss = V_tcp_v6mssdflt; |
3224 | 0 | maxmtu = tcp_maxmtu6(tp, NULL); |
3225 | 0 | min_protoh = IP6HDR_SIZE + sizeof(struct tcphdr); |
3226 | |
|
3227 | 0 | thcmtu = tcp_hc_getmtu(tp); /* IPv4 and IPv6 */ |
3228 | |
|
3229 | 0 | if (maxmtu && thcmtu) |
3230 | 0 | mss = min(maxmtu, thcmtu) - min_protoh; |
3231 | 0 | else if (maxmtu || thcmtu) |
3232 | 0 | mss = max(maxmtu, thcmtu) - min_protoh; |
3233 | |
|
3234 | 0 | return (mss); |
3235 | 0 | } |
3236 | | |
3237 | | /* |
3238 | | * On a partial ack arrives, force the retransmission of the |
3239 | | * next unacknowledged segment. Do not clear tp->t_dupacks. |
3240 | | * By setting snd_nxt to ti_ack, this forces retransmission timer to |
3241 | | * be started again. |
3242 | | */ |
3243 | | static void |
3244 | | tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th) |
3245 | 0 | { |
3246 | 0 | tcp_seq onxt = tp->snd_nxt; |
3247 | 0 | uint64_t ocwnd = tp->snd_cwnd; |
3248 | |
|
3249 | 0 | tcp_timer_activate(tp, TT_REXMT, 0); |
3250 | 0 | tp->t_rtttime = 0; |
3251 | 0 | tp->snd_nxt = th->th_ack; |
3252 | | /* |
3253 | | * Set snd_cwnd to one segment beyond acknowledged offset. |
3254 | | * (tp->snd_una has not yet been updated when this function is called.) |
3255 | | */ |
3256 | 0 | tp->snd_cwnd = tp->t_maxseg + BYTES_THIS_ACK(tp, th); |
3257 | 0 | tp->t_flags |= TF_ACKNOW; |
3258 | | #ifdef INSTRUMENT_TCP |
3259 | | tcplp_sys_log("TCP Partial_ACK"); |
3260 | | #endif |
3261 | 0 | (void) tcplp_output(tp); |
3262 | 0 | tp->snd_cwnd = ocwnd; |
3263 | 0 | if (SEQ_GT(onxt, tp->snd_nxt)) |
3264 | 0 | tp->snd_nxt = onxt; |
3265 | | /* |
3266 | | * Partial window deflation. Relies on fact that tp->snd_una |
3267 | | * not updated yet. |
3268 | | */ |
3269 | 0 | if (tp->snd_cwnd > BYTES_THIS_ACK(tp, th)) |
3270 | 0 | tp->snd_cwnd -= BYTES_THIS_ACK(tp, th); |
3271 | 0 | else |
3272 | 0 | tp->snd_cwnd = 0; |
3273 | 0 | tp->snd_cwnd += tp->t_maxseg; |
3274 | | #ifdef INSTRUMENT_TCP |
3275 | | tcplp_sys_log("TCP Partial_ACK_final %d", (int) tp->snd_cwnd); |
3276 | | #endif |
3277 | 0 | } |