/src/openthread/third_party/tcplp/bsdtcp/tcp_output.c
Line  | Count  | Source  | 
1  |  | /*-  | 
2  |  |  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995  | 
3  |  |  *  The Regents of the University of California.  All rights reserved.  | 
4  |  |  *  | 
5  |  |  * Redistribution and use in source and binary forms, with or without  | 
6  |  |  * modification, are permitted provided that the following conditions  | 
7  |  |  * are met:  | 
8  |  |  * 1. Redistributions of source code must retain the above copyright  | 
9  |  |  *    notice, this list of conditions and the following disclaimer.  | 
10  |  |  * 2. Redistributions in binary form must reproduce the above copyright  | 
11  |  |  *    notice, this list of conditions and the following disclaimer in the  | 
12  |  |  *    documentation and/or other materials provided with the distribution.  | 
13  |  |  * 4. Neither the name of the University nor the names of its contributors  | 
14  |  |  *    may be used to endorse or promote products derived from this software  | 
15  |  |  *    without specific prior written permission.  | 
16  |  |  *  | 
17  |  |  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND  | 
18  |  |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE  | 
19  |  |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE  | 
20  |  |  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE  | 
21  |  |  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL  | 
22  |  |  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS  | 
23  |  |  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)  | 
24  |  |  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT  | 
25  |  |  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY  | 
26  |  |  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF  | 
27  |  |  * SUCH DAMAGE.  | 
28  |  |  *  | 
29  |  |  *  @(#)tcp_output.c  8.4 (Berkeley) 5/24/95  | 
30  |  |  */  | 
31  |  |  | 
32  |  | #include <errno.h>  | 
33  |  | #include <string.h>  | 
34  |  |  | 
35  |  | #include "../tcplp.h"  | 
36  |  | #include "tcp.h"  | 
37  |  | #include "tcp_fastopen.h"  | 
38  |  | #include "tcp_fsm.h"  | 
39  |  | #include "tcp_var.h"  | 
40  |  | #include "tcp_seq.h"  | 
41  |  | #include "tcp_timer.h"  | 
42  |  | #include "ip.h"  | 
43  |  | #include "../lib/cbuf.h"  | 
44  |  |  | 
45  |  | #include "tcp_const.h"  | 
46  |  |  | 
47  |  | #include <openthread/ip6.h>  | 
48  |  | #include <openthread/message.h>  | 
49  |  | #include <openthread/tcp.h>  | 
50  |  |  | 
51  |  | static inline void  | 
52  |  | cc_after_idle(struct tcpcb *tp)  | 
53  | 0  | { | 
54  |  |   /* samkumar: Removed synchronization. */  | 
55  | 0  |   if (CC_ALGO(tp)->after_idle != NULL)  | 
56  | 0  |     CC_ALGO(tp)->after_idle(tp->ccv);  | 
57  | 0  | }  | 
58  |  |  | 
59  | 0  | long min(long a, long b) { | 
60  | 0  |   if (a < b) { | 
61  | 0  |     return a;  | 
62  | 0  |   } else { | 
63  | 0  |     return b;  | 
64  | 0  |   }  | 
65  | 0  | }  | 
66  |  |  | 
67  | 0  | unsigned long ulmin(unsigned long a, unsigned long b) { | 
68  | 0  |   if (a < b) { | 
69  | 0  |     return a;  | 
70  | 0  |   } else { | 
71  | 0  |     return b;  | 
72  | 0  |   }  | 
73  | 0  | }  | 
74  |  |  | 
75  | 0  | #define lmin(a, b) min(a, b)  | 
76  |  |  | 
77  |  | void  | 
78  |  | tcp_setpersist(struct tcpcb *tp)  | 
79  | 0  | { | 
80  | 0  |   int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1;  | 
81  | 0  |   int tt;  | 
82  |  | 
  | 
83  | 0  |   tp->t_flags &= ~TF_PREVVALID;  | 
84  | 0  |   if (tcp_timer_active(tp, TT_REXMT))  | 
85  | 0  |     tcplp_sys_panic("PANIC: tcp_setpersist: retransmit pending"); | 
86  |  |   /*  | 
87  |  |    * Start/restart persistance timer.  | 
88  |  |    */  | 
89  | 0  |   TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift],  | 
90  | 0  |           TCPTV_PERSMIN, TCPTV_PERSMAX);  | 
91  | 0  |   tcp_timer_activate(tp, TT_PERSIST, tt);  | 
92  | 0  |   if (tp->t_rxtshift < TCP_MAXRXTSHIFT)  | 
93  | 0  |     tp->t_rxtshift++;  | 
94  | 0  | }  | 
95  |  |  | 
96  |  | /*  | 
97  |  |  * Tcp output routine: figure out what should be sent and send it.  | 
98  |  |  */  | 
99  |  | int  | 
100  |  | tcplp_output(struct tcpcb *tp)  | 
101  | 0  | { | 
102  |  |   /*  | 
103  |  |    * samkumar: The biggest change in this function is in how outgoing  | 
104  |  |    * segments are built and sent out. That code has been updated to account  | 
105  |  |    * for TCPlp's buffering, and using otMessages rather than mbufs to  | 
106  |  |    * construct the outgoing segments.  | 
107  |  |    *  | 
108  |  |    * And, of course, all code corresponding to locks, stats, and debugging  | 
109  |  |    * has been removed, and all code specific to IPv4 or to decide between  | 
110  |  |    * IPv6 and IPv4 handling has been removed.  | 
111  |  |    */  | 
112  |  | 
  | 
113  | 0  |   struct tcphdr* th = NULL;  | 
114  | 0  |   int idle;  | 
115  | 0  |   long len, recwin, sendwin;  | 
116  | 0  |   int off, flags, error = 0;  /* Keep compiler happy */  | 
117  | 0  |   int sendalot, mtu;  | 
118  | 0  |   int sack_rxmit, sack_bytes_rxmt;  | 
119  | 0  |   struct sackhole* p;  | 
120  | 0  |   unsigned ipoptlen, optlen, hdrlen;  | 
121  | 0  |   struct tcpopt to;  | 
122  | 0  |   unsigned int wanted_cookie = 0;  | 
123  | 0  |   unsigned int dont_sendalot = 0;  | 
124  | 0  |   uint8_t opt[TCP_MAXOLEN];  | 
125  | 0  |   uint32_t ticks = tcplp_sys_get_ticks();  | 
126  |  |  | 
127  |  |   /* samkumar: Code for TCP offload has been removed. */  | 
128  |  |  | 
129  |  |   /*  | 
130  |  |    * For TFO connections in SYN_SENT or SYN_RECEIVED,  | 
131  |  |    * only allow the initial SYN or SYN|ACK and those sent  | 
132  |  |    * by the retransmit timer.  | 
133  |  |    */  | 
134  | 0  |   if (IS_FASTOPEN(tp->t_flags) &&  | 
135  | 0  |       ((tp->t_state == TCPS_SYN_SENT) ||  | 
136  | 0  |        (tp->t_state == TCPS_SYN_RECEIVED)) &&  | 
137  | 0  |       SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN or SYN|ACK sent */  | 
138  | 0  |       (tp->snd_nxt != tp->snd_una))       /* not a retransmit */  | 
139  | 0  |     return (0);  | 
140  |  |  | 
141  |  |   /*  | 
142  |  |    * Determine length of data that should be transmitted,  | 
143  |  |    * and flags that will be used.  | 
144  |  |    * If there is some data or critical controls (SYN, RST)  | 
145  |  |    * to send, then transmit; otherwise, investigate further.  | 
146  |  |    */  | 
147  | 0  |   idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una);  | 
148  | 0  |   if (idle && ticks - tp->t_rcvtime >= tp->t_rxtcur)  | 
149  | 0  |     cc_after_idle(tp);  | 
150  |  | 
  | 
151  | 0  |   tp->t_flags &= ~TF_LASTIDLE;  | 
152  | 0  |   if (idle) { | 
153  | 0  |     if (tp->t_flags & TF_MORETOCOME) { | 
154  | 0  |       tp->t_flags |= TF_LASTIDLE;  | 
155  | 0  |       idle = 0;  | 
156  | 0  |     }  | 
157  | 0  |   }  | 
158  |  |   /* samkumar: This would be printed once per _window_ that is transmitted. */  | 
159  |  | #ifdef INSTRUMENT_TCP  | 
160  |  |   tcplp_sys_log("TCP output %u %d %d", (unsigned int) tcplp_sys_get_millis(), (int) tp->snd_wnd, (int) tp->snd_cwnd); | 
161  |  | #endif  | 
162  |  | 
  | 
163  | 0  | again:  | 
164  |  |   /*  | 
165  |  |    * If we've recently taken a timeout, snd_max will be greater than  | 
166  |  |    * snd_nxt.  There may be SACK information that allows us to avoid  | 
167  |  |    * resending already delivered data.  Adjust snd_nxt accordingly.  | 
168  |  |    */  | 
169  | 0  |   if ((tp->t_flags & TF_SACK_PERMIT) &&  | 
170  | 0  |       SEQ_LT(tp->snd_nxt, tp->snd_max))  | 
171  | 0  |     tcp_sack_adjust(tp);  | 
172  | 0  |   sendalot = 0;  | 
173  |  |   /* samkumar: Removed code for supporting TSO. */  | 
174  | 0  |   mtu = 0;  | 
175  | 0  |   off = tp->snd_nxt - tp->snd_una;  | 
176  | 0  |   sendwin = min(tp->snd_wnd, tp->snd_cwnd);  | 
177  |  | 
  | 
178  | 0  |   flags = tcp_outflags[tp->t_state];  | 
179  |  |   /*  | 
180  |  |    * Send any SACK-generated retransmissions.  If we're explicitly trying  | 
181  |  |    * to send out new data (when sendalot is 1), bypass this function.  | 
182  |  |    * If we retransmit in fast recovery mode, decrement snd_cwnd, since  | 
183  |  |    * we're replacing a (future) new transmission with a retransmission  | 
184  |  |    * now, and we previously incremented snd_cwnd in tcplp_input().  | 
185  |  |    */  | 
186  |  |   /*  | 
187  |  |    * Still in sack recovery , reset rxmit flag to zero.  | 
188  |  |    */  | 
189  | 0  |   sack_rxmit = 0;  | 
190  | 0  |   sack_bytes_rxmt = 0;  | 
191  | 0  |   len = 0;  | 
192  | 0  |   p = NULL;  | 
193  | 0  |   if ((tp->t_flags & TF_SACK_PERMIT) && IN_FASTRECOVERY(tp->t_flags) &&  | 
194  | 0  |       (p = tcp_sack_output(tp, &sack_bytes_rxmt))) { | 
195  | 0  |     long cwin;  | 
196  |  | 
  | 
197  | 0  |     cwin = min(tp->snd_wnd, tp->snd_cwnd) - sack_bytes_rxmt;  | 
198  | 0  |     if (cwin < 0)  | 
199  | 0  |       cwin = 0;  | 
200  |  |     /* Do not retransmit SACK segments beyond snd_recover */  | 
201  | 0  |     if (SEQ_GT(p->end, tp->snd_recover)) { | 
202  |  |       /*  | 
203  |  |        * (At least) part of sack hole extends beyond  | 
204  |  |        * snd_recover. Check to see if we can rexmit data  | 
205  |  |        * for this hole.  | 
206  |  |        */  | 
207  | 0  |       if (SEQ_GEQ(p->rxmit, tp->snd_recover)) { | 
208  |  |         /*  | 
209  |  |          * Can't rexmit any more data for this hole.  | 
210  |  |          * That data will be rexmitted in the next  | 
211  |  |          * sack recovery episode, when snd_recover  | 
212  |  |          * moves past p->rxmit.  | 
213  |  |          */  | 
214  | 0  |         p = NULL;  | 
215  | 0  |         goto after_sack_rexmit;  | 
216  | 0  |       } else  | 
217  |  |         /* Can rexmit part of the current hole */  | 
218  | 0  |         len = ((long)ulmin(cwin,  | 
219  | 0  |                tp->snd_recover - p->rxmit));  | 
220  | 0  |     } else  | 
221  | 0  |       len = ((long)ulmin(cwin, p->end - p->rxmit));  | 
222  | 0  |     off = p->rxmit - tp->snd_una;  | 
223  | 0  |     KASSERT(off >= 0,("%s: sack block to the left of una : %d", | 
224  | 0  |         __func__, off));  | 
225  | 0  |     if (len > 0) { | 
226  | 0  |       sack_rxmit = 1;  | 
227  | 0  |       sendalot = 1;  | 
228  | 0  |     }  | 
229  | 0  |   }  | 
230  | 0  | after_sack_rexmit:  | 
231  |  |   /*  | 
232  |  |    * Get standard flags, and add SYN or FIN if requested by 'hidden'  | 
233  |  |    * state flags.  | 
234  |  |    */  | 
235  | 0  |   if (tp->t_flags & TF_NEEDFIN)  | 
236  | 0  |     flags |= TH_FIN;  | 
237  | 0  |   if (tp->t_flags & TF_NEEDSYN)  | 
238  | 0  |     flags |= TH_SYN;  | 
239  |  |  | 
240  |  |   /*  | 
241  |  |    * If in persist timeout with window of 0, send 1 byte.  | 
242  |  |    * Otherwise, if window is small but nonzero  | 
243  |  |    * and timer expired, we will send what we can  | 
244  |  |    * and go to transmit state.  | 
245  |  |    */  | 
246  | 0  |   if (tp->t_flags & TF_FORCEDATA) { | 
247  | 0  |     if (sendwin == 0) { | 
248  |  |       /*  | 
249  |  |        * If we still have some data to send, then  | 
250  |  |        * clear the FIN bit.  Usually this would  | 
251  |  |        * happen below when it realizes that we  | 
252  |  |        * aren't sending all the data.  However,  | 
253  |  |        * if we have exactly 1 byte of unsent data,  | 
254  |  |        * then it won't clear the FIN bit below,  | 
255  |  |        * and if we are in persist state, we wind  | 
256  |  |        * up sending the packet without recording  | 
257  |  |        * that we sent the FIN bit.  | 
258  |  |        *  | 
259  |  |        * We can't just blindly clear the FIN bit,  | 
260  |  |        * because if we don't have any more data  | 
261  |  |        * to send then the probe will be the FIN  | 
262  |  |        * itself.  | 
263  |  |        */  | 
264  |  |       /*  | 
265  |  |        * samkumar: Replaced call to sbused(&so->so_snd) with the call to  | 
266  |  |        * lbuf_used_space below.  | 
267  |  |        */  | 
268  | 0  |       if (off < lbuf_used_space(&tp->sendbuf))  | 
269  | 0  |         flags &= ~TH_FIN;  | 
270  | 0  |       sendwin = 1;  | 
271  | 0  |     } else { | 
272  | 0  |       tcp_timer_activate(tp, TT_PERSIST, 0);  | 
273  | 0  |       tp->t_rxtshift = 0;  | 
274  | 0  |     }  | 
275  | 0  |   }  | 
276  |  |  | 
277  |  |   /*  | 
278  |  |    * If snd_nxt == snd_max and we have transmitted a FIN, the  | 
279  |  |    * offset will be > 0 even if so_snd.sb_cc is 0, resulting in  | 
280  |  |    * a negative length.  This can also occur when TCP opens up  | 
281  |  |    * its congestion window while receiving additional duplicate  | 
282  |  |    * acks after fast-retransmit because TCP will reset snd_nxt  | 
283  |  |    * to snd_max after the fast-retransmit.  | 
284  |  |    *  | 
285  |  |    * In the normal retransmit-FIN-only case, however, snd_nxt will  | 
286  |  |    * be set to snd_una, the offset will be 0, and the length may  | 
287  |  |    * wind up 0.  | 
288  |  |    *  | 
289  |  |    * If sack_rxmit is true we are retransmitting from the scoreboard  | 
290  |  |    * in which case len is already set.  | 
291  |  |    */  | 
292  | 0  |   if (sack_rxmit == 0) { | 
293  | 0  |     if (sack_bytes_rxmt == 0)  | 
294  |  |       /*  | 
295  |  |        * samkumar: Replaced sbavail(&so->so_snd) with this call to  | 
296  |  |        * lbuf_used_space.  | 
297  |  |        */  | 
298  | 0  |       len = ((long)ulmin(lbuf_used_space(&tp->sendbuf), sendwin) -  | 
299  | 0  |           off);  | 
300  | 0  |     else { | 
301  | 0  |       long cwin;  | 
302  |  |  | 
303  |  |       /*  | 
304  |  |        * We are inside of a SACK recovery episode and are  | 
305  |  |        * sending new data, having retransmitted all the  | 
306  |  |        * data possible in the scoreboard.  | 
307  |  |        */  | 
308  |  |       /*  | 
309  |  |        * samkumar: Replaced sbavail(&so->so_snd) with this call to  | 
310  |  |        * lbuf_used_space.  | 
311  |  |        */  | 
312  | 0  |       len = ((long)ulmin(lbuf_used_space(&tp->sendbuf), tp->snd_wnd) -  | 
313  | 0  |           off);  | 
314  |  |       /*  | 
315  |  |        * Don't remove this (len > 0) check !  | 
316  |  |        * We explicitly check for len > 0 here (although it  | 
317  |  |        * isn't really necessary), to work around a gcc  | 
318  |  |        * optimization issue - to force gcc to compute  | 
319  |  |        * len above. Without this check, the computation  | 
320  |  |        * of len is bungled by the optimizer.  | 
321  |  |        */  | 
322  | 0  |       if (len > 0) { | 
323  | 0  |         cwin = tp->snd_cwnd -  | 
324  | 0  |           (tp->snd_nxt - tp->sack_newdata) -  | 
325  | 0  |           sack_bytes_rxmt;  | 
326  | 0  |         if (cwin < 0)  | 
327  | 0  |           cwin = 0;  | 
328  | 0  |         len = lmin(len, cwin);  | 
329  | 0  |       }  | 
330  | 0  |     }  | 
331  | 0  |   }  | 
332  |  |  | 
333  |  |   /*  | 
334  |  |    * Lop off SYN bit if it has already been sent.  However, if this  | 
335  |  |    * is SYN-SENT state and if segment contains data and if we don't  | 
336  |  |    * know that foreign host supports TAO, suppress sending segment.  | 
337  |  |    */  | 
338  | 0  |   if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) { | 
339  | 0  |     if (tp->t_state != TCPS_SYN_RECEIVED)  | 
340  | 0  |       flags &= ~TH_SYN;  | 
341  |  |     /*  | 
342  |  |      * When sending additional segments following a TFO SYN|ACK,  | 
343  |  |      * do not include the SYN bit.  | 
344  |  |      */  | 
345  | 0  |     if (IS_FASTOPEN(tp->t_flags) &&  | 
346  | 0  |         (tp->t_state == TCPS_SYN_RECEIVED))  | 
347  | 0  |       flags &= ~TH_SYN;  | 
348  | 0  |     off--, len++;  | 
349  | 0  |   }  | 
350  |  |  | 
351  |  |   /*  | 
352  |  |    * Be careful not to send data and/or FIN on SYN segments.  | 
353  |  |    * This measure is needed to prevent interoperability problems  | 
354  |  |    * with not fully conformant TCP implementations.  | 
355  |  |    */  | 
356  | 0  |   if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) { | 
357  | 0  |     len = 0;  | 
358  | 0  |     flags &= ~TH_FIN;  | 
359  | 0  |   }  | 
360  |  |  | 
361  |  |   /*  | 
362  |  |    * On TFO sockets, ensure no data is sent in the following cases:  | 
363  |  |    *  | 
364  |  |    *  - When retransmitting SYN|ACK on a passively-created socket  | 
365  |  |    *  | 
366  |  |    *  - When retransmitting SYN on an actively created socket  | 
367  |  |    *  | 
368  |  |    *  - When sending a zero-length cookie (cookie request) on an  | 
369  |  |    *    actively created socket  | 
370  |  |    *  | 
371  |  |    *  - When the socket is in the CLOSED state (RST is being sent)  | 
372  |  |    */  | 
373  |  |   /*  | 
374  |  |    * samkumar: I commented out the check to ensure no data is sent  | 
375  |  |    * on a TFO cookie request. As far as I am aware, this is still  | 
376  |  |    * compliant with the RFC.  | 
377  |  |    */  | 
378  | 0  |   if (IS_FASTOPEN(tp->t_flags) &&  | 
379  | 0  |       (((flags & TH_SYN) && (tp->t_rxtshift > 0)) ||  | 
380  |  |        /*((tp->t_state == TCPS_SYN_SENT) &&  | 
381  |  |         (tp->t_tfo_client_cookie_len == 0)) ||*/  | 
382  | 0  |        (flags & TH_RST)))  | 
383  | 0  |     len = 0;  | 
384  | 0  |   if (len <= 0) { | 
385  |  |     /*  | 
386  |  |      * If FIN has been sent but not acked,  | 
387  |  |      * but we haven't been called to retransmit,  | 
388  |  |      * len will be < 0.  Otherwise, window shrank  | 
389  |  |      * after we sent into it.  If window shrank to 0,  | 
390  |  |      * cancel pending retransmit, pull snd_nxt back  | 
391  |  |      * to (closed) window, and set the persist timer  | 
392  |  |      * if it isn't already going.  If the window didn't  | 
393  |  |      * close completely, just wait for an ACK.  | 
394  |  |      *  | 
395  |  |      * We also do a general check here to ensure that  | 
396  |  |      * we will set the persist timer when we have data  | 
397  |  |      * to send, but a 0-byte window. This makes sure  | 
398  |  |      * the persist timer is set even if the packet  | 
399  |  |      * hits one of the "goto send" lines below.  | 
400  |  |      */  | 
401  | 0  |     len = 0;  | 
402  |  |     /*  | 
403  |  |      * samkumar: Replaced sbavail(&so->so_snd) with this call to  | 
404  |  |      * lbuf_used_space.  | 
405  |  |      */  | 
406  | 0  |     if ((sendwin == 0) && (TCPS_HAVEESTABLISHED(tp->t_state)) &&  | 
407  | 0  |       (off < (int) lbuf_used_space(&tp->sendbuf))) { | 
408  | 0  |       tcp_timer_activate(tp, TT_REXMT, 0);  | 
409  | 0  |       tp->t_rxtshift = 0;  | 
410  | 0  |       tp->snd_nxt = tp->snd_una;  | 
411  | 0  |       if (!tcp_timer_active(tp, TT_PERSIST)) { | 
412  | 0  |         tcp_setpersist(tp);  | 
413  | 0  |       }  | 
414  | 0  |     }  | 
415  | 0  |   }  | 
416  |  |  | 
417  |  |  | 
418  |  |   /* len will be >= 0 after this point. */  | 
419  | 0  |   KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); | 
420  |  |  | 
421  |  |   /*  | 
422  |  |    * Automatic sizing of send socket buffer.  Often the send buffer  | 
423  |  |    * size is not optimally adjusted to the actual network conditions  | 
424  |  |    * at hand (delay bandwidth product).  Setting the buffer size too  | 
425  |  |    * small limits throughput on links with high bandwidth and high  | 
426  |  |    * delay (eg. trans-continental/oceanic links).  Setting the  | 
427  |  |    * buffer size too big consumes too much real kernel memory,  | 
428  |  |    * especially with many connections on busy servers.  | 
429  |  |    *  | 
430  |  |    * The criteria to step up the send buffer one notch are:  | 
431  |  |    *  1. receive window of remote host is larger than send buffer  | 
432  |  |    *     (with a fudge factor of 5/4th);  | 
433  |  |    *  2. send buffer is filled to 7/8th with data (so we actually  | 
434  |  |    *     have data to make use of it);  | 
435  |  |    *  3. send buffer fill has not hit maximal automatic size;  | 
436  |  |    *  4. our send window (slow start and cogestion controlled) is  | 
437  |  |    *     larger than sent but unacknowledged data in send buffer.  | 
438  |  |    *  | 
439  |  |    * The remote host receive window scaling factor may limit the  | 
440  |  |    * growing of the send buffer before it reaches its allowed  | 
441  |  |    * maximum.  | 
442  |  |    *  | 
443  |  |    * It scales directly with slow start or congestion window  | 
444  |  |    * and does at most one step per received ACK.  This fast  | 
445  |  |    * scaling has the drawback of growing the send buffer beyond  | 
446  |  |    * what is strictly necessary to make full use of a given  | 
447  |  |    * delay*bandwith product.  However testing has shown this not  | 
448  |  |    * to be much of an problem.  At worst we are trading wasting  | 
449  |  |    * of available bandwith (the non-use of it) for wasting some  | 
450  |  |    * socket buffer memory.  | 
451  |  |    *  | 
452  |  |    * TODO: Shrink send buffer during idle periods together  | 
453  |  |    * with congestion window.  Requires another timer.  Has to  | 
454  |  |    * wait for upcoming tcp timer rewrite.  | 
455  |  |    *  | 
456  |  |    * XXXGL: should there be used sbused() or sbavail()?  | 
457  |  |    */  | 
458  |  |    /*  | 
459  |  |    * samkumar: There used to be code here to dynamically size the  | 
460  |  |    * send buffer (by calling sbreserve_locked). In TCPlp, we don't support  | 
461  |  |    * this, as the send buffer doesn't have a well-defined size (and even if  | 
462  |  |    * we were to use a circular buffer, it would be a fixed-size buffer  | 
463  |  |    * allocated by the application). Therefore, I removed the code that does  | 
464  |  |    * this.  | 
465  |  |    */  | 
466  |  |  | 
467  |  |    /*  | 
468  |  |    * samkumar: There used to be code here to handle TCP Segmentation  | 
469  |  |    * Offloading (TSO); I removed it becuase we don't support that in TCPlp.  | 
470  |  |    */  | 
471  |  | 
  | 
472  | 0  |   if (sack_rxmit) { | 
473  |  |     /*  | 
474  |  |      * samkumar: Replaced sbused(&so->so_snd) with this call to  | 
475  |  |      * lbuf_used_space.  | 
476  |  |      */  | 
477  | 0  |     if (SEQ_LT(p->rxmit + len, tp->snd_una + lbuf_used_space(&tp->sendbuf)))  | 
478  | 0  |       flags &= ~TH_FIN;  | 
479  | 0  |   } else { | 
480  | 0  |     if (SEQ_LT(tp->snd_nxt + len, tp->snd_una +  | 
481  |  |       /*  | 
482  |  |        * samkumar: Replaced sbused(&so->so_snd) with this call to  | 
483  |  |        * lbuf_used_space.  | 
484  |  |        */  | 
485  | 0  |       lbuf_used_space(&tp->sendbuf)))  | 
486  | 0  |       flags &= ~TH_FIN;  | 
487  | 0  |   }  | 
488  |  |  | 
489  |  |   /*  | 
490  |  |    * samkumar: Replaced sbspace(&so->so_rcv) with this call to  | 
491  |  |    * cbuf_free_space.  | 
492  |  |    */  | 
493  | 0  |   recwin = cbuf_free_space(&tp->recvbuf);  | 
494  |  |  | 
495  |  |   /*  | 
496  |  |    * Sender silly window avoidance.   We transmit under the following  | 
497  |  |    * conditions when len is non-zero:  | 
498  |  |    *  | 
499  |  |    *  - We have a full segment (or more with TSO)  | 
500  |  |    *  - This is the last buffer in a write()/send() and we are  | 
501  |  |    *    either idle or running NODELAY  | 
502  |  |    *  - we've timed out (e.g. persist timer)  | 
503  |  |    *  - we have more then 1/2 the maximum send window's worth of  | 
504  |  |    *    data (receiver may be limited the window size)  | 
505  |  |    *  - we need to retransmit  | 
506  |  |    */  | 
507  | 0  |   if (len) { | 
508  | 0  |     if (len >= tp->t_maxseg)  | 
509  | 0  |       goto send;  | 
510  |  |     /*  | 
511  |  |      * NOTE! on localhost connections an 'ack' from the remote  | 
512  |  |      * end may occur synchronously with the output and cause  | 
513  |  |      * us to flush a buffer queued with moretocome.  XXX  | 
514  |  |      *  | 
515  |  |      * note: the len + off check is almost certainly unnecessary.  | 
516  |  |      */  | 
517  |  |     /*  | 
518  |  |      * samkumar: Replaced sbavail(&so->so_snd) with this call to  | 
519  |  |      * lbuf_used_space.  | 
520  |  |      */  | 
521  | 0  |     if (!(tp->t_flags & TF_MORETOCOME) &&  /* normal case */  | 
522  | 0  |         (idle || (tp->t_flags & TF_NODELAY)) &&  | 
523  | 0  |         len + off >= lbuf_used_space(&tp->sendbuf) &&  | 
524  | 0  |         (tp->t_flags & TF_NOPUSH) == 0) { | 
525  | 0  |       goto send;  | 
526  | 0  |     }  | 
527  | 0  |     if (tp->t_flags & TF_FORCEDATA)   /* typ. timeout case */  | 
528  | 0  |       goto send;  | 
529  | 0  |     if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0)  | 
530  | 0  |       goto send;  | 
531  | 0  |     if (SEQ_LT(tp->snd_nxt, tp->snd_max)) /* retransmit case */  | 
532  | 0  |       goto send;  | 
533  | 0  |     if (sack_rxmit)  | 
534  | 0  |       goto send;  | 
535  | 0  |   }  | 
536  |  |  | 
537  |  |   /*  | 
538  |  |    * Sending of standalone window updates.  | 
539  |  |    *  | 
540  |  |    * Window updates are important when we close our window due to a  | 
541  |  |    * full socket buffer and are opening it again after the application  | 
542  |  |    * reads data from it.  Once the window has opened again and the  | 
543  |  |    * remote end starts to send again the ACK clock takes over and  | 
544  |  |    * provides the most current window information.  | 
545  |  |    *  | 
546  |  |    * We must avoid the silly window syndrome whereas every read  | 
547  |  |    * from the receive buffer, no matter how small, causes a window  | 
548  |  |    * update to be sent.  We also should avoid sending a flurry of  | 
549  |  |    * window updates when the socket buffer had queued a lot of data  | 
550  |  |    * and the application is doing small reads.  | 
551  |  |    *  | 
552  |  |    * Prevent a flurry of pointless window updates by only sending  | 
553  |  |    * an update when we can increase the advertized window by more  | 
554  |  |    * than 1/4th of the socket buffer capacity.  When the buffer is  | 
555  |  |    * getting full or is very small be more aggressive and send an  | 
556  |  |    * update whenever we can increase by two mss sized segments.  | 
557  |  |    * In all other situations the ACK's to new incoming data will  | 
558  |  |    * carry further window increases.  | 
559  |  |    *  | 
560  |  |    * Don't send an independent window update if a delayed  | 
561  |  |    * ACK is pending (it will get piggy-backed on it) or the  | 
562  |  |    * remote side already has done a half-close and won't send  | 
563  |  |    * more data.  Skip this if the connection is in T/TCP  | 
564  |  |    * half-open state.  | 
565  |  |    */  | 
566  | 0  |   if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) &&  | 
567  | 0  |       !(tp->t_flags & TF_DELACK) &&  | 
568  | 0  |       !TCPS_HAVERCVDFIN(tp->t_state)) { | 
569  |  |     /*  | 
570  |  |      * "adv" is the amount we could increase the window,  | 
571  |  |      * taking into account that we are limited by  | 
572  |  |      * TCP_MAXWIN << tp->rcv_scale.  | 
573  |  |      */  | 
574  | 0  |     long adv;  | 
575  | 0  |     int oldwin;  | 
576  |  | 
  | 
577  | 0  |     adv = min(recwin, (long)TCP_MAXWIN << tp->rcv_scale);  | 
578  | 0  |     if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) { | 
579  | 0  |       oldwin = (tp->rcv_adv - tp->rcv_nxt);  | 
580  | 0  |       adv -= oldwin;  | 
581  | 0  |     } else  | 
582  | 0  |       oldwin = 0;  | 
583  |  |  | 
584  |  |     /*  | 
585  |  |      * If the new window size ends up being the same as the old  | 
586  |  |      * size when it is scaled, then don't force a window update.  | 
587  |  |      */  | 
588  | 0  |     if (oldwin >> tp->rcv_scale == (adv + oldwin) >> tp->rcv_scale)  | 
589  | 0  |       goto dontupdate;  | 
590  |  |  | 
591  |  |     /*  | 
592  |  |      * samkumar: Here, FreeBSD has some heuristics to decide whether or  | 
593  |  |      * not to send a window update. The code for the original heuristics  | 
594  |  |      * is commented out, using #if 0. These heuristics compare "adv,"  | 
595  |  |      * the size of the window update, with the size of the local receive  | 
596  |  |      * buffer. The FreeBSD heuristics aren't applicable because they are  | 
597  |  |      * orders of magnitude off from what we see in TCPlp. For example,  | 
598  |  |      * FreeBSD only sends a window update if it is at least two segments  | 
599  |  |      * big. Note that, in the experiments I did, the second case did not  | 
600  |  |      * filter window updates further because, in the experiments, the  | 
601  |  |      * receive buffer was smaller than 8 segments.  | 
602  |  |      *  | 
603  |  |      * I replaced these heuristics with a simpler version, which you can  | 
604  |  |      * see below. For the experiments I did, the first condition  | 
605  |  |      * (checking if adv >= (long)(2 * tp->t_maxseg)) wasn't included; this  | 
606  |  |      * did not matter because the receive buffer was smaller than 8  | 
607  |  |      * segments, so any condition that would have triggered the first  | 
608  |  |      * condition would have triggered the second one anyway. I've included  | 
609  |  |      * the first condition in this version in an effort to be more robust,  | 
610  |  |      * in case someone does try to run TCPlp with a large receive buffer.  | 
611  |  |      *  | 
612  |  |      * It may be worth studying this more and revisiting the heuristic to  | 
613  |  |      * use here. In case we try to resurrect the old FreeBSD heuristics,  | 
614  |  |      * note that so->so_rcv.sb_hiwat in FreeBSD corresponds roughly to  | 
615  |  |      * cbuf_size(&tp->recvbuf) in TCPlp.  | 
616  |  |      */  | 
617  |  | #if 0  | 
618  |  |     if (adv >= (long)(2 * tp->t_maxseg) &&  | 
619  |  |         (adv >= (long)(so->so_rcv.sb_hiwat / 4) ||  | 
620  |  |          recwin <= (long)(so->so_rcv.sb_hiwat / 8) ||  | 
621  |  |          so->so_rcv.sb_hiwat <= 8 * tp->t_maxseg))  | 
622  |  |       goto send;  | 
623  |  | #endif  | 
624  | 0  |     if (adv >= (long)(2 * tp->t_maxseg) ||  | 
625  | 0  |         adv >= (long)cbuf_size(&tp->recvbuf) / 4)  | 
626  | 0  |       goto send;  | 
627  | 0  |   }  | 
628  | 0  | dontupdate:  | 
629  |  |  | 
630  |  |   /*  | 
631  |  |    * Send if we owe the peer an ACK, RST, SYN, or urgent data.  ACKNOW  | 
632  |  |    * is also a catch-all for the retransmit timer timeout case.  | 
633  |  |    */  | 
634  | 0  |   if (tp->t_flags & TF_ACKNOW) { | 
635  | 0  |     goto send;  | 
636  | 0  |   }  | 
637  | 0  |   if ((flags & TH_RST) ||  | 
638  | 0  |       ((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0))  | 
639  | 0  |     goto send;  | 
640  | 0  |   if (SEQ_GT(tp->snd_up, tp->snd_una))  | 
641  | 0  |     goto send;  | 
642  |  |   /*  | 
643  |  |    * If our state indicates that FIN should be sent  | 
644  |  |    * and we have not yet done so, then we need to send.  | 
645  |  |    */  | 
646  | 0  |   if (flags & TH_FIN &&  | 
647  | 0  |       ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una))  | 
648  | 0  |     goto send;  | 
649  |  |   /*  | 
650  |  |    * In SACK, it is possible for tcplp_output to fail to send a segment  | 
651  |  |    * after the retransmission timer has been turned off.  Make sure  | 
652  |  |    * that the retransmission timer is set.  | 
653  |  |    */  | 
654  | 0  |   if ((tp->t_flags & TF_SACK_PERMIT) &&  | 
655  | 0  |       SEQ_GT(tp->snd_max, tp->snd_una) &&  | 
656  | 0  |       !tcp_timer_active(tp, TT_REXMT) &&  | 
657  | 0  |       !tcp_timer_active(tp, TT_PERSIST)) { | 
658  | 0  |     tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);  | 
659  | 0  |     goto just_return;  | 
660  | 0  |   }  | 
661  |  |  | 
662  |  |   /*  | 
663  |  |    * TCP window updates are not reliable, rather a polling protocol  | 
664  |  |    * using ``persist'' packets is used to insure receipt of window  | 
665  |  |    * updates.  The three ``states'' for the output side are:  | 
666  |  |    *  idle      not doing retransmits or persists  | 
667  |  |    *  persisting    to move a small or zero window  | 
668  |  |    *  (re)transmitting  and thereby not persisting  | 
669  |  |    *  | 
670  |  |    * tcp_timer_active(tp, TT_PERSIST)  | 
671  |  |    *  is true when we are in persist state.  | 
672  |  |    * (tp->t_flags & TF_FORCEDATA)  | 
673  |  |    *  is set when we are called to send a persist packet.  | 
674  |  |    * tcp_timer_active(tp, TT_REXMT)  | 
675  |  |    *  is set when we are retransmitting  | 
676  |  |    * The output side is idle when both timers are zero.  | 
677  |  |    *  | 
678  |  |    * If send window is too small, there is data to transmit, and no  | 
679  |  |    * retransmit or persist is pending, then go to persist state.  | 
680  |  |    * If nothing happens soon, send when timer expires:  | 
681  |  |    * if window is nonzero, transmit what we can,  | 
682  |  |    * otherwise force out a byte.  | 
683  |  |    */  | 
684  |  |   /*  | 
685  |  |    * samkumar: Replaced sbavail(&so->so_snd) with this call to  | 
686  |  |    * lbuf_used_space.  | 
687  |  |    */  | 
688  | 0  |   if (lbuf_used_space(&tp->sendbuf) && !tcp_timer_active(tp, TT_REXMT) &&  | 
689  | 0  |       !tcp_timer_active(tp, TT_PERSIST)) { | 
690  | 0  |     tp->t_rxtshift = 0;  | 
691  | 0  |     tcp_setpersist(tp);  | 
692  | 0  |   }  | 
693  |  |  | 
694  |  |   /*  | 
695  |  |    * No reason to send a segment, just return.  | 
696  |  |    */  | 
697  | 0  | just_return:  | 
698  | 0  |   return (0);  | 
699  |  |  | 
700  | 0  | send:  | 
701  | 0  |   if (len > 0) { | 
702  | 0  |     if (len >= tp->t_maxseg)  | 
703  | 0  |       tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT;  | 
704  | 0  |     else  | 
705  | 0  |       tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT;  | 
706  | 0  |   }  | 
707  |  |   /*  | 
708  |  |    * Before ESTABLISHED, force sending of initial options  | 
709  |  |    * unless TCP set not to do any options.  | 
710  |  |    * NOTE: we assume that the IP/TCP header plus TCP options  | 
711  |  |    * always fit in a single mbuf, leaving room for a maximum  | 
712  |  |    * link header, i.e.  | 
713  |  |    *  max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MCLBYTES  | 
714  |  |    */  | 
715  | 0  |   optlen = 0;  | 
716  | 0  |   hdrlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr);  | 
717  |  |  | 
718  |  |   /*  | 
719  |  |    * Compute options for segment.  | 
720  |  |    * We only have to care about SYN and established connection  | 
721  |  |    * segments.  Options for SYN-ACK segments are handled in TCP  | 
722  |  |    * syncache.  | 
723  |  |    */  | 
724  |  |   /*  | 
725  |  |    * samkumar: I've done away with the syncache. However, it  | 
726  |  |    * seems that the existing logic works fine for SYN-ACK as  | 
727  |  |    * well.  | 
728  |  |    */  | 
729  | 0  |   to.to_flags = 0;  | 
730  | 0  |   if ((tp->t_flags & TF_NOOPT) == 0) { | 
731  |  |     /* Maximum segment size. */  | 
732  | 0  |     if (flags & TH_SYN) { | 
733  | 0  |       tp->snd_nxt = tp->iss;  | 
734  | 0  |       to.to_mss = tcp_mssopt(tp);  | 
735  | 0  |       to.to_flags |= TOF_MSS;  | 
736  |  |  | 
737  |  |       /*  | 
738  |  |        * On SYN or SYN|ACK transmits on TFO connections,  | 
739  |  |        * only include the TFO option if it is not a  | 
740  |  |        * retransmit, as the presence of the TFO option may  | 
741  |  |        * have caused the original SYN or SYN|ACK to have  | 
742  |  |        * been dropped by a middlebox.  | 
743  |  |        */  | 
744  | 0  |       if (IS_FASTOPEN(tp->t_flags) &&  | 
745  | 0  |           (tp->t_rxtshift == 0)) { | 
746  | 0  |         if (tp->t_state == TCPS_SYN_RECEIVED) { | 
747  | 0  |           to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN;  | 
748  | 0  |           to.to_tfo_cookie =  | 
749  | 0  |               (u_int8_t *)&tp->t_tfo_cookie.server;  | 
750  | 0  |           to.to_flags |= TOF_FASTOPEN;  | 
751  | 0  |           wanted_cookie = 1;  | 
752  | 0  |         } else if (tp->t_state == TCPS_SYN_SENT) { | 
753  | 0  |           to.to_tfo_len =  | 
754  | 0  |               tp->t_tfo_client_cookie_len;  | 
755  | 0  |           to.to_tfo_cookie =  | 
756  | 0  |               tp->t_tfo_cookie.client;  | 
757  | 0  |           to.to_flags |= TOF_FASTOPEN;  | 
758  | 0  |           wanted_cookie = 1;  | 
759  |  |           /*  | 
760  |  |            * If we wind up having more data to  | 
761  |  |            * send with the SYN than can fit in  | 
762  |  |            * one segment, don't send any more  | 
763  |  |            * until the SYN|ACK comes back from  | 
764  |  |            * the other end.  | 
765  |  |            */  | 
766  | 0  |           dont_sendalot = 1;  | 
767  | 0  |         }  | 
768  | 0  |       }  | 
769  | 0  |     }  | 
770  |  |     /* Window scaling. */  | 
771  | 0  |     if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) { | 
772  | 0  |       to.to_wscale = tp->request_r_scale;  | 
773  | 0  |       to.to_flags |= TOF_SCALE;  | 
774  | 0  |     }  | 
775  |  |     /* Timestamps. */  | 
776  | 0  |     if ((tp->t_flags & TF_RCVD_TSTMP) ||  | 
777  | 0  |         ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) { | 
778  | 0  |       to.to_tsval = tcp_ts_getticks() + tp->ts_offset;  | 
779  | 0  |       to.to_tsecr = tp->ts_recent;  | 
780  | 0  |       to.to_flags |= TOF_TS;  | 
781  |  |       /*  | 
782  |  |        * samkumar: I removed the code to set the timestamp tp->rfbuf_ts  | 
783  |  |        * for receive buffer autosizing, since we don't do autosizing on  | 
784  |  |        * the receive buffer in TCPlp.  | 
785  |  |        */  | 
786  | 0  |     }  | 
787  |  |  | 
788  |  |     /* Selective ACK's. */  | 
789  | 0  |     if (tp->t_flags & TF_SACK_PERMIT) { | 
790  | 0  |       if (flags & TH_SYN)  | 
791  | 0  |         to.to_flags |= TOF_SACKPERM;  | 
792  | 0  |       else if (TCPS_HAVEESTABLISHED(tp->t_state) &&  | 
793  | 0  |           (tp->t_flags & TF_SACK_PERMIT) &&  | 
794  | 0  |           tp->rcv_numsacks > 0) { | 
795  | 0  |         to.to_flags |= TOF_SACK;  | 
796  | 0  |         to.to_nsacks = tp->rcv_numsacks;  | 
797  | 0  |         to.to_sacks = (uint8_t *)tp->sackblks;  | 
798  | 0  |       }  | 
799  | 0  |     }  | 
800  |  |  | 
801  |  |     /*  | 
802  |  |      * samkumar: Remove logic to set TOF_SIGNATURE flag in to.to_flags,  | 
803  |  |      * since TCPlp does not support TCP signatures.  | 
804  |  |      */  | 
805  |  |  | 
806  |  |     /* Processing the options. */  | 
807  | 0  |     hdrlen += optlen = tcp_addoptions(&to, opt);  | 
808  |  |     /*  | 
809  |  |      * If we wanted a TFO option to be added, but it was unable  | 
810  |  |      * to fit, ensure no data is sent.  | 
811  |  |      */  | 
812  | 0  |     if (IS_FASTOPEN(tp->t_flags) && wanted_cookie &&  | 
813  | 0  |         !(to.to_flags & TOF_FASTOPEN))  | 
814  | 0  |       len = 0;  | 
815  | 0  |   }  | 
816  |  |   /*  | 
817  |  |    * samkumar: This used to be set to ip6_optlen(tp->t_inpcb), instead of 0,  | 
818  |  |    * along with some additional code to handle IPSEC. In TCPlp we don't set  | 
819  |  |    * IPv6 options here; we expect those to be set by the host network stack.  | 
820  |  |    * Of course, code that supports IPv4 has been removed as well.  | 
821  |  |    */  | 
822  | 0  |   ipoptlen = 0;  | 
823  |  |  | 
824  |  |   /*  | 
825  |  |    * Adjust data length if insertion of options will  | 
826  |  |    * bump the packet length beyond the t_maxopd length.  | 
827  |  |    * Clear the FIN bit because we cut off the tail of  | 
828  |  |    * the segment.  | 
829  |  |    */  | 
830  | 0  |   if (len + optlen + ipoptlen > tp->t_maxopd) { | 
831  | 0  |     flags &= ~TH_FIN;  | 
832  |  |     /*  | 
833  |  |      * samkumar: Remove code for TCP segmentation offloading.  | 
834  |  |      */  | 
835  | 0  |     len = tp->t_maxopd - optlen - ipoptlen;  | 
836  | 0  |     sendalot = 1;  | 
837  | 0  |     if (dont_sendalot)  | 
838  | 0  |         sendalot = 0;  | 
839  | 0  |   }  | 
840  |  |   /*  | 
841  |  |    * samkumar: The else case of the above "if" statement would set tso to 0.  | 
842  |  |    * Removing this since we no longer need a tso variable.  | 
843  |  |    */  | 
844  | 0  |   KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET,  | 
845  | 0  |       ("%s: len > IP_MAXPACKET", __func__)); | 
846  |  |  | 
847  |  |   /*  | 
848  |  |    * This KASSERT is here to catch edge cases at a well defined place.  | 
849  |  |    * Before, those had triggered (random) panic conditions further down.  | 
850  |  |    */  | 
851  | 0  |   KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); | 
852  |  |  | 
853  |  |   /*  | 
854  |  |    * Grab a header mbuf, attaching a copy of data to  | 
855  |  |    * be transmitted, and initialize the header from  | 
856  |  |    * the template for sends on this connection.  | 
857  |  |    */  | 
858  |  |  | 
859  |  |   /*  | 
860  |  |    * samkumar: The code to allocate, build, and send outgoing segments has  | 
861  |  |    * been rewritten. I've left the original code to build the output mbuf  | 
862  |  |    * here in a comment, for reference. The new code is below.  | 
863  |  |    */  | 
864  |  | #if 0  | 
865  |  |   if (len) { | 
866  |  |     struct mbuf *mb;  | 
867  |  |     uint32_t moff;  | 
868  |  |  | 
869  |  |     if ((tp->t_flags & TF_FORCEDATA) && len == 1)  | 
870  |  |       TCPSTAT_INC(tcps_sndprobe);  | 
871  |  |     else if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) { | 
872  |  |       tp->t_sndrexmitpack++;  | 
873  |  |       TCPSTAT_INC(tcps_sndrexmitpack);  | 
874  |  |       TCPSTAT_ADD(tcps_sndrexmitbyte, len);  | 
875  |  |     } else { | 
876  |  |       TCPSTAT_INC(tcps_sndpack);  | 
877  |  |       TCPSTAT_ADD(tcps_sndbyte, len);  | 
878  |  |     }  | 
879  |  | #ifdef INET6  | 
880  |  |     if (MHLEN < hdrlen + max_linkhdr)  | 
881  |  |       m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);  | 
882  |  |     else  | 
883  |  | #endif  | 
884  |  |       m = m_gethdr(M_NOWAIT, MT_DATA);  | 
885  |  |  | 
886  |  |     if (m == NULL) { | 
887  |  |       SOCKBUF_UNLOCK(&so->so_snd);  | 
888  |  |       error = ENOBUFS;  | 
889  |  |       sack_rxmit = 0;  | 
890  |  |       goto out;  | 
891  |  |     }  | 
892  |  |  | 
893  |  |     m->m_data += max_linkhdr;  | 
894  |  |     m->m_len = hdrlen;  | 
895  |  |  | 
896  |  |     /*  | 
897  |  |      * Start the m_copy functions from the closest mbuf  | 
898  |  |      * to the offset in the socket buffer chain.  | 
899  |  |      */  | 
900  |  |     mb = sbsndptr(&so->so_snd, off, len, &moff);  | 
901  |  |  | 
902  |  |     if (len <= MHLEN - hdrlen - max_linkhdr) { | 
903  |  |       m_copydata(mb, moff, (int)len,  | 
904  |  |           mtod(m, caddr_t) + hdrlen);  | 
905  |  |       m->m_len += len;  | 
906  |  |     } else { | 
907  |  |       m->m_next = m_copy(mb, moff, (int)len);  | 
908  |  |       if (m->m_next == NULL) { | 
909  |  |         SOCKBUF_UNLOCK(&so->so_snd);  | 
910  |  |         (void) m_free(m);  | 
911  |  |         error = ENOBUFS;  | 
912  |  |         sack_rxmit = 0;  | 
913  |  |         goto out;  | 
914  |  |       }  | 
915  |  |     }  | 
916  |  |  | 
917  |  |     /*  | 
918  |  |      * If we're sending everything we've got, set PUSH.  | 
919  |  |      * (This will keep happy those implementations which only  | 
920  |  |      * give data to the user when a buffer fills or  | 
921  |  |      * a PUSH comes in.)  | 
922  |  |      */  | 
923  |  |     if (off + len == sbused(&so->so_snd))  | 
924  |  |       flags |= TH_PUSH;  | 
925  |  |     SOCKBUF_UNLOCK(&so->so_snd);  | 
926  |  |   } else { | 
927  |  |     SOCKBUF_UNLOCK(&so->so_snd);  | 
928  |  |     if (tp->t_flags & TF_ACKNOW)  | 
929  |  |       TCPSTAT_INC(tcps_sndacks);  | 
930  |  |     else if (flags & (TH_SYN|TH_FIN|TH_RST))  | 
931  |  |       TCPSTAT_INC(tcps_sndctrl);  | 
932  |  |     else if (SEQ_GT(tp->snd_up, tp->snd_una))  | 
933  |  |       TCPSTAT_INC(tcps_sndurg);  | 
934  |  |     else  | 
935  |  |       TCPSTAT_INC(tcps_sndwinup);  | 
936  |  |  | 
937  |  |     m = m_gethdr(M_NOWAIT, MT_DATA);  | 
938  |  |     if (m == NULL) { | 
939  |  |       error = ENOBUFS;  | 
940  |  |       sack_rxmit = 0;  | 
941  |  |       goto out;  | 
942  |  |     }  | 
943  |  | #ifdef INET6  | 
944  |  |     if (isipv6 && (MHLEN < hdrlen + max_linkhdr) &&  | 
945  |  |         MHLEN >= hdrlen) { | 
946  |  |       M_ALIGN(m, hdrlen);  | 
947  |  |     } else  | 
948  |  | #endif  | 
949  |  |     m->m_data += max_linkhdr;  | 
950  |  |     m->m_len = hdrlen;  | 
951  |  |   }  | 
952  |  | #endif  | 
953  |  | 
  | 
954  | 0  |   KASSERT(ipoptlen == 0, ("No IP options supported")); // samkumar | 
955  |  | 
  | 
956  | 0  |   otMessage* message = tcplp_sys_new_message(tp->instance);  | 
957  | 0  |   if (message == NULL) { | 
958  | 0  |     error = ENOBUFS;  | 
959  | 0  |     sack_rxmit = 0;  | 
960  | 0  |     goto out;  | 
961  | 0  |   }  | 
962  | 0  |   if (otMessageSetLength(message, sizeof(struct tcphdr) + optlen + len) != OT_ERROR_NONE) { | 
963  | 0  |     tcplp_sys_free_message(tp->instance, message);  | 
964  | 0  |     error = ENOBUFS;  | 
965  | 0  |     sack_rxmit = 0;  | 
966  | 0  |     goto out;  | 
967  | 0  |   }  | 
968  | 0  |   if (len) { | 
969  | 0  |       uint32_t used_space = lbuf_used_space(&tp->sendbuf);  | 
970  |  |  | 
971  |  |     /*  | 
972  |  |      * The TinyOS version has a way to avoid the copying we have to do here.  | 
973  |  |      * Because it is possible to send iovecs directly in the BLIP stack, and  | 
974  |  |      * an lbuf is made of iovecs, we could just "save" the starting and ending  | 
975  |  |      * iovecs, modify them to get exactly the slice we want, call "send" on  | 
976  |  |      * the resulting chain, and then restore the starting and ending iovecs  | 
977  |  |      * once "send" returns.  | 
978  |  |      *  | 
979  |  |      * In RIOT, pktsnips have additional behavior regarding memory management  | 
980  |  |      * that precludes this optimization. But, now that we have moved to  | 
981  |  |      * cbufs, this is not relevant anymore.  | 
982  |  |      */  | 
983  | 0  |     { | 
984  | 0  |       otLinkedBuffer* start;  | 
985  | 0  |       size_t start_offset;  | 
986  | 0  |       otLinkedBuffer* end;  | 
987  | 0  |       size_t end_offset;  | 
988  | 0  |       otLinkedBuffer* curr;  | 
989  | 0  |       int rv = lbuf_getrange(&tp->sendbuf, off, len, &start, &start_offset, &end, &end_offset);  | 
990  | 0  |       size_t message_offset = otMessageGetOffset(message) + sizeof(struct tcphdr) + optlen;  | 
991  | 0  |       KASSERT(rv == 0, ("Reading send buffer out of range!")); | 
992  | 0  |       for (curr = start; curr != end->mNext; curr = curr->mNext) { | 
993  | 0  |         const uint8_t* data_to_copy = curr->mData;  | 
994  | 0  |         size_t length_to_copy = curr->mLength;  | 
995  | 0  |         if (curr == start) { | 
996  | 0  |           data_to_copy += start_offset;  | 
997  | 0  |           length_to_copy -= start_offset;  | 
998  | 0  |         }  | 
999  | 0  |         if (curr == end) { | 
1000  | 0  |           length_to_copy -= end_offset;  | 
1001  | 0  |         }  | 
1002  | 0  |         otMessageWrite(message, message_offset, data_to_copy, length_to_copy);  | 
1003  | 0  |         message_offset += length_to_copy;  | 
1004  | 0  |       }  | 
1005  | 0  |     }  | 
1006  |  |  | 
1007  |  |     /*  | 
1008  |  |      * If we're sending everything we've got, set PUSH.  | 
1009  |  |      * (This will keep happy those implementations which only  | 
1010  |  |      * give data to the user when a buffer fills or  | 
1011  |  |      * a PUSH comes in.)  | 
1012  |  |      */  | 
1013  |  |     /* samkumar: Replaced call to sbused(&so->so_snd) with used_space. */  | 
1014  | 0  |     if (off + len == used_space)  | 
1015  | 0  |       flags |= TH_PUSH;  | 
1016  | 0  |   }  | 
1017  |  | 
  | 
1018  | 0  |   char outbuf[sizeof(struct tcphdr) + TCP_MAXOLEN];  | 
1019  | 0  |   th = (struct tcphdr*) (&outbuf[0]);  | 
1020  |  |  | 
1021  |  |   /*  | 
1022  |  |    * samkumar: I replaced the original call to tcpip_fillheaders with the  | 
1023  |  |    * one below.  | 
1024  |  |    */  | 
1025  | 0  |   otMessageInfo ip6info;  | 
1026  | 0  |   tcpip_fillheaders(tp, &ip6info, th);  | 
1027  |  |  | 
1028  |  |   /*  | 
1029  |  |    * Fill in fields, remembering maximum advertised  | 
1030  |  |    * window for use in delaying messages about window sizes.  | 
1031  |  |    * If resending a FIN, be sure not to use a new sequence number.  | 
1032  |  |    */  | 
1033  | 0  |   if (flags & TH_FIN && tp->t_flags & TF_SENTFIN &&  | 
1034  | 0  |       tp->snd_nxt == tp->snd_max)  | 
1035  | 0  |     tp->snd_nxt--;  | 
1036  |  |   /*  | 
1037  |  |    * If we are starting a connection, send ECN setup  | 
1038  |  |    * SYN packet. If we are on a retransmit, we may  | 
1039  |  |    * resend those bits a number of times as per  | 
1040  |  |    * RFC 3168.  | 
1041  |  |    */  | 
1042  | 0  |   if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn) { | 
1043  | 0  |     if (tp->t_rxtshift >= 1) { | 
1044  | 0  |       if (tp->t_rxtshift <= V_tcp_ecn_maxretries)  | 
1045  | 0  |         flags |= TH_ECE|TH_CWR;  | 
1046  | 0  |     } else  | 
1047  | 0  |       flags |= TH_ECE|TH_CWR;  | 
1048  | 0  |   }  | 
1049  |  |  | 
1050  |  |   /*  | 
1051  |  |    * samkumar: Make tcplp_output reply with ECE flag in the SYN-ACK for  | 
1052  |  |    * ECN-enabled connections. The existing code in FreeBSD didn't have to do  | 
1053  |  |    * this, because it didn't use tcplp_output to send the SYN-ACK; it  | 
1054  |  |    * constructed the SYN-ACK segment manually. Yet another consequnce of  | 
1055  |  |    * removing the SYN cache...  | 
1056  |  |    */  | 
1057  | 0  |   if (tp->t_state == TCPS_SYN_RECEIVED && tp->t_flags & TF_ECN_PERMIT &&  | 
1058  | 0  |     V_tcp_do_ecn) { | 
1059  | 0  |     flags |= TH_ECE;  | 
1060  | 0  |   }  | 
1061  |  | 
  | 
1062  | 0  |   if (tp->t_state == TCPS_ESTABLISHED &&  | 
1063  | 0  |       (tp->t_flags & TF_ECN_PERMIT)) { | 
1064  |  |     /*  | 
1065  |  |      * If the peer has ECN, mark data packets with  | 
1066  |  |      * ECN capable transmission (ECT).  | 
1067  |  |      * Ignore pure ack packets, retransmissions and window probes.  | 
1068  |  |      */  | 
1069  | 0  |     if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) &&  | 
1070  | 0  |         !((tp->t_flags & TF_FORCEDATA) && len == 1)) { | 
1071  |  |       /*  | 
1072  |  |        * samkumar: Replaced ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20);  | 
1073  |  |        * with the following code, which will cause OpenThread to set the  | 
1074  |  |        * ECT0 bit in the header.  | 
1075  |  |        */  | 
1076  | 0  |       ip6info.mEcn = OT_ECN_CAPABLE_0;  | 
1077  | 0  |     }  | 
1078  |  |  | 
1079  |  |     /*  | 
1080  |  |      * Reply with proper ECN notifications.  | 
1081  |  |      */  | 
1082  | 0  |     if (tp->t_flags & TF_ECN_SND_CWR) { | 
1083  | 0  |       flags |= TH_CWR;  | 
1084  | 0  |       tp->t_flags &= ~TF_ECN_SND_CWR;  | 
1085  | 0  |     }  | 
1086  | 0  |     if (tp->t_flags & TF_ECN_SND_ECE)  | 
1087  | 0  |       flags |= TH_ECE;  | 
1088  | 0  |   }  | 
1089  |  |  | 
1090  |  |   /*  | 
1091  |  |    * If we are doing retransmissions, then snd_nxt will  | 
1092  |  |    * not reflect the first unsent octet.  For ACK only  | 
1093  |  |    * packets, we do not want the sequence number of the  | 
1094  |  |    * retransmitted packet, we want the sequence number  | 
1095  |  |    * of the next unsent octet.  So, if there is no data  | 
1096  |  |    * (and no SYN or FIN), use snd_max instead of snd_nxt  | 
1097  |  |    * when filling in ti_seq.  But if we are in persist  | 
1098  |  |    * state, snd_max might reflect one byte beyond the  | 
1099  |  |    * right edge of the window, so use snd_nxt in that  | 
1100  |  |    * case, since we know we aren't doing a retransmission.  | 
1101  |  |    * (retransmit and persist are mutually exclusive...)  | 
1102  |  |    */  | 
1103  | 0  |   if (sack_rxmit == 0) { | 
1104  | 0  |     if (len || (flags & (TH_SYN|TH_FIN)) ||  | 
1105  | 0  |         tcp_timer_active(tp, TT_PERSIST))  | 
1106  | 0  |       th->th_seq = htonl(tp->snd_nxt);  | 
1107  | 0  |     else  | 
1108  | 0  |       th->th_seq = htonl(tp->snd_max);  | 
1109  | 0  |   } else { | 
1110  | 0  |     th->th_seq = htonl(p->rxmit);  | 
1111  | 0  |     p->rxmit += len;  | 
1112  | 0  |     tp->sackhint.sack_bytes_rexmit += len;  | 
1113  | 0  |   }  | 
1114  |  |  | 
1115  |  |   /*  | 
1116  |  |    * samkumar: Check if this is a retransmission (added as part of TCPlp).  | 
1117  |  |    * This kind of stats collection is useful but not necessary for TCP, so  | 
1118  |  |    * I've left it as a comment in case we want to bring this back to measure  | 
1119  |  |    * performance.  | 
1120  |  |    */  | 
1121  |  | #if 0  | 
1122  |  |   if (len > 0 && !tcp_timer_active(tp, TT_PERSIST) && SEQ_LT(ntohl(th->th_seq), tp->snd_max)) { | 
1123  |  |     tcplp_totalRexmitCnt++;  | 
1124  |  |   }  | 
1125  |  | #endif  | 
1126  |  | 
  | 
1127  | 0  |   th->th_ack = htonl(tp->rcv_nxt);  | 
1128  | 0  |   if (optlen) { | 
1129  | 0  |     bcopy(opt, th + 1, optlen);  | 
1130  | 0  |     th->th_off_x2 = ((sizeof (struct tcphdr) + optlen) >> 2) << TH_OFF_SHIFT;  | 
1131  | 0  |   }  | 
1132  | 0  |   th->th_flags = flags;  | 
1133  |  |   /*  | 
1134  |  |    * Calculate receive window.  Don't shrink window,  | 
1135  |  |    * but avoid silly window syndrome.  | 
1136  |  |    */  | 
1137  |  |   /* samkumar: Replaced so->so_rcv.sb_hiwat with this call to cbuf_size. */  | 
1138  | 0  |   if (recwin < (long)(cbuf_size(&tp->recvbuf) / 4) &&  | 
1139  | 0  |       recwin < (long)tp->t_maxseg)  | 
1140  | 0  |     recwin = 0;  | 
1141  | 0  |   if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) &&  | 
1142  | 0  |       recwin < (long)(tp->rcv_adv - tp->rcv_nxt))  | 
1143  | 0  |     recwin = (long)(tp->rcv_adv - tp->rcv_nxt);  | 
1144  | 0  |   if (recwin > (long)TCP_MAXWIN << tp->rcv_scale)  | 
1145  | 0  |     recwin = (long)TCP_MAXWIN << tp->rcv_scale;  | 
1146  |  |  | 
1147  |  |   /*  | 
1148  |  |    * According to RFC1323 the window field in a SYN (i.e., a <SYN>  | 
1149  |  |    * or <SYN,ACK>) segment itself is never scaled.  The <SYN,ACK>  | 
1150  |  |    * case is handled in syncache.  | 
1151  |  |    */  | 
1152  | 0  |   if (flags & TH_SYN)  | 
1153  | 0  |     th->th_win = htons((uint16_t)  | 
1154  | 0  |         (min(cbuf_size(&tp->recvbuf), TCP_MAXWIN)));  | 
1155  | 0  |   else  | 
1156  | 0  |     th->th_win = htons((uint16_t)(recwin >> tp->rcv_scale));  | 
1157  |  |  | 
1158  |  |   /*  | 
1159  |  |    * Adjust the RXWIN0SENT flag - indicate that we have advertised  | 
1160  |  |    * a 0 window.  This may cause the remote transmitter to stall.  This  | 
1161  |  |    * flag tells soreceive() to disable delayed acknowledgements when  | 
1162  |  |    * draining the buffer.  This can occur if the receiver is attempting  | 
1163  |  |    * to read more data than can be buffered prior to transmitting on  | 
1164  |  |    * the connection.  | 
1165  |  |    */  | 
1166  | 0  |   if (th->th_win == 0) { | 
1167  | 0  |     tp->t_flags |= TF_RXWIN0SENT;  | 
1168  | 0  |   } else  | 
1169  | 0  |     tp->t_flags &= ~TF_RXWIN0SENT;  | 
1170  | 0  |   if (SEQ_GT(tp->snd_up, tp->snd_nxt)) { | 
1171  | 0  |     th->th_urp = htons((uint16_t)(tp->snd_up - tp->snd_nxt));  | 
1172  | 0  |     th->th_flags |= TH_URG;  | 
1173  | 0  |   } else  | 
1174  |  |     /*  | 
1175  |  |      * If no urgent pointer to send, then we pull  | 
1176  |  |      * the urgent pointer to the left edge of the send window  | 
1177  |  |      * so that it doesn't drift into the send window on sequence  | 
1178  |  |      * number wraparound.  | 
1179  |  |      */  | 
1180  | 0  |     tp->snd_up = tp->snd_una;   /* drag it along */  | 
1181  |  |  | 
1182  |  |   /*  | 
1183  |  |    * samkumar: Removed code for TCP signatures.  | 
1184  |  |    */  | 
1185  |  |   /*  | 
1186  |  |    * Put TCP length in extended header, and then  | 
1187  |  |    * checksum extended header and data.  | 
1188  |  |    */  | 
1189  |  |   /*  | 
1190  |  |    * samkumar: The code to implement the above comment isn't relevant to us.  | 
1191  |  |    * Checksum computation is not handled using FreeBSD code, so we don't need  | 
1192  |  |    * to build an extended header.  | 
1193  |  |    */  | 
1194  |  |   /*  | 
1195  |  |    * samkumar: Removed code for TCP Segmentation Offloading.  | 
1196  |  |    */  | 
1197  |  |   /* samkumar: Removed mbuf-specific assertions an debug code. */  | 
1198  |  |   /*  | 
1199  |  |    * Fill in IP length and desired time to live and  | 
1200  |  |    * send to IP level.  There should be a better way  | 
1201  |  |    * to handle ttl and tos; we could keep them in  | 
1202  |  |    * the template, but need a way to checksum without them.  | 
1203  |  |    */  | 
1204  |  |   /*  | 
1205  |  |    * m->m_pkthdr.len should have been set before checksum calculation,  | 
1206  |  |    * because in6_cksum() need it.  | 
1207  |  |    */  | 
1208  |  |   /*  | 
1209  |  |    * samkumar: The IPv6 packet length and hop limit are handled by the host  | 
1210  |  |    * network stack, not by TCPlp. I've also removed code for Path MTU  | 
1211  |  |    * discovery. And of course, I've removed debug code as well.  | 
1212  |  |    */  | 
1213  |  |   /* samkumar: I've replaced the call to ip6_output with the following. */  | 
1214  | 0  |   otMessageWrite(message, 0, outbuf, sizeof(struct tcphdr) + optlen);  | 
1215  | 0  |   tcplp_sys_send_message(tp->instance, message, &ip6info);  | 
1216  |  | 
  | 
1217  | 0  | out:  | 
1218  |  |   /*  | 
1219  |  |    * In transmit state, time the transmission and arrange for  | 
1220  |  |    * the retransmit.  In persist state, just set snd_max.  | 
1221  |  |    */  | 
1222  | 0  |   if ((tp->t_flags & TF_FORCEDATA) == 0 ||  | 
1223  | 0  |       !tcp_timer_active(tp, TT_PERSIST)) { | 
1224  | 0  |     tcp_seq startseq = tp->snd_nxt;  | 
1225  |  |  | 
1226  |  |     /*  | 
1227  |  |      * Advance snd_nxt over sequence space of this segment.  | 
1228  |  |      */  | 
1229  | 0  |     if (flags & (TH_SYN|TH_FIN)) { | 
1230  | 0  |       if (flags & TH_SYN)  | 
1231  | 0  |         tp->snd_nxt++;  | 
1232  | 0  |       if (flags & TH_FIN) { | 
1233  | 0  |         tp->snd_nxt++;  | 
1234  | 0  |         tp->t_flags |= TF_SENTFIN;  | 
1235  | 0  |       }  | 
1236  | 0  |     }  | 
1237  | 0  |     if (sack_rxmit)  | 
1238  | 0  |       goto timer;  | 
1239  | 0  |     tp->snd_nxt += len;  | 
1240  | 0  |     if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { | 
1241  | 0  |       tp->snd_max = tp->snd_nxt;  | 
1242  |  |       /*  | 
1243  |  |        * Time this transmission if not a retransmission and  | 
1244  |  |        * not currently timing anything.  | 
1245  |  |        */  | 
1246  | 0  |       if (tp->t_rtttime == 0) { | 
1247  | 0  |         tp->t_rtttime = ticks;  | 
1248  | 0  |         tp->t_rtseq = startseq;  | 
1249  | 0  |       }  | 
1250  | 0  |     }  | 
1251  |  |  | 
1252  |  |     /*  | 
1253  |  |      * Set retransmit timer if not currently set,  | 
1254  |  |      * and not doing a pure ack or a keep-alive probe.  | 
1255  |  |      * Initial value for retransmit timer is smoothed  | 
1256  |  |      * round-trip time + 2 * round-trip time variance.  | 
1257  |  |      * Initialize shift counter which is used for backoff  | 
1258  |  |      * of retransmit time.  | 
1259  |  |      */  | 
1260  | 0  | timer:  | 
1261  | 0  |     if (!tcp_timer_active(tp, TT_REXMT) &&  | 
1262  | 0  |         ((sack_rxmit && tp->snd_nxt != tp->snd_max) ||  | 
1263  | 0  |          (tp->snd_nxt != tp->snd_una))) { | 
1264  | 0  |       if (tcp_timer_active(tp, TT_PERSIST)) { | 
1265  | 0  |         tcp_timer_activate(tp, TT_PERSIST, 0);  | 
1266  | 0  |         tp->t_rxtshift = 0;  | 
1267  | 0  |       }  | 
1268  | 0  |       tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);  | 
1269  |  |       /*  | 
1270  |  |        * samkumar: Replaced sbavail(&so->so_snd) with this call to  | 
1271  |  |        * lbuf_used_space.  | 
1272  |  |        */  | 
1273  | 0  |     } else if (len == 0 && lbuf_used_space(&tp->sendbuf) &&  | 
1274  | 0  |         !tcp_timer_active(tp, TT_REXMT) &&  | 
1275  | 0  |         !tcp_timer_active(tp, TT_PERSIST)) { | 
1276  |  |       /*  | 
1277  |  |        * Avoid a situation where we do not set persist timer  | 
1278  |  |        * after a zero window condition. For example:  | 
1279  |  |        * 1) A -> B: packet with enough data to fill the window  | 
1280  |  |        * 2) B -> A: ACK for #1 + new data (0 window  | 
1281  |  |        *    advertisement)  | 
1282  |  |        * 3) A -> B: ACK for #2, 0 len packet  | 
1283  |  |        *  | 
1284  |  |        * In this case, A will not activate the persist timer,  | 
1285  |  |        * because it chose to send a packet. Unless tcplp_output  | 
1286  |  |        * is called for some other reason (delayed ack timer,  | 
1287  |  |        * another input packet from B, socket syscall), A will  | 
1288  |  |        * not send zero window probes.  | 
1289  |  |        *  | 
1290  |  |        * So, if you send a 0-length packet, but there is data  | 
1291  |  |        * in the socket buffer, and neither the rexmt or  | 
1292  |  |        * persist timer is already set, then activate the  | 
1293  |  |        * persist timer.  | 
1294  |  |        */  | 
1295  | 0  |       tp->t_rxtshift = 0;  | 
1296  | 0  |       tcp_setpersist(tp);  | 
1297  | 0  |     }  | 
1298  | 0  |   } else { | 
1299  |  |     /*  | 
1300  |  |      * Persist case, update snd_max but since we are in  | 
1301  |  |      * persist mode (no window) we do not update snd_nxt.  | 
1302  |  |      */  | 
1303  | 0  |     int xlen = len;  | 
1304  | 0  |     if (flags & TH_SYN)  | 
1305  | 0  |       ++xlen;  | 
1306  | 0  |     if (flags & TH_FIN) { | 
1307  | 0  |       ++xlen;  | 
1308  | 0  |       tp->t_flags |= TF_SENTFIN;  | 
1309  | 0  |     }  | 
1310  | 0  |     if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max))  | 
1311  | 0  |       tp->snd_max = tp->snd_nxt + len;  | 
1312  | 0  |   }  | 
1313  |  |  | 
1314  | 0  |   if (error) { | 
1315  |  |  | 
1316  |  |     /*  | 
1317  |  |      * We know that the packet was lost, so back out the  | 
1318  |  |      * sequence number advance, if any.  | 
1319  |  |      *  | 
1320  |  |      * If the error is EPERM the packet got blocked by the  | 
1321  |  |      * local firewall.  Normally we should terminate the  | 
1322  |  |      * connection but the blocking may have been spurious  | 
1323  |  |      * due to a firewall reconfiguration cycle.  So we treat  | 
1324  |  |      * it like a packet loss and let the retransmit timer and  | 
1325  |  |      * timeouts do their work over time.  | 
1326  |  |      * XXX: It is a POLA question whether calling tcp_drop right  | 
1327  |  |      * away would be the really correct behavior instead.  | 
1328  |  |      */  | 
1329  | 0  |     if (((tp->t_flags & TF_FORCEDATA) == 0 ||  | 
1330  | 0  |         !tcp_timer_active(tp, TT_PERSIST)) &&  | 
1331  | 0  |         ((flags & TH_SYN) == 0) &&  | 
1332  | 0  |         (error != EPERM)) { | 
1333  | 0  |       if (sack_rxmit) { | 
1334  | 0  |         p->rxmit -= len;  | 
1335  | 0  |         tp->sackhint.sack_bytes_rexmit -= len;  | 
1336  | 0  |         KASSERT(tp->sackhint.sack_bytes_rexmit >= 0,  | 
1337  | 0  |             ("sackhint bytes rtx >= 0")); | 
1338  | 0  |       } else  | 
1339  | 0  |         tp->snd_nxt -= len;  | 
1340  | 0  |     }  | 
1341  | 0  |     switch (error) { | 
1342  | 0  |     case EPERM:  | 
1343  | 0  |       tp->t_softerror = error;  | 
1344  | 0  |       return (error);  | 
1345  | 0  |     case ENOBUFS:  | 
1346  | 0  |                   if (!tcp_timer_active(tp, TT_REXMT) &&  | 
1347  | 0  |           !tcp_timer_active(tp, TT_PERSIST))  | 
1348  | 0  |                           tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);  | 
1349  | 0  |       tp->snd_cwnd = tp->t_maxseg;  | 
1350  |  | #ifdef INSTRUMENT_TCP  | 
1351  |  |       tcplp_sys_log("TCP ALLOCFAIL %u %d", (unsigned int) tcplp_sys_get_millis(), (int) tp->snd_cwnd); | 
1352  |  | #endif  | 
1353  | 0  |       return (0);  | 
1354  | 0  |     case EMSGSIZE:  | 
1355  |  |       /*  | 
1356  |  |        * For some reason the interface we used initially  | 
1357  |  |        * to send segments changed to another or lowered  | 
1358  |  |        * its MTU.  | 
1359  |  |        * If TSO was active we either got an interface  | 
1360  |  |        * without TSO capabilits or TSO was turned off.  | 
1361  |  |        * If we obtained mtu from ip_output() then update  | 
1362  |  |        * it and try again.  | 
1363  |  |        */  | 
1364  |  |       /* samkumar: Removed code for TCP Segmentation Offloading. */  | 
1365  | 0  |       if (mtu != 0) { | 
1366  | 0  |         tcp_mss_update(tp, -1, mtu, NULL, NULL);  | 
1367  | 0  |         goto again;  | 
1368  | 0  |       }  | 
1369  | 0  |       return (error);  | 
1370  | 0  |     case EHOSTDOWN:  | 
1371  | 0  |     case EHOSTUNREACH:  | 
1372  | 0  |     case ENETDOWN:  | 
1373  | 0  |     case ENETUNREACH:  | 
1374  | 0  |       if (TCPS_HAVERCVDSYN(tp->t_state)) { | 
1375  | 0  |         tp->t_softerror = error;  | 
1376  | 0  |         return (0);  | 
1377  | 0  |       }  | 
1378  |  |       /* FALLTHROUGH */  | 
1379  | 0  |     default:  | 
1380  | 0  |       return (error);  | 
1381  | 0  |     }  | 
1382  | 0  |   }  | 
1383  |  |  | 
1384  |  |   /*  | 
1385  |  |    * Data sent (as far as we can tell).  | 
1386  |  |    * If this advertises a larger window than any other segment,  | 
1387  |  |    * then remember the size of the advertised window.  | 
1388  |  |    * Any pending ACK has now been sent.  | 
1389  |  |    */  | 
1390  | 0  |   if (recwin >= 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv))  | 
1391  | 0  |     tp->rcv_adv = tp->rcv_nxt + recwin;  | 
1392  | 0  |   tp->last_ack_sent = tp->rcv_nxt;  | 
1393  | 0  |   tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);  | 
1394  | 0  |   if (tcp_timer_active(tp, TT_DELACK))  | 
1395  | 0  |     tcp_timer_activate(tp, TT_DELACK, 0);  | 
1396  |  |  | 
1397  |  |   /*  | 
1398  |  |    * samkumar: This was already commented out (using #if 0) in the original  | 
1399  |  |    * FreeBSD code.  | 
1400  |  |    */  | 
1401  |  | #if 0  | 
1402  |  |   /*  | 
1403  |  |    * This completely breaks TCP if newreno is turned on.  What happens  | 
1404  |  |    * is that if delayed-acks are turned on on the receiver, this code  | 
1405  |  |    * on the transmitter effectively destroys the TCP window, forcing  | 
1406  |  |    * it to four packets (1.5Kx4 = 6K window).  | 
1407  |  |    */  | 
1408  |  |   if (sendalot && --maxburst)  | 
1409  |  |     goto again;  | 
1410  |  | #endif  | 
1411  | 0  |   if (sendalot)  | 
1412  | 0  |     goto again;  | 
1413  | 0  |   return (0);  | 
1414  | 0  | }  | 
1415  |  |  | 
1416  |  | /*  | 
1417  |  |  * Insert TCP options according to the supplied parameters to the place  | 
1418  |  |  * optp in a consistent way.  Can handle unaligned destinations.  | 
1419  |  |  *  | 
1420  |  |  * The order of the option processing is crucial for optimal packing and  | 
1421  |  |  * alignment for the scarce option space.  | 
1422  |  |  *  | 
1423  |  |  * The optimal order for a SYN/SYN-ACK segment is:  | 
1424  |  |  *   MSS (4) + NOP (1) + Window scale (3) + SACK permitted (2) +  | 
1425  |  |  *   Timestamp (10) + Signature (18) = 38 bytes out of a maximum of 40.  | 
1426  |  |  *  | 
1427  |  |  * The SACK options should be last.  SACK blocks consume 8*n+2 bytes.  | 
1428  |  |  * So a full size SACK blocks option is 34 bytes (with 4 SACK blocks).  | 
1429  |  |  * At minimum we need 10 bytes (to generate 1 SACK block).  If both  | 
1430  |  |  * TCP Timestamps (12 bytes) and TCP Signatures (18 bytes) are present,  | 
1431  |  |  * we only have 10 bytes for SACK options (40 - (12 + 18)).  | 
1432  |  |  */  | 
1433  |  | int  | 
1434  |  | tcp_addoptions(struct tcpopt *to, uint8_t *optp)  | 
1435  | 0  | { | 
1436  | 0  |   uint32_t mask, optlen = 0;  | 
1437  |  | 
  | 
1438  | 0  |   for (mask = 1; mask < TOF_MAXOPT; mask <<= 1) { | 
1439  | 0  |     if ((to->to_flags & mask) != mask)  | 
1440  | 0  |       continue;  | 
1441  | 0  |     if (optlen == TCP_MAXOLEN)  | 
1442  | 0  |       break;  | 
1443  | 0  |     switch (to->to_flags & mask) { | 
1444  | 0  |     case TOF_MSS:  | 
1445  | 0  |       while (optlen % 4) { | 
1446  | 0  |         optlen += TCPOLEN_NOP;  | 
1447  | 0  |         *optp++ = TCPOPT_NOP;  | 
1448  | 0  |       }  | 
1449  | 0  |       if (TCP_MAXOLEN - optlen < TCPOLEN_MAXSEG)  | 
1450  | 0  |         continue;  | 
1451  | 0  |       optlen += TCPOLEN_MAXSEG;  | 
1452  | 0  |       *optp++ = TCPOPT_MAXSEG;  | 
1453  | 0  |       *optp++ = TCPOLEN_MAXSEG;  | 
1454  | 0  |       to->to_mss = htons(to->to_mss);  | 
1455  | 0  |       bcopy((uint8_t *)&to->to_mss, optp, sizeof(to->to_mss));  | 
1456  | 0  |       optp += sizeof(to->to_mss);  | 
1457  | 0  |       break;  | 
1458  | 0  |     case TOF_SCALE:  | 
1459  | 0  |       while (!optlen || optlen % 2 != 1) { | 
1460  | 0  |         optlen += TCPOLEN_NOP;  | 
1461  | 0  |         *optp++ = TCPOPT_NOP;  | 
1462  | 0  |       }  | 
1463  | 0  |       if (TCP_MAXOLEN - optlen < TCPOLEN_WINDOW)  | 
1464  | 0  |         continue;  | 
1465  | 0  |       optlen += TCPOLEN_WINDOW;  | 
1466  | 0  |       *optp++ = TCPOPT_WINDOW;  | 
1467  | 0  |       *optp++ = TCPOLEN_WINDOW;  | 
1468  | 0  |       *optp++ = to->to_wscale;  | 
1469  | 0  |       break;  | 
1470  | 0  |     case TOF_SACKPERM:  | 
1471  | 0  |       while (optlen % 2) { | 
1472  | 0  |         optlen += TCPOLEN_NOP;  | 
1473  | 0  |         *optp++ = TCPOPT_NOP;  | 
1474  | 0  |       }  | 
1475  | 0  |       if (TCP_MAXOLEN - optlen < TCPOLEN_SACK_PERMITTED)  | 
1476  | 0  |         continue;  | 
1477  | 0  |       optlen += TCPOLEN_SACK_PERMITTED;  | 
1478  | 0  |       *optp++ = TCPOPT_SACK_PERMITTED;  | 
1479  | 0  |       *optp++ = TCPOLEN_SACK_PERMITTED;  | 
1480  | 0  |       break;  | 
1481  | 0  |     case TOF_TS:  | 
1482  | 0  |       while (!optlen || optlen % 4 != 2) { | 
1483  | 0  |         optlen += TCPOLEN_NOP;  | 
1484  | 0  |         *optp++ = TCPOPT_NOP;  | 
1485  | 0  |       }  | 
1486  | 0  |       if (TCP_MAXOLEN - optlen < TCPOLEN_TIMESTAMP)  | 
1487  | 0  |         continue;  | 
1488  | 0  |       optlen += TCPOLEN_TIMESTAMP;  | 
1489  | 0  |       *optp++ = TCPOPT_TIMESTAMP;  | 
1490  | 0  |       *optp++ = TCPOLEN_TIMESTAMP;  | 
1491  | 0  |       to->to_tsval = htonl(to->to_tsval);  | 
1492  | 0  |       to->to_tsecr = htonl(to->to_tsecr);  | 
1493  | 0  |       bcopy((uint8_t *)&to->to_tsval, optp, sizeof(to->to_tsval));  | 
1494  | 0  |       optp += sizeof(to->to_tsval);  | 
1495  | 0  |       bcopy((uint8_t *)&to->to_tsecr, optp, sizeof(to->to_tsecr));  | 
1496  | 0  |       optp += sizeof(to->to_tsecr);  | 
1497  | 0  |       break;  | 
1498  | 0  |     case TOF_SIGNATURE:  | 
1499  | 0  |       { | 
1500  | 0  |       int siglen = TCPOLEN_SIGNATURE - 2;  | 
1501  |  | 
  | 
1502  | 0  |       while (!optlen || optlen % 4 != 2) { | 
1503  | 0  |         optlen += TCPOLEN_NOP;  | 
1504  | 0  |         *optp++ = TCPOPT_NOP;  | 
1505  | 0  |       }  | 
1506  | 0  |       if (TCP_MAXOLEN - optlen < TCPOLEN_SIGNATURE)  | 
1507  | 0  |         continue;  | 
1508  | 0  |       optlen += TCPOLEN_SIGNATURE;  | 
1509  | 0  |       *optp++ = TCPOPT_SIGNATURE;  | 
1510  | 0  |       *optp++ = TCPOLEN_SIGNATURE;  | 
1511  | 0  |       to->to_signature = optp;  | 
1512  | 0  |       while (siglen--)  | 
1513  | 0  |          *optp++ = 0;  | 
1514  | 0  |       break;  | 
1515  | 0  |       }  | 
1516  | 0  |     case TOF_SACK:  | 
1517  | 0  |       { | 
1518  | 0  |       int sackblks = 0;  | 
1519  | 0  |       struct sackblk *sack = (struct sackblk *)to->to_sacks;  | 
1520  | 0  |       tcp_seq sack_seq;  | 
1521  |  | 
  | 
1522  | 0  |       while (!optlen || optlen % 4 != 2) { | 
1523  | 0  |         optlen += TCPOLEN_NOP;  | 
1524  | 0  |         *optp++ = TCPOPT_NOP;  | 
1525  | 0  |       }  | 
1526  | 0  |       if (TCP_MAXOLEN - optlen < TCPOLEN_SACKHDR + TCPOLEN_SACK)  | 
1527  | 0  |         continue;  | 
1528  | 0  |       optlen += TCPOLEN_SACKHDR;  | 
1529  | 0  |       *optp++ = TCPOPT_SACK;  | 
1530  | 0  |       sackblks = min(to->to_nsacks,  | 
1531  | 0  |           (TCP_MAXOLEN - optlen) / TCPOLEN_SACK);  | 
1532  | 0  |       *optp++ = TCPOLEN_SACKHDR + sackblks * TCPOLEN_SACK;  | 
1533  | 0  |       while (sackblks--) { | 
1534  | 0  |         sack_seq = htonl(sack->start);  | 
1535  | 0  |         bcopy((uint8_t *)&sack_seq, optp, sizeof(sack_seq));  | 
1536  | 0  |         optp += sizeof(sack_seq);  | 
1537  | 0  |         sack_seq = htonl(sack->end);  | 
1538  | 0  |         bcopy((uint8_t *)&sack_seq, optp, sizeof(sack_seq));  | 
1539  | 0  |         optp += sizeof(sack_seq);  | 
1540  | 0  |         optlen += TCPOLEN_SACK;  | 
1541  | 0  |         sack++;  | 
1542  | 0  |       }  | 
1543  |  |       /* samkumar: Removed TCPSTAT_INC(tcps_sack_send_blocks); */  | 
1544  | 0  |       break;  | 
1545  | 0  |       }  | 
1546  | 0  |     case TOF_FASTOPEN:  | 
1547  | 0  |       { | 
1548  | 0  |       int total_len;  | 
1549  |  |  | 
1550  |  |       /* XXX is there any point to aligning this option? */  | 
1551  | 0  |       total_len = TCPOLEN_FAST_OPEN_EMPTY + to->to_tfo_len;  | 
1552  | 0  |       if (TCP_MAXOLEN - optlen < total_len) { | 
1553  | 0  |         to->to_flags &= ~TOF_FASTOPEN;  | 
1554  | 0  |         continue;  | 
1555  | 0  |       }  | 
1556  | 0  |       *optp++ = TCPOPT_FAST_OPEN;  | 
1557  | 0  |       *optp++ = total_len;  | 
1558  | 0  |       if (to->to_tfo_len > 0) { | 
1559  | 0  |         bcopy(to->to_tfo_cookie, optp, to->to_tfo_len);  | 
1560  | 0  |         optp += to->to_tfo_len;  | 
1561  | 0  |       }  | 
1562  | 0  |       optlen += total_len;  | 
1563  | 0  |       break;  | 
1564  | 0  |       }  | 
1565  | 0  |     default:  | 
1566  | 0  |       tcplp_sys_panic("PANIC: %s: unknown TCP option type", __func__); | 
1567  | 0  |       break;  | 
1568  | 0  |     }  | 
1569  | 0  |   }  | 
1570  |  |  | 
1571  |  |   /* Terminate and pad TCP options to a 4 byte boundary. */  | 
1572  | 0  |   if (optlen % 4) { | 
1573  | 0  |     optlen += TCPOLEN_EOL;  | 
1574  | 0  |     *optp++ = TCPOPT_EOL;  | 
1575  | 0  |   }  | 
1576  |  |   /*  | 
1577  |  |    * According to RFC 793 (STD0007):  | 
1578  |  |    *   "The content of the header beyond the End-of-Option option  | 
1579  |  |    *    must be header padding (i.e., zero)."  | 
1580  |  |    *   and later: "The padding is composed of zeros."  | 
1581  |  |    */  | 
1582  | 0  |   while (optlen % 4) { | 
1583  | 0  |     optlen += TCPOLEN_PAD;  | 
1584  | 0  |     *optp++ = TCPOPT_PAD;  | 
1585  | 0  |   }  | 
1586  |  | 
  | 
1587  | 0  |   KASSERT(optlen <= TCP_MAXOLEN, ("%s: TCP options too long", __func__)); | 
1588  | 0  |   return (optlen);  | 
1589  | 0  | }  |