~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~ [ freetext search ] ~ [ file search ] ~

Linux Cross Reference
Linux/net/ipv4/tcp_output.c

Version: ~ [ 2.4.0 ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 /*
  2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
  3  *              operating system.  INET is implemented using the  BSD Socket
  4  *              interface as the means of communication with the user level.
  5  *
  6  *              Implementation of the Transmission Control Protocol(TCP).
  7  *
  8  * Version:     $Id: tcp_output.c,v 1.129 2000/11/28 17:04:10 davem Exp $
  9  *
 10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
 11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
 13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
 14  *              Florian La Roche, <flla@stud.uni-sb.de>
 15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
 16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
 17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
 18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
 19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
 20  *              Jorge Cwik, <jorge@laser.satlink.net>
 21  */
 22 
 23 /*
 24  * Changes:     Pedro Roque     :       Retransmit queue handled by TCP.
 25  *                              :       Fragmentation on mtu decrease
 26  *                              :       Segment collapse on retransmit
 27  *                              :       AF independence
 28  *
 29  *              Linus Torvalds  :       send_delayed_ack
 30  *              David S. Miller :       Charge memory using the right skb
 31  *                                      during syn/ack processing.
 32  *              David S. Miller :       Output engine completely rewritten.
 33  *              Andrea Arcangeli:       SYNACK carry ts_recent in tsecr.
 34  *              Cacophonix Gaul :       draft-minshall-nagle-01
 35  *              J Hadi Salim    :       ECN support
 36  *
 37  */
 38 
 39 #include <net/tcp.h>
 40 
 41 #include <linux/smp_lock.h>
 42 
 43 /* People can turn this off for buggy TCP's found in printers etc. */
 44 int sysctl_tcp_retrans_collapse = 1;
 45 
 46 static __inline__
 47 void update_send_head(struct sock *sk, struct tcp_opt *tp, struct sk_buff *skb)
 48 {
 49         tp->send_head = skb->next;
 50         if (tp->send_head == (struct sk_buff *) &sk->write_queue)
 51                 tp->send_head = NULL;
 52         tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
 53         if (tp->packets_out++ == 0)
 54                 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
 55 }
 56 
 57 /* SND.NXT, if window was not shrunk.
 58  * If window has been shrunk, what should we make? It is not clear at all.
 59  * Using SND.UNA we will fail to open window, SND.NXT is out of window. :-(
 60  * Anything in between SND.UNA...SND.UNA+SND.WND also can be already
 61  * invalid. OK, let's make this for now:
 62  */
 63 static __inline__ __u32 tcp_acceptable_seq(struct sock *sk, struct tcp_opt *tp)
 64 {
 65         if (!before(tp->snd_una+tp->snd_wnd, tp->snd_nxt))
 66                 return tp->snd_nxt;
 67         else
 68                 return tp->snd_una+tp->snd_wnd;
 69 }
 70 
 71 /* Calculate mss to advertise in SYN segment.
 72  * RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that:
 73  *
 74  * 1. It is independent of path mtu.
 75  * 2. Ideally, it is maximal possible segment size i.e. 65535-40.
 76  * 3. For IPv4 it is reasonable to calculate it from maximal MTU of
 77  *    attached devices, because some buggy hosts are confused by
 78  *    large MSS.
 79  * 4. We do not make 3, we advertise MSS, calculated from first
 80  *    hop device mtu, but allow to raise it to ip_rt_min_advmss.
 81  *    This may be overriden via information stored in routing table.
 82  * 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible,
 83  *    probably even Jumbo".
 84  */
 85 static __u16 tcp_advertise_mss(struct sock *sk)
 86 {
 87         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 88         struct dst_entry *dst = __sk_dst_get(sk);
 89         int mss = tp->advmss;
 90 
 91         if (dst && dst->advmss < mss) {
 92                 mss = dst->advmss;
 93                 tp->advmss = mss;
 94         }
 95 
 96         return (__u16)mss;
 97 }
 98 
 99 /* RFC2861. Reset CWND after idle period longer RTO to "restart window".
100  * This is the first part of cwnd validation mechanism. */
101 static void tcp_cwnd_restart(struct tcp_opt *tp)
102 {
103         s32 delta = tcp_time_stamp - tp->lsndtime;
104         u32 restart_cwnd = tcp_init_cwnd(tp);
105         u32 cwnd = tp->snd_cwnd;
106 
107         tp->snd_ssthresh = tcp_current_ssthresh(tp);
108         restart_cwnd = min(restart_cwnd, cwnd);
109 
110         while ((delta -= tp->rto) > 0 && cwnd > restart_cwnd)
111                 cwnd >>= 1;
112         tp->snd_cwnd = max(cwnd, restart_cwnd);
113         tp->snd_cwnd_stamp = tcp_time_stamp;
114         tp->snd_cwnd_used = 0;
115 }
116 
117 static __inline__ void tcp_event_data_sent(struct tcp_opt *tp, struct sk_buff *skb)
118 {
119         u32 now = tcp_time_stamp;
120 
121         if (!tp->packets_out && (s32)(now - tp->lsndtime) > tp->rto)
122                 tcp_cwnd_restart(tp);
123 
124         tp->lsndtime = now;
125 
126         /* If it is a reply for ato after last received
127          * packet, enter pingpong mode.
128          */
129         if ((u32)(now - tp->ack.lrcvtime) < tp->ack.ato)
130                 tp->ack.pingpong = 1;
131 }
132 
133 static __inline__ void tcp_event_ack_sent(struct sock *sk)
134 {
135         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
136 
137         tcp_dec_quickack_mode(tp);
138         tcp_clear_xmit_timer(sk, TCP_TIME_DACK);
139 }
140 
141 /* Chose a new window to advertise, update state in tcp_opt for the
142  * socket, and return result with RFC1323 scaling applied.  The return
143  * value can be stuffed directly into th->window for an outgoing
144  * frame.
145  */
146 static __inline__ u16 tcp_select_window(struct sock *sk)
147 {
148         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
149         u32 cur_win = tcp_receive_window(tp);
150         u32 new_win = __tcp_select_window(sk);
151 
152         /* Never shrink the offered window */
153         if(new_win < cur_win) {
154                 /* Danger Will Robinson!
155                  * Don't update rcv_wup/rcv_wnd here or else
156                  * we will not be able to advertise a zero
157                  * window in time.  --DaveM
158                  *
159                  * Relax Will Robinson.
160                  */
161                 new_win = cur_win;
162         }
163         tp->rcv_wnd = new_win;
164         tp->rcv_wup = tp->rcv_nxt;
165 
166         /* RFC1323 scaling applied */
167         new_win >>= tp->rcv_wscale;
168 
169 #ifdef TCP_FORMAL_WINDOW
170         if (new_win == 0) {
171                 /* If we advertise zero window, disable fast path. */
172                 tp->pred_flags = 0;
173         } else if (cur_win == 0 && tp->pred_flags == 0 &&
174                    skb_queue_len(&tp->out_of_order_queue) == 0 &&
175                    !tp->urg_data) {
176                 /* If we open zero window, enable fast path.
177                    Without this it will be open by the first data packet,
178                    it is too late to merge checksumming to copy.
179                  */
180                 tcp_fast_path_on(tp);
181         }
182 #endif
183 
184         return new_win;
185 }
186 
187 
188 /* This routine actually transmits TCP packets queued in by
189  * tcp_do_sendmsg().  This is used by both the initial
190  * transmission and possible later retransmissions.
191  * All SKB's seen here are completely headerless.  It is our
192  * job to build the TCP header, and pass the packet down to
193  * IP so it can do the same plus pass the packet off to the
194  * device.
195  *
196  * We are working here with either a clone of the original
197  * SKB, or a fresh unique copy made by the retransmit engine.
198  */
199 int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
200 {
201         if(skb != NULL) {
202                 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
203                 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
204                 int tcp_header_size = tp->tcp_header_len;
205                 struct tcphdr *th;
206                 int sysctl_flags;
207                 int err;
208 
209 #define SYSCTL_FLAG_TSTAMPS     0x1
210 #define SYSCTL_FLAG_WSCALE      0x2
211 #define SYSCTL_FLAG_SACK        0x4
212 
213                 sysctl_flags = 0;
214                 if (tcb->flags & TCPCB_FLAG_SYN) {
215                         tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS;
216                         if(sysctl_tcp_timestamps) {
217                                 tcp_header_size += TCPOLEN_TSTAMP_ALIGNED;
218                                 sysctl_flags |= SYSCTL_FLAG_TSTAMPS;
219                         }
220                         if(sysctl_tcp_window_scaling) {
221                                 tcp_header_size += TCPOLEN_WSCALE_ALIGNED;
222                                 sysctl_flags |= SYSCTL_FLAG_WSCALE;
223                         }
224                         if(sysctl_tcp_sack) {
225                                 sysctl_flags |= SYSCTL_FLAG_SACK;
226                                 if(!(sysctl_flags & SYSCTL_FLAG_TSTAMPS))
227                                         tcp_header_size += TCPOLEN_SACKPERM_ALIGNED;
228                         }
229                 } else if (tp->eff_sacks) {
230                         /* A SACK is 2 pad bytes, a 2 byte header, plus
231                          * 2 32-bit sequence numbers for each SACK block.
232                          */
233                         tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED +
234                                             (tp->eff_sacks * TCPOLEN_SACK_PERBLOCK));
235                 }
236                 th = (struct tcphdr *) skb_push(skb, tcp_header_size);
237                 skb->h.th = th;
238                 skb_set_owner_w(skb, sk);
239 
240                 /* Build TCP header and checksum it. */
241                 th->source              = sk->sport;
242                 th->dest                = sk->dport;
243                 th->seq                 = htonl(tcb->seq);
244                 th->ack_seq             = htonl(tp->rcv_nxt);
245                 *(((__u16 *)th) + 6)    = htons(((tcp_header_size >> 2) << 12) | tcb->flags);
246                 if (tcb->flags & TCPCB_FLAG_SYN) {
247                         /* RFC1323: The window in SYN & SYN/ACK segments
248                          * is never scaled.
249                          */
250                         th->window      = htons(tp->rcv_wnd);
251                 } else {
252                         th->window      = htons(tcp_select_window(sk));
253                 }
254                 th->check               = 0;
255                 th->urg_ptr             = 0;
256 
257                 if (tp->urg_mode &&
258                     between(tp->snd_up, tcb->seq+1, tcb->seq+0xFFFF)) {
259                         th->urg_ptr             = htons(tp->snd_up-tcb->seq);
260                         th->urg                 = 1;
261                 }
262 
263                 if (tcb->flags & TCPCB_FLAG_SYN) {
264                         tcp_syn_build_options((__u32 *)(th + 1),
265                                               tcp_advertise_mss(sk),
266                                               (sysctl_flags & SYSCTL_FLAG_TSTAMPS),
267                                               (sysctl_flags & SYSCTL_FLAG_SACK),
268                                               (sysctl_flags & SYSCTL_FLAG_WSCALE),
269                                               tp->rcv_wscale,
270                                               tcb->when,
271                                               tp->ts_recent);
272                 } else {
273                         tcp_build_and_update_options((__u32 *)(th + 1),
274                                                      tp, tcb->when);
275 
276                         TCP_ECN_send(sk, tp, skb, tcp_header_size);
277                 }
278                 tp->af_specific->send_check(sk, th, skb->len, skb);
279 
280                 if (tcb->flags & TCPCB_FLAG_ACK)
281                         tcp_event_ack_sent(sk);
282 
283                 if (skb->len != tcp_header_size)
284                         tcp_event_data_sent(tp, skb);
285 
286                 TCP_INC_STATS(TcpOutSegs);
287 
288                 err = tp->af_specific->queue_xmit(skb);
289                 if (err <= 0)
290                         return err;
291 
292                 tcp_enter_cwr(tp);
293 
294                 /* NET_XMIT_CN is special. It does not guarantee,
295                  * that this packet is lost. It tells that device
296                  * is about to start to drop packets or already
297                  * drops some packets of the same priority and
298                  * invokes us to send less aggressively.
299                  */
300                 return err == NET_XMIT_CN ? 0 : err;
301         }
302         return -ENOBUFS;
303 #undef SYSCTL_FLAG_TSTAMPS
304 #undef SYSCTL_FLAG_WSCALE
305 #undef SYSCTL_FLAG_SACK
306 }
307 
308 
309 /* This is the main buffer sending routine. We queue the buffer
310  * and decide whether to queue or transmit now.
311  *
312  * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
313  * otherwise socket can stall.
314  */
315 void tcp_send_skb(struct sock *sk, struct sk_buff *skb, int force_queue, unsigned cur_mss)
316 {
317         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
318 
319         /* Advance write_seq and place onto the write_queue. */
320         tp->write_seq = TCP_SKB_CB(skb)->end_seq;
321         __skb_queue_tail(&sk->write_queue, skb);
322         tcp_charge_skb(sk, skb);
323 
324         if (!force_queue && tp->send_head == NULL && tcp_snd_test(tp, skb, cur_mss, tp->nonagle)) {
325                 /* Send it out now. */
326                 TCP_SKB_CB(skb)->when = tcp_time_stamp;
327                 if (tcp_transmit_skb(sk, skb_clone(skb, sk->allocation)) == 0) {
328                         tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
329                         tcp_minshall_update(tp, cur_mss, skb);
330                         if (tp->packets_out++ == 0)
331                                 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
332                         return;
333                 }
334         }
335         /* Queue it, remembering where we must start sending. */
336         if (tp->send_head == NULL)
337                 tp->send_head = skb;
338 }
339 
340 /* Function to create two new TCP segments.  Shrinks the given segment
341  * to the specified size and appends a new segment with the rest of the
342  * packet to the list.  This won't be called frequently, I hope. 
343  * Remember, these are still headerless SKBs at this point.
344  */
345 static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
346 {
347         struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
348         struct sk_buff *buff;
349         int nsize = skb->len - len;
350         u16 flags;
351 
352         /* Get a new skb... force flag on. */
353         buff = tcp_alloc_skb(sk, nsize + MAX_TCP_HEADER, GFP_ATOMIC);
354         if (buff == NULL)
355                 return -ENOMEM; /* We'll just try again later. */
356         tcp_charge_skb(sk, buff);
357 
358         /* Reserve space for headers. */
359         skb_reserve(buff, MAX_TCP_HEADER);
360                 
361         /* Correct the sequence numbers. */
362         TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
363         TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
364         
365         /* PSH and FIN should only be set in the second packet. */
366         flags = TCP_SKB_CB(skb)->flags;
367         TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
368         TCP_SKB_CB(buff)->flags = flags;
369         TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked&(TCPCB_LOST|TCPCB_EVER_RETRANS|TCPCB_AT_TAIL);
370         if (TCP_SKB_CB(buff)->sacked&TCPCB_LOST) {
371                 tp->lost_out++;
372                 tp->left_out++;
373         }
374         TCP_SKB_CB(buff)->sacked &= ~TCPCB_AT_TAIL;
375 
376         /* Copy and checksum data tail into the new buffer. */
377         buff->csum = csum_partial_copy_nocheck(skb->data + len, skb_put(buff, nsize),
378                                                nsize, 0);
379 
380         /* This takes care of the FIN sequence number too. */
381         TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
382         skb_trim(skb, len);
383 
384         /* Rechecksum original buffer. */
385         skb->csum = csum_partial(skb->data, skb->len, 0);
386 
387         /* Looks stupid, but our code really uses when of
388          * skbs, which it never sent before. --ANK
389          */
390         TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
391 
392         /* Link BUFF into the send queue. */
393         __skb_append(skb, buff);
394 
395         return 0;
396 }
397 
398 /* This function synchronize snd mss to current pmtu/exthdr set.
399 
400    tp->user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
401    for TCP options, but includes only bare TCP header.
402 
403    tp->mss_clamp is mss negotiated at connection setup.
404    It is minumum of user_mss and mss received with SYN.
405    It also does not include TCP options.
406 
407    tp->pmtu_cookie is last pmtu, seen by this function.
408 
409    tp->mss_cache is current effective sending mss, including
410    all tcp options except for SACKs. It is evaluated,
411    taking into account current pmtu, but never exceeds
412    tp->mss_clamp.
413 
414    NOTE1. rfc1122 clearly states that advertised MSS
415    DOES NOT include either tcp or ip options.
416 
417    NOTE2. tp->pmtu_cookie and tp->mss_cache are READ ONLY outside
418    this function.                       --ANK (980731)
419  */
420 
421 int tcp_sync_mss(struct sock *sk, u32 pmtu)
422 {
423         struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
424         int mss_now;
425 
426         /* Calculate base mss without TCP options:
427            It is MMS_S - sizeof(tcphdr) of rfc1122
428          */
429 
430         mss_now = pmtu - tp->af_specific->net_header_len - sizeof(struct tcphdr);
431 
432         /* Clamp it (mss_clamp does not include tcp options) */
433         if (mss_now > tp->mss_clamp)
434                 mss_now = tp->mss_clamp;
435 
436         /* Now subtract optional transport overhead */
437         mss_now -= tp->ext_header_len;
438 
439         /* Then reserve room for full set of TCP options and 8 bytes of data */
440         if (mss_now < 48)
441                 mss_now = 48;
442 
443         /* Now subtract TCP options size, not including SACKs */
444         mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
445 
446         /* Bound mss with half of window */
447         if (tp->max_window && mss_now > (tp->max_window>>1))
448                 mss_now = max((tp->max_window>>1), 68 - tp->tcp_header_len);
449 
450         /* And store cached results */
451         tp->pmtu_cookie = pmtu;
452         tp->mss_cache = mss_now;
453         return mss_now;
454 }
455 
456 
457 /* This routine writes packets to the network.  It advances the
458  * send_head.  This happens as incoming acks open up the remote
459  * window for us.
460  *
461  * Returns 1, if no segments are in flight and we have queued segments, but
462  * cannot send anything now because of SWS or another problem.
463  */
464 int tcp_write_xmit(struct sock *sk)
465 {
466         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
467         unsigned int mss_now;
468 
469         /* If we are closed, the bytes will have to remain here.
470          * In time closedown will finish, we empty the write queue and all
471          * will be happy.
472          */
473         if(sk->state != TCP_CLOSE) {
474                 struct sk_buff *skb;
475                 int sent_pkts = 0;
476 
477                 /* Account for SACKS, we may need to fragment due to this.
478                  * It is just like the real MSS changing on us midstream.
479                  * We also handle things correctly when the user adds some
480                  * IP options mid-stream.  Silly to do, but cover it.
481                  */
482                 mss_now = tcp_current_mss(sk); 
483 
484                 while((skb = tp->send_head) &&
485                       tcp_snd_test(tp, skb, mss_now, tcp_skb_is_last(sk, skb) ? tp->nonagle : 1)) {
486                         if (skb->len > mss_now) {
487                                 if (tcp_fragment(sk, skb, mss_now))
488                                         break;
489                         }
490 
491                         TCP_SKB_CB(skb)->when = tcp_time_stamp;
492                         if (tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)))
493                                 break;
494                         /* Advance the send_head.  This one is sent out. */
495                         update_send_head(sk, tp, skb);
496                         tcp_minshall_update(tp, mss_now, skb);
497                         sent_pkts = 1;
498                 }
499 
500                 if (sent_pkts) {
501                         tcp_cwnd_validate(sk, tp);
502                         return 0;
503                 }
504 
505                 return !tp->packets_out && tp->send_head;
506         }
507         return 0;
508 }
509 
510 /* This function returns the amount that we can raise the
511  * usable window based on the following constraints
512  *  
513  * 1. The window can never be shrunk once it is offered (RFC 793)
514  * 2. We limit memory per socket
515  *
516  * RFC 1122:
517  * "the suggested [SWS] avoidance algorithm for the receiver is to keep
518  *  RECV.NEXT + RCV.WIN fixed until:
519  *  RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)"
520  *
521  * i.e. don't raise the right edge of the window until you can raise
522  * it at least MSS bytes.
523  *
524  * Unfortunately, the recommended algorithm breaks header prediction,
525  * since header prediction assumes th->window stays fixed.
526  *
527  * Strictly speaking, keeping th->window fixed violates the receiver
528  * side SWS prevention criteria. The problem is that under this rule
529  * a stream of single byte packets will cause the right side of the
530  * window to always advance by a single byte.
531  * 
532  * Of course, if the sender implements sender side SWS prevention
533  * then this will not be a problem.
534  * 
535  * BSD seems to make the following compromise:
536  * 
537  *      If the free space is less than the 1/4 of the maximum
538  *      space available and the free space is less than 1/2 mss,
539  *      then set the window to 0.
540  *      [ Actually, bsd uses MSS and 1/4 of maximal _window_ ]
541  *      Otherwise, just prevent the window from shrinking
542  *      and from being larger than the largest representable value.
543  *
544  * This prevents incremental opening of the window in the regime
545  * where TCP is limited by the speed of the reader side taking
546  * data out of the TCP receive queue. It does nothing about
547  * those cases where the window is constrained on the sender side
548  * because the pipeline is full.
549  *
550  * BSD also seems to "accidentally" limit itself to windows that are a
551  * multiple of MSS, at least until the free space gets quite small.
552  * This would appear to be a side effect of the mbuf implementation.
553  * Combining these two algorithms results in the observed behavior
554  * of having a fixed window size at almost all times.
555  *
556  * Below we obtain similar behavior by forcing the offered window to
557  * a multiple of the mss when it is feasible to do so.
558  *
559  * Note, we don't "adjust" for TIMESTAMP or SACK option bytes.
560  * Regular options like TIMESTAMP are taken into account.
561  */
562 u32 __tcp_select_window(struct sock *sk)
563 {
564         struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
565         /* MSS for the peer's data.  Previous verions used mss_clamp
566          * here.  I don't know if the value based on our guesses
567          * of peer's MSS is better for the performance.  It's more correct
568          * but may be worse for the performance because of rcv_mss
569          * fluctuations.  --SAW  1998/11/1
570          */
571         unsigned int mss = tp->ack.rcv_mss;
572         int free_space;
573         u32 window;
574 
575         /* Sometimes free_space can be < 0. */
576         free_space = tcp_space(sk); 
577         if (tp->window_clamp < mss)
578                 mss = tp->window_clamp; 
579 
580         if (free_space < (int)min(tp->window_clamp, tcp_full_space(sk)) / 2) {
581                 tp->ack.quick = 0;
582 
583                 if (tcp_memory_pressure)
584                         tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4*tp->advmss);
585 
586                 if (free_space < ((int)mss))
587                         return 0;
588         }
589 
590         if (free_space > tp->rcv_ssthresh)
591                 free_space = tp->rcv_ssthresh;
592 
593         /* Get the largest window that is a nice multiple of mss.
594          * Window clamp already applied above.
595          * If our current window offering is within 1 mss of the
596          * free space we just keep it. This prevents the divide
597          * and multiply from happening most of the time.
598          * We also don't do any window rounding when the free space
599          * is too small.
600          */
601         window = tp->rcv_wnd;
602         if ((((int) window) <= (free_space - ((int) mss))) ||
603             (((int) window) > free_space))
604                 window = (((unsigned int) free_space)/mss)*mss;
605 
606         return window;
607 }
608 
609 /* Attempt to collapse two adjacent SKB's during retransmission. */
610 static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int mss_now)
611 {
612         struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
613         struct sk_buff *next_skb = skb->next;
614 
615         /* The first test we must make is that neither of these two
616          * SKB's are still referenced by someone else.
617          */
618         if(!skb_cloned(skb) && !skb_cloned(next_skb)) {
619                 int skb_size = skb->len, next_skb_size = next_skb->len;
620                 u16 flags = TCP_SKB_CB(skb)->flags;
621 
622                 /* Also punt if next skb has been SACK'd. */
623                 if(TCP_SKB_CB(next_skb)->sacked & TCPCB_SACKED_ACKED)
624                         return;
625 
626                 /* Next skb is out of window. */
627                 if (after(TCP_SKB_CB(next_skb)->end_seq, tp->snd_una+tp->snd_wnd))
628                         return;
629 
630                 /* Punt if not enough space exists in the first SKB for
631                  * the data in the second, or the total combined payload
632                  * would exceed the MSS.
633                  */
634                 if ((next_skb_size > skb_tailroom(skb)) ||
635                     ((skb_size + next_skb_size) > mss_now))
636                         return;
637 
638                 /* Ok.  We will be able to collapse the packet. */
639                 __skb_unlink(next_skb, next_skb->list);
640 
641                 if(skb->len % 4) {
642                         /* Must copy and rechecksum all data. */
643                         memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size);
644                         skb->csum = csum_partial(skb->data, skb->len, 0);
645                 } else {
646                         /* Optimize, actually we could also combine next_skb->csum
647                          * to skb->csum using a single add w/carry operation too.
648                          */
649                         skb->csum = csum_partial_copy_nocheck(next_skb->data,
650                                                               skb_put(skb, next_skb_size),
651                                                               next_skb_size, skb->csum);
652                 }
653         
654                 /* Update sequence range on original skb. */
655                 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
656 
657                 /* Merge over control information. */
658                 flags |= TCP_SKB_CB(next_skb)->flags; /* This moves PSH/FIN etc. over */
659                 TCP_SKB_CB(skb)->flags = flags;
660 
661                 /* All done, get rid of second SKB and account for it so
662                  * packet counting does not break.
663                  */
664                 TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked&(TCPCB_EVER_RETRANS|TCPCB_AT_TAIL);
665                 if (TCP_SKB_CB(next_skb)->sacked&TCPCB_SACKED_RETRANS)
666                         tp->retrans_out--;
667                 if (TCP_SKB_CB(next_skb)->sacked&TCPCB_LOST) {
668                         tp->lost_out--;
669                         tp->left_out--;
670                 }
671                 if (!tp->sack_ok && tp->sacked_out) {
672                         /* Reno case is special. Sigh... */
673                         tp->sacked_out--;
674                         tp->left_out--;
675                 }
676                 /* Not quite right: it can be > snd.fack, but
677                  * it is better to underestimate fackets.
678                  */
679                 if (tp->fackets_out)
680                         tp->fackets_out--;
681                 tcp_free_skb(sk, next_skb);
682                 tp->packets_out--;
683         }
684 }
685 
686 /* Do a simple retransmit without using the backoff mechanisms in
687  * tcp_timer. This is used for path mtu discovery. 
688  * The socket is already locked here.
689  */ 
690 void tcp_simple_retransmit(struct sock *sk)
691 {
692         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
693         struct sk_buff *skb;
694         unsigned int mss = tcp_current_mss(sk);
695         int lost = 0;
696 
697         for_retrans_queue(skb, sk, tp) {
698                 if (skb->len > mss && 
699                     !(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) {
700                         if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) {
701                                 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
702                                 tp->retrans_out--;
703                         }
704                         if (!(TCP_SKB_CB(skb)->sacked&TCPCB_LOST)) {
705                                 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
706                                 tp->lost_out++;
707                                 lost = 1;
708                         }
709                 }
710         }
711 
712         if (!lost)
713                 return;
714 
715         tp->left_out = tp->sacked_out + tp->lost_out;
716 
717         /* Don't muck with the congestion window here.
718          * Reason is that we do not increase amount of _data_
719          * in network, but units changed and effective
720          * cwnd/ssthresh really reduced now.
721          */
722         if (tp->ca_state != TCP_CA_Loss) {
723                 tp->high_seq = tp->snd_nxt;
724                 tp->snd_ssthresh = tcp_current_ssthresh(tp);
725                 tp->prior_ssthresh = 0;
726                 tp->undo_marker = 0;
727                 tp->ca_state = TCP_CA_Loss;
728         }
729         tcp_xmit_retransmit_queue(sk);
730 }
731 
732 /* This retransmits one SKB.  Policy decisions and retransmit queue
733  * state updates are done by the caller.  Returns non-zero if an
734  * error occurred which prevented the send.
735  */
736 int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
737 {
738         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
739         unsigned int cur_mss = tcp_current_mss(sk);
740         int err;
741 
742         /* Do not sent more than we queued. 1/4 is reserved for possible
743          * copying overhead: frgagmentation, tunneling, mangling etc.
744          */
745         if (atomic_read(&sk->wmem_alloc) > min(sk->wmem_queued+(sk->wmem_queued>>2),sk->sndbuf))
746                 return -EAGAIN;
747 
748         if(skb->len > cur_mss) {
749                 if(tcp_fragment(sk, skb, cur_mss))
750                         return -ENOMEM; /* We'll try again later. */
751 
752                 /* New SKB created, account for it. */
753                 tp->packets_out++;
754         }
755 
756         /* Collapse two adjacent packets if worthwhile and we can. */
757         if(!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) &&
758            (skb->len < (cur_mss >> 1)) &&
759            (skb->next != tp->send_head) &&
760            (skb->next != (struct sk_buff *)&sk->write_queue) &&
761            (sysctl_tcp_retrans_collapse != 0))
762                 tcp_retrans_try_collapse(sk, skb, cur_mss);
763 
764         if(tp->af_specific->rebuild_header(sk))
765                 return -EHOSTUNREACH; /* Routing failure or similar. */
766 
767         /* Some Solaris stacks overoptimize and ignore the FIN on a
768          * retransmit when old data is attached.  So strip it off
769          * since it is cheap to do so and saves bytes on the network.
770          */
771         if(skb->len > 0 &&
772            (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) &&
773            tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) {
774                 TCP_SKB_CB(skb)->seq = TCP_SKB_CB(skb)->end_seq - 1;
775                 skb_trim(skb, 0);
776                 skb->csum = 0;
777         }
778 
779         /* Make a copy, if the first transmission SKB clone we made
780          * is still in somebody's hands, else make a clone.
781          */
782         TCP_SKB_CB(skb)->when = tcp_time_stamp;
783 
784         err = tcp_transmit_skb(sk, (skb_cloned(skb) ?
785                                     skb_copy(skb, GFP_ATOMIC):
786                                     skb_clone(skb, GFP_ATOMIC)));
787 
788         if (err == 0) {
789                 /* Update global TCP statistics. */
790                 TCP_INC_STATS(TcpRetransSegs);
791 
792 #if FASTRETRANS_DEBUG > 0
793                 if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) {
794                         if (net_ratelimit())
795                                 printk(KERN_DEBUG "retrans_out leaked.\n");
796                 }
797 #endif
798                 TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS;
799                 tp->retrans_out++;
800 
801                 /* Save stamp of the first retransmit. */
802                 if (!tp->retrans_stamp)
803                         tp->retrans_stamp = TCP_SKB_CB(skb)->when;
804 
805                 tp->undo_retrans++;
806 
807                 /* snd_nxt is stored to detect loss of retransmitted segment,
808                  * see tcp_input.c tcp_sacktag_write_queue().
809                  */
810                 TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt;
811         }
812         return err;
813 }
814 
815 /* This gets called after a retransmit timeout, and the initially
816  * retransmitted data is acknowledged.  It tries to continue
817  * resending the rest of the retransmit queue, until either
818  * we've sent it all or the congestion window limit is reached.
819  * If doing SACK, the first ACK which comes back for a timeout
820  * based retransmit packet might feed us FACK information again.
821  * If so, we use it to avoid unnecessarily retransmissions.
822  */
823 void tcp_xmit_retransmit_queue(struct sock *sk)
824 {
825         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
826         struct sk_buff *skb;
827         int packet_cnt = tp->lost_out;
828 
829         /* First pass: retransmit lost packets. */
830         if (packet_cnt) {
831                 for_retrans_queue(skb, sk, tp) {
832                         __u8 sacked = TCP_SKB_CB(skb)->sacked;
833 
834                         if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
835                                 return;
836 
837                         if (sacked&TCPCB_LOST) {
838                                 if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) {
839                                         if (tcp_retransmit_skb(sk, skb))
840                                                 return;
841                                         if (tp->ca_state != TCP_CA_Loss)
842                                                 NET_INC_STATS_BH(TCPFastRetrans);
843                                         else
844                                                 NET_INC_STATS_BH(TCPSlowStartRetrans);
845 
846                                         if (skb == skb_peek(&sk->write_queue))
847                                                 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
848                                 }
849 
850                                 if (--packet_cnt <= 0)
851                                         break;
852                         }
853                 }
854         }
855 
856         /* OK, demanded retransmission is finished. */
857 
858         /* Forward retransmissions are possible only during Recovery. */
859         if (tp->ca_state != TCP_CA_Recovery)
860                 return;
861 
862         /* No forward retransmissions in Reno are possible. */
863         if (!tp->sack_ok)
864                 return;
865 
866         /* Yeah, we have to make difficult choice between forward transmission
867          * and retransmission... Both ways have their merits...
868          *
869          * For now we do not retrnamsit anything, while we have some new
870          * segments to send.
871          */
872 
873         if (tcp_may_send_now(sk, tp))
874                 return;
875 
876         packet_cnt = 0;
877 
878         for_retrans_queue(skb, sk, tp) {
879                 if(++packet_cnt > tp->fackets_out)
880                         break;
881 
882                 if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
883                         break;
884 
885                 if(TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS)
886                         continue;
887 
888                 /* Ok, retransmit it. */
889                 if(tcp_retransmit_skb(sk, skb))
890                         break;
891 
892                 if (skb == skb_peek(&sk->write_queue))
893                         tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
894 
895                 NET_INC_STATS_BH(TCPForwardRetrans);
896         }
897 }
898 
899 
900 /* Send a fin.  The caller locks the socket for us.  This cannot be
901  * allowed to fail queueing a FIN frame under any circumstances.
902  */
903 void tcp_send_fin(struct sock *sk)
904 {
905         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);    
906         struct sk_buff *skb = skb_peek_tail(&sk->write_queue);
907         unsigned int mss_now;
908         
909         /* Optimization, tack on the FIN if we have a queue of
910          * unsent frames.  But be careful about outgoing SACKS
911          * and IP options.
912          */
913         mss_now = tcp_current_mss(sk); 
914 
915         /* Please, find seven differences of 2.3.33 and loook
916          * what I broke here. 8) --ANK
917          */
918 
919         if(tp->send_head != NULL) {
920                 /* tcp_write_xmit() takes care of the rest. */
921                 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN;
922                 TCP_SKB_CB(skb)->end_seq++;
923                 tp->write_seq++;
924 
925                 /* Special case to avoid Nagle bogosity.  If this
926                  * segment is the last segment, and it was queued
927                  * due to Nagle/SWS-avoidance, send it out now.
928                  */
929                 if(tp->send_head == skb &&
930                    !after(tp->write_seq, tp->snd_una + tp->snd_wnd)) {
931                         TCP_SKB_CB(skb)->when = tcp_time_stamp;
932                         if (!tcp_transmit_skb(sk, skb_clone(skb, GFP_KERNEL)))
933                                 update_send_head(sk, tp, skb);
934                         else
935                                 tcp_check_probe_timer(sk, tp);
936                 }
937         } else {
938                 /* Socket is locked, keep trying until memory is available. */
939                 for (;;) {
940                         skb = alloc_skb(MAX_TCP_HEADER, GFP_KERNEL);
941                         if (skb)
942                                 break;
943                         current->policy |= SCHED_YIELD;
944                         schedule();
945                 }
946 
947                 /* Reserve space for headers and prepare control bits. */
948                 skb_reserve(skb, MAX_TCP_HEADER);
949                 skb->csum = 0;
950                 TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_FIN);
951                 TCP_SKB_CB(skb)->sacked = 0;
952 
953                 /* FIN eats a sequence byte, write_seq advanced by tcp_send_skb(). */
954                 TCP_SKB_CB(skb)->seq = tp->write_seq;
955                 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
956                 tcp_send_skb(sk, skb, 0, mss_now);
957                 __tcp_push_pending_frames(sk, tp, mss_now, 1);
958         }
959 }
960 
961 /* We get here when a process closes a file descriptor (either due to
962  * an explicit close() or as a byproduct of exit()'ing) and there
963  * was unread data in the receive queue.  This behavior is recommended
964  * by draft-ietf-tcpimpl-prob-03.txt section 3.10.  -DaveM
965  */
966 void tcp_send_active_reset(struct sock *sk, int priority)
967 {
968         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
969         struct sk_buff *skb;
970 
971         /* NOTE: No TCP options attached and we never retransmit this. */
972         skb = alloc_skb(MAX_TCP_HEADER, priority);
973         if (!skb) {
974                 NET_INC_STATS(TCPAbortFailed);
975                 return;
976         }
977 
978         /* Reserve space for headers and prepare control bits. */
979         skb_reserve(skb, MAX_TCP_HEADER);
980         skb->csum = 0;
981         TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_RST);
982         TCP_SKB_CB(skb)->sacked = 0;
983 
984         /* Send it off. */
985         TCP_SKB_CB(skb)->seq = tcp_acceptable_seq(sk, tp);
986         TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
987         TCP_SKB_CB(skb)->when = tcp_time_stamp;
988         if (tcp_transmit_skb(sk, skb))
989                 NET_INC_STATS(TCPAbortFailed);
990 }
991 
992 /* WARNING: This routine must only be called when we have already sent
993  * a SYN packet that crossed the incoming SYN that caused this routine
994  * to get called. If this assumption fails then the initial rcv_wnd
995  * and rcv_wscale values will not be correct.
996  */
997 int tcp_send_synack(struct sock *sk)
998 {
999         struct sk_buff* skb;
1000 
1001         skb = skb_peek(&sk->write_queue);
1002         if (skb == NULL || !(TCP_SKB_CB(skb)->flags&TCPCB_FLAG_SYN)) {
1003                 printk(KERN_DEBUG "tcp_send_synack: wrong queue state\n");
1004                 return -EFAULT;
1005         }
1006         if (!(TCP_SKB_CB(skb)->flags&TCPCB_FLAG_ACK)) {
1007                 if (skb_cloned(skb)) {
1008                         struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC);
1009                         if (nskb == NULL)
1010                                 return -ENOMEM;
1011                         __skb_unlink(skb, &sk->write_queue);
1012                         __skb_queue_head(&sk->write_queue, nskb);
1013                         tcp_free_skb(sk, skb);
1014                         tcp_charge_skb(sk, nskb);
1015                         skb = nskb;
1016                 }
1017 
1018                 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ACK;
1019                 TCP_ECN_send_synack(&sk->tp_pinfo.af_tcp, skb);
1020         }
1021         TCP_SKB_CB(skb)->when = tcp_time_stamp;
1022         return tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
1023 }
1024 
1025 /*
1026  * Prepare a SYN-ACK.
1027  */
1028 struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
1029                                  struct open_request *req)
1030 {
1031         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1032         struct tcphdr *th;
1033         int tcp_header_size;
1034         struct sk_buff *skb;
1035 
1036         skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC);
1037         if (skb == NULL)
1038                 return NULL;
1039 
1040         /* Reserve space for headers. */
1041         skb_reserve(skb, MAX_TCP_HEADER);
1042 
1043         skb->dst = dst_clone(dst);
1044 
1045         tcp_header_size = (sizeof(struct tcphdr) + TCPOLEN_MSS +
1046                            (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) +
1047                            (req->wscale_ok ? TCPOLEN_WSCALE_ALIGNED : 0) +
1048                            /* SACK_PERM is in the place of NOP NOP of TS */
1049                            ((req->sack_ok && !req->tstamp_ok) ? TCPOLEN_SACKPERM_ALIGNED : 0));
1050         skb->h.th = th = (struct tcphdr *) skb_push(skb, tcp_header_size);
1051 
1052         memset(th, 0, sizeof(struct tcphdr));
1053         th->syn = 1;
1054         th->ack = 1;
1055         TCP_ECN_make_synack(req, th);
1056         th->source = sk->sport;
1057         th->dest = req->rmt_port;
1058         TCP_SKB_CB(skb)->seq = req->snt_isn;
1059         TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
1060         th->seq = htonl(TCP_SKB_CB(skb)->seq);
1061         th->ack_seq = htonl(req->rcv_isn + 1);
1062         if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
1063                 __u8 rcv_wscale; 
1064                 /* Set this up on the first call only */
1065                 req->window_clamp = tp->window_clamp ? : dst->window;
1066                 /* tcp_full_space because it is guaranteed to be the first packet */
1067                 tcp_select_initial_window(tcp_full_space(sk), 
1068                         dst->advmss - (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
1069                         &req->rcv_wnd,
1070                         &req->window_clamp,
1071                         req->wscale_ok,
1072                         &rcv_wscale);
1073                 req->rcv_wscale = rcv_wscale; 
1074         }
1075 
1076         /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
1077         th->window = htons(req->rcv_wnd);
1078 
1079         TCP_SKB_CB(skb)->when = tcp_time_stamp;
1080         tcp_syn_build_options((__u32 *)(th + 1), dst->advmss, req->tstamp_ok,
1081                               req->sack_ok, req->wscale_ok, req->rcv_wscale,
1082                               TCP_SKB_CB(skb)->when,
1083                               req->ts_recent);
1084 
1085         skb->csum = 0;
1086         th->doff = (tcp_header_size >> 2);
1087         TCP_INC_STATS(TcpOutSegs);
1088         return skb;
1089 }
1090 
1091 int tcp_connect(struct sock *sk, struct sk_buff *buff)
1092 {
1093         struct dst_entry *dst = __sk_dst_get(sk);
1094         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1095 
1096         /* Reserve space for headers. */
1097         skb_reserve(buff, MAX_TCP_HEADER);
1098 
1099         /* We'll fix this up when we get a response from the other end.
1100          * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
1101          */
1102         tp->tcp_header_len = sizeof(struct tcphdr) +
1103                 (sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0);
1104 
1105         /* If user gave his TCP_MAXSEG, record it to clamp */
1106         if (tp->user_mss)
1107                 tp->mss_clamp = tp->user_mss;
1108         tp->max_window = 0;
1109         tcp_sync_mss(sk, dst->pmtu);
1110 
1111         if (!tp->window_clamp)
1112                 tp->window_clamp = dst->window;
1113         tp->advmss = dst->advmss;
1114         tcp_initialize_rcv_mss(sk);
1115 
1116         tcp_select_initial_window(tcp_full_space(sk),
1117                                   tp->advmss - (tp->ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
1118                                   &tp->rcv_wnd,
1119                                   &tp->window_clamp,
1120                                   sysctl_tcp_window_scaling,
1121                                   &tp->rcv_wscale);
1122 
1123         tp->rcv_ssthresh = tp->rcv_wnd;
1124 
1125         /* Socket identity change complete, no longer
1126          * in TCP_CLOSE, so enter ourselves into the
1127          * hash tables.
1128          */
1129         tcp_set_state(sk,TCP_SYN_SENT);
1130         if (tp->af_specific->hash_connecting(sk))
1131                 goto err_out;
1132 
1133         sk->err = 0;
1134         sk->done = 0;
1135         tp->snd_wnd = 0;
1136         tcp_init_wl(tp, tp->write_seq, 0);
1137         tp->snd_una = tp->write_seq;
1138         tp->snd_sml = tp->write_seq;
1139         tp->rcv_nxt = 0;
1140         tp->rcv_wup = 0;
1141         tp->copied_seq = 0;
1142 
1143         tp->rto = TCP_TIMEOUT_INIT;
1144         tp->retransmits = 0;
1145         tcp_clear_retrans(tp);
1146 
1147         TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN;
1148         TCP_ECN_send_syn(tp, buff);
1149         TCP_SKB_CB(buff)->sacked = 0;
1150         buff->csum = 0;
1151         TCP_SKB_CB(buff)->seq = tp->write_seq++;
1152         TCP_SKB_CB(buff)->end_seq = tp->write_seq;
1153         tp->snd_nxt = tp->write_seq;
1154         tp->pushed_seq = tp->write_seq;
1155 
1156         /* Send it off. */
1157         TCP_SKB_CB(buff)->when = tcp_time_stamp;
1158         tp->retrans_stamp = TCP_SKB_CB(buff)->when;
1159         __skb_queue_tail(&sk->write_queue, buff);
1160         tcp_charge_skb(sk, buff);
1161         tp->packets_out++;
1162         tcp_transmit_skb(sk, skb_clone(buff, GFP_KERNEL));
1163         TCP_INC_STATS(TcpActiveOpens);
1164 
1165         /* Timer for repeating the SYN until an answer. */
1166         tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
1167         return 0;
1168 
1169 err_out:
1170         tcp_set_state(sk,TCP_CLOSE);
1171         kfree_skb(buff);
1172         return -EADDRNOTAVAIL;
1173 }
1174 
1175 /* Send out a delayed ack, the caller does the policy checking
1176  * to see if we should even be here.  See tcp_input.c:tcp_ack_snd_check()
1177  * for details.
1178  */
1179 void tcp_send_delayed_ack(struct sock *sk)
1180 {
1181         struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
1182         int ato = tp->ack.ato;
1183         unsigned long timeout;
1184 
1185         if (ato > TCP_DELACK_MIN) {
1186                 int max_ato = HZ/2;
1187 
1188                 if (tp->ack.pingpong || (tp->ack.pending&TCP_ACK_PUSHED))
1189                         max_ato = TCP_DELACK_MAX;
1190 
1191                 /* Slow path, intersegment interval is "high". */
1192 
1193                 /* If some rtt estimate is known, use it to bound delayed ack.
1194                  * Do not use tp->rto here, use results of rtt measurements
1195                  * directly.
1196                  */
1197                 if (tp->srtt) {
1198                         int rtt = max(tp->srtt>>3, TCP_DELACK_MIN);
1199 
1200                         if (rtt < max_ato)
1201                                 max_ato = rtt;
1202                 }
1203 
1204                 ato = min(ato, max_ato);
1205         }
1206 
1207         /* Stay within the limit we were given */
1208         timeout = jiffies + ato;
1209 
1210         /* Use new timeout only if there wasn't a older one earlier. */
1211         if (tp->ack.pending&TCP_ACK_TIMER) {
1212                 /* If delack timer was blocked or is about to expire,
1213                  * send ACK now.
1214                  */
1215                 if (tp->ack.blocked || time_before_eq(tp->ack.timeout, jiffies+(ato>>2))) {
1216                         tcp_send_ack(sk);
1217                         return;
1218                 }
1219 
1220                 if (!time_before(timeout, tp->ack.timeout))
1221                         timeout = tp->ack.timeout;
1222         }
1223         tp->ack.pending |= TCP_ACK_SCHED|TCP_ACK_TIMER;
1224         tp->ack.timeout = timeout;
1225         if (!mod_timer(&tp->delack_timer, timeout))
1226                 sock_hold(sk);
1227 
1228 #ifdef TCP_FORMAL_WINDOW
1229         /* Explanation. Header prediction path does not handle
1230          * case of zero window. If we send ACK immediately, pred_flags
1231          * are reset when sending ACK. If rcv_nxt is advanced and
1232          * ack is not sent, than delayed ack is scheduled.
1233          * Hence, it is the best place to check for zero window.
1234          */
1235         if (tp->pred_flags) {
1236                 if (tcp_receive_window(tp) == 0)
1237                         tp->pred_flags = 0;
1238         } else {
1239                 if (skb_queue_len(&tp->out_of_order_queue) == 0 &&
1240                     !tp->urg_data)
1241                         tcp_fast_path_on(tp);
1242         }
1243 #endif
1244 }
1245 
1246 /* This routine sends an ack and also updates the window. */
1247 void tcp_send_ack(struct sock *sk)
1248 {
1249         /* If we have been reset, we may not send again. */
1250         if(sk->state != TCP_CLOSE) {
1251                 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1252                 struct sk_buff *buff;
1253 
1254                 /* We are not putting this on the write queue, so
1255                  * tcp_transmit_skb() will set the ownership to this
1256                  * sock.
1257                  */
1258                 buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
1259                 if (buff == NULL) {
1260                         tcp_schedule_ack(tp);
1261                         tp->ack.ato = TCP_ATO_MIN;
1262                         tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MAX);
1263                         return;
1264                 }
1265 
1266                 /* Reserve space for headers and prepare control bits. */
1267                 skb_reserve(buff, MAX_TCP_HEADER);
1268                 buff->csum = 0;
1269                 TCP_SKB_CB(buff)->flags = TCPCB_FLAG_ACK;
1270                 TCP_SKB_CB(buff)->sacked = 0;
1271 
1272                 /* Send it off, this clears delayed acks for us. */
1273                 TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tcp_acceptable_seq(sk, tp);
1274                 TCP_SKB_CB(buff)->when = tcp_time_stamp;
1275                 tcp_transmit_skb(sk, buff);
1276         }
1277 }
1278 
1279 /* This routine sends a packet with an out of date sequence
1280  * number. It assumes the other end will try to ack it.
1281  *
1282  * Question: what should we make while urgent mode?
1283  * 4.4BSD forces sending single byte of data. We cannot send
1284  * out of window data, because we have SND.NXT==SND.MAX...
1285  *
1286  * Current solution: to send TWO zero-length segments in urgent mode:
1287  * one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is
1288  * out-of-date with SND.UNA-1 to probe window.
1289  */
1290 static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
1291 {
1292         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1293         struct sk_buff *skb;
1294 
1295         /* We don't queue it, tcp_transmit_skb() sets ownership. */
1296         skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
1297         if (skb == NULL) 
1298                 return -1;
1299 
1300         /* Reserve space for headers and set control bits. */
1301         skb_reserve(skb, MAX_TCP_HEADER);
1302         skb->csum = 0;
1303         TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
1304         TCP_SKB_CB(skb)->sacked = urgent;
1305 
1306         /* Use a previous sequence.  This should cause the other
1307          * end to send an ack.  Don't queue or clone SKB, just
1308          * send it.
1309          */
1310         TCP_SKB_CB(skb)->seq = urgent ? tp->snd_una : tp->snd_una - 1;
1311         TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
1312         TCP_SKB_CB(skb)->when = tcp_time_stamp;
1313         return tcp_transmit_skb(sk, skb);
1314 }
1315 
1316 int tcp_write_wakeup(struct sock *sk)
1317 {
1318         if (sk->state != TCP_CLOSE) {
1319                 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1320                 struct sk_buff *skb;
1321 
1322                 if ((skb = tp->send_head) != NULL &&
1323                     before(TCP_SKB_CB(skb)->seq, tp->snd_una+tp->snd_wnd)) {
1324                         int err;
1325                         int mss = tcp_current_mss(sk);
1326                         int seg_size = tp->snd_una+tp->snd_wnd-TCP_SKB_CB(skb)->seq;
1327 
1328                         if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
1329                                 tp->pushed_seq = TCP_SKB_CB(skb)->end_seq;
1330 
1331                         /* We are probing the opening of a window
1332                          * but the window size is != 0
1333                          * must have been a result SWS avoidance ( sender )
1334                          */
1335                         if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq ||
1336                             skb->len > mss) {
1337                                 seg_size = min(seg_size, mss);
1338                                 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
1339                                 if (tcp_fragment(sk, skb, seg_size))
1340                                         return -1;
1341                         }
1342                         TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
1343                         TCP_SKB_CB(skb)->when = tcp_time_stamp;
1344                         err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
1345                         if (!err) {
1346                                 update_send_head(sk, tp, skb);
1347                         }
1348                         return err;
1349                 } else {
1350                         if (tp->urg_mode &&
1351                             between(tp->snd_up, tp->snd_una+1, tp->snd_una+0xFFFF))
1352                                 tcp_xmit_probe_skb(sk, TCPCB_URG);
1353                         return tcp_xmit_probe_skb(sk, 0);
1354                 }
1355         }
1356         return -1;
1357 }
1358 
1359 /* A window probe timeout has occurred.  If window is not closed send
1360  * a partial packet else a zero probe.
1361  */
1362 void tcp_send_probe0(struct sock *sk)
1363 {
1364         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1365         int err;
1366 
1367         err = tcp_write_wakeup(sk);
1368 
1369         if (tp->packets_out || !tp->send_head) {
1370                 /* Cancel probe timer, if it is not required. */
1371                 tp->probes_out = 0;
1372                 tp->backoff = 0;
1373                 return;
1374         }
1375 
1376         if (err <= 0) {
1377                 tp->backoff++;
1378                 tp->probes_out++;
1379                 tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0, 
1380                                       min(tp->rto << tp->backoff, TCP_RTO_MAX));
1381         } else {
1382                 /* If packet was not sent due to local congestion,
1383                  * do not backoff and do not remember probes_out.
1384                  * Let local senders to fight for local resources.
1385                  *
1386                  * Use accumulated backoff yet.
1387                  */
1388                 if (!tp->probes_out)
1389                         tp->probes_out=1;
1390                 tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0, 
1391                                       min(tp->rto << tp->backoff, TCP_RESOURCE_PROBE_INTERVAL));
1392         }
1393 }
1394 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~ [ freetext search ] ~ [ file search ] ~

This page was automatically generated by the LXR engine.
Visit the LXR main site for more information.