1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
8 * Version: $Id: tcp_timer.c,v 1.80 2000/10/03 07:29:01 anton Exp $
9 *
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 * Corey Minyard <wf-rch!minyard@relay.EU.net>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16 * Linus Torvalds, <torvalds@cs.helsinki.fi>
17 * Alan Cox, <gw4pts@gw4pts.ampr.org>
18 * Matthew Dillon, <dillon@apollo.west.oic.com>
19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20 * Jorge Cwik, <jorge@laser.satlink.net>
21 */
22
23 #include <net/tcp.h>
24
25 int sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
26 int sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
27 int sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
28 int sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
29 int sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
30 int sysctl_tcp_retries1 = TCP_RETR1;
31 int sysctl_tcp_retries2 = TCP_RETR2;
32 int sysctl_tcp_orphan_retries;
33
34 static void tcp_write_timer(unsigned long);
35 static void tcp_delack_timer(unsigned long);
36 static void tcp_keepalive_timer (unsigned long data);
37
38 const char timer_bug_msg[] = KERN_DEBUG "tcpbug: unknown timer value\n";
39
40 /*
41 * Using different timers for retransmit, delayed acks and probes
42 * We may wish use just one timer maintaining a list of expire jiffies
43 * to optimize.
44 */
45
46 void tcp_init_xmit_timers(struct sock *sk)
47 {
48 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
49
50 init_timer(&tp->retransmit_timer);
51 tp->retransmit_timer.function=&tcp_write_timer;
52 tp->retransmit_timer.data = (unsigned long) sk;
53 tp->pending = 0;
54
55 init_timer(&tp->delack_timer);
56 tp->delack_timer.function=&tcp_delack_timer;
57 tp->delack_timer.data = (unsigned long) sk;
58 tp->ack.pending = 0;
59
60 init_timer(&sk->timer);
61 sk->timer.function=&tcp_keepalive_timer;
62 sk->timer.data = (unsigned long) sk;
63 }
64
65 void tcp_clear_xmit_timers(struct sock *sk)
66 {
67 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
68
69 tp->pending = 0;
70 if (timer_pending(&tp->retransmit_timer) &&
71 del_timer(&tp->retransmit_timer))
72 __sock_put(sk);
73
74 tp->ack.pending = 0;
75 tp->ack.blocked = 0;
76 if (timer_pending(&tp->delack_timer) &&
77 del_timer(&tp->delack_timer))
78 __sock_put(sk);
79
80 if(timer_pending(&sk->timer) && del_timer(&sk->timer))
81 __sock_put(sk);
82 }
83
84 static void tcp_write_err(struct sock *sk)
85 {
86 sk->err = sk->err_soft ? : ETIMEDOUT;
87 sk->error_report(sk);
88
89 tcp_done(sk);
90 NET_INC_STATS_BH(TCPAbortOnTimeout);
91 }
92
93 /* Do not allow orphaned sockets to eat all our resources.
94 * This is direct violation of TCP specs, but it is required
95 * to prevent DoS attacks. It is called when a retransmission timeout
96 * or zero probe timeout occurs on orphaned socket.
97 *
98 * Criterium is still not confirmed experimentally and may change.
99 * We kill the socket, if:
100 * 1. If number of orphaned sockets exceeds an administratively configured
101 * limit.
102 * 2. If we have strong memory pressure.
103 */
104 static int tcp_out_of_resources(struct sock *sk, int do_reset)
105 {
106 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
107 int orphans = atomic_read(&tcp_orphan_count);
108
109 /* If peer does not open window for long time, or did not transmit
110 * anything for long time, penalize it. */
111 if ((s32)(tcp_time_stamp - tp->lsndtime) > 2*TCP_RTO_MAX || !do_reset)
112 orphans <<= 1;
113
114 /* If some dubious ICMP arrived, penalize even more. */
115 if (sk->err_soft)
116 orphans <<= 1;
117
118 if (orphans >= sysctl_tcp_max_orphans ||
119 (sk->wmem_queued > SOCK_MIN_SNDBUF &&
120 atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
121 if (net_ratelimit())
122 printk(KERN_INFO "Out of socket memory\n");
123
124 /* Catch exceptional cases, when connection requires reset.
125 * 1. Last segment was sent recently. */
126 if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN ||
127 /* 2. Window is closed. */
128 (!tp->snd_wnd && !tp->packets_out))
129 do_reset = 1;
130 if (do_reset)
131 tcp_send_active_reset(sk, GFP_ATOMIC);
132 tcp_done(sk);
133 NET_INC_STATS_BH(TCPAbortOnMemory);
134 return 1;
135 }
136 return 0;
137 }
138
139 /* Calculate maximal number or retries on an orphaned socket. */
140 static int tcp_orphan_retries(struct sock *sk, int alive)
141 {
142 int retries = sysctl_tcp_orphan_retries; /* May be zero. */
143
144 /* We know from an ICMP that something is wrong. */
145 if (sk->err_soft && !alive)
146 retries = 0;
147
148 /* However, if socket sent something recently, select some safe
149 * number of retries. 8 corresponds to >100 seconds with minimal
150 * RTO of 200msec. */
151 if (retries == 0 && alive)
152 retries = 8;
153 return retries;
154 }
155
156 /* A write timeout has occurred. Process the after effects. */
157 static int tcp_write_timeout(struct sock *sk)
158 {
159 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
160 int retry_until;
161
162 if ((1<<sk->state)&(TCPF_SYN_SENT|TCPF_SYN_RECV)) {
163 if (tp->retransmits)
164 dst_negative_advice(&sk->dst_cache);
165 retry_until = tp->syn_retries ? : sysctl_tcp_syn_retries;
166 } else {
167 if (tp->retransmits >= sysctl_tcp_retries1) {
168 /* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu black
169 hole detection. :-(
170
171 It is place to make it. It is not made. I do not want
172 to make it. It is disguisting. It does not work in any
173 case. Let me to cite the same draft, which requires for
174 us to implement this:
175
176 "The one security concern raised by this memo is that ICMP black holes
177 are often caused by over-zealous security administrators who block
178 all ICMP messages. It is vitally important that those who design and
179 deploy security systems understand the impact of strict filtering on
180 upper-layer protocols. The safest web site in the world is worthless
181 if most TCP implementations cannot transfer data from it. It would
182 be far nicer to have all of the black holes fixed rather than fixing
183 all of the TCP implementations."
184
185 Golden words :-).
186 */
187
188 dst_negative_advice(&sk->dst_cache);
189 }
190
191 retry_until = sysctl_tcp_retries2;
192 if (sk->dead) {
193 int alive = (tp->rto < TCP_RTO_MAX);
194
195 retry_until = tcp_orphan_retries(sk, alive);
196
197 if (tcp_out_of_resources(sk, alive || tp->retransmits < retry_until))
198 return 1;
199 }
200 }
201
202 if (tp->retransmits >= retry_until) {
203 /* Has it gone just too far? */
204 tcp_write_err(sk);
205 return 1;
206 }
207 return 0;
208 }
209
210 static void tcp_delack_timer(unsigned long data)
211 {
212 struct sock *sk = (struct sock*)data;
213 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
214
215 bh_lock_sock(sk);
216 if (sk->lock.users) {
217 /* Try again later. */
218 tp->ack.blocked = 1;
219 NET_INC_STATS_BH(DelayedACKLocked);
220 if (!mod_timer(&tp->delack_timer, jiffies + TCP_DELACK_MIN))
221 sock_hold(sk);
222 goto out_unlock;
223 }
224
225 tcp_mem_reclaim(sk);
226
227 if (sk->state == TCP_CLOSE || !(tp->ack.pending&TCP_ACK_TIMER))
228 goto out;
229
230 if ((long)(tp->ack.timeout - jiffies) > 0) {
231 if (!mod_timer(&tp->delack_timer, tp->ack.timeout))
232 sock_hold(sk);
233 goto out;
234 }
235 tp->ack.pending &= ~TCP_ACK_TIMER;
236
237 if (skb_queue_len(&tp->ucopy.prequeue)) {
238 struct sk_buff *skb;
239
240 net_statistics[smp_processor_id()*2].TCPSchedulerFailed += skb_queue_len(&tp->ucopy.prequeue);
241
242 while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
243 sk->backlog_rcv(sk, skb);
244
245 tp->ucopy.memory = 0;
246 }
247
248 if (tcp_ack_scheduled(tp)) {
249 if (!tp->ack.pingpong) {
250 /* Delayed ACK missed: inflate ATO. */
251 tp->ack.ato = min(tp->ack.ato<<1, tp->rto);
252 } else {
253 /* Delayed ACK missed: leave pingpong mode and
254 * deflate ATO.
255 */
256 tp->ack.pingpong = 0;
257 tp->ack.ato = TCP_ATO_MIN;
258 }
259 tcp_send_ack(sk);
260 NET_INC_STATS_BH(DelayedACKs);
261 }
262 TCP_CHECK_TIMER(sk);
263
264 out:
265 if (tcp_memory_pressure)
266 tcp_mem_reclaim(sk);
267 out_unlock:
268 bh_unlock_sock(sk);
269 sock_put(sk);
270 }
271
272 static void tcp_probe_timer(struct sock *sk)
273 {
274 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
275 int max_probes;
276
277 if (tp->packets_out || !tp->send_head) {
278 tp->probes_out = 0;
279 return;
280 }
281
282 /* *WARNING* RFC 1122 forbids this
283 *
284 * It doesn't AFAIK, because we kill the retransmit timer -AK
285 *
286 * FIXME: We ought not to do it, Solaris 2.5 actually has fixing
287 * this behaviour in Solaris down as a bug fix. [AC]
288 *
289 * Let me to explain. probes_out is zeroed by incoming ACKs
290 * even if they advertise zero window. Hence, connection is killed only
291 * if we received no ACKs for normal connection timeout. It is not killed
292 * only because window stays zero for some time, window may be zero
293 * until armageddon and even later. We are in full accordance
294 * with RFCs, only probe timer combines both retransmission timeout
295 * and probe timeout in one bottle. --ANK
296 */
297 max_probes = sysctl_tcp_retries2;
298
299 if (sk->dead) {
300 int alive = ((tp->rto<<tp->backoff) < TCP_RTO_MAX);
301
302 max_probes = tcp_orphan_retries(sk, alive);
303
304 if (tcp_out_of_resources(sk, alive || tp->probes_out <= max_probes))
305 return;
306 }
307
308 if (tp->probes_out > max_probes) {
309 tcp_write_err(sk);
310 } else {
311 /* Only send another probe if we didn't close things up. */
312 tcp_send_probe0(sk);
313 }
314 }
315
316 /*
317 * The TCP retransmit timer.
318 */
319
320 static void tcp_retransmit_timer(struct sock *sk)
321 {
322 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
323
324 if (tp->packets_out == 0)
325 goto out;
326
327 BUG_TRAP(!skb_queue_empty(&sk->write_queue));
328
329 if (tcp_write_timeout(sk))
330 goto out;
331
332 if (tp->retransmits == 0) {
333 if (tp->ca_state == TCP_CA_Disorder || tp->ca_state == TCP_CA_Recovery) {
334 if (tp->sack_ok) {
335 if (tp->ca_state == TCP_CA_Recovery)
336 NET_INC_STATS_BH(TCPSackRecoveryFail);
337 else
338 NET_INC_STATS_BH(TCPSackFailures);
339 } else {
340 if (tp->ca_state == TCP_CA_Recovery)
341 NET_INC_STATS_BH(TCPRenoRecoveryFail);
342 else
343 NET_INC_STATS_BH(TCPRenoFailures);
344 }
345 } else if (tp->ca_state == TCP_CA_Loss) {
346 NET_INC_STATS_BH(TCPLossFailures);
347 } else {
348 NET_INC_STATS_BH(TCPTimeouts);
349 }
350 }
351
352 tcp_enter_loss(sk, 0);
353
354 if (tcp_retransmit_skb(sk, skb_peek(&sk->write_queue)) > 0) {
355 /* Retransmission failed because of local congestion,
356 * do not backoff.
357 */
358 if (!tp->retransmits)
359 tp->retransmits=1;
360 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS,
361 min(tp->rto, TCP_RESOURCE_PROBE_INTERVAL));
362 goto out;
363 }
364
365 /* Increase the timeout each time we retransmit. Note that
366 * we do not increase the rtt estimate. rto is initialized
367 * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests
368 * that doubling rto each time is the least we can get away with.
369 * In KA9Q, Karn uses this for the first few times, and then
370 * goes to quadratic. netBSD doubles, but only goes up to *64,
371 * and clamps at 1 to 64 sec afterwards. Note that 120 sec is
372 * defined in the protocol as the maximum possible RTT. I guess
373 * we'll have to use something other than TCP to talk to the
374 * University of Mars.
375 *
376 * PAWS allows us longer timeouts and large windows, so once
377 * implemented ftp to mars will work nicely. We will have to fix
378 * the 120 second clamps though!
379 */
380 tp->backoff++;
381 tp->retransmits++;
382 tp->rto = min(tp->rto << 1, TCP_RTO_MAX);
383 tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
384 if (tp->retransmits > sysctl_tcp_retries1)
385 __sk_dst_reset(sk);
386
387 out:;
388 }
389
390 static void tcp_write_timer(unsigned long data)
391 {
392 struct sock *sk = (struct sock*)data;
393 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
394 int event;
395
396 bh_lock_sock(sk);
397 if (sk->lock.users) {
398 /* Try again later */
399 if (!mod_timer(&tp->retransmit_timer, jiffies + (HZ/20)))
400 sock_hold(sk);
401 goto out_unlock;
402 }
403
404 if (sk->state == TCP_CLOSE || !tp->pending)
405 goto out;
406
407 if ((long)(tp->timeout - jiffies) > 0) {
408 if (!mod_timer(&tp->retransmit_timer, tp->timeout))
409 sock_hold(sk);
410 goto out;
411 }
412
413 event = tp->pending;
414 tp->pending = 0;
415
416 switch (event) {
417 case TCP_TIME_RETRANS:
418 tcp_retransmit_timer(sk);
419 break;
420 case TCP_TIME_PROBE0:
421 tcp_probe_timer(sk);
422 break;
423 }
424 TCP_CHECK_TIMER(sk);
425
426 out:
427 tcp_mem_reclaim(sk);
428 out_unlock:
429 bh_unlock_sock(sk);
430 sock_put(sk);
431 }
432
433 /*
434 * Timer for listening sockets
435 */
436
437 static void tcp_synack_timer(struct sock *sk)
438 {
439 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
440 struct tcp_listen_opt *lopt = tp->listen_opt;
441 int max_retries = tp->syn_retries ? : sysctl_tcp_synack_retries;
442 int thresh = max_retries;
443 unsigned long now = jiffies;
444 struct open_request **reqp, *req;
445 int i, budget;
446
447 if (lopt == NULL || lopt->qlen == 0)
448 return;
449
450 /* Normally all the openreqs are young and become mature
451 * (i.e. converted to established socket) for first timeout.
452 * If synack was not acknowledged for 3 seconds, it means
453 * one of the following things: synack was lost, ack was lost,
454 * rtt is high or nobody planned to ack (i.e. synflood).
455 * When server is a bit loaded, queue is populated with old
456 * open requests, reducing effective size of queue.
457 * When server is well loaded, queue size reduces to zero
458 * after several minutes of work. It is not synflood,
459 * it is normal operation. The solution is pruning
460 * too old entries overriding normal timeout, when
461 * situation becomes dangerous.
462 *
463 * Essentially, we reserve half of room for young
464 * embrions; and abort old ones without pity, if old
465 * ones are about to clog our table.
466 */
467 if (lopt->qlen>>(lopt->max_qlen_log-1)) {
468 int young = (lopt->qlen_young<<1);
469
470 while (thresh > 2) {
471 if (lopt->qlen < young)
472 break;
473 thresh--;
474 young <<= 1;
475 }
476 }
477
478 if (tp->defer_accept)
479 max_retries = tp->defer_accept;
480
481 budget = 2*(TCP_SYNQ_HSIZE/(TCP_TIMEOUT_INIT/TCP_SYNQ_INTERVAL));
482 i = lopt->clock_hand;
483
484 do {
485 reqp=&lopt->syn_table[i];
486 while ((req = *reqp) != NULL) {
487 if ((long)(now - req->expires) >= 0) {
488 if ((req->retrans < thresh ||
489 (req->acked && req->retrans < max_retries))
490 && !req->class->rtx_syn_ack(sk, req, NULL)) {
491 unsigned long timeo;
492
493 if (req->retrans++ == 0)
494 lopt->qlen_young--;
495 timeo = min((TCP_TIMEOUT_INIT << req->retrans),
496 TCP_RTO_MAX);
497 req->expires = now + timeo;
498 reqp = &req->dl_next;
499 continue;
500 }
501
502 /* Drop this request */
503 write_lock(&tp->syn_wait_lock);
504 *reqp = req->dl_next;
505 write_unlock(&tp->syn_wait_lock);
506 lopt->qlen--;
507 if (req->retrans == 0)
508 lopt->qlen_young--;
509 tcp_openreq_free(req);
510 continue;
511 }
512 reqp = &req->dl_next;
513 }
514
515 i = (i+1)&(TCP_SYNQ_HSIZE-1);
516
517 } while (--budget > 0);
518
519 lopt->clock_hand = i;
520
521 if (lopt->qlen)
522 tcp_reset_keepalive_timer(sk, TCP_SYNQ_INTERVAL);
523 }
524
525 void tcp_delete_keepalive_timer (struct sock *sk)
526 {
527 if (timer_pending(&sk->timer) && del_timer (&sk->timer))
528 __sock_put(sk);
529 }
530
531 void tcp_reset_keepalive_timer (struct sock *sk, unsigned long len)
532 {
533 if (!mod_timer(&sk->timer, jiffies+len))
534 sock_hold(sk);
535 }
536
537 void tcp_set_keepalive(struct sock *sk, int val)
538 {
539 if ((1<<sk->state)&(TCPF_CLOSE|TCPF_LISTEN))
540 return;
541
542 if (val && !sk->keepopen)
543 tcp_reset_keepalive_timer(sk, keepalive_time_when(&sk->tp_pinfo.af_tcp));
544 else if (!val)
545 tcp_delete_keepalive_timer(sk);
546 }
547
548
549 static void tcp_keepalive_timer (unsigned long data)
550 {
551 struct sock *sk = (struct sock *) data;
552 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
553 __u32 elapsed;
554
555 /* Only process if socket is not in use. */
556 bh_lock_sock(sk);
557 if (sk->lock.users) {
558 /* Try again later. */
559 tcp_reset_keepalive_timer (sk, HZ/20);
560 goto out;
561 }
562
563 if (sk->state == TCP_LISTEN) {
564 tcp_synack_timer(sk);
565 goto out;
566 }
567
568 if (sk->state == TCP_FIN_WAIT2 && sk->dead) {
569 if (tp->linger2 >= 0) {
570 int tmo = tcp_fin_time(tp) - TCP_TIMEWAIT_LEN;
571
572 if (tmo > 0) {
573 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
574 goto out;
575 }
576 }
577 tcp_send_active_reset(sk, GFP_ATOMIC);
578 goto death;
579 }
580
581 if (!sk->keepopen || sk->state == TCP_CLOSE)
582 goto out;
583
584 elapsed = keepalive_time_when(tp);
585
586 /* It is alive without keepalive 8) */
587 if (tp->packets_out || tp->send_head)
588 goto resched;
589
590 elapsed = tcp_time_stamp - tp->rcv_tstamp;
591
592 if (elapsed >= keepalive_time_when(tp)) {
593 if ((!tp->keepalive_probes && tp->probes_out >= sysctl_tcp_keepalive_probes) ||
594 (tp->keepalive_probes && tp->probes_out >= tp->keepalive_probes)) {
595 tcp_send_active_reset(sk, GFP_ATOMIC);
596 tcp_write_err(sk);
597 goto out;
598 }
599 if (tcp_write_wakeup(sk) <= 0) {
600 tp->probes_out++;
601 elapsed = keepalive_intvl_when(tp);
602 } else {
603 /* If keepalive was lost due to local congestion,
604 * try harder.
605 */
606 elapsed = TCP_RESOURCE_PROBE_INTERVAL;
607 }
608 } else {
609 /* It is tp->rcv_tstamp + keepalive_time_when(tp) */
610 elapsed = keepalive_time_when(tp) - elapsed;
611 }
612
613 TCP_CHECK_TIMER(sk);
614 tcp_mem_reclaim(sk);
615
616 resched:
617 tcp_reset_keepalive_timer (sk, elapsed);
618 goto out;
619
620 death:
621 tcp_done(sk);
622
623 out:
624 bh_unlock_sock(sk);
625 sock_put(sk);
626 }
627
This page was automatically generated by the
LXR engine.
Visit the LXR main site for more
information.