~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~ [ freetext search ] ~ [ file search ] ~

Linux Cross Reference
Linux/net/ipv4/route.c

Version: ~ [ 2.4.0 ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 /*
  2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
  3  *              operating system.  INET is implemented using the  BSD Socket
  4  *              interface as the means of communication with the user level.
  5  *
  6  *              ROUTE - implementation of the IP router.
  7  *
  8  * Version:     $Id: route.c,v 1.91 2000/10/03 07:29:00 anton Exp $
  9  *
 10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
 11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
 13  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
 14  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 15  *
 16  * Fixes:
 17  *              Alan Cox        :       Verify area fixes.
 18  *              Alan Cox        :       cli() protects routing changes
 19  *              Rui Oliveira    :       ICMP routing table updates
 20  *              (rco@di.uminho.pt)      Routing table insertion and update
 21  *              Linus Torvalds  :       Rewrote bits to be sensible
 22  *              Alan Cox        :       Added BSD route gw semantics
 23  *              Alan Cox        :       Super /proc >4K 
 24  *              Alan Cox        :       MTU in route table
 25  *              Alan Cox        :       MSS actually. Also added the window
 26  *                                      clamper.
 27  *              Sam Lantinga    :       Fixed route matching in rt_del()
 28  *              Alan Cox        :       Routing cache support.
 29  *              Alan Cox        :       Removed compatibility cruft.
 30  *              Alan Cox        :       RTF_REJECT support.
 31  *              Alan Cox        :       TCP irtt support.
 32  *              Jonathan Naylor :       Added Metric support.
 33  *      Miquel van Smoorenburg  :       BSD API fixes.
 34  *      Miquel van Smoorenburg  :       Metrics.
 35  *              Alan Cox        :       Use __u32 properly
 36  *              Alan Cox        :       Aligned routing errors more closely with BSD
 37  *                                      our system is still very different.
 38  *              Alan Cox        :       Faster /proc handling
 39  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
 40  *                                      routing caches and better behaviour.
 41  *              
 42  *              Olaf Erb        :       irtt wasn't being copied right.
 43  *              Bjorn Ekwall    :       Kerneld route support.
 44  *              Alan Cox        :       Multicast fixed (I hope)
 45  *              Pavel Krauz     :       Limited broadcast fixed
 46  *              Mike McLagan    :       Routing by source
 47  *      Alexey Kuznetsov        :       End of old history. Splitted to fib.c and
 48  *                                      route.c and rewritten from scratch.
 49  *              Andi Kleen      :       Load-limit warning messages.
 50  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
 51  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
 52  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
 53  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
 54  *              Marc Boucher    :       routing by fwmark
 55  *
 56  *              This program is free software; you can redistribute it and/or
 57  *              modify it under the terms of the GNU General Public License
 58  *              as published by the Free Software Foundation; either version
 59  *              2 of the License, or (at your option) any later version.
 60  */
 61 
 62 #include <linux/config.h>
 63 #include <asm/uaccess.h>
 64 #include <asm/system.h>
 65 #include <asm/bitops.h>
 66 #include <linux/types.h>
 67 #include <linux/kernel.h>
 68 #include <linux/sched.h>
 69 #include <linux/mm.h>
 70 #include <linux/string.h>
 71 #include <linux/socket.h>
 72 #include <linux/sockios.h>
 73 #include <linux/errno.h>
 74 #include <linux/in.h>
 75 #include <linux/inet.h>
 76 #include <linux/netdevice.h>
 77 #include <linux/proc_fs.h>
 78 #include <linux/init.h>
 79 #include <linux/skbuff.h>
 80 #include <linux/rtnetlink.h>
 81 #include <linux/inetdevice.h>
 82 #include <linux/igmp.h>
 83 #include <linux/pkt_sched.h>
 84 #include <linux/mroute.h>
 85 #include <linux/netfilter_ipv4.h>
 86 #include <linux/random.h>
 87 #include <net/protocol.h>
 88 #include <net/ip.h>
 89 #include <net/route.h>
 90 #include <net/inetpeer.h>
 91 #include <net/sock.h>
 92 #include <net/ip_fib.h>
 93 #include <net/arp.h>
 94 #include <net/tcp.h>
 95 #include <net/icmp.h>
 96 #ifdef CONFIG_SYSCTL
 97 #include <linux/sysctl.h>
 98 #endif
 99 
100 #define IP_MAX_MTU      0xFFF0
101 
102 #define RT_GC_TIMEOUT (300*HZ)
103 
104 int ip_rt_min_delay = 2*HZ;
105 int ip_rt_max_delay = 10*HZ;
106 int ip_rt_max_size;
107 int ip_rt_gc_timeout = RT_GC_TIMEOUT;
108 int ip_rt_gc_interval = 60*HZ;
109 int ip_rt_gc_min_interval = 5*HZ;
110 int ip_rt_redirect_number = 9;
111 int ip_rt_redirect_load = HZ/50;
112 int ip_rt_redirect_silence = ((HZ/50) << (9+1));
113 int ip_rt_error_cost = HZ;
114 int ip_rt_error_burst = 5*HZ;
115 int ip_rt_gc_elasticity = 8;
116 int ip_rt_mtu_expires = 10*60*HZ;
117 int ip_rt_min_pmtu = 512+20+20;
118 int ip_rt_min_advmss = 536;
119 
120 static unsigned long rt_deadline;
121 
122 #define RTprint(a...)   printk(KERN_DEBUG a)
123 
124 static struct timer_list rt_flush_timer;
125 static struct timer_list rt_periodic_timer;
126 
127 /*
128  *      Interface to generic destination cache.
129  */
130 
131 static struct dst_entry * ipv4_dst_check(struct dst_entry * dst, u32);
132 static struct dst_entry * ipv4_dst_reroute(struct dst_entry * dst,
133                                            struct sk_buff *);
134 static void               ipv4_dst_destroy(struct dst_entry * dst);
135 static struct dst_entry * ipv4_negative_advice(struct dst_entry *);
136 static void               ipv4_link_failure(struct sk_buff *skb);
137 static int rt_garbage_collect(void);
138 
139 
140 struct dst_ops ipv4_dst_ops =
141 {
142         AF_INET,
143         __constant_htons(ETH_P_IP),
144         0,
145 
146         rt_garbage_collect,
147         ipv4_dst_check,
148         ipv4_dst_reroute,
149         ipv4_dst_destroy,
150         ipv4_negative_advice,
151         ipv4_link_failure,
152         sizeof(struct rtable),
153 };
154 
155 #ifdef CONFIG_INET_ECN
156 #define ECN_OR_COST(class)      TC_PRIO_##class
157 #else
158 #define ECN_OR_COST(class)      TC_PRIO_FILLER
159 #endif
160 
161 __u8 ip_tos2prio[16] = {
162         TC_PRIO_BESTEFFORT,
163         ECN_OR_COST(FILLER),
164         TC_PRIO_BESTEFFORT,
165         ECN_OR_COST(BESTEFFORT),
166         TC_PRIO_BULK,
167         ECN_OR_COST(BULK),
168         TC_PRIO_BULK,
169         ECN_OR_COST(BULK),
170         TC_PRIO_INTERACTIVE,
171         ECN_OR_COST(INTERACTIVE),
172         TC_PRIO_INTERACTIVE,
173         ECN_OR_COST(INTERACTIVE),
174         TC_PRIO_INTERACTIVE_BULK,
175         ECN_OR_COST(INTERACTIVE_BULK),
176         TC_PRIO_INTERACTIVE_BULK,
177         ECN_OR_COST(INTERACTIVE_BULK)
178 };
179 
180 
181 /*
182  * Route cache.
183  */
184 
185 /* The locking scheme is rather straight forward:
186  *
187  * 1) A BH protected rwlocks protect buckets of the central route hash.
188  * 2) Only writers remove entries, and they hold the lock
189  *    as they look at rtable reference counts.
190  * 3) Only readers acquire references to rtable entries,
191  *    they do so with atomic increments and with the
192  *    lock held.
193  */
194 
195 struct rt_hash_bucket {
196         struct rtable   *chain;
197         rwlock_t        lock;
198 } __attribute__((__aligned__(8)));
199 
200 static struct rt_hash_bucket    *rt_hash_table;
201 static unsigned                 rt_hash_mask;
202 static int                      rt_hash_log;
203 
204 static int rt_intern_hash(unsigned hash, struct rtable * rth, struct rtable ** res);
205 
206 static __inline__ unsigned rt_hash_code(u32 daddr, u32 saddr, u8 tos)
207 {
208         unsigned hash = ((daddr&0xF0F0F0F0)>>4)|((daddr&0x0F0F0F0F)<<4);
209         hash ^= saddr^tos;
210         hash ^= (hash>>16);
211         return (hash^(hash>>8)) & rt_hash_mask;
212 }
213 
214 static int rt_cache_get_info(char *buffer, char **start, off_t offset, int length)
215 {
216         int len=0;
217         off_t pos=0;
218         char temp[129];
219         struct rtable *r;
220         int i;
221 
222         pos = 128;
223 
224         if (offset<128) {
225                 sprintf(buffer,"%-127s\n", "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\tMetric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\tHHUptod\tSpecDst");
226                 len = 128;
227         }
228         
229         for (i = rt_hash_mask; i>=0; i--) {
230                 read_lock_bh(&rt_hash_table[i].lock);
231                 for (r = rt_hash_table[i].chain; r; r = r->u.rt_next) {
232                         /*
233                          *      Spin through entries until we are ready
234                          */
235                         pos += 128;
236 
237                         if (pos <= offset) {
238                                 len = 0;
239                                 continue;
240                         }
241                         sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
242                                 r->u.dst.dev ? r->u.dst.dev->name : "*",
243                                 (unsigned long)r->rt_dst,
244                                 (unsigned long)r->rt_gateway,
245                                 r->rt_flags,
246                                 atomic_read(&r->u.dst.__refcnt),
247                                 r->u.dst.__use,
248                                 0,
249                                 (unsigned long)r->rt_src, (int)r->u.dst.advmss + 40,
250                                 r->u.dst.window,
251                                 (int)((r->u.dst.rtt>>3) + r->u.dst.rttvar),
252                                 r->key.tos,
253                                 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
254                                 r->u.dst.hh ? (r->u.dst.hh->hh_output == dev_queue_xmit) : 0,
255                                 r->rt_spec_dst);
256                         sprintf(buffer+len,"%-127s\n",temp);
257                         len += 128;
258                         if (pos >= offset+length) {
259                                 read_unlock_bh(&rt_hash_table[i].lock);
260                                 goto done;
261                         }
262                 }
263                 read_unlock_bh(&rt_hash_table[i].lock);
264         }
265 
266 done:
267         *start = buffer+len-(pos-offset);
268         len = pos-offset;
269         if (len>length)
270                 len = length;
271         return len;
272 }
273   
274 static __inline__ void rt_free(struct rtable *rt)
275 {
276         dst_free(&rt->u.dst);
277 }
278 
279 static __inline__ void rt_drop(struct rtable *rt)
280 {
281         ip_rt_put(rt);
282         dst_free(&rt->u.dst);
283 }
284 
285 static __inline__ int rt_fast_clean(struct rtable *rth)
286 {
287         /* Kill broadcast/multicast entries very aggresively, if they
288            collide in hash table with more useful entries */
289         return ((rth->rt_flags&(RTCF_BROADCAST|RTCF_MULTICAST))
290                 && rth->key.iif && rth->u.rt_next);
291 }
292 
293 static __inline__ int rt_valuable(struct rtable *rth)
294 {
295         return ((rth->rt_flags&(RTCF_REDIRECTED|RTCF_NOTIFY))
296                 || rth->u.dst.expires);
297 }
298 
299 static __inline__ int rt_may_expire(struct rtable *rth, int tmo1, int tmo2)
300 {
301         int age;
302 
303         if (atomic_read(&rth->u.dst.__refcnt))
304                 return 0;
305 
306         if (rth->u.dst.expires && (long)(rth->u.dst.expires - jiffies) <= 0)
307                 return 1;
308 
309         age = jiffies - rth->u.dst.lastuse;
310         if (age <= tmo1 && !rt_fast_clean(rth))
311                 return 0;
312         if (age <= tmo2 && rt_valuable(rth))
313                 return 0;
314         return 1;
315 }
316 
317 /* This runs via a timer and thus is always in BH context. */
318 static void SMP_TIMER_NAME(rt_check_expire)(unsigned long dummy)
319 {
320         int i, t;
321         static int rover;
322         struct rtable *rth, **rthp;
323         unsigned long now = jiffies;
324 
325         i = rover;
326 
327         for (t=(ip_rt_gc_interval<<rt_hash_log); t>=0; t -= ip_rt_gc_timeout) {
328                 unsigned tmo = ip_rt_gc_timeout;
329 
330                 i = (i + 1) & rt_hash_mask;
331                 rthp = &rt_hash_table[i].chain;
332 
333                 write_lock(&rt_hash_table[i].lock);
334                 while ((rth = *rthp) != NULL) {
335                         if (rth->u.dst.expires) {
336                                 /* Entry is expired even if it is in use */
337                                 if ((long)(now - rth->u.dst.expires) <= 0) {
338                                         tmo >>= 1;
339                                         rthp = &rth->u.rt_next;
340                                         continue;
341                                 }
342                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
343                                 tmo >>= 1;
344                                 rthp = &rth->u.rt_next;
345                                 continue;
346                         }
347 
348                         /*
349                          * Cleanup aged off entries.
350                          */
351                         *rthp = rth->u.rt_next;
352                         rt_free(rth);
353                 }
354                 write_unlock(&rt_hash_table[i].lock);
355 
356                 /* Fallback loop breaker. */
357                 if ((jiffies - now) > 0)
358                         break;
359         }
360         rover = i;
361         mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval);
362 }
363 
364 SMP_TIMER_DEFINE(rt_check_expire, rt_gc_task);
365 
366 /* This can run from both BH and non-BH contexts, the latter
367  * in the case of a forced flush event.
368  */
369 static void SMP_TIMER_NAME(rt_run_flush)(unsigned long dummy)
370 {
371         int i;
372         struct rtable * rth, * next;
373 
374         rt_deadline = 0;
375 
376         for (i=rt_hash_mask; i>=0; i--) {
377                 write_lock_bh(&rt_hash_table[i].lock);
378                 rth = rt_hash_table[i].chain;
379                 if (rth)
380                         rt_hash_table[i].chain = NULL;
381                 write_unlock_bh(&rt_hash_table[i].lock);
382 
383                 for (; rth; rth=next) {
384                         next = rth->u.rt_next;
385                         rt_free(rth);
386                 }
387         }
388 }
389 
390 SMP_TIMER_DEFINE(rt_run_flush, rt_cache_flush_task);
391   
392 static spinlock_t rt_flush_lock = SPIN_LOCK_UNLOCKED;
393 
394 void rt_cache_flush(int delay)
395 {
396         unsigned long now = jiffies;
397         int user_mode = !in_softirq();
398 
399         if (delay < 0)
400                 delay = ip_rt_min_delay;
401 
402         spin_lock_bh(&rt_flush_lock);
403 
404         if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
405                 long tmo = (long)(rt_deadline - now);
406 
407                 /* If flush timer is already running
408                    and flush request is not immediate (delay > 0):
409 
410                    if deadline is not achieved, prolongate timer to "delay",
411                    otherwise fire it at deadline time.
412                  */
413 
414                 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
415                         tmo = 0;
416                 
417                 if (delay > tmo)
418                         delay = tmo;
419         }
420 
421         if (delay <= 0) {
422                 spin_unlock_bh(&rt_flush_lock);
423                 SMP_TIMER_NAME(rt_run_flush)(0);
424                 return;
425         }
426 
427         if (rt_deadline == 0)
428                 rt_deadline = now + ip_rt_max_delay;
429 
430         mod_timer(&rt_flush_timer, now+delay);
431         spin_unlock_bh(&rt_flush_lock);
432 }
433 
434 /*
435    Short description of GC goals.
436 
437    We want to build algorithm, which will keep routing cache
438    at some equilibrium point, when number of aged off entries
439    is kept approximately equal to newly generated ones.
440 
441    Current expiration strength is variable "expire".
442    We try to adjust it dynamically, so that if networking
443    is idle expires is large enough to keep enough of warm entries,
444    and when load increases it reduces to limit cache size.
445  */
446 
447 static int rt_garbage_collect(void)
448 {
449         static unsigned expire = RT_GC_TIMEOUT;
450         static unsigned long last_gc;
451         static int rover;
452         static int equilibrium;
453         struct rtable *rth, **rthp;
454         unsigned long now = jiffies;
455         int goal;
456 
457         /*
458          * Garbage collection is pretty expensive,
459          * do not make it too frequently.
460          */
461         if (now - last_gc < ip_rt_gc_min_interval &&
462             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
463                 return 0;
464 
465         /* Calculate number of entries, which we want to expire now. */
466         goal = atomic_read(&ipv4_dst_ops.entries) - (ip_rt_gc_elasticity<<rt_hash_log);
467         if (goal <= 0) {
468                 if (equilibrium < ipv4_dst_ops.gc_thresh)
469                         equilibrium = ipv4_dst_ops.gc_thresh;
470                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
471                 if (goal > 0) {
472                         equilibrium += min(goal/2, rt_hash_mask+1);
473                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
474                 }
475         } else {
476                 /* We are in dangerous area. Try to reduce cache really
477                  * aggressively.
478                  */
479                 goal = max(goal/2, rt_hash_mask+1);
480                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
481         }
482 
483         if (now - last_gc >= ip_rt_gc_min_interval)
484                 last_gc = now;
485 
486         if (goal <= 0) {
487                 equilibrium += goal;
488                 goto work_done;
489         }
490 
491         do {
492                 int i, k;
493 
494                 for (i=rt_hash_mask, k=rover; i>=0; i--) {
495                         unsigned tmo = expire;
496 
497                         k = (k + 1) & rt_hash_mask;
498                         rthp = &rt_hash_table[k].chain;
499                         write_lock_bh(&rt_hash_table[k].lock);
500                         while ((rth = *rthp) != NULL) {
501                                 if (!rt_may_expire(rth, tmo, expire)) {
502                                         tmo >>= 1;
503                                         rthp = &rth->u.rt_next;
504                                         continue;
505                                 }
506                                 *rthp = rth->u.rt_next;
507                                 rt_free(rth);
508                                 goal--;
509                         }
510                         write_unlock_bh(&rt_hash_table[k].lock);
511                         if (goal <= 0)
512                                 break;
513                 }
514                 rover = k;
515 
516                 if (goal <= 0)
517                         goto work_done;
518 
519                 /* Goal is not achieved. We stop process if:
520 
521                    - if expire reduced to zero. Otherwise, expire is halfed.
522                    - if table is not full.
523                    - if we are called from interrupt.
524                    - jiffies check is just fallback/debug loop breaker.
525                      We will not spin here for long time in any case.
526                  */
527 
528                 if (expire == 0)
529                         break;
530 
531                 expire >>= 1;
532 #if RT_CACHE_DEBUG >= 2
533                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire, atomic_read(&ipv4_dst_ops.entries), goal, i);
534 #endif
535 
536                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
537                         return 0;
538         } while (!in_softirq() && jiffies - now < 1);
539 
540         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
541                 return 0;
542         if (net_ratelimit())
543                 printk("dst cache overflow\n");
544         return 1;
545 
546 work_done:
547         expire += ip_rt_gc_min_interval;
548         if (expire > ip_rt_gc_timeout ||
549             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
550                 expire = ip_rt_gc_timeout;
551 #if RT_CACHE_DEBUG >= 2
552         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire, atomic_read(&ipv4_dst_ops.entries), goal, rover);
553 #endif
554         return 0;
555 }
556 
557 static int rt_intern_hash(unsigned hash, struct rtable * rt, struct rtable ** rp)
558 {
559         struct rtable   *rth, **rthp;
560         unsigned long   now = jiffies;
561         int attempts = !in_softirq();
562 
563 restart:
564         rthp = &rt_hash_table[hash].chain;
565 
566         write_lock_bh(&rt_hash_table[hash].lock);
567         while ((rth = *rthp) != NULL) {
568                 if (memcmp(&rth->key, &rt->key, sizeof(rt->key)) == 0) {
569                         /* Put it first */
570                         *rthp = rth->u.rt_next;
571                         rth->u.rt_next = rt_hash_table[hash].chain;
572                         rt_hash_table[hash].chain = rth;
573 
574                         rth->u.dst.__use++;
575                         dst_hold(&rth->u.dst);
576                         rth->u.dst.lastuse = now;
577                         write_unlock_bh(&rt_hash_table[hash].lock);
578 
579                         rt_drop(rt);
580                         *rp = rth;
581                         return 0;
582                 }
583 
584                 rthp = &rth->u.rt_next;
585         }
586 
587         /* Try to bind route to arp only if it is output
588            route or unicast forwarding path.
589          */
590         if (rt->rt_type == RTN_UNICAST || rt->key.iif == 0) {
591                 int err = arp_bind_neighbour(&rt->u.dst);
592                 if (err) {
593                         write_unlock_bh(&rt_hash_table[hash].lock);
594 
595                         if (err != -ENOBUFS) {
596                                 rt_drop(rt);
597                                 return err;
598                         }
599 
600                         /* Neighbour tables are full and nothing
601                            can be released. Try to shrink route cache,
602                            it is most likely it holds some neighbour records.
603                          */
604                         if (attempts-- > 0) {
605                                 int saved_elasticity = ip_rt_gc_elasticity;
606                                 int saved_int = ip_rt_gc_min_interval;
607                                 ip_rt_gc_elasticity = 1;
608                                 ip_rt_gc_min_interval = 0;
609                                 rt_garbage_collect();
610                                 ip_rt_gc_min_interval = saved_int;
611                                 ip_rt_gc_elasticity = saved_elasticity;
612                                 goto restart;
613                         }
614 
615                         if (net_ratelimit())
616                                 printk("Neighbour table overflow.\n");
617                         rt_drop(rt);
618                         return -ENOBUFS;
619                 }
620         }
621 
622         rt->u.rt_next = rt_hash_table[hash].chain;
623 #if RT_CACHE_DEBUG >= 2
624         if (rt->u.rt_next) {
625                 struct rtable * trt;
626                 printk("rt_cache @%02x: %u.%u.%u.%u", hash, NIPQUAD(rt->rt_dst));
627                 for (trt=rt->u.rt_next; trt; trt=trt->u.rt_next)
628                         printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
629                 printk("\n");
630         }
631 #endif
632         rt_hash_table[hash].chain = rt;
633         write_unlock_bh(&rt_hash_table[hash].lock);
634         *rp = rt;
635         return 0;
636 }
637 
638 void rt_bind_peer(struct rtable *rt, int create)
639 {
640         static spinlock_t rt_peer_lock = SPIN_LOCK_UNLOCKED;
641         struct inet_peer *peer;
642 
643         peer = inet_getpeer(rt->rt_dst, create);
644 
645         spin_lock_bh(&rt_peer_lock);
646         if (rt->peer == NULL) {
647                 rt->peer = peer;
648                 peer = NULL;
649         }
650         spin_unlock_bh(&rt_peer_lock);
651         if (peer)
652                 inet_putpeer(peer);
653 }
654 
655 /*
656  * Peer allocation may fail only in serious out-of-memory conditions.  However
657  * we still can generate some output.
658  * Random ID selection looks a bit dangerous because we have no chances to
659  * select ID being unique in a reasonable period of time.
660  * But broken packet identifier may be better than no packet at all.
661  */
662 static void ip_select_fb_ident(struct iphdr *iph)
663 {
664         static spinlock_t ip_fb_id_lock = SPIN_LOCK_UNLOCKED;
665         static u32 ip_fallback_id;
666         u32 salt;
667 
668         spin_lock_bh(&ip_fb_id_lock);
669         salt = secure_ip_id(ip_fallback_id ^ iph->daddr);
670         iph->id = salt & 0xFFFF;
671         ip_fallback_id = salt;
672         spin_unlock_bh(&ip_fb_id_lock);
673 }
674 
675 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst)
676 {
677         struct rtable *rt = (struct rtable *) dst;
678 
679         if (rt) {
680                 if (rt->peer == NULL)
681                         rt_bind_peer(rt, 1);
682 
683                 /* If peer is attached to destination, it is never detached,
684                    so that we need not to grab a lock to dereference it.
685                  */
686                 if (rt->peer) {
687                         iph->id = inet_getid(rt->peer);
688                         return;
689                 }
690         } else {
691                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n", NET_CALLER(iph));
692         }
693 
694         ip_select_fb_ident(iph);
695 }
696 
697 static void rt_del(unsigned hash, struct rtable *rt)
698 {
699         struct rtable **rthp;
700 
701         write_lock_bh(&rt_hash_table[hash].lock);
702         ip_rt_put(rt);
703         for (rthp = &rt_hash_table[hash].chain; *rthp; rthp = &(*rthp)->u.rt_next) {
704                 if (*rthp == rt) {
705                         *rthp = rt->u.rt_next;
706                         rt_free(rt);
707                         break;
708                 }
709         }
710         write_unlock_bh(&rt_hash_table[hash].lock);
711 }
712 
713 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
714                     u32 saddr, u8 tos, struct net_device *dev)
715 {
716         int i, k;
717         struct in_device *in_dev = in_dev_get(dev);
718         struct rtable *rth, **rthp;
719         u32  skeys[2] = { saddr, 0 };
720         int  ikeys[2] = { dev->ifindex, 0 };
721 
722         tos &= IPTOS_RT_MASK;
723 
724         if (!in_dev)
725                 return;
726 
727         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
728             || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
729                 goto reject_redirect;
730 
731         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
732                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
733                         goto reject_redirect;
734                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
735                         goto reject_redirect;
736         } else {
737                 if (inet_addr_type(new_gw) != RTN_UNICAST)
738                         goto reject_redirect;
739         }
740 
741         for (i=0; i<2; i++) {
742                 for (k=0; k<2; k++) {
743                         unsigned hash = rt_hash_code(daddr, skeys[i]^(ikeys[k]<<5), tos);
744 
745                         rthp=&rt_hash_table[hash].chain;
746 
747                         read_lock(&rt_hash_table[hash].lock);
748                         while ( (rth = *rthp) != NULL) {
749                                 struct rtable *rt;
750 
751                                 if (rth->key.dst != daddr ||
752                                     rth->key.src != skeys[i] ||
753                                     rth->key.tos != tos ||
754                                     rth->key.oif != ikeys[k] ||
755                                     rth->key.iif != 0) {
756                                         rthp = &rth->u.rt_next;
757                                         continue;
758                                 }
759 
760                                 if (rth->rt_dst != daddr ||
761                                     rth->rt_src != saddr ||
762                                     rth->u.dst.error ||
763                                     rth->rt_gateway != old_gw ||
764                                     rth->u.dst.dev != dev)
765                                         break;
766 
767                                 dst_clone(&rth->u.dst);
768                                 read_unlock(&rt_hash_table[hash].lock);
769 
770                                 rt = dst_alloc(&ipv4_dst_ops);
771                                 if (rt == NULL) {
772                                         ip_rt_put(rth);
773                                         in_dev_put(in_dev);
774                                         return;
775                                 }
776 
777                                 /*
778                                  * Copy all the information.
779                                  */
780                                 *rt = *rth;
781                                 rt->u.dst.__use = 1;
782                                 atomic_set(&rt->u.dst.__refcnt, 1);
783                                 if (rt->u.dst.dev)
784                                         dev_hold(rt->u.dst.dev);
785                                 rt->u.dst.lastuse = jiffies;
786                                 rt->u.dst.neighbour = NULL;
787                                 rt->u.dst.hh = NULL;
788                                 rt->u.dst.obsolete = 0;
789 
790                                 rt->rt_flags |= RTCF_REDIRECTED;
791 
792                                 /* Gateway is different ... */
793                                 rt->rt_gateway = new_gw;
794 
795                                 /* Redirect received -> path was valid */
796                                 dst_confirm(&rth->u.dst);
797 
798                                 if (rt->peer)
799                                         atomic_inc(&rt->peer->refcnt);
800 
801                                 if (arp_bind_neighbour(&rt->u.dst) ||
802                                     !(rt->u.dst.neighbour->nud_state&NUD_VALID)) {
803                                         if (rt->u.dst.neighbour)
804                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
805                                         ip_rt_put(rth);
806                                         rt_drop(rt);
807                                         goto do_next;
808                                 }
809 
810                                 rt_del(hash, rth);
811                                 if (!rt_intern_hash(hash, rt, &rt))
812                                         ip_rt_put(rt);
813                                 goto do_next;
814                         }
815                         read_unlock(&rt_hash_table[hash].lock);
816                 do_next:
817                         ;
818                 }
819         }
820         in_dev_put(in_dev);
821         return;
822 
823 reject_redirect:
824 #ifdef CONFIG_IP_ROUTE_VERBOSE
825         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
826                 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about %u.%u.%u.%u ignored.\n"
827                        "  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u, tos %02x\n",
828                        NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
829                        NIPQUAD(saddr), NIPQUAD(daddr), tos);
830 #endif
831         in_dev_put(in_dev);
832 }
833 
834 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
835 {
836         struct rtable *rt = (struct rtable*)dst;
837 
838         if (rt != NULL) {
839                 if (dst->obsolete) {
840                         ip_rt_put(rt);
841                         return NULL;
842                 }
843                 if ((rt->rt_flags&RTCF_REDIRECTED) || rt->u.dst.expires) {
844                         unsigned hash = rt_hash_code(rt->key.dst, rt->key.src^(rt->key.oif<<5), rt->key.tos);
845 #if RT_CACHE_DEBUG >= 1
846                         printk(KERN_DEBUG "ip_rt_advice: redirect to %u.%u.%u.%u/%02x dropped\n",
847                                 NIPQUAD(rt->rt_dst), rt->key.tos);
848 #endif
849                         rt_del(hash, rt);
850                         return NULL;
851                 }
852         }
853         return dst;
854 }
855 
856 /*
857  * Algorithm:
858  *      1. The first ip_rt_redirect_number redirects are sent
859  *         with exponential backoff, then we stop sending them at all,
860  *         assuming that the host ignores our redirects.
861  *      2. If we did not see packets requiring redirects
862  *         during ip_rt_redirect_silence, we assume that the host
863  *         forgot redirected route and start to send redirects again.
864  *
865  * This algorithm is much cheaper and more intelligent than dumb load limiting
866  * in icmp.c.
867  *
868  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
869  * and "frag. need" (breaks PMTU discovery) in icmp.c.
870  */
871 
872 void ip_rt_send_redirect(struct sk_buff *skb)
873 {
874         struct rtable *rt = (struct rtable*)skb->dst;
875         struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
876 
877         if (!in_dev)
878                 return;
879 
880         if (!IN_DEV_TX_REDIRECTS(in_dev))
881                 goto out;
882 
883         /* No redirected packets during ip_rt_redirect_silence;
884          * reset the algorithm.
885          */
886         if (jiffies - rt->u.dst.rate_last > ip_rt_redirect_silence)
887                 rt->u.dst.rate_tokens = 0;
888 
889         /* Too many ignored redirects; do not send anything
890          * set u.dst.rate_last to the last seen redirected packet.
891          */
892         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
893                 rt->u.dst.rate_last = jiffies;
894                 goto out;
895         }
896 
897         /* Check for load limit; set rate_last to the latest sent
898          * redirect.
899          */
900         if (jiffies - rt->u.dst.rate_last > (ip_rt_redirect_load<<rt->u.dst.rate_tokens)) {
901                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
902                 rt->u.dst.rate_last = jiffies;
903                 ++rt->u.dst.rate_tokens;
904 #ifdef CONFIG_IP_ROUTE_VERBOSE
905                 if (IN_DEV_LOG_MARTIANS(in_dev) &&
906                     rt->u.dst.rate_tokens == ip_rt_redirect_number && net_ratelimit())
907                         printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores redirects for "
908                                 "%u.%u.%u.%u to %u.%u.%u.%u.\n",
909                                 NIPQUAD(rt->rt_src), rt->rt_iif,
910                                 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
911 #endif
912         }
913 out:
914         in_dev_put(in_dev);
915 }
916 
917 static int ip_error(struct sk_buff *skb)
918 {
919         struct rtable *rt = (struct rtable*)skb->dst;
920         unsigned long now;
921         int code;
922 
923         switch (rt->u.dst.error) {
924         case EINVAL:
925         default:
926                 kfree_skb(skb);
927                 return 0;
928         case EHOSTUNREACH:
929                 code = ICMP_HOST_UNREACH;
930                 break;
931         case ENETUNREACH:
932                 code = ICMP_NET_UNREACH;
933                 break;
934         case EACCES:
935                 code = ICMP_PKT_FILTERED;
936                 break;
937         }
938 
939         now = jiffies;
940         if ((rt->u.dst.rate_tokens += (now - rt->u.dst.rate_last)) > ip_rt_error_burst)
941                 rt->u.dst.rate_tokens = ip_rt_error_burst;
942         rt->u.dst.rate_last = now;
943         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
944                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
945                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
946         }
947 
948         kfree_skb(skb);
949         return 0;
950 } 
951 
952 /*
953  *      The last two values are not from the RFC but
954  *      are needed for AMPRnet AX.25 paths.
955  */
956 
957 static unsigned short mtu_plateau[] =
958 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
959 
960 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
961 {
962         int i;
963         
964         for (i = 0; i < sizeof(mtu_plateau)/sizeof(mtu_plateau[0]); i++)
965                 if (old_mtu > mtu_plateau[i])
966                         return mtu_plateau[i];
967         return 68;
968 }
969 
970 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
971 {
972         int i;
973         unsigned short old_mtu = ntohs(iph->tot_len);
974         struct rtable *rth;
975         u32  skeys[2] = { iph->saddr, 0, };
976         u32  daddr = iph->daddr;
977         u8   tos = iph->tos & IPTOS_RT_MASK;
978         unsigned short est_mtu = 0;
979 
980         if (ipv4_config.no_pmtu_disc)
981                 return 0;
982 
983         for (i=0; i<2; i++) {
984                 unsigned hash = rt_hash_code(daddr, skeys[i], tos);
985 
986                 read_lock(&rt_hash_table[hash].lock);
987                 for (rth = rt_hash_table[hash].chain; rth; rth = rth->u.rt_next) {
988                         if (rth->key.dst == daddr &&
989                             rth->key.src == skeys[i] &&
990                             rth->rt_dst == daddr &&
991                             rth->rt_src == iph->saddr &&
992                             rth->key.tos == tos &&
993                             rth->key.iif == 0 &&
994                             !(rth->u.dst.mxlock&(1<<RTAX_MTU))) {
995                                 unsigned short mtu = new_mtu;
996 
997                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
998 
999                                         /* BSD 4.2 compatibility hack :-( */
1000                                         if (mtu == 0 && old_mtu >= rth->u.dst.pmtu &&
1001                                             old_mtu >= 68 + (iph->ihl<<2))
1002                                                 old_mtu -= iph->ihl<<2;
1003 
1004                                         mtu = guess_mtu(old_mtu);
1005                                 }
1006                                 if (mtu <= rth->u.dst.pmtu) {
1007                                         if (mtu < rth->u.dst.pmtu) { 
1008                                                 dst_confirm(&rth->u.dst);
1009                                                 if (mtu < ip_rt_min_pmtu) {
1010                                                         mtu = ip_rt_min_pmtu;
1011                                                         rth->u.dst.mxlock |= (1<<RTAX_MTU);
1012                                                 }
1013                                                 rth->u.dst.pmtu = mtu;
1014                                                 dst_set_expires(&rth->u.dst, ip_rt_mtu_expires);
1015                                         }
1016                                         est_mtu = mtu;
1017                                 }
1018                         }
1019                 }
1020                 read_unlock(&rt_hash_table[hash].lock);
1021         }
1022         return est_mtu ? : new_mtu;
1023 }
1024 
1025 void ip_rt_update_pmtu(struct dst_entry *dst, unsigned mtu)
1026 {
1027         if (dst->pmtu > mtu && mtu >= 68 &&
1028             !(dst->mxlock&(1<<RTAX_MTU))) {
1029                 if (mtu < ip_rt_min_pmtu) {
1030                         mtu = ip_rt_min_pmtu;
1031                         dst->mxlock |= (1<<RTAX_MTU);
1032                 }
1033                 dst->pmtu = mtu;
1034                 dst_set_expires(dst, ip_rt_mtu_expires);
1035         }
1036 }
1037 
1038 static struct dst_entry * ipv4_dst_check(struct dst_entry * dst, u32 cookie)
1039 {
1040         dst_release(dst);
1041         return NULL;
1042 }
1043 
1044 static struct dst_entry * ipv4_dst_reroute(struct dst_entry * dst,
1045                                            struct sk_buff *skb)
1046 {
1047         return NULL;
1048 }
1049 
1050 static void ipv4_dst_destroy(struct dst_entry * dst)
1051 {
1052         struct rtable *rt = (struct rtable *) dst;
1053         struct inet_peer *peer = rt->peer;
1054 
1055         if (peer) {
1056                 rt->peer = NULL;
1057                 inet_putpeer(peer);
1058         }
1059 }
1060 
1061 static void ipv4_link_failure(struct sk_buff *skb)
1062 {
1063         struct rtable *rt;
1064 
1065         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1066 
1067         rt = (struct rtable *) skb->dst;
1068         if (rt)
1069                 dst_set_expires(&rt->u.dst, 0);
1070 }
1071 
1072 static int ip_rt_bug(struct sk_buff *skb)
1073 {
1074         printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1075                 NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
1076                 skb->dev ? skb->dev->name : "?");
1077         kfree_skb(skb);
1078         return 0;
1079 }
1080 
1081 /*
1082    We do not cache source address of outgoing interface,
1083    because it is used only by IP RR, TS and SRR options,
1084    so that it out of fast path.
1085 
1086    BTW remember: "addr" is allowed to be not aligned
1087    in IP options!
1088  */
1089 
1090 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1091 {
1092         u32 src;
1093         struct fib_result res;
1094 
1095         if (rt->key.iif == 0)
1096                 src = rt->rt_src;
1097         else if (fib_lookup(&rt->key, &res) == 0) {
1098 #ifdef CONFIG_IP_ROUTE_NAT
1099                 if (res.type == RTN_NAT)
1100                         src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway, RT_SCOPE_UNIVERSE);
1101                 else
1102 #endif
1103                         src = FIB_RES_PREFSRC(res);
1104                 fib_res_put(&res);
1105         } else
1106                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway, RT_SCOPE_UNIVERSE);
1107         memcpy(addr, &src, 4);
1108 }
1109 
1110 #ifdef CONFIG_NET_CLS_ROUTE
1111 static void set_class_tag(struct rtable *rt, u32 tag)
1112 {
1113         if (!(rt->u.dst.tclassid&0xFFFF))
1114                 rt->u.dst.tclassid |= tag&0xFFFF;
1115         if (!(rt->u.dst.tclassid&0xFFFF0000))
1116                 rt->u.dst.tclassid |= tag&0xFFFF0000;
1117 }
1118 #endif
1119 
1120 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1121 {
1122         struct fib_info *fi = res->fi;
1123 
1124         if (fi) {
1125                 if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1126                         rt->rt_gateway = FIB_RES_GW(*res);
1127                 memcpy(&rt->u.dst.mxlock, fi->fib_metrics, sizeof(fi->fib_metrics));
1128                 if (fi->fib_mtu == 0) {
1129                         rt->u.dst.pmtu = rt->u.dst.dev->mtu;
1130                         if (rt->u.dst.mxlock&(1<<RTAX_MTU) &&
1131                             rt->rt_gateway != rt->rt_dst &&
1132                             rt->u.dst.pmtu > 576)
1133                                 rt->u.dst.pmtu = 576;
1134                 }
1135 #ifdef CONFIG_NET_CLS_ROUTE
1136                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1137 #endif
1138         } else {
1139                 rt->u.dst.pmtu  = rt->u.dst.dev->mtu;
1140         }
1141         if (rt->u.dst.pmtu > IP_MAX_MTU)
1142                 rt->u.dst.pmtu = IP_MAX_MTU;
1143         if (rt->u.dst.advmss == 0)
1144                 rt->u.dst.advmss = max(rt->u.dst.dev->mtu-40, ip_rt_min_advmss);
1145         if (rt->u.dst.advmss > 65535-40)
1146                 rt->u.dst.advmss = 65535-40;
1147 
1148 #ifdef CONFIG_NET_CLS_ROUTE
1149 #ifdef CONFIG_IP_MULTIPLE_TABLES
1150         set_class_tag(rt, fib_rules_tclass(res));
1151 #endif
1152         set_class_tag(rt, itag);
1153 #endif
1154         rt->rt_type = res->type;
1155 }
1156 
1157 static int
1158 ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
1159                   u8 tos, struct net_device *dev, int our)
1160 {
1161         unsigned hash;
1162         struct rtable *rth;
1163         u32 spec_dst;
1164         struct in_device *in_dev = in_dev_get(dev);
1165         u32 itag = 0;
1166 
1167         /* Primary sanity checks. */
1168 
1169         if (in_dev == NULL)
1170                 return -EINVAL;
1171 
1172         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1173             skb->protocol != __constant_htons(ETH_P_IP))
1174                 goto e_inval;
1175 
1176         if (ZERONET(saddr)) {
1177                 if (!LOCAL_MCAST(daddr))
1178                         goto e_inval;
1179                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1180         } else if (fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, &itag) < 0)
1181                 goto e_inval;
1182 
1183         rth = dst_alloc(&ipv4_dst_ops);
1184         if (!rth)
1185                 goto e_nobufs;
1186 
1187         rth->u.dst.output= ip_rt_bug;
1188 
1189         atomic_set(&rth->u.dst.__refcnt, 1);
1190         rth->u.dst.flags= DST_HOST;
1191         rth->key.dst    = daddr;
1192         rth->rt_dst     = daddr;
1193         rth->key.tos    = tos;
1194 #ifdef CONFIG_IP_ROUTE_FWMARK
1195         rth->key.fwmark = skb->nfmark;
1196 #endif
1197         rth->key.src    = saddr;
1198         rth->rt_src     = saddr;
1199 #ifdef CONFIG_IP_ROUTE_NAT
1200         rth->rt_dst_map = daddr;
1201         rth->rt_src_map = saddr;
1202 #endif
1203 #ifdef CONFIG_NET_CLS_ROUTE
1204         rth->u.dst.tclassid = itag;
1205 #endif
1206         rth->rt_iif     =
1207         rth->key.iif    = dev->ifindex;
1208         rth->u.dst.dev  = &loopback_dev;
1209         dev_hold(rth->u.dst.dev);
1210         rth->key.oif    = 0;
1211         rth->rt_gateway = daddr;
1212         rth->rt_spec_dst= spec_dst;
1213         rth->rt_type    = RTN_MULTICAST;
1214         rth->rt_flags   = RTCF_MULTICAST;
1215         if (our) {
1216                 rth->u.dst.input= ip_local_deliver;
1217                 rth->rt_flags |= RTCF_LOCAL;
1218         }
1219 
1220 #ifdef CONFIG_IP_MROUTE
1221         if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1222                 rth->u.dst.input = ip_mr_input;
1223 #endif
1224 
1225         in_dev_put(in_dev);
1226         hash = rt_hash_code(daddr, saddr^(dev->ifindex<<5), tos);
1227         return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1228 
1229 e_nobufs:
1230         in_dev_put(in_dev);
1231         return -ENOBUFS;
1232 
1233 e_inval:
1234         in_dev_put(in_dev);
1235         return -EINVAL;
1236 }
1237 
1238 /*
1239  *      NOTE. We drop all the packets that has local source
1240  *      addresses, because every properly looped back packet
1241  *      must have correct destination already attached by output routine.
1242  *
1243  *      Such approach solves two big problems:
1244  *      1. Not simplex devices are handled properly.
1245  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1246  */
1247 
1248 int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1249                         u8 tos, struct net_device *dev)
1250 {
1251         struct rt_key   key;
1252         struct fib_result res;
1253         struct in_device *in_dev = in_dev_get(dev);
1254         struct in_device *out_dev = NULL;
1255         unsigned        flags = 0;
1256         u32             itag = 0;
1257         struct rtable * rth;
1258         unsigned        hash;
1259         u32             spec_dst;
1260         int             err = -EINVAL;
1261         int             free_res = 0;
1262 
1263         /*
1264          *      IP on this device is disabled.
1265          */
1266 
1267         if (!in_dev)
1268                 return -EINVAL;
1269 
1270         key.dst = daddr;
1271         key.src = saddr;
1272         key.tos = tos;
1273 #ifdef CONFIG_IP_ROUTE_FWMARK
1274         key.fwmark = skb->nfmark;
1275 #endif
1276         key.iif = dev->ifindex;
1277         key.oif = 0;
1278         key.scope = RT_SCOPE_UNIVERSE;
1279 
1280         hash = rt_hash_code(daddr, saddr^(key.iif<<5), tos);
1281 
1282         /* Check for the most weird martians, which can be not detected
1283            by fib_lookup.
1284          */
1285 
1286         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1287                 goto martian_source;
1288 
1289         if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
1290                 goto brd_input;
1291 
1292         /* Accept zero addresses only to limited broadcast;
1293          * I even do not know to fix it or not. Waiting for complains :-)
1294          */
1295         if (ZERONET(saddr))
1296                 goto martian_source;
1297 
1298         if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1299                 goto martian_destination;
1300 
1301         /*
1302          *      Now we are ready to route packet.
1303          */
1304         if ((err = fib_lookup(&key, &res)) != 0) {
1305                 if (!IN_DEV_FORWARD(in_dev))
1306                         goto e_inval;
1307                 goto no_route;
1308         }
1309         free_res = 1;
1310 
1311 #ifdef CONFIG_IP_ROUTE_NAT
1312         /* Policy is applied before mapping destination,
1313            but rerouting after map should be made with old source.
1314          */
1315 
1316         if (1) {
1317                 u32 src_map = saddr;
1318                 if (res.r)
1319                         src_map = fib_rules_policy(saddr, &res, &flags);
1320 
1321                 if (res.type == RTN_NAT) {
1322                         key.dst = fib_rules_map_destination(daddr, &res);
1323                         fib_res_put(&res);
1324                         free_res = 0;
1325                         if (fib_lookup(&key, &res))
1326                                 goto e_inval;
1327                         free_res = 1;
1328                         if (res.type != RTN_UNICAST)
1329                                 goto e_inval;
1330                         flags |= RTCF_DNAT;
1331                 }
1332                 key.src = src_map;
1333         }
1334 #endif
1335 
1336         if (res.type == RTN_BROADCAST)
1337                 goto brd_input;
1338 
1339         if (res.type == RTN_LOCAL) {
1340                 int result;
1341                 result = fib_validate_source(saddr, daddr, tos, loopback_dev.ifindex,
1342                                              dev, &spec_dst, &itag);
1343                 if (result < 0)
1344                         goto martian_source;
1345                 if (result)
1346                         flags |= RTCF_DIRECTSRC;
1347                 spec_dst = daddr;
1348                 goto local_input;
1349         }
1350 
1351         if (!IN_DEV_FORWARD(in_dev))
1352                 goto e_inval;
1353         if (res.type != RTN_UNICAST)
1354                 goto martian_destination;
1355 
1356 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1357         if (res.fi->fib_nhs > 1 && key.oif == 0)
1358                 fib_select_multipath(&key, &res);
1359 #endif
1360         out_dev = in_dev_get(FIB_RES_DEV(res));
1361         if (out_dev == NULL) {
1362                 if (net_ratelimit())
1363                         printk(KERN_CRIT "Bug in ip_route_input_slow(). Please, report\n");
1364                 goto e_inval;
1365         }
1366 
1367         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(res), dev, &spec_dst, &itag);
1368         if (err < 0)
1369                 goto martian_source;
1370 
1371         if (err)
1372                 flags |= RTCF_DIRECTSRC;
1373 
1374         if (out_dev == in_dev && err && !(flags&(RTCF_NAT|RTCF_MASQ)) &&
1375             (IN_DEV_SHARED_MEDIA(out_dev)
1376              || inet_addr_onlink(out_dev, saddr, FIB_RES_GW(res))))
1377                 flags |= RTCF_DOREDIRECT;
1378 
1379         if (skb->protocol != __constant_htons(ETH_P_IP)) {
1380                 /* Not IP (i.e. ARP). Do not create route, if it is
1381                  * invalid for proxy arp. DNAT routes are always valid.
1382                  */
1383                 if (out_dev == in_dev && !(flags&RTCF_DNAT))
1384                         goto e_inval;
1385         }
1386 
1387         rth = dst_alloc(&ipv4_dst_ops);
1388         if (!rth)
1389                 goto e_nobufs;
1390 
1391         atomic_set(&rth->u.dst.__refcnt, 1);
1392         rth->u.dst.flags= DST_HOST;
1393         rth->key.dst    = daddr;
1394         rth->rt_dst     = daddr;
1395         rth->key.tos    = tos;
1396 #ifdef CONFIG_IP_ROUTE_FWMARK
1397         rth->key.fwmark = skb->nfmark;
1398 #endif
1399         rth->key.src    = saddr;
1400         rth->rt_src     = saddr;
1401         rth->rt_gateway = daddr;
1402 #ifdef CONFIG_IP_ROUTE_NAT
1403         rth->rt_src_map = key.src;
1404         rth->rt_dst_map = key.dst;
1405         if (flags&RTCF_DNAT)
1406                 rth->rt_gateway = key.dst;
1407 #endif
1408         rth->rt_iif     =
1409         rth->key.iif    = dev->ifindex;
1410         rth->u.dst.dev  = out_dev->dev;
1411         dev_hold(rth->u.dst.dev);
1412         rth->key.oif    = 0;
1413         rth->rt_spec_dst= spec_dst;
1414 
1415         rth->u.dst.input = ip_forward;
1416         rth->u.dst.output = ip_output;
1417 
1418         rt_set_nexthop(rth, &res, itag);
1419 
1420         rth->rt_flags = flags;
1421 
1422 #ifdef CONFIG_NET_FASTROUTE
1423         if (netdev_fastroute && !(flags&(RTCF_NAT|RTCF_MASQ|RTCF_DOREDIRECT))) {
1424                 struct net_device *odev = rth->u.dst.dev;
1425                 if (odev != dev &&
1426                     dev->accept_fastpath &&
1427                     odev->mtu >= dev->mtu &&
1428                     dev->accept_fastpath(dev, &rth->u.dst) == 0)
1429                         rth->rt_flags |= RTCF_FAST;
1430         }
1431 #endif
1432 
1433 intern:
1434         err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1435 done:
1436         in_dev_put(in_dev);
1437         if (out_dev)
1438                 in_dev_put(out_dev);
1439         if (free_res)
1440                 fib_res_put(&res);
1441         return err;
1442 
1443 brd_input:
1444         if (skb->protocol != __constant_htons(ETH_P_IP))
1445                 goto e_inval;
1446 
1447         if (ZERONET(saddr)) {
1448                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1449         } else {
1450                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, &itag);
1451                 if (err < 0)
1452                         goto martian_source;
1453                 if (err)
1454                         flags |= RTCF_DIRECTSRC;
1455         }
1456         flags |= RTCF_BROADCAST;
1457         res.type = RTN_BROADCAST;
1458 
1459 local_input:
1460         rth = dst_alloc(&ipv4_dst_ops);
1461         if (!rth)
1462                 goto e_nobufs;
1463 
1464         rth->u.dst.output= ip_rt_bug;
1465 
1466         atomic_set(&rth->u.dst.__refcnt, 1);
1467         rth->u.dst.flags= DST_HOST;
1468         rth->key.dst    = daddr;
1469         rth->rt_dst     = daddr;
1470         rth->key.tos    = tos;
1471 #ifdef CONFIG_IP_ROUTE_FWMARK
1472         rth->key.fwmark = skb->nfmark;
1473 #endif
1474         rth->key.src    = saddr;
1475         rth->rt_src     = saddr;
1476 #ifdef CONFIG_IP_ROUTE_NAT
1477         rth->rt_dst_map = key.dst;
1478         rth->rt_src_map = key.src;
1479 #endif
1480 #ifdef CONFIG_NET_CLS_ROUTE
1481         rth->u.dst.tclassid = itag;
1482 #endif
1483         rth->rt_iif     =
1484         rth->key.iif    = dev->ifindex;
1485         rth->u.dst.dev  = &loopback_dev;
1486         dev_hold(rth->u.dst.dev);
1487         rth->key.oif    = 0;
1488         rth->rt_gateway = daddr;
1489         rth->rt_spec_dst= spec_dst;
1490         rth->u.dst.input= ip_local_deliver;
1491         rth->rt_flags   = flags|RTCF_LOCAL;
1492         if (res.type == RTN_UNREACHABLE) {
1493                 rth->u.dst.input= ip_error;
1494                 rth->u.dst.error= -err;
1495                 rth->rt_flags   &= ~RTCF_LOCAL;
1496         }
1497         rth->rt_type    = res.type;
1498         goto intern;
1499 
1500 no_route:
1501         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
1502         res.type = RTN_UNREACHABLE;
1503         goto local_input;
1504 
1505         /*
1506          *      Do not cache martian addresses: they should be logged (RFC1812)
1507          */
1508 martian_destination:
1509 #ifdef CONFIG_IP_ROUTE_VERBOSE
1510         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1511                 printk(KERN_WARNING "martian destination %u.%u.%u.%u from %u.%u.%u.%u, dev %s\n",
1512                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1513 #endif
1514 e_inval:
1515         err = -EINVAL;
1516         goto done;
1517 
1518 e_nobufs:
1519         err = -ENOBUFS;
1520         goto done;
1521 
1522 martian_source:
1523 #ifdef CONFIG_IP_ROUTE_VERBOSE
1524         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1525                 /*
1526                  *      RFC1812 recommendation, if source is martian,
1527                  *      the only hint is MAC header.
1528                  */
1529                 printk(KERN_WARNING "martian source %u.%u.%u.%u from %u.%u.%u.%u, on dev %s\n",
1530                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1531                 if (dev->hard_header_len) {
1532                         int i;
1533                         unsigned char *p = skb->mac.raw;
1534                         printk(KERN_WARNING "ll header: ");
1535                         for (i=0; i<dev->hard_header_len; i++, p++) {
1536                                 printk("%02x", *p);
1537                                 if(i<(dev->hard_header_len-1))
1538                                         printk(":");
1539                         }
1540                         printk("\n");
1541                 }
1542         }
1543 #endif
1544         goto e_inval;
1545 }
1546 
1547 int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
1548                    u8 tos, struct net_device *dev)
1549 {
1550         struct rtable * rth;
1551         unsigned        hash;
1552         int iif = dev->ifindex;
1553 
1554         tos &= IPTOS_RT_MASK;
1555         hash = rt_hash_code(daddr, saddr^(iif<<5), tos);
1556 
1557         read_lock(&rt_hash_table[hash].lock);
1558         for (rth=rt_hash_table[hash].chain; rth; rth=rth->u.rt_next) {
1559                 if (rth->key.dst == daddr &&
1560                     rth->key.src == saddr &&
1561                     rth->key.iif == iif &&
1562                     rth->key.oif == 0 &&
1563 #ifdef CONFIG_IP_ROUTE_FWMARK
1564                     rth->key.fwmark == skb->nfmark &&
1565 #endif
1566                     rth->key.tos == tos) {
1567                         rth->u.dst.lastuse = jiffies;
1568                         dst_hold(&rth->u.dst);
1569                         rth->u.dst.__use++;
1570                         read_unlock(&rt_hash_table[hash].lock);
1571                         skb->dst = (struct dst_entry*)rth;
1572                         return 0;
1573                 }
1574         }
1575         read_unlock(&rt_hash_table[hash].lock);
1576 
1577         /* Multicast recognition logic is moved from route cache to here.
1578            The problem was that too many Ethernet cards have broken/missing
1579            hardware multicast filters :-( As result the host on multicasting
1580            network acquires a lot of useless route cache entries, sort of
1581            SDR messages from all the world. Now we try to get rid of them.
1582            Really, provided software IP multicast filter is organized
1583            reasonably (at least, hashed), it does not result in a slowdown
1584            comparing with route cache reject entries.
1585            Note, that multicast routers are not affected, because
1586            route cache entry is created eventually.
1587          */
1588         if (MULTICAST(daddr)) {
1589                 struct in_device *in_dev;
1590 
1591                 read_lock(&inetdev_lock);
1592                 if ((in_dev = __in_dev_get(dev)) != NULL) {
1593                         int our = ip_check_mc(in_dev, daddr);
1594                         if (our
1595 #ifdef CONFIG_IP_MROUTE
1596                             || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1597 #endif
1598                             ) {
1599                                 read_unlock(&inetdev_lock);
1600                                 return ip_route_input_mc(skb, daddr, saddr, tos, dev, our);
1601                         }
1602                 }
1603                 read_unlock(&inetdev_lock);
1604                 return -EINVAL;
1605         }
1606         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
1607 }
1608 
1609 /*
1610  * Major route resolver routine.
1611  */
1612 
1613 int ip_route_output_slow(struct rtable **rp, const struct rt_key *oldkey)
1614 {
1615         struct rt_key key;
1616         struct fib_result res;
1617         unsigned flags = 0;
1618         struct rtable *rth;
1619         struct net_device *dev_out = NULL;
1620         unsigned hash;
1621         int free_res = 0;
1622         int err;
1623         u32 tos;
1624 
1625         tos = oldkey->tos & (IPTOS_RT_MASK|RTO_ONLINK);
1626         key.dst = oldkey->dst;
1627         key.src = oldkey->src;
1628         key.tos = tos&IPTOS_RT_MASK;
1629         key.iif = loopback_dev.ifindex;
1630         key.oif = oldkey->oif;
1631 #ifdef CONFIG_IP_ROUTE_FWMARK
1632         key.fwmark = oldkey->fwmark;
1633 #endif
1634         key.scope = (tos&RTO_ONLINK) ? RT_SCOPE_LINK : RT_SCOPE_UNIVERSE;
1635         res.fi = NULL;
1636 #ifdef CONFIG_IP_MULTIPLE_TABLES
1637         res.r = NULL;
1638 #endif
1639 
1640         if (oldkey->src) {
1641                 if (MULTICAST(oldkey->src)
1642                     || BADCLASS(oldkey->src)
1643                     || ZERONET(oldkey->src))
1644                         return -EINVAL;
1645 
1646                 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1647                 dev_out = ip_dev_find(oldkey->src);
1648                 if (dev_out == NULL)
1649                         return -EINVAL;
1650 
1651                 /* I removed check for oif == dev_out->oif here.
1652                    It was wrong by three reasons:
1653                    1. ip_dev_find(saddr) can return wrong iface, if saddr is
1654                       assigned to multiple interfaces.
1655                    2. Moreover, we are allowed to send packets with saddr
1656                       of another iface. --ANK
1657                  */
1658 
1659                 if (oldkey->oif == 0
1660                     && (MULTICAST(oldkey->dst) || oldkey->dst == 0xFFFFFFFF)) {
1661                         /* Special hack: user can direct multicasts
1662                            and limited broadcast via necessary interface
1663                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
1664                            This hack is not just for fun, it allows
1665                            vic,vat and friends to work.
1666                            They bind socket to loopback, set ttl to zero
1667                            and expect that it will work.
1668                            From the viewpoint of routing cache they are broken,
1669                            because we are not allowed to build multicast path
1670                            with loopback source addr (look, routing cache
1671                            cannot know, that ttl is zero, so that packet
1672                            will not leave this host and route is valid).
1673                            Luckily, this hack is good workaround.
1674                          */
1675 
1676                         key.oif = dev_out->ifindex;
1677                         goto make_route;
1678                 }
1679                 if (dev_out)
1680                         dev_put(dev_out);
1681                 dev_out = NULL;
1682         }
1683         if (oldkey->oif) {
1684                 dev_out = dev_get_by_index(oldkey->oif);
1685                 if (dev_out == NULL)
1686                         return -ENODEV;
1687                 if (__in_dev_get(dev_out) == NULL) {
1688                         dev_put(dev_out);
1689                         return -ENODEV; /* Wrong error code */
1690                 }
1691 
1692                 if (LOCAL_MCAST(oldkey->dst) || oldkey->dst == 0xFFFFFFFF) {
1693                         if (!key.src)
1694                                 key.src = inet_select_addr(dev_out, 0, RT_SCOPE_LINK);
1695                         goto make_route;
1696                 }
1697                 if (!key.src) {
1698                         if (MULTICAST(oldkey->dst))
1699                                 key.src = inet_select_addr(dev_out, 0, key.scope);
1700                         else if (!oldkey->dst)
1701                                 key.src = inet_select_addr(dev_out, 0, RT_SCOPE_HOST);
1702                 }
1703         }
1704 
1705         if (!key.dst) {
1706                 key.dst = key.src;
1707                 if (!key.dst)
1708                         key.dst = key.src = htonl(INADDR_LOOPBACK);
1709                 if (dev_out)
1710                         dev_put(dev_out);
1711                 dev_out = &loopback_dev;
1712                 dev_hold(dev_out);
1713                 key.oif = loopback_dev.ifindex;
1714                 res.type = RTN_LOCAL;
1715                 flags |= RTCF_LOCAL;
1716                 goto make_route;
1717         }
1718 
1719         if (fib_lookup(&key, &res)) {
1720                 res.fi = NULL;
1721                 if (oldkey->oif) {
1722                         /* Apparently, routing tables are wrong. Assume,
1723                            that the destination is on link.
1724 
1725                            WHY? DW.
1726                            Because we are allowed to send to iface
1727                            even if it has NO routes and NO assigned
1728                            addresses. When oif is specified, routing
1729                            tables are looked up with only one purpose:
1730                            to catch if destination is gatewayed, rather than
1731                            direct. Moreover, if MSG_DONTROUTE is set,
1732                            we send packet, ignoring both routing tables
1733                            and ifaddr state. --ANK
1734 
1735 
1736                            We could make it even if oif is unknown,
1737                            likely IPv6, but we do not.
1738                          */
1739 
1740                         if (key.src == 0)
1741                                 key.src = inet_select_addr(dev_out, 0, RT_SCOPE_LINK);
1742                         res.type = RTN_UNICAST;
1743                         goto make_route;
1744                 }
1745                 if (dev_out)
1746                         dev_put(dev_out);
1747                 return -ENETUNREACH;
1748         }
1749         free_res = 1;
1750 
1751         if (res.type == RTN_NAT)
1752                 goto e_inval;
1753 
1754         if (res.type == RTN_LOCAL) {
1755                 if (!key.src)
1756                         key.src = key.dst;
1757                 if (dev_out)
1758                         dev_put(dev_out);
1759                 dev_out = &loopback_dev;
1760                 dev_hold(dev_out);
1761                 key.oif = dev_out->ifindex;
1762                 if (res.fi)
1763                         fib_info_put(res.fi);
1764                 res.fi = NULL;
1765                 flags |= RTCF_LOCAL;
1766                 goto make_route;
1767         }
1768 
1769 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1770         if (res.fi->fib_nhs > 1 && key.oif == 0)
1771                 fib_select_multipath(&key, &res);
1772         else
1773 #endif
1774         if (res.prefixlen==0 && res.type == RTN_UNICAST && key.oif == 0)
1775                 fib_select_default(&key, &res);
1776 
1777         if (!key.src)
1778                 key.src = FIB_RES_PREFSRC(res);
1779 
1780         if (dev_out)
1781                 dev_put(dev_out);
1782         dev_out = FIB_RES_DEV(res);
1783         dev_hold(dev_out);
1784         key.oif = dev_out->ifindex;
1785 
1786 make_route:
1787         if (LOOPBACK(key.src) && !(dev_out->flags&IFF_LOOPBACK))
1788                 goto e_inval;
1789 
1790         if (key.dst == 0xFFFFFFFF)
1791                 res.type = RTN_BROADCAST;
1792         else if (MULTICAST(key.dst))
1793                 res.type = RTN_MULTICAST;
1794         else if (BADCLASS(key.dst) || ZERONET(key.dst))
1795                 goto e_inval;
1796 
1797         if (dev_out->flags&IFF_LOOPBACK)
1798                 flags |= RTCF_LOCAL;
1799 
1800         if (res.type == RTN_BROADCAST) {
1801                 flags |= RTCF_BROADCAST|RTCF_LOCAL;
1802                 if (res.fi) {
1803                         fib_info_put(res.fi);
1804                         res.fi = NULL;
1805                 }
1806         } else if (res.type == RTN_MULTICAST) {
1807                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
1808                 read_lock(&inetdev_lock);
1809                 if (!__in_dev_get(dev_out) || !ip_check_mc(__in_dev_get(dev_out), oldkey->dst))
1810                         flags &= ~RTCF_LOCAL;
1811                 read_unlock(&inetdev_lock);
1812                 /* If multicast route do not exist use
1813                    default one, but do not gateway in this case.
1814                    Yes, it is hack.
1815                  */
1816                 if (res.fi && res.prefixlen < 4) {
1817                         fib_info_put(res.fi);
1818                         res.fi = NULL;
1819                 }
1820         }
1821 
1822         rth = dst_alloc(&ipv4_dst_ops);
1823         if (!rth)
1824                 goto e_nobufs;
1825 
1826         atomic_set(&rth->u.dst.__refcnt, 1);
1827         rth->u.dst.flags= DST_HOST;
1828         rth->key.dst    = oldkey->dst;
1829         rth->key.tos    = tos;
1830         rth->key.src    = oldkey->src;
1831         rth->key.iif    = 0;
1832         rth->key.oif    = oldkey->oif;
1833 #ifdef CONFIG_IP_ROUTE_FWMARK
1834         rth->key.fwmark = oldkey->fwmark;
1835 #endif
1836         rth->rt_dst     = key.dst;
1837         rth->rt_src     = key.src;
1838 #ifdef CONFIG_IP_ROUTE_NAT
1839         rth->rt_dst_map = key.dst;
1840         rth->rt_src_map = key.src;
1841 #endif
1842         rth->rt_iif     = oldkey->oif ? : dev_out->ifindex;
1843         rth->u.dst.dev  = dev_out;
1844         dev_hold(dev_out);
1845         rth->rt_gateway = key.dst;
1846         rth->rt_spec_dst= key.src;
1847 
1848         rth->u.dst.output=ip_output;
1849 
1850         if (flags&RTCF_LOCAL) {
1851                 rth->u.dst.input = ip_local_deliver;
1852                 rth->rt_spec_dst = key.dst;
1853         }
1854         if (flags&(RTCF_BROADCAST|RTCF_MULTICAST)) {
1855                 rth->rt_spec_dst = key.src;
1856                 if (flags&RTCF_LOCAL && !(dev_out->flags&IFF_LOOPBACK))
1857                         rth->u.dst.output = ip_mc_output;
1858 #ifdef CONFIG_IP_MROUTE
1859                 if (res.type == RTN_MULTICAST) {
1860                         struct in_device *in_dev = in_dev_get(dev_out);
1861                         if (in_dev) {
1862                                 if (IN_DEV_MFORWARD(in_dev) && !LOCAL_MCAST(oldkey->dst)) {
1863                                         rth->u.dst.input = ip_mr_input;
1864                                         rth->u.dst.output = ip_mc_output;
1865                                 }
1866                                 in_dev_put(in_dev);
1867                         }
1868                 }
1869 #endif
1870         }
1871 
1872         rt_set_nexthop(rth, &res, 0);
1873 
1874         rth->rt_flags = flags;
1875 
1876         hash = rt_hash_code(oldkey->dst, oldkey->src^(oldkey->oif<<5), tos);
1877         err = rt_intern_hash(hash, rth, rp);
1878 done:
1879         if (free_res)
1880                 fib_res_put(&res);
1881         if (dev_out)
1882                 dev_put(dev_out);
1883         return err;
1884 
1885 e_inval:
1886         err = -EINVAL;
1887         goto done;
1888 e_nobufs:
1889         err = -ENOBUFS;
1890         goto done;
1891 }
1892 
1893 int ip_route_output_key(struct rtable **rp, const struct rt_key *key)
1894 {
1895         unsigned hash;
1896         struct rtable *rth;
1897 
1898         hash = rt_hash_code(key->dst, key->src^(key->oif<<5), key->tos);
1899 
1900         read_lock_bh(&rt_hash_table[hash].lock);
1901         for (rth=rt_hash_table[hash].chain; rth; rth=rth->u.rt_next) {
1902                 if (rth->key.dst == key->dst &&
1903                     rth->key.src == key->src &&
1904                     rth->key.iif == 0 &&
1905                     rth->key.oif == key->oif &&
1906 #ifdef CONFIG_IP_ROUTE_FWMARK
1907                     rth->key.fwmark == key->fwmark &&
1908 #endif
1909                     !((rth->key.tos^key->tos)&(IPTOS_RT_MASK|RTO_ONLINK)) &&
1910                     ((key->tos&RTO_TPROXY) || !(rth->rt_flags&RTCF_TPROXY))
1911                 ) {
1912                         rth->u.dst.lastuse = jiffies;
1913                         dst_hold(&rth->u.dst);
1914                         rth->u.dst.__use++;
1915                         read_unlock_bh(&rt_hash_table[hash].lock);
1916                         *rp = rth;
1917                         return 0;
1918                 }
1919         }
1920         read_unlock_bh(&rt_hash_table[hash].lock);
1921 
1922         return ip_route_output_slow(rp, key);
1923 }       
1924 
1925 #ifdef CONFIG_RTNETLINK
1926 
1927 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event, int nowait)
1928 {
1929         struct rtable *rt = (struct rtable*)skb->dst;
1930         struct rtmsg *r;
1931         struct nlmsghdr  *nlh;
1932         unsigned char    *b = skb->tail;
1933         struct rta_cacheinfo ci;
1934 #ifdef CONFIG_IP_MROUTE
1935         struct rtattr *eptr;
1936 #endif
1937 
1938         nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*r));
1939         r = NLMSG_DATA(nlh);
1940         nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0;
1941         r->rtm_family = AF_INET;
1942         r->rtm_dst_len = 32;
1943         r->rtm_src_len = 0;
1944         r->rtm_tos = rt->key.tos;
1945         r->rtm_table = RT_TABLE_MAIN;
1946         r->rtm_type = rt->rt_type;
1947         r->rtm_scope = RT_SCOPE_UNIVERSE;
1948         r->rtm_protocol = RTPROT_UNSPEC;
1949         r->rtm_flags = (rt->rt_flags&~0xFFFF) | RTM_F_CLONED;
1950         if (rt->rt_flags & RTCF_NOTIFY)
1951                 r->rtm_flags |= RTM_F_NOTIFY;
1952         RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
1953         if (rt->key.src) {
1954                 r->rtm_src_len = 32;
1955                 RTA_PUT(skb, RTA_SRC, 4, &rt->key.src);
1956         }
1957         if (rt->u.dst.dev)
1958                 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
1959 #ifdef CONFIG_NET_CLS_ROUTE
1960         if (rt->u.dst.tclassid)
1961                 RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
1962 #endif
1963         if (rt->key.iif)
1964                 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
1965         else if (rt->rt_src != rt->key.src)
1966                 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
1967         if (rt->rt_dst != rt->rt_gateway)
1968                 RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
1969         if (rtnetlink_put_metrics(skb, &rt->u.dst.mxlock) < 0)
1970                 goto rtattr_failure;
1971         ci.rta_lastuse = jiffies - rt->u.dst.lastuse;
1972         ci.rta_used = rt->u.dst.__use;
1973         ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
1974         if (rt->u.dst.expires)
1975                 ci.rta_expires = rt->u.dst.expires - jiffies;
1976         else
1977                 ci.rta_expires = 0;
1978         ci.rta_error = rt->u.dst.error;
1979         ci.rta_id = 0;
1980         ci.rta_ts = 0;
1981         ci.rta_tsage = 0;
1982         if (rt->peer) {
1983                 ci.rta_id = rt->peer->ip_id_count;
1984                 if (rt->peer->tcp_ts_stamp) {
1985                         ci.rta_ts = rt->peer->tcp_ts;
1986                         ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
1987                 }
1988         }
1989 #ifdef CONFIG_IP_MROUTE
1990         eptr = (struct rtattr*)skb->tail;
1991 #endif
1992         RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
1993         if (rt->key.iif) {
1994 #ifdef CONFIG_IP_MROUTE
1995                 u32 dst = rt->rt_dst;
1996 
1997                 if (MULTICAST(dst) && !LOCAL_MCAST(dst) && ipv4_devconf.mc_forwarding) {
1998                         int err = ipmr_get_route(skb, r, nowait);
1999                         if (err <= 0) {
2000                                 if (!nowait) {
2001                                         if (err == 0)
2002                                                 return 0;
2003                                         goto nlmsg_failure;
2004                                 } else {
2005                                         if (err == -EMSGSIZE)
2006                                                 goto nlmsg_failure;
2007                                         ((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
2008                                 }
2009                         }
2010                 } else
2011 #endif
2012                 {
2013                         RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->key.iif);
2014                 }
2015         }
2016 
2017         nlh->nlmsg_len = skb->tail - b;
2018         return skb->len;
2019 
2020 nlmsg_failure:
2021 rtattr_failure:
2022         skb_trim(skb, b - skb->data);
2023         return -1;
2024 }
2025 
2026 int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2027 {
2028         struct rtattr **rta = arg;
2029         struct rtmsg *rtm = NLMSG_DATA(nlh);
2030         struct rtable *rt = NULL;
2031         u32 dst = 0;
2032         u32 src = 0;
2033         int iif = 0;
2034         int err;
2035         struct sk_buff *skb;
2036 
2037         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2038         if (skb == NULL)
2039                 return -ENOBUFS;
2040 
2041         /* Reserve room for dummy headers, this skb can pass
2042            through good chunk of routing engine.
2043          */
2044         skb->mac.raw = skb->data;
2045         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2046 
2047         if (rta[RTA_SRC-1])
2048                 memcpy(&src, RTA_DATA(rta[RTA_SRC-1]), 4);
2049         if (rta[RTA_DST-1])
2050                 memcpy(&dst, RTA_DATA(rta[RTA_DST-1]), 4);
2051         if (rta[RTA_IIF-1])
2052                 memcpy(&iif, RTA_DATA(rta[RTA_IIF-1]), sizeof(int));
2053 
2054         if (iif) {
2055                 struct net_device *dev;
2056                 dev = __dev_get_by_index(iif);
2057                 if (!dev)
2058                         return -ENODEV;
2059                 skb->protocol = __constant_htons(ETH_P_IP);
2060                 skb->dev = dev;
2061                 local_bh_disable();
2062                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2063                 local_bh_enable();
2064                 rt = (struct rtable*)skb->dst;
2065                 if (!err && rt->u.dst.error)
2066                         err = -rt->u.dst.error;
2067         } else {
2068                 int oif = 0;
2069                 if (rta[RTA_OIF-1])
2070                         memcpy(&oif, RTA_DATA(rta[RTA_OIF-1]), sizeof(int));
2071                 err = ip_route_output(&rt, dst, src, rtm->rtm_tos, oif);
2072         }
2073         if (err) {
2074                 kfree_skb(skb);
2075                 return err;
2076         }
2077 
2078         skb->dst = &rt->u.dst;
2079         if (rtm->rtm_flags & RTM_F_NOTIFY)
2080                 rt->rt_flags |= RTCF_NOTIFY;
2081 
2082         NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
2083 
2084         err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, RTM_NEWROUTE, 0);
2085         if (err == 0)
2086                 return 0;
2087         if (err < 0)
2088                 return -EMSGSIZE;
2089 
2090         err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
2091         if (err < 0)
2092                 return err;
2093         return 0;
2094 }
2095 
2096 
2097 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2098 {
2099         struct rtable *rt;
2100         int h, s_h;
2101         int idx, s_idx;
2102 
2103         s_h = cb->args[0];
2104         s_idx = idx = cb->args[1];
2105         for (h=0; h <= rt_hash_mask; h++) {
2106                 if (h < s_h) continue;
2107                 if (h > s_h)
2108                         s_idx = 0;
2109                 read_lock_bh(&rt_hash_table[h].lock);
2110                 for (rt = rt_hash_table[h].chain, idx = 0; rt; rt = rt->u.rt_next, idx++) {
2111                         if (idx < s_idx)
2112                                 continue;
2113                         skb->dst = dst_clone(&rt->u.dst);
2114                         if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2115                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE, 1) <= 0) {
2116                                 dst_release(xchg(&skb->dst, NULL));
2117                                 read_unlock_bh(&rt_hash_table[h].lock);
2118                                 goto done;
2119                         }
2120                         dst_release(xchg(&skb->dst, NULL));
2121                 }
2122                 read_unlock_bh(&rt_hash_table[h].lock);
2123         }
2124 
2125 done:
2126         cb->args[0] = h;
2127         cb->args[1] = idx;
2128         return skb->len;
2129 }
2130 
2131 #endif /* CONFIG_RTNETLINK */
2132 
2133 void ip_rt_multicast_event(struct in_device *in_dev)
2134 {
2135         rt_cache_flush(0);
2136 }
2137 
2138 
2139 
2140 #ifdef CONFIG_SYSCTL
2141 
2142 static int flush_delay;
2143 
2144 static
2145 int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
2146                               void *buffer, size_t *lenp)
2147 {
2148         if (write) {
2149                 proc_dointvec(ctl, write, filp, buffer, lenp);
2150                 rt_cache_flush(flush_delay);
2151                 return 0;
2152         } else
2153                 return -EINVAL;
2154 }
2155 
2156 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table, int *name, int nlen,
2157                          void *oldval, size_t *oldlenp,
2158                          void *newval, size_t newlen, 
2159                          void **context)
2160 {
2161         int delay;
2162         if (newlen != sizeof(int))
2163                 return -EINVAL;
2164         if (get_user(delay,(int *)newval))
2165                 return -EFAULT; 
2166         rt_cache_flush(delay); 
2167         return 0;
2168 }
2169 
2170 ctl_table ipv4_route_table[] = {
2171         {NET_IPV4_ROUTE_FLUSH, "flush",
2172          &flush_delay, sizeof(int), 0644, NULL,
2173          &ipv4_sysctl_rtcache_flush, &ipv4_sysctl_rtcache_flush_strategy },
2174         {NET_IPV4_ROUTE_MIN_DELAY, "min_delay",
2175          &ip_rt_min_delay, sizeof(int), 0644, NULL,
2176          &proc_dointvec_jiffies, &sysctl_jiffies},
2177         {NET_IPV4_ROUTE_MAX_DELAY, "max_delay",
2178          &ip_rt_max_delay, sizeof(int), 0644, NULL,
2179          &proc_dointvec_jiffies, &sysctl_jiffies},
2180         {NET_IPV4_ROUTE_GC_THRESH, "gc_thresh",
2181          &ipv4_dst_ops.gc_thresh, sizeof(int), 0644, NULL,
2182          &proc_dointvec},
2183         {NET_IPV4_ROUTE_MAX_SIZE, "max_size",
2184          &ip_rt_max_size, sizeof(int), 0644, NULL,
2185          &proc_dointvec},
2186         {NET_IPV4_ROUTE_GC_MIN_INTERVAL, "gc_min_interval",
2187          &ip_rt_gc_min_interval, sizeof(int), 0644, NULL,
2188          &proc_dointvec_jiffies, &sysctl_jiffies},
2189         {NET_IPV4_ROUTE_GC_TIMEOUT, "gc_timeout",
2190          &ip_rt_gc_timeout, sizeof(int), 0644, NULL,
2191          &proc_dointvec_jiffies, &sysctl_jiffies},
2192         {NET_IPV4_ROUTE_GC_INTERVAL, "gc_interval",
2193          &ip_rt_gc_interval, sizeof(int), 0644, NULL,
2194          &proc_dointvec_jiffies, &sysctl_jiffies},
2195         {NET_IPV4_ROUTE_REDIRECT_LOAD, "redirect_load",
2196          &ip_rt_redirect_load, sizeof(int), 0644, NULL,
2197          &proc_dointvec},
2198         {NET_IPV4_ROUTE_REDIRECT_NUMBER, "redirect_number",
2199          &ip_rt_redirect_number, sizeof(int), 0644, NULL,
2200          &proc_dointvec},
2201         {NET_IPV4_ROUTE_REDIRECT_SILENCE, "redirect_silence",
2202          &ip_rt_redirect_silence, sizeof(int), 0644, NULL,
2203          &proc_dointvec},
2204         {NET_IPV4_ROUTE_ERROR_COST, "error_cost",
2205          &ip_rt_error_cost, sizeof(int), 0644, NULL,
2206          &proc_dointvec},
2207         {NET_IPV4_ROUTE_ERROR_BURST, "error_burst",
2208          &ip_rt_error_burst, sizeof(int), 0644, NULL,
2209          &proc_dointvec},
2210         {NET_IPV4_ROUTE_GC_ELASTICITY, "gc_elasticity",
2211          &ip_rt_gc_elasticity, sizeof(int), 0644, NULL,
2212          &proc_dointvec},
2213         {NET_IPV4_ROUTE_MTU_EXPIRES, "mtu_expires",
2214          &ip_rt_mtu_expires, sizeof(int), 0644, NULL,
2215          &proc_dointvec_jiffies, &sysctl_jiffies},
2216         {NET_IPV4_ROUTE_MIN_PMTU, "min_pmtu",
2217          &ip_rt_min_pmtu, sizeof(int), 0644, NULL,
2218          &proc_dointvec},
2219         {NET_IPV4_ROUTE_MIN_ADVMSS, "min_adv_mss",
2220          &ip_rt_min_advmss, sizeof(int), 0644, NULL,
2221          &proc_dointvec},
2222          {0}
2223 };
2224 #endif
2225 
2226 #ifdef CONFIG_NET_CLS_ROUTE
2227 struct ip_rt_acct *ip_rt_acct;
2228 
2229 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
2230                            int length, int *eof, void *data)
2231 {
2232         *start=buffer;
2233 
2234         if ((offset&3) || (length&3))
2235                 return -EIO;
2236 
2237         if (offset + length >= sizeof(struct ip_rt_acct)*256) {
2238                 length = sizeof(struct ip_rt_acct)*256 - offset;
2239                 *eof = 1;
2240         }
2241         if (length > 0) {
2242                 u32 *dst = (u32*)buffer;
2243                 u32 *src = (u32*)(((u8*)ip_rt_acct) + offset);
2244 
2245                 memcpy(dst, src, length);
2246 
2247 #ifdef CONFIG_SMP
2248                 if (smp_num_cpus > 1 || cpu_logical_map(0) != 0) {
2249                         int i;
2250                         int cnt = length/4;
2251 
2252                         for (i=0; i<smp_num_cpus; i++) {
2253                                 int cpu = cpu_logical_map(i);
2254                                 int k;
2255 
2256                                 if (cpu == 0)
2257                                         continue;
2258 
2259                                 src = (u32*)(((u8*)ip_rt_acct) + offset +
2260                                              cpu*256*sizeof(struct ip_rt_acct));
2261 
2262                                 for (k=0; k<cnt; k++)
2263                                         dst[k] += src[k];
2264                         }
2265                 }
2266 #endif
2267                 return length;
2268         }
2269         return 0;
2270 }
2271 #endif
2272 
2273 void __init ip_rt_init(void)
2274 {
2275         int i, order, goal;
2276 
2277 #ifdef CONFIG_NET_CLS_ROUTE
2278         for (order=0;
2279              (PAGE_SIZE<<order) < 256*sizeof(ip_rt_acct)*NR_CPUS; order++)
2280                 /* NOTHING */;
2281         ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
2282         if (!ip_rt_acct)
2283                 panic("IP: failed to allocate ip_rt_acct\n");
2284         memset(ip_rt_acct, 0, PAGE_SIZE<<order);
2285 #endif
2286 
2287         ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
2288                                                      sizeof(struct rtable),
2289                                                      0, SLAB_HWCACHE_ALIGN,
2290                                                      NULL, NULL);
2291 
2292         if (!ipv4_dst_ops.kmem_cachep)
2293                 panic("IP: failed to allocate ip_dst_cache\n");
2294 
2295         goal = num_physpages >> (26 - PAGE_SHIFT);
2296 
2297         for (order = 0; (1UL << order) < goal; order++)
2298                 /* NOTHING */;
2299 
2300         do {
2301                 rt_hash_mask = (1UL << order) * PAGE_SIZE /
2302                         sizeof(struct rt_hash_bucket);
2303                 while (rt_hash_mask & (rt_hash_mask-1))
2304                         rt_hash_mask--;
2305                 rt_hash_table = (struct rt_hash_bucket *)
2306                         __get_free_pages(GFP_ATOMIC, order);
2307         } while (rt_hash_table == NULL && --order > 0);
2308 
2309         if (!rt_hash_table)
2310                 panic("Failed to allocate IP route cache hash table\n");
2311 
2312         printk("IP: routing cache hash table of %u buckets, %ldKbytes\n",
2313                rt_hash_mask,
2314                (long) (rt_hash_mask*sizeof(struct rt_hash_bucket))/1024);
2315 
2316         for (rt_hash_log=0; (1<<rt_hash_log) != rt_hash_mask; rt_hash_log++)
2317                 /* NOTHING */;
2318 
2319         rt_hash_mask--;
2320         for (i = 0; i <= rt_hash_mask; i++) {
2321                 rt_hash_table[i].lock = RW_LOCK_UNLOCKED;
2322                 rt_hash_table[i].chain = NULL;
2323         }
2324 
2325         ipv4_dst_ops.gc_thresh = (rt_hash_mask+1);
2326         ip_rt_max_size = (rt_hash_mask+1)*16;
2327 
2328         devinet_init();
2329         ip_fib_init();
2330 
2331         rt_flush_timer.function = rt_run_flush;
2332         rt_periodic_timer.function = rt_check_expire;
2333 
2334         /* All the timers, started at system startup tend
2335            to synchronize. Perturb it a bit.
2336          */
2337         rt_periodic_timer.expires = jiffies + net_random()%ip_rt_gc_interval
2338                 + ip_rt_gc_interval;
2339         add_timer(&rt_periodic_timer);
2340 
2341         proc_net_create ("rt_cache", 0, rt_cache_get_info);
2342 #ifdef CONFIG_NET_CLS_ROUTE
2343         create_proc_read_entry("net/rt_acct", 0, 0, ip_rt_acct_read, NULL);
2344 #endif
2345 }
2346 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~ [ freetext search ] ~ [ file search ] ~

This page was automatically generated by the LXR engine.
Visit the LXR main site for more information.