/* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * ROUTE - implementation of the IP router. * * Version: $Id: route.c,v 1.91 2000/10/03 07:29:00 anton Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Alan Cox, <gw4pts@gw4pts.ampr.org> * Linus Torvalds, <Linus.Torvalds@helsinki.fi> * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * * Fixes: * Alan Cox : Verify area fixes. * Alan Cox : cli() protects routing changes * Rui Oliveira : ICMP routing table updates * (rco@di.uminho.pt) Routing table insertion and update * Linus Torvalds : Rewrote bits to be sensible * Alan Cox : Added BSD route gw semantics * Alan Cox : Super /proc >4K * Alan Cox : MTU in route table * Alan Cox : MSS actually. Also added the window * clamper. * Sam Lantinga : Fixed route matching in rt_del() * Alan Cox : Routing cache support. * Alan Cox : Removed compatibility cruft. * Alan Cox : RTF_REJECT support. * Alan Cox : TCP irtt support. * Jonathan Naylor : Added Metric support. * Miquel van Smoorenburg : BSD API fixes. * Miquel van Smoorenburg : Metrics. * Alan Cox : Use __u32 properly * Alan Cox : Aligned routing errors more closely with BSD * our system is still very different. * Alan Cox : Faster /proc handling * Alexey Kuznetsov : Massive rework to support tree based routing, * routing caches and better behaviour. * * Olaf Erb : irtt wasn't being copied right. * Bjorn Ekwall : Kerneld route support. * Alan Cox : Multicast fixed (I hope) * Pavel Krauz : Limited broadcast fixed * Mike McLagan : Routing by source * Alexey Kuznetsov : End of old history. Splitted to fib.c and * route.c and rewritten from scratch. * Andi Kleen : Load-limit warning messages. * Vitaly E. Lavrov : Transparent proxy revived after year coma. * Vitaly E. Lavrov : Race condition in ip_route_input_slow. * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow. * Vladimir V. Ivanov : IP rule info (flowid) is really useful. * Marc Boucher : routing by fwmark * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ #include <linux/config.h> #include <asm/uaccess.h> #include <asm/system.h> #include <asm/bitops.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/mm.h> #include <linux/string.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/errno.h> #include <linux/in.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/proc_fs.h> #include <linux/init.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <linux/inetdevice.h> #include <linux/igmp.h> #include <linux/pkt_sched.h> #include <linux/mroute.h> #include <linux/netfilter_ipv4.h> #include <linux/random.h> #include <net/protocol.h> #include <net/ip.h> #include <net/route.h> #include <net/inetpeer.h> #include <net/sock.h> #include <net/ip_fib.h> #include <net/arp.h> #include <net/tcp.h> #include <net/icmp.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif #define IP_MAX_MTU 0xFFF0 #define RT_GC_TIMEOUT (300*HZ) int ip_rt_min_delay = 2*HZ; int ip_rt_max_delay = 10*HZ; int ip_rt_max_size; int ip_rt_gc_timeout = RT_GC_TIMEOUT; int ip_rt_gc_interval = 60*HZ; int ip_rt_gc_min_interval = 5*HZ; int ip_rt_redirect_number = 9; int ip_rt_redirect_load = HZ/50; int ip_rt_redirect_silence = ((HZ/50) << (9+1)); int ip_rt_error_cost = HZ; int ip_rt_error_burst = 5*HZ; int ip_rt_gc_elasticity = 8; int ip_rt_mtu_expires = 10*60*HZ; int ip_rt_min_pmtu = 512+20+20; int ip_rt_min_advmss = 536; static unsigned long rt_deadline; #define RTprint(a...) printk(KERN_DEBUG a) static struct timer_list rt_flush_timer; static struct timer_list rt_periodic_timer; /* * Interface to generic destination cache. */ static struct dst_entry * ipv4_dst_check(struct dst_entry * dst, u32); static struct dst_entry * ipv4_dst_reroute(struct dst_entry * dst, struct sk_buff *); static void ipv4_dst_destroy(struct dst_entry * dst); static struct dst_entry * ipv4_negative_advice(struct dst_entry *); static void ipv4_link_failure(struct sk_buff *skb); static int rt_garbage_collect(void); struct dst_ops ipv4_dst_ops = { AF_INET, __constant_htons(ETH_P_IP), 0, rt_garbage_collect, ipv4_dst_check, ipv4_dst_reroute, ipv4_dst_destroy, ipv4_negative_advice, ipv4_link_failure, sizeof(struct rtable), }; #ifdef CONFIG_INET_ECN #define ECN_OR_COST(class) TC_PRIO_##class #else #define ECN_OR_COST(class) TC_PRIO_FILLER #endif __u8 ip_tos2prio[16] = { TC_PRIO_BESTEFFORT, ECN_OR_COST(FILLER), TC_PRIO_BESTEFFORT, ECN_OR_COST(BESTEFFORT), TC_PRIO_BULK, ECN_OR_COST(BULK), TC_PRIO_BULK, ECN_OR_COST(BULK), TC_PRIO_INTERACTIVE, ECN_OR_COST(INTERACTIVE), TC_PRIO_INTERACTIVE, ECN_OR_COST(INTERACTIVE), TC_PRIO_INTERACTIVE_BULK, ECN_OR_COST(INTERACTIVE_BULK), TC_PRIO_INTERACTIVE_BULK, ECN_OR_COST(INTERACTIVE_BULK) }; /* * Route cache. */ /* The locking scheme is rather straight forward: * * 1) A BH protected rwlocks protect buckets of the central route hash. * 2) Only writers remove entries, and they hold the lock * as they look at rtable reference counts. * 3) Only readers acquire references to rtable entries, * they do so with atomic increments and with the * lock held. */ struct rt_hash_bucket { struct rtable *chain; rwlock_t lock; } __attribute__((__aligned__(8))); static struct rt_hash_bucket *rt_hash_table; static unsigned rt_hash_mask; static int rt_hash_log; static int rt_intern_hash(unsigned hash, struct rtable * rth, struct rtable ** res); 206 static __inline__ unsigned rt_hash_code(u32 daddr, u32 saddr, u8 tos) { unsigned hash = ((daddr&0xF0F0F0F0)>>4)|((daddr&0x0F0F0F0F)<<4); hash ^= saddr^tos; hash ^= (hash>>16); 211 return (hash^(hash>>8)) & rt_hash_mask; } 214 static int rt_cache_get_info(char *buffer, char **start, off_t offset, int length) { int len=0; off_t pos=0; char temp[129]; struct rtable *r; int i; pos = 128; 224 if (offset<128) { sprintf(buffer,"%-127s\n", "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\tMetric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\tHHUptod\tSpecDst"); len = 128; } 229 for (i = rt_hash_mask; i>=0; i--) { 230 read_lock_bh(&rt_hash_table[i].lock); 231 for (r = rt_hash_table[i].chain; r; r = r->u.rt_next) { /* * Spin through entries until we are ready */ pos += 128; 237 if (pos <= offset) { len = 0; 239 continue; } sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X", r->u.dst.dev ? r->u.dst.dev->name : "*", (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway, r->rt_flags, atomic_read(&r->u.dst.__refcnt), r->u.dst.__use, 0, (unsigned long)r->rt_src, (int)r->u.dst.advmss + 40, r->u.dst.window, (int)((r->u.dst.rtt>>3) + r->u.dst.rttvar), r->key.tos, r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1, r->u.dst.hh ? (r->u.dst.hh->hh_output == dev_queue_xmit) : 0, r->rt_spec_dst); sprintf(buffer+len,"%-127s\n",temp); len += 128; 258 if (pos >= offset+length) { 259 read_unlock_bh(&rt_hash_table[i].lock); 260 goto done; } } 263 read_unlock_bh(&rt_hash_table[i].lock); } done: *start = buffer+len-(pos-offset); len = pos-offset; 269 if (len>length) len = length; 271 return len; } 274 static __inline__ void rt_free(struct rtable *rt) { dst_free(&rt->u.dst); } 279 static __inline__ void rt_drop(struct rtable *rt) { ip_rt_put(rt); dst_free(&rt->u.dst); } 285 static __inline__ int rt_fast_clean(struct rtable *rth) { /* Kill broadcast/multicast entries very aggresively, if they collide in hash table with more useful entries */ return ((rth->rt_flags&(RTCF_BROADCAST|RTCF_MULTICAST)) 290 && rth->key.iif && rth->u.rt_next); } 293 static __inline__ int rt_valuable(struct rtable *rth) { return ((rth->rt_flags&(RTCF_REDIRECTED|RTCF_NOTIFY)) 296 || rth->u.dst.expires); } 299 static __inline__ int rt_may_expire(struct rtable *rth, int tmo1, int tmo2) { int age; 303 if (atomic_read(&rth->u.dst.__refcnt)) 304 return 0; 306 if (rth->u.dst.expires && (long)(rth->u.dst.expires - jiffies) <= 0) 307 return 1; age = jiffies - rth->u.dst.lastuse; 310 if (age <= tmo1 && !rt_fast_clean(rth)) 311 return 0; 312 if (age <= tmo2 && rt_valuable(rth)) 313 return 0; 314 return 1; } /* This runs via a timer and thus is always in BH context. */ 318 static void SMP_TIMER_NAME(rt_check_expire)(unsigned long dummy) { int i, t; static int rover; struct rtable *rth, **rthp; unsigned long now = jiffies; i = rover; 327 for (t=(ip_rt_gc_interval<<rt_hash_log); t>=0; t -= ip_rt_gc_timeout) { unsigned tmo = ip_rt_gc_timeout; i = (i + 1) & rt_hash_mask; rthp = &rt_hash_table[i].chain; write_lock(&rt_hash_table[i].lock); 334 while ((rth = *rthp) != NULL) { 335 if (rth->u.dst.expires) { /* Entry is expired even if it is in use */ 337 if ((long)(now - rth->u.dst.expires) <= 0) { tmo >>= 1; rthp = &rth->u.rt_next; 340 continue; } 342 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) { tmo >>= 1; rthp = &rth->u.rt_next; 345 continue; } /* * Cleanup aged off entries. */ *rthp = rth->u.rt_next; rt_free(rth); } 354 write_unlock(&rt_hash_table[i].lock); /* Fallback loop breaker. */ 357 if ((jiffies - now) > 0) 358 break; } rover = i; mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval); } SMP_TIMER_DEFINE(rt_check_expire, rt_gc_task); /* This can run from both BH and non-BH contexts, the latter * in the case of a forced flush event. */ 369 static void SMP_TIMER_NAME(rt_run_flush)(unsigned long dummy) { int i; struct rtable * rth, * next; rt_deadline = 0; 376 for (i=rt_hash_mask; i>=0; i--) { 377 write_lock_bh(&rt_hash_table[i].lock); rth = rt_hash_table[i].chain; 379 if (rth) rt_hash_table[i].chain = NULL; 381 write_unlock_bh(&rt_hash_table[i].lock); 383 for (; rth; rth=next) { next = rth->u.rt_next; rt_free(rth); } } } SMP_TIMER_DEFINE(rt_run_flush, rt_cache_flush_task); static spinlock_t rt_flush_lock = SPIN_LOCK_UNLOCKED; 394 void rt_cache_flush(int delay) { unsigned long now = jiffies; int user_mode = !in_softirq(); 399 if (delay < 0) delay = ip_rt_min_delay; 402 spin_lock_bh(&rt_flush_lock); 404 if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) { long tmo = (long)(rt_deadline - now); /* If flush timer is already running and flush request is not immediate (delay > 0): if deadline is not achieved, prolongate timer to "delay", otherwise fire it at deadline time. */ 414 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay) tmo = 0; 417 if (delay > tmo) delay = tmo; } 421 if (delay <= 0) { 422 spin_unlock_bh(&rt_flush_lock); SMP_TIMER_NAME(rt_run_flush)(0); 424 return; } 427 if (rt_deadline == 0) rt_deadline = now + ip_rt_max_delay; mod_timer(&rt_flush_timer, now+delay); 431 spin_unlock_bh(&rt_flush_lock); } /* Short description of GC goals. We want to build algorithm, which will keep routing cache at some equilibrium point, when number of aged off entries is kept approximately equal to newly generated ones. Current expiration strength is variable "expire". We try to adjust it dynamically, so that if networking is idle expires is large enough to keep enough of warm entries, and when load increases it reduces to limit cache size. */ 447 static int rt_garbage_collect(void) { static unsigned expire = RT_GC_TIMEOUT; static unsigned long last_gc; static int rover; static int equilibrium; struct rtable *rth, **rthp; unsigned long now = jiffies; int goal; /* * Garbage collection is pretty expensive, * do not make it too frequently. */ if (now - last_gc < ip_rt_gc_min_interval && 462 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) 463 return 0; /* Calculate number of entries, which we want to expire now. */ goal = atomic_read(&ipv4_dst_ops.entries) - (ip_rt_gc_elasticity<<rt_hash_log); 467 if (goal <= 0) { 468 if (equilibrium < ipv4_dst_ops.gc_thresh) equilibrium = ipv4_dst_ops.gc_thresh; goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium; 471 if (goal > 0) { equilibrium += min(goal/2, rt_hash_mask+1); goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium; } 475 } else { /* We are in dangerous area. Try to reduce cache really * aggressively. */ goal = max(goal/2, rt_hash_mask+1); equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal; } 483 if (now - last_gc >= ip_rt_gc_min_interval) last_gc = now; 486 if (goal <= 0) { equilibrium += goal; 488 goto work_done; } 491 do { int i, k; 494 for (i=rt_hash_mask, k=rover; i>=0; i--) { unsigned tmo = expire; k = (k + 1) & rt_hash_mask; rthp = &rt_hash_table[k].chain; 499 write_lock_bh(&rt_hash_table[k].lock); 500 while ((rth = *rthp) != NULL) { 501 if (!rt_may_expire(rth, tmo, expire)) { tmo >>= 1; rthp = &rth->u.rt_next; 504 continue; } *rthp = rth->u.rt_next; rt_free(rth); goal--; } 510 write_unlock_bh(&rt_hash_table[k].lock); 511 if (goal <= 0) 512 break; } rover = k; 516 if (goal <= 0) 517 goto work_done; /* Goal is not achieved. We stop process if: - if expire reduced to zero. Otherwise, expire is halfed. - if table is not full. - if we are called from interrupt. - jiffies check is just fallback/debug loop breaker. We will not spin here for long time in any case. */ 528 if (expire == 0) 529 break; expire >>= 1; #if RT_CACHE_DEBUG >= 2 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire, atomic_read(&ipv4_dst_ops.entries), goal, i); #endif 536 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) 537 return 0; 538 } while (!in_softirq() && jiffies - now < 1); 540 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) 541 return 0; 542 if (net_ratelimit()) printk("dst cache overflow\n"); 544 return 1; work_done: expire += ip_rt_gc_min_interval; if (expire > ip_rt_gc_timeout || 549 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh) expire = ip_rt_gc_timeout; #if RT_CACHE_DEBUG >= 2 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire, atomic_read(&ipv4_dst_ops.entries), goal, rover); #endif 554 return 0; } 557 static int rt_intern_hash(unsigned hash, struct rtable * rt, struct rtable ** rp) { struct rtable *rth, **rthp; unsigned long now = jiffies; int attempts = !in_softirq(); restart: rthp = &rt_hash_table[hash].chain; 566 write_lock_bh(&rt_hash_table[hash].lock); 567 while ((rth = *rthp) != NULL) { 568 if (memcmp(&rth->key, &rt->key, sizeof(rt->key)) == 0) { /* Put it first */ *rthp = rth->u.rt_next; rth->u.rt_next = rt_hash_table[hash].chain; rt_hash_table[hash].chain = rth; rth->u.dst.__use++; dst_hold(&rth->u.dst); rth->u.dst.lastuse = now; 577 write_unlock_bh(&rt_hash_table[hash].lock); rt_drop(rt); *rp = rth; 581 return 0; } rthp = &rth->u.rt_next; } /* Try to bind route to arp only if it is output route or unicast forwarding path. */ 590 if (rt->rt_type == RTN_UNICAST || rt->key.iif == 0) { int err = arp_bind_neighbour(&rt->u.dst); 592 if (err) { 593 write_unlock_bh(&rt_hash_table[hash].lock); 595 if (err != -ENOBUFS) { rt_drop(rt); 597 return err; } /* Neighbour tables are full and nothing can be released. Try to shrink route cache, it is most likely it holds some neighbour records. */ 604 if (attempts-- > 0) { int saved_elasticity = ip_rt_gc_elasticity; int saved_int = ip_rt_gc_min_interval; ip_rt_gc_elasticity = 1; ip_rt_gc_min_interval = 0; rt_garbage_collect(); ip_rt_gc_min_interval = saved_int; ip_rt_gc_elasticity = saved_elasticity; 612 goto restart; } 615 if (net_ratelimit()) printk("Neighbour table overflow.\n"); rt_drop(rt); 618 return -ENOBUFS; } } rt->u.rt_next = rt_hash_table[hash].chain; #if RT_CACHE_DEBUG >= 2 if (rt->u.rt_next) { struct rtable * trt; printk("rt_cache @%02x: %u.%u.%u.%u", hash, NIPQUAD(rt->rt_dst)); for (trt=rt->u.rt_next; trt; trt=trt->u.rt_next) printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst)); printk("\n"); } #endif rt_hash_table[hash].chain = rt; 633 write_unlock_bh(&rt_hash_table[hash].lock); *rp = rt; 635 return 0; } 638 void rt_bind_peer(struct rtable *rt, int create) { static spinlock_t rt_peer_lock = SPIN_LOCK_UNLOCKED; struct inet_peer *peer; peer = inet_getpeer(rt->rt_dst, create); 645 spin_lock_bh(&rt_peer_lock); 646 if (rt->peer == NULL) { rt->peer = peer; peer = NULL; } 650 spin_unlock_bh(&rt_peer_lock); 651 if (peer) inet_putpeer(peer); } /* * Peer allocation may fail only in serious out-of-memory conditions. However * we still can generate some output. * Random ID selection looks a bit dangerous because we have no chances to * select ID being unique in a reasonable period of time. * But broken packet identifier may be better than no packet at all. */ 662 static void ip_select_fb_ident(struct iphdr *iph) { static spinlock_t ip_fb_id_lock = SPIN_LOCK_UNLOCKED; static u32 ip_fallback_id; u32 salt; 668 spin_lock_bh(&ip_fb_id_lock); salt = secure_ip_id(ip_fallback_id ^ iph->daddr); iph->id = salt & 0xFFFF; ip_fallback_id = salt; 672 spin_unlock_bh(&ip_fb_id_lock); } 675 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst) { struct rtable *rt = (struct rtable *) dst; 679 if (rt) { 680 if (rt->peer == NULL) rt_bind_peer(rt, 1); /* If peer is attached to destination, it is never detached, so that we need not to grab a lock to dereference it. */ 686 if (rt->peer) { iph->id = inet_getid(rt->peer); 688 return; } 690 } else { printk(KERN_DEBUG "rt_bind_peer(0) @%p\n", NET_CALLER(iph)); } ip_select_fb_ident(iph); } 697 static void rt_del(unsigned hash, struct rtable *rt) { struct rtable **rthp; 701 write_lock_bh(&rt_hash_table[hash].lock); ip_rt_put(rt); 703 for (rthp = &rt_hash_table[hash].chain; *rthp; rthp = &(*rthp)->u.rt_next) { 704 if (*rthp == rt) { *rthp = rt->u.rt_next; rt_free(rt); 707 break; } } 710 write_unlock_bh(&rt_hash_table[hash].lock); } 713 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw, u32 saddr, u8 tos, struct net_device *dev) { int i, k; struct in_device *in_dev = in_dev_get(dev); struct rtable *rth, **rthp; u32 skeys[2] = { saddr, 0 }; int ikeys[2] = { dev->ifindex, 0 }; tos &= IPTOS_RT_MASK; 724 if (!in_dev) 725 return; if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) 728 || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw)) 729 goto reject_redirect; 731 if (!IN_DEV_SHARED_MEDIA(in_dev)) { 732 if (!inet_addr_onlink(in_dev, new_gw, old_gw)) 733 goto reject_redirect; 734 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev)) 735 goto reject_redirect; 736 } else { 737 if (inet_addr_type(new_gw) != RTN_UNICAST) 738 goto reject_redirect; } 741 for (i=0; i<2; i++) { 742 for (k=0; k<2; k++) { unsigned hash = rt_hash_code(daddr, skeys[i]^(ikeys[k]<<5), tos); rthp=&rt_hash_table[hash].chain; read_lock(&rt_hash_table[hash].lock); 748 while ( (rth = *rthp) != NULL) { struct rtable *rt; if (rth->key.dst != daddr || rth->key.src != skeys[i] || rth->key.tos != tos || rth->key.oif != ikeys[k] || 755 rth->key.iif != 0) { rthp = &rth->u.rt_next; 757 continue; } if (rth->rt_dst != daddr || rth->rt_src != saddr || rth->u.dst.error || rth->rt_gateway != old_gw || 764 rth->u.dst.dev != dev) 765 break; dst_clone(&rth->u.dst); 768 read_unlock(&rt_hash_table[hash].lock); rt = dst_alloc(&ipv4_dst_ops); 771 if (rt == NULL) { ip_rt_put(rth); in_dev_put(in_dev); 774 return; } /* * Copy all the information. */ *rt = *rth; rt->u.dst.__use = 1; atomic_set(&rt->u.dst.__refcnt, 1); 783 if (rt->u.dst.dev) dev_hold(rt->u.dst.dev); rt->u.dst.lastuse = jiffies; rt->u.dst.neighbour = NULL; rt->u.dst.hh = NULL; rt->u.dst.obsolete = 0; rt->rt_flags |= RTCF_REDIRECTED; /* Gateway is different ... */ rt->rt_gateway = new_gw; /* Redirect received -> path was valid */ dst_confirm(&rth->u.dst); 798 if (rt->peer) atomic_inc(&rt->peer->refcnt); if (arp_bind_neighbour(&rt->u.dst) || 802 !(rt->u.dst.neighbour->nud_state&NUD_VALID)) { 803 if (rt->u.dst.neighbour) neigh_event_send(rt->u.dst.neighbour, NULL); ip_rt_put(rth); rt_drop(rt); 807 goto do_next; } rt_del(hash, rth); 811 if (!rt_intern_hash(hash, rt, &rt)) ip_rt_put(rt); 813 goto do_next; } 815 read_unlock(&rt_hash_table[hash].lock); do_next: ; } } in_dev_put(in_dev); 821 return; reject_redirect: #ifdef CONFIG_IP_ROUTE_VERBOSE if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about %u.%u.%u.%u ignored.\n" " Advised path = %u.%u.%u.%u -> %u.%u.%u.%u, tos %02x\n", NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw), NIPQUAD(saddr), NIPQUAD(daddr), tos); #endif in_dev_put(in_dev); } 834 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) { struct rtable *rt = (struct rtable*)dst; 838 if (rt != NULL) { 839 if (dst->obsolete) { ip_rt_put(rt); 841 return NULL; } 843 if ((rt->rt_flags&RTCF_REDIRECTED) || rt->u.dst.expires) { unsigned hash = rt_hash_code(rt->key.dst, rt->key.src^(rt->key.oif<<5), rt->key.tos); #if RT_CACHE_DEBUG >= 1 printk(KERN_DEBUG "ip_rt_advice: redirect to %u.%u.%u.%u/%02x dropped\n", NIPQUAD(rt->rt_dst), rt->key.tos); #endif rt_del(hash, rt); 850 return NULL; } } 853 return dst; } /* * Algorithm: * 1. The first ip_rt_redirect_number redirects are sent * with exponential backoff, then we stop sending them at all, * assuming that the host ignores our redirects. * 2. If we did not see packets requiring redirects * during ip_rt_redirect_silence, we assume that the host * forgot redirected route and start to send redirects again. * * This algorithm is much cheaper and more intelligent than dumb load limiting * in icmp.c. * * NOTE. Do not forget to inhibit load limiting for redirects (redundant) * and "frag. need" (breaks PMTU discovery) in icmp.c. */ 872 void ip_rt_send_redirect(struct sk_buff *skb) { struct rtable *rt = (struct rtable*)skb->dst; struct in_device *in_dev = in_dev_get(rt->u.dst.dev); 877 if (!in_dev) 878 return; 880 if (!IN_DEV_TX_REDIRECTS(in_dev)) 881 goto out; /* No redirected packets during ip_rt_redirect_silence; * reset the algorithm. */ 886 if (jiffies - rt->u.dst.rate_last > ip_rt_redirect_silence) rt->u.dst.rate_tokens = 0; /* Too many ignored redirects; do not send anything * set u.dst.rate_last to the last seen redirected packet. */ 892 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) { rt->u.dst.rate_last = jiffies; 894 goto out; } /* Check for load limit; set rate_last to the latest sent * redirect. */ 900 if (jiffies - rt->u.dst.rate_last > (ip_rt_redirect_load<<rt->u.dst.rate_tokens)) { icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); rt->u.dst.rate_last = jiffies; ++rt->u.dst.rate_tokens; #ifdef CONFIG_IP_ROUTE_VERBOSE if (IN_DEV_LOG_MARTIANS(in_dev) && rt->u.dst.rate_tokens == ip_rt_redirect_number && net_ratelimit()) printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores redirects for " "%u.%u.%u.%u to %u.%u.%u.%u.\n", NIPQUAD(rt->rt_src), rt->rt_iif, NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway)); #endif } out: in_dev_put(in_dev); } 917 static int ip_error(struct sk_buff *skb) { struct rtable *rt = (struct rtable*)skb->dst; unsigned long now; int code; 923 switch (rt->u.dst.error) { 924 case EINVAL: 925 default: kfree_skb(skb); 927 return 0; 928 case EHOSTUNREACH: code = ICMP_HOST_UNREACH; 930 break; 931 case ENETUNREACH: code = ICMP_NET_UNREACH; 933 break; 934 case EACCES: code = ICMP_PKT_FILTERED; 936 break; } now = jiffies; 940 if ((rt->u.dst.rate_tokens += (now - rt->u.dst.rate_last)) > ip_rt_error_burst) rt->u.dst.rate_tokens = ip_rt_error_burst; rt->u.dst.rate_last = now; 943 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) { rt->u.dst.rate_tokens -= ip_rt_error_cost; icmp_send(skb, ICMP_DEST_UNREACH, code, 0); } kfree_skb(skb); 949 return 0; } /* * The last two values are not from the RFC but * are needed for AMPRnet AX.25 paths. */ static unsigned short mtu_plateau[] = {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 }; 960 static __inline__ unsigned short guess_mtu(unsigned short old_mtu) { int i; 964 for (i = 0; i < sizeof(mtu_plateau)/sizeof(mtu_plateau[0]); i++) 965 if (old_mtu > mtu_plateau[i]) 966 return mtu_plateau[i]; 967 return 68; } 970 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu) { int i; unsigned short old_mtu = ntohs(iph->tot_len); struct rtable *rth; u32 skeys[2] = { iph->saddr, 0, }; u32 daddr = iph->daddr; u8 tos = iph->tos & IPTOS_RT_MASK; unsigned short est_mtu = 0; 980 if (ipv4_config.no_pmtu_disc) 981 return 0; 983 for (i=0; i<2; i++) { unsigned hash = rt_hash_code(daddr, skeys[i], tos); read_lock(&rt_hash_table[hash].lock); 987 for (rth = rt_hash_table[hash].chain; rth; rth = rth->u.rt_next) { if (rth->key.dst == daddr && rth->key.src == skeys[i] && rth->rt_dst == daddr && rth->rt_src == iph->saddr && rth->key.tos == tos && rth->key.iif == 0 && 994 !(rth->u.dst.mxlock&(1<<RTAX_MTU))) { unsigned short mtu = new_mtu; 997 if (new_mtu < 68 || new_mtu >= old_mtu) { /* BSD 4.2 compatibility hack :-( */ if (mtu == 0 && old_mtu >= rth->u.dst.pmtu && 1001 old_mtu >= 68 + (iph->ihl<<2)) old_mtu -= iph->ihl<<2; mtu = guess_mtu(old_mtu); } 1006 if (mtu <= rth->u.dst.pmtu) { 1007 if (mtu < rth->u.dst.pmtu) { dst_confirm(&rth->u.dst); 1009 if (mtu < ip_rt_min_pmtu) { mtu = ip_rt_min_pmtu; rth->u.dst.mxlock |= (1<<RTAX_MTU); } rth->u.dst.pmtu = mtu; dst_set_expires(&rth->u.dst, ip_rt_mtu_expires); } est_mtu = mtu; } } } 1020 read_unlock(&rt_hash_table[hash].lock); } 1022 return est_mtu ? : new_mtu; } 1025 void ip_rt_update_pmtu(struct dst_entry *dst, unsigned mtu) { if (dst->pmtu > mtu && mtu >= 68 && 1028 !(dst->mxlock&(1<<RTAX_MTU))) { 1029 if (mtu < ip_rt_min_pmtu) { mtu = ip_rt_min_pmtu; dst->mxlock |= (1<<RTAX_MTU); } dst->pmtu = mtu; dst_set_expires(dst, ip_rt_mtu_expires); } } 1038 static struct dst_entry * ipv4_dst_check(struct dst_entry * dst, u32 cookie) { dst_release(dst); 1041 return NULL; } 1044 static struct dst_entry * ipv4_dst_reroute(struct dst_entry * dst, struct sk_buff *skb) { 1047 return NULL; } 1050 static void ipv4_dst_destroy(struct dst_entry * dst) { struct rtable *rt = (struct rtable *) dst; struct inet_peer *peer = rt->peer; 1055 if (peer) { rt->peer = NULL; inet_putpeer(peer); } } 1061 static void ipv4_link_failure(struct sk_buff *skb) { struct rtable *rt; icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); rt = (struct rtable *) skb->dst; 1068 if (rt) dst_set_expires(&rt->u.dst, 0); } 1072 static int ip_rt_bug(struct sk_buff *skb) { printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n", NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr), skb->dev ? skb->dev->name : "?"); kfree_skb(skb); 1078 return 0; } /* We do not cache source address of outgoing interface, because it is used only by IP RR, TS and SRR options, so that it out of fast path. BTW remember: "addr" is allowed to be not aligned in IP options! */ 1090 void ip_rt_get_source(u8 *addr, struct rtable *rt) { u32 src; struct fib_result res; 1095 if (rt->key.iif == 0) src = rt->rt_src; 1097 else if (fib_lookup(&rt->key, &res) == 0) { #ifdef CONFIG_IP_ROUTE_NAT if (res.type == RTN_NAT) src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway, RT_SCOPE_UNIVERSE); else #endif src = FIB_RES_PREFSRC(res); fib_res_put(&res); 1105 } else src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway, RT_SCOPE_UNIVERSE); memcpy(addr, &src, 4); } #ifdef CONFIG_NET_CLS_ROUTE static void set_class_tag(struct rtable *rt, u32 tag) { if (!(rt->u.dst.tclassid&0xFFFF)) rt->u.dst.tclassid |= tag&0xFFFF; if (!(rt->u.dst.tclassid&0xFFFF0000)) rt->u.dst.tclassid |= tag&0xFFFF0000; } #endif 1120 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag) { struct fib_info *fi = res->fi; 1124 if (fi) { 1125 if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) rt->rt_gateway = FIB_RES_GW(*res); memcpy(&rt->u.dst.mxlock, fi->fib_metrics, sizeof(fi->fib_metrics)); 1128 if (fi->fib_mtu == 0) { rt->u.dst.pmtu = rt->u.dst.dev->mtu; if (rt->u.dst.mxlock&(1<<RTAX_MTU) && rt->rt_gateway != rt->rt_dst && 1132 rt->u.dst.pmtu > 576) rt->u.dst.pmtu = 576; } #ifdef CONFIG_NET_CLS_ROUTE rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid; #endif 1138 } else { rt->u.dst.pmtu = rt->u.dst.dev->mtu; } 1141 if (rt->u.dst.pmtu > IP_MAX_MTU) rt->u.dst.pmtu = IP_MAX_MTU; 1143 if (rt->u.dst.advmss == 0) rt->u.dst.advmss = max(rt->u.dst.dev->mtu-40, ip_rt_min_advmss); 1145 if (rt->u.dst.advmss > 65535-40) rt->u.dst.advmss = 65535-40; #ifdef CONFIG_NET_CLS_ROUTE #ifdef CONFIG_IP_MULTIPLE_TABLES set_class_tag(rt, fib_rules_tclass(res)); #endif set_class_tag(rt, itag); #endif rt->rt_type = res->type; } static int 1158 ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr, u8 tos, struct net_device *dev, int our) { unsigned hash; struct rtable *rth; u32 spec_dst; struct in_device *in_dev = in_dev_get(dev); u32 itag = 0; /* Primary sanity checks. */ 1169 if (in_dev == NULL) 1170 return -EINVAL; if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) || 1173 skb->protocol != __constant_htons(ETH_P_IP)) 1174 goto e_inval; 1176 if (ZERONET(saddr)) { 1177 if (!LOCAL_MCAST(daddr)) 1178 goto e_inval; spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); 1180 } else if (fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, &itag) < 0) 1181 goto e_inval; rth = dst_alloc(&ipv4_dst_ops); 1184 if (!rth) 1185 goto e_nobufs; rth->u.dst.output= ip_rt_bug; atomic_set(&rth->u.dst.__refcnt, 1); rth->u.dst.flags= DST_HOST; rth->key.dst = daddr; rth->rt_dst = daddr; rth->key.tos = tos; #ifdef CONFIG_IP_ROUTE_FWMARK rth->key.fwmark = skb->nfmark; #endif rth->key.src = saddr; rth->rt_src = saddr; #ifdef CONFIG_IP_ROUTE_NAT rth->rt_dst_map = daddr; rth->rt_src_map = saddr; #endif #ifdef CONFIG_NET_CLS_ROUTE rth->u.dst.tclassid = itag; #endif rth->rt_iif = rth->key.iif = dev->ifindex; rth->u.dst.dev = &loopback_dev; dev_hold(rth->u.dst.dev); rth->key.oif = 0; rth->rt_gateway = daddr; rth->rt_spec_dst= spec_dst; rth->rt_type = RTN_MULTICAST; rth->rt_flags = RTCF_MULTICAST; 1215 if (our) { rth->u.dst.input= ip_local_deliver; rth->rt_flags |= RTCF_LOCAL; } #ifdef CONFIG_IP_MROUTE if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev)) rth->u.dst.input = ip_mr_input; #endif in_dev_put(in_dev); hash = rt_hash_code(daddr, saddr^(dev->ifindex<<5), tos); 1227 return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst); e_nobufs: in_dev_put(in_dev); 1231 return -ENOBUFS; e_inval: in_dev_put(in_dev); 1235 return -EINVAL; } /* * NOTE. We drop all the packets that has local source * addresses, because every properly looped back packet * must have correct destination already attached by output routine. * * Such approach solves two big problems: * 1. Not simplex devices are handled properly. * 2. IP spoofing attempts are filtered with 100% of guarantee. */ 1248 int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr, u8 tos, struct net_device *dev) { struct rt_key key; struct fib_result res; struct in_device *in_dev = in_dev_get(dev); struct in_device *out_dev = NULL; unsigned flags = 0; u32 itag = 0; struct rtable * rth; unsigned hash; u32 spec_dst; int err = -EINVAL; int free_res = 0; /* * IP on this device is disabled. */ 1267 if (!in_dev) 1268 return -EINVAL; key.dst = daddr; key.src = saddr; key.tos = tos; #ifdef CONFIG_IP_ROUTE_FWMARK key.fwmark = skb->nfmark; #endif key.iif = dev->ifindex; key.oif = 0; key.scope = RT_SCOPE_UNIVERSE; hash = rt_hash_code(daddr, saddr^(key.iif<<5), tos); /* Check for the most weird martians, which can be not detected by fib_lookup. */ 1286 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr)) 1287 goto martian_source; 1289 if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0)) 1290 goto brd_input; /* Accept zero addresses only to limited broadcast; * I even do not know to fix it or not. Waiting for complains :-) */ 1295 if (ZERONET(saddr)) 1296 goto martian_source; 1298 if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr)) 1299 goto martian_destination; /* * Now we are ready to route packet. */ 1304 if ((err = fib_lookup(&key, &res)) != 0) { 1305 if (!IN_DEV_FORWARD(in_dev)) 1306 goto e_inval; 1307 goto no_route; } free_res = 1; #ifdef CONFIG_IP_ROUTE_NAT /* Policy is applied before mapping destination, but rerouting after map should be made with old source. */ if (1) { u32 src_map = saddr; if (res.r) src_map = fib_rules_policy(saddr, &res, &flags); if (res.type == RTN_NAT) { key.dst = fib_rules_map_destination(daddr, &res); fib_res_put(&res); free_res = 0; if (fib_lookup(&key, &res)) goto e_inval; free_res = 1; if (res.type != RTN_UNICAST) goto e_inval; flags |= RTCF_DNAT; } key.src = src_map; } #endif 1336 if (res.type == RTN_BROADCAST) 1337 goto brd_input; 1339 if (res.type == RTN_LOCAL) { int result; result = fib_validate_source(saddr, daddr, tos, loopback_dev.ifindex, dev, &spec_dst, &itag); 1343 if (result < 0) 1344 goto martian_source; 1345 if (result) flags |= RTCF_DIRECTSRC; spec_dst = daddr; 1348 goto local_input; } 1351 if (!IN_DEV_FORWARD(in_dev)) 1352 goto e_inval; 1353 if (res.type != RTN_UNICAST) 1354 goto martian_destination; #ifdef CONFIG_IP_ROUTE_MULTIPATH if (res.fi->fib_nhs > 1 && key.oif == 0) fib_select_multipath(&key, &res); #endif out_dev = in_dev_get(FIB_RES_DEV(res)); 1361 if (out_dev == NULL) { 1362 if (net_ratelimit()) printk(KERN_CRIT "Bug in ip_route_input_slow(). Please, report\n"); 1364 goto e_inval; } err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(res), dev, &spec_dst, &itag); 1368 if (err < 0) 1369 goto martian_source; 1371 if (err) flags |= RTCF_DIRECTSRC; if (out_dev == in_dev && err && !(flags&(RTCF_NAT|RTCF_MASQ)) && (IN_DEV_SHARED_MEDIA(out_dev) 1376 || inet_addr_onlink(out_dev, saddr, FIB_RES_GW(res)))) flags |= RTCF_DOREDIRECT; 1379 if (skb->protocol != __constant_htons(ETH_P_IP)) { /* Not IP (i.e. ARP). Do not create route, if it is * invalid for proxy arp. DNAT routes are always valid. */ 1383 if (out_dev == in_dev && !(flags&RTCF_DNAT)) 1384 goto e_inval; } rth = dst_alloc(&ipv4_dst_ops); 1388 if (!rth) 1389 goto e_nobufs; atomic_set(&rth->u.dst.__refcnt, 1); rth->u.dst.flags= DST_HOST; rth->key.dst = daddr; rth->rt_dst = daddr; rth->key.tos = tos; #ifdef CONFIG_IP_ROUTE_FWMARK rth->key.fwmark = skb->nfmark; #endif rth->key.src = saddr; rth->rt_src = saddr; rth->rt_gateway = daddr; #ifdef CONFIG_IP_ROUTE_NAT rth->rt_src_map = key.src; rth->rt_dst_map = key.dst; if (flags&RTCF_DNAT) rth->rt_gateway = key.dst; #endif rth->rt_iif = rth->key.iif = dev->ifindex; rth->u.dst.dev = out_dev->dev; dev_hold(rth->u.dst.dev); rth->key.oif = 0; rth->rt_spec_dst= spec_dst; rth->u.dst.input = ip_forward; rth->u.dst.output = ip_output; rt_set_nexthop(rth, &res, itag); rth->rt_flags = flags; #ifdef CONFIG_NET_FASTROUTE if (netdev_fastroute && !(flags&(RTCF_NAT|RTCF_MASQ|RTCF_DOREDIRECT))) { struct net_device *odev = rth->u.dst.dev; if (odev != dev && dev->accept_fastpath && odev->mtu >= dev->mtu && dev->accept_fastpath(dev, &rth->u.dst) == 0) rth->rt_flags |= RTCF_FAST; } #endif intern: err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst); done: in_dev_put(in_dev); 1437 if (out_dev) in_dev_put(out_dev); 1439 if (free_res) fib_res_put(&res); 1441 return err; brd_input: 1444 if (skb->protocol != __constant_htons(ETH_P_IP)) 1445 goto e_inval; 1447 if (ZERONET(saddr)) { spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); 1449 } else { err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, &itag); 1451 if (err < 0) 1452 goto martian_source; 1453 if (err) flags |= RTCF_DIRECTSRC; } flags |= RTCF_BROADCAST; res.type = RTN_BROADCAST; local_input: rth = dst_alloc(&ipv4_dst_ops); 1461 if (!rth) 1462 goto e_nobufs; rth->u.dst.output= ip_rt_bug; atomic_set(&rth->u.dst.__refcnt, 1); rth->u.dst.flags= DST_HOST; rth->key.dst = daddr; rth->rt_dst = daddr; rth->key.tos = tos; #ifdef CONFIG_IP_ROUTE_FWMARK rth->key.fwmark = skb->nfmark; #endif rth->key.src = saddr; rth->rt_src = saddr; #ifdef CONFIG_IP_ROUTE_NAT rth->rt_dst_map = key.dst; rth->rt_src_map = key.src; #endif #ifdef CONFIG_NET_CLS_ROUTE rth->u.dst.tclassid = itag; #endif rth->rt_iif = rth->key.iif = dev->ifindex; rth->u.dst.dev = &loopback_dev; dev_hold(rth->u.dst.dev); rth->key.oif = 0; rth->rt_gateway = daddr; rth->rt_spec_dst= spec_dst; rth->u.dst.input= ip_local_deliver; rth->rt_flags = flags|RTCF_LOCAL; 1492 if (res.type == RTN_UNREACHABLE) { rth->u.dst.input= ip_error; rth->u.dst.error= -err; rth->rt_flags &= ~RTCF_LOCAL; } rth->rt_type = res.type; 1498 goto intern; no_route: spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); res.type = RTN_UNREACHABLE; 1503 goto local_input; /* * Do not cache martian addresses: they should be logged (RFC1812) */ martian_destination: #ifdef CONFIG_IP_ROUTE_VERBOSE if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) printk(KERN_WARNING "martian destination %u.%u.%u.%u from %u.%u.%u.%u, dev %s\n", NIPQUAD(daddr), NIPQUAD(saddr), dev->name); #endif e_inval: err = -EINVAL; 1516 goto done; e_nobufs: err = -ENOBUFS; 1520 goto done; martian_source: #ifdef CONFIG_IP_ROUTE_VERBOSE if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) { /* * RFC1812 recommendation, if source is martian, * the only hint is MAC header. */ printk(KERN_WARNING "martian source %u.%u.%u.%u from %u.%u.%u.%u, on dev %s\n", NIPQUAD(daddr), NIPQUAD(saddr), dev->name); if (dev->hard_header_len) { int i; unsigned char *p = skb->mac.raw; printk(KERN_WARNING "ll header: "); for (i=0; i<dev->hard_header_len; i++, p++) { printk("%02x", *p); if(i<(dev->hard_header_len-1)) printk(":"); } printk("\n"); } } #endif 1544 goto e_inval; } 1547 int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr, u8 tos, struct net_device *dev) { struct rtable * rth; unsigned hash; int iif = dev->ifindex; tos &= IPTOS_RT_MASK; hash = rt_hash_code(daddr, saddr^(iif<<5), tos); read_lock(&rt_hash_table[hash].lock); 1558 for (rth=rt_hash_table[hash].chain; rth; rth=rth->u.rt_next) { if (rth->key.dst == daddr && rth->key.src == saddr && rth->key.iif == iif && rth->key.oif == 0 && #ifdef CONFIG_IP_ROUTE_FWMARK rth->key.fwmark == skb->nfmark && #endif 1566 rth->key.tos == tos) { rth->u.dst.lastuse = jiffies; dst_hold(&rth->u.dst); rth->u.dst.__use++; 1570 read_unlock(&rt_hash_table[hash].lock); skb->dst = (struct dst_entry*)rth; 1572 return 0; } } 1575 read_unlock(&rt_hash_table[hash].lock); /* Multicast recognition logic is moved from route cache to here. The problem was that too many Ethernet cards have broken/missing hardware multicast filters :-( As result the host on multicasting network acquires a lot of useless route cache entries, sort of SDR messages from all the world. Now we try to get rid of them. Really, provided software IP multicast filter is organized reasonably (at least, hashed), it does not result in a slowdown comparing with route cache reject entries. Note, that multicast routers are not affected, because route cache entry is created eventually. */ 1588 if (MULTICAST(daddr)) { struct in_device *in_dev; read_lock(&inetdev_lock); 1592 if ((in_dev = __in_dev_get(dev)) != NULL) { int our = ip_check_mc(in_dev, daddr); 1594 if (our #ifdef CONFIG_IP_MROUTE || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev)) #endif ) { 1599 read_unlock(&inetdev_lock); 1600 return ip_route_input_mc(skb, daddr, saddr, tos, dev, our); } } 1603 read_unlock(&inetdev_lock); 1604 return -EINVAL; } 1606 return ip_route_input_slow(skb, daddr, saddr, tos, dev); } /* * Major route resolver routine. */ 1613 int ip_route_output_slow(struct rtable **rp, const struct rt_key *oldkey) { struct rt_key key; struct fib_result res; unsigned flags = 0; struct rtable *rth; struct net_device *dev_out = NULL; unsigned hash; int free_res = 0; int err; u32 tos; tos = oldkey->tos & (IPTOS_RT_MASK|RTO_ONLINK); key.dst = oldkey->dst; key.src = oldkey->src; key.tos = tos&IPTOS_RT_MASK; key.iif = loopback_dev.ifindex; key.oif = oldkey->oif; #ifdef CONFIG_IP_ROUTE_FWMARK key.fwmark = oldkey->fwmark; #endif key.scope = (tos&RTO_ONLINK) ? RT_SCOPE_LINK : RT_SCOPE_UNIVERSE; res.fi = NULL; #ifdef CONFIG_IP_MULTIPLE_TABLES res.r = NULL; #endif 1640 if (oldkey->src) { if (MULTICAST(oldkey->src) || BADCLASS(oldkey->src) 1643 || ZERONET(oldkey->src)) 1644 return -EINVAL; /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ dev_out = ip_dev_find(oldkey->src); 1648 if (dev_out == NULL) 1649 return -EINVAL; /* I removed check for oif == dev_out->oif here. It was wrong by three reasons: 1. ip_dev_find(saddr) can return wrong iface, if saddr is assigned to multiple interfaces. 2. Moreover, we are allowed to send packets with saddr of another iface. --ANK */ if (oldkey->oif == 0 1660 && (MULTICAST(oldkey->dst) || oldkey->dst == 0xFFFFFFFF)) { /* Special hack: user can direct multicasts and limited broadcast via necessary interface without fiddling with IP_MULTICAST_IF or IP_PKTINFO. This hack is not just for fun, it allows vic,vat and friends to work. They bind socket to loopback, set ttl to zero and expect that it will work. From the viewpoint of routing cache they are broken, because we are not allowed to build multicast path with loopback source addr (look, routing cache cannot know, that ttl is zero, so that packet will not leave this host and route is valid). Luckily, this hack is good workaround. */ key.oif = dev_out->ifindex; 1677 goto make_route; } 1679 if (dev_out) dev_put(dev_out); dev_out = NULL; } 1683 if (oldkey->oif) { dev_out = dev_get_by_index(oldkey->oif); 1685 if (dev_out == NULL) 1686 return -ENODEV; 1687 if (__in_dev_get(dev_out) == NULL) { dev_put(dev_out); 1689 return -ENODEV; /* Wrong error code */ } 1692 if (LOCAL_MCAST(oldkey->dst) || oldkey->dst == 0xFFFFFFFF) { 1693 if (!key.src) key.src = inet_select_addr(dev_out, 0, RT_SCOPE_LINK); 1695 goto make_route; } 1697 if (!key.src) { 1698 if (MULTICAST(oldkey->dst)) key.src = inet_select_addr(dev_out, 0, key.scope); 1700 else if (!oldkey->dst) key.src = inet_select_addr(dev_out, 0, RT_SCOPE_HOST); } } 1705 if (!key.dst) { key.dst = key.src; 1707 if (!key.dst) key.dst = key.src = htonl(INADDR_LOOPBACK); 1709 if (dev_out) dev_put(dev_out); dev_out = &loopback_dev; dev_hold(dev_out); key.oif = loopback_dev.ifindex; res.type = RTN_LOCAL; flags |= RTCF_LOCAL; 1716 goto make_route; } 1719 if (fib_lookup(&key, &res)) { res.fi = NULL; 1721 if (oldkey->oif) { /* Apparently, routing tables are wrong. Assume, that the destination is on link. WHY? DW. Because we are allowed to send to iface even if it has NO routes and NO assigned addresses. When oif is specified, routing tables are looked up with only one purpose: to catch if destination is gatewayed, rather than direct. Moreover, if MSG_DONTROUTE is set, we send packet, ignoring both routing tables and ifaddr state. --ANK We could make it even if oif is unknown, likely IPv6, but we do not. */ 1740 if (key.src == 0) key.src = inet_select_addr(dev_out, 0, RT_SCOPE_LINK); res.type = RTN_UNICAST; 1743 goto make_route; } 1745 if (dev_out) dev_put(dev_out); 1747 return -ENETUNREACH; } free_res = 1; 1751 if (res.type == RTN_NAT) 1752 goto e_inval; 1754 if (res.type == RTN_LOCAL) { 1755 if (!key.src) key.src = key.dst; 1757 if (dev_out) dev_put(dev_out); dev_out = &loopback_dev; dev_hold(dev_out); key.oif = dev_out->ifindex; 1762 if (res.fi) fib_info_put(res.fi); res.fi = NULL; flags |= RTCF_LOCAL; 1766 goto make_route; } #ifdef CONFIG_IP_ROUTE_MULTIPATH if (res.fi->fib_nhs > 1 && key.oif == 0) fib_select_multipath(&key, &res); else #endif 1774 if (res.prefixlen==0 && res.type == RTN_UNICAST && key.oif == 0) fib_select_default(&key, &res); 1777 if (!key.src) key.src = FIB_RES_PREFSRC(res); 1780 if (dev_out) dev_put(dev_out); dev_out = FIB_RES_DEV(res); dev_hold(dev_out); key.oif = dev_out->ifindex; make_route: 1787 if (LOOPBACK(key.src) && !(dev_out->flags&IFF_LOOPBACK)) 1788 goto e_inval; 1790 if (key.dst == 0xFFFFFFFF) res.type = RTN_BROADCAST; 1792 else if (MULTICAST(key.dst)) res.type = RTN_MULTICAST; 1794 else if (BADCLASS(key.dst) || ZERONET(key.dst)) 1795 goto e_inval; 1797 if (dev_out->flags&IFF_LOOPBACK) flags |= RTCF_LOCAL; 1800 if (res.type == RTN_BROADCAST) { flags |= RTCF_BROADCAST|RTCF_LOCAL; 1802 if (res.fi) { fib_info_put(res.fi); res.fi = NULL; } 1806 } else if (res.type == RTN_MULTICAST) { flags |= RTCF_MULTICAST|RTCF_LOCAL; read_lock(&inetdev_lock); 1809 if (!__in_dev_get(dev_out) || !ip_check_mc(__in_dev_get(dev_out), oldkey->dst)) flags &= ~RTCF_LOCAL; 1811 read_unlock(&inetdev_lock); /* If multicast route do not exist use default one, but do not gateway in this case. Yes, it is hack. */ 1816 if (res.fi && res.prefixlen < 4) { fib_info_put(res.fi); res.fi = NULL; } } rth = dst_alloc(&ipv4_dst_ops); 1823 if (!rth) 1824 goto e_nobufs; atomic_set(&rth->u.dst.__refcnt, 1); rth->u.dst.flags= DST_HOST; rth->key.dst = oldkey->dst; rth->key.tos = tos; rth->key.src = oldkey->src; rth->key.iif = 0; rth->key.oif = oldkey->oif; #ifdef CONFIG_IP_ROUTE_FWMARK rth->key.fwmark = oldkey->fwmark; #endif rth->rt_dst = key.dst; rth->rt_src = key.src; #ifdef CONFIG_IP_ROUTE_NAT rth->rt_dst_map = key.dst; rth->rt_src_map = key.src; #endif rth->rt_iif = oldkey->oif ? : dev_out->ifindex; rth->u.dst.dev = dev_out; dev_hold(dev_out); rth->rt_gateway = key.dst; rth->rt_spec_dst= key.src; rth->u.dst.output=ip_output; 1850 if (flags&RTCF_LOCAL) { rth->u.dst.input = ip_local_deliver; rth->rt_spec_dst = key.dst; } 1854 if (flags&(RTCF_BROADCAST|RTCF_MULTICAST)) { rth->rt_spec_dst = key.src; 1856 if (flags&RTCF_LOCAL && !(dev_out->flags&IFF_LOOPBACK)) rth->u.dst.output = ip_mc_output; #ifdef CONFIG_IP_MROUTE if (res.type == RTN_MULTICAST) { struct in_device *in_dev = in_dev_get(dev_out); if (in_dev) { if (IN_DEV_MFORWARD(in_dev) && !LOCAL_MCAST(oldkey->dst)) { rth->u.dst.input = ip_mr_input; rth->u.dst.output = ip_mc_output; } in_dev_put(in_dev); } } #endif } rt_set_nexthop(rth, &res, 0); rth->rt_flags = flags; hash = rt_hash_code(oldkey->dst, oldkey->src^(oldkey->oif<<5), tos); err = rt_intern_hash(hash, rth, rp); done: 1879 if (free_res) fib_res_put(&res); 1881 if (dev_out) dev_put(dev_out); 1883 return err; e_inval: err = -EINVAL; 1887 goto done; e_nobufs: err = -ENOBUFS; 1890 goto done; } 1893 int ip_route_output_key(struct rtable **rp, const struct rt_key *key) { unsigned hash; struct rtable *rth; hash = rt_hash_code(key->dst, key->src^(key->oif<<5), key->tos); 1900 read_lock_bh(&rt_hash_table[hash].lock); 1901 for (rth=rt_hash_table[hash].chain; rth; rth=rth->u.rt_next) { if (rth->key.dst == key->dst && rth->key.src == key->src && rth->key.iif == 0 && rth->key.oif == key->oif && #ifdef CONFIG_IP_ROUTE_FWMARK rth->key.fwmark == key->fwmark && #endif !((rth->key.tos^key->tos)&(IPTOS_RT_MASK|RTO_ONLINK)) && 1910 ((key->tos&RTO_TPROXY) || !(rth->rt_flags&RTCF_TPROXY)) 1911 ) { rth->u.dst.lastuse = jiffies; dst_hold(&rth->u.dst); rth->u.dst.__use++; 1915 read_unlock_bh(&rt_hash_table[hash].lock); *rp = rth; 1917 return 0; } } 1920 read_unlock_bh(&rt_hash_table[hash].lock); 1922 return ip_route_output_slow(rp, key); } #ifdef CONFIG_RTNETLINK static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event, int nowait) { struct rtable *rt = (struct rtable*)skb->dst; struct rtmsg *r; struct nlmsghdr *nlh; unsigned char *b = skb->tail; struct rta_cacheinfo ci; #ifdef CONFIG_IP_MROUTE struct rtattr *eptr; #endif nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*r)); r = NLMSG_DATA(nlh); nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0; r->rtm_family = AF_INET; r->rtm_dst_len = 32; r->rtm_src_len = 0; r->rtm_tos = rt->key.tos; r->rtm_table = RT_TABLE_MAIN; r->rtm_type = rt->rt_type; r->rtm_scope = RT_SCOPE_UNIVERSE; r->rtm_protocol = RTPROT_UNSPEC; r->rtm_flags = (rt->rt_flags&~0xFFFF) | RTM_F_CLONED; if (rt->rt_flags & RTCF_NOTIFY) r->rtm_flags |= RTM_F_NOTIFY; RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst); if (rt->key.src) { r->rtm_src_len = 32; RTA_PUT(skb, RTA_SRC, 4, &rt->key.src); } if (rt->u.dst.dev) RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex); #ifdef CONFIG_NET_CLS_ROUTE if (rt->u.dst.tclassid) RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid); #endif if (rt->key.iif) RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst); else if (rt->rt_src != rt->key.src) RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src); if (rt->rt_dst != rt->rt_gateway) RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway); if (rtnetlink_put_metrics(skb, &rt->u.dst.mxlock) < 0) goto rtattr_failure; ci.rta_lastuse = jiffies - rt->u.dst.lastuse; ci.rta_used = rt->u.dst.__use; ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt); if (rt->u.dst.expires) ci.rta_expires = rt->u.dst.expires - jiffies; else ci.rta_expires = 0; ci.rta_error = rt->u.dst.error; ci.rta_id = 0; ci.rta_ts = 0; ci.rta_tsage = 0; if (rt->peer) { ci.rta_id = rt->peer->ip_id_count; if (rt->peer->tcp_ts_stamp) { ci.rta_ts = rt->peer->tcp_ts; ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp; } } #ifdef CONFIG_IP_MROUTE eptr = (struct rtattr*)skb->tail; #endif RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci); if (rt->key.iif) { #ifdef CONFIG_IP_MROUTE u32 dst = rt->rt_dst; if (MULTICAST(dst) && !LOCAL_MCAST(dst) && ipv4_devconf.mc_forwarding) { int err = ipmr_get_route(skb, r, nowait); if (err <= 0) { if (!nowait) { if (err == 0) return 0; goto nlmsg_failure; } else { if (err == -EMSGSIZE) goto nlmsg_failure; ((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err; } } } else #endif { RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->key.iif); } } nlh->nlmsg_len = skb->tail - b; return skb->len; nlmsg_failure: rtattr_failure: skb_trim(skb, b - skb->data); return -1; } int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg) { struct rtattr **rta = arg; struct rtmsg *rtm = NLMSG_DATA(nlh); struct rtable *rt = NULL; u32 dst = 0; u32 src = 0; int iif = 0; int err; struct sk_buff *skb; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (skb == NULL) return -ENOBUFS; /* Reserve room for dummy headers, this skb can pass through good chunk of routing engine. */ skb->mac.raw = skb->data; skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr)); if (rta[RTA_SRC-1]) memcpy(&src, RTA_DATA(rta[RTA_SRC-1]), 4); if (rta[RTA_DST-1]) memcpy(&dst, RTA_DATA(rta[RTA_DST-1]), 4); if (rta[RTA_IIF-1]) memcpy(&iif, RTA_DATA(rta[RTA_IIF-1]), sizeof(int)); if (iif) { struct net_device *dev; dev = __dev_get_by_index(iif); if (!dev) return -ENODEV; skb->protocol = __constant_htons(ETH_P_IP); skb->dev = dev; local_bh_disable(); err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev); local_bh_enable(); rt = (struct rtable*)skb->dst; if (!err && rt->u.dst.error) err = -rt->u.dst.error; } else { int oif = 0; if (rta[RTA_OIF-1]) memcpy(&oif, RTA_DATA(rta[RTA_OIF-1]), sizeof(int)); err = ip_route_output(&rt, dst, src, rtm->rtm_tos, oif); } if (err) { kfree_skb(skb); return err; } skb->dst = &rt->u.dst; if (rtm->rtm_flags & RTM_F_NOTIFY) rt->rt_flags |= RTCF_NOTIFY; NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid; err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, RTM_NEWROUTE, 0); if (err == 0) return 0; if (err < 0) return -EMSGSIZE; err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT); if (err < 0) return err; return 0; } int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct rtable *rt; int h, s_h; int idx, s_idx; s_h = cb->args[0]; s_idx = idx = cb->args[1]; for (h=0; h <= rt_hash_mask; h++) { if (h < s_h) continue; if (h > s_h) s_idx = 0; read_lock_bh(&rt_hash_table[h].lock); for (rt = rt_hash_table[h].chain, idx = 0; rt; rt = rt->u.rt_next, idx++) { if (idx < s_idx) continue; skb->dst = dst_clone(&rt->u.dst); if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, RTM_NEWROUTE, 1) <= 0) { dst_release(xchg(&skb->dst, NULL)); read_unlock_bh(&rt_hash_table[h].lock); goto done; } dst_release(xchg(&skb->dst, NULL)); } read_unlock_bh(&rt_hash_table[h].lock); } done: cb->args[0] = h; cb->args[1] = idx; return skb->len; } #endif /* CONFIG_RTNETLINK */ 2133 void ip_rt_multicast_event(struct in_device *in_dev) { rt_cache_flush(0); } #ifdef CONFIG_SYSCTL static int flush_delay; static 2145 int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp, void *buffer, size_t *lenp) { 2148 if (write) { proc_dointvec(ctl, write, filp, buffer, lenp); rt_cache_flush(flush_delay); 2151 return 0; 2152 } else 2153 return -EINVAL; } 2156 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table, int *name, int nlen, void *oldval, size_t *oldlenp, void *newval, size_t newlen, void **context) { int delay; 2162 if (newlen != sizeof(int)) 2163 return -EINVAL; 2164 if (get_user(delay,(int *)newval)) 2165 return -EFAULT; rt_cache_flush(delay); 2167 return 0; } ctl_table ipv4_route_table[] = { {NET_IPV4_ROUTE_FLUSH, "flush", &flush_delay, sizeof(int), 0644, NULL, &ipv4_sysctl_rtcache_flush, &ipv4_sysctl_rtcache_flush_strategy }, {NET_IPV4_ROUTE_MIN_DELAY, "min_delay", &ip_rt_min_delay, sizeof(int), 0644, NULL, &proc_dointvec_jiffies, &sysctl_jiffies}, {NET_IPV4_ROUTE_MAX_DELAY, "max_delay", &ip_rt_max_delay, sizeof(int), 0644, NULL, &proc_dointvec_jiffies, &sysctl_jiffies}, {NET_IPV4_ROUTE_GC_THRESH, "gc_thresh", &ipv4_dst_ops.gc_thresh, sizeof(int), 0644, NULL, &proc_dointvec}, {NET_IPV4_ROUTE_MAX_SIZE, "max_size", &ip_rt_max_size, sizeof(int), 0644, NULL, &proc_dointvec}, {NET_IPV4_ROUTE_GC_MIN_INTERVAL, "gc_min_interval", &ip_rt_gc_min_interval, sizeof(int), 0644, NULL, &proc_dointvec_jiffies, &sysctl_jiffies}, {NET_IPV4_ROUTE_GC_TIMEOUT, "gc_timeout", &ip_rt_gc_timeout, sizeof(int), 0644, NULL, &proc_dointvec_jiffies, &sysctl_jiffies}, {NET_IPV4_ROUTE_GC_INTERVAL, "gc_interval", &ip_rt_gc_interval, sizeof(int), 0644, NULL, &proc_dointvec_jiffies, &sysctl_jiffies}, {NET_IPV4_ROUTE_REDIRECT_LOAD, "redirect_load", &ip_rt_redirect_load, sizeof(int), 0644, NULL, &proc_dointvec}, {NET_IPV4_ROUTE_REDIRECT_NUMBER, "redirect_number", &ip_rt_redirect_number, sizeof(int), 0644, NULL, &proc_dointvec}, {NET_IPV4_ROUTE_REDIRECT_SILENCE, "redirect_silence", &ip_rt_redirect_silence, sizeof(int), 0644, NULL, &proc_dointvec}, {NET_IPV4_ROUTE_ERROR_COST, "error_cost", &ip_rt_error_cost, sizeof(int), 0644, NULL, &proc_dointvec}, {NET_IPV4_ROUTE_ERROR_BURST, "error_burst", &ip_rt_error_burst, sizeof(int), 0644, NULL, &proc_dointvec}, {NET_IPV4_ROUTE_GC_ELASTICITY, "gc_elasticity", &ip_rt_gc_elasticity, sizeof(int), 0644, NULL, &proc_dointvec}, {NET_IPV4_ROUTE_MTU_EXPIRES, "mtu_expires", &ip_rt_mtu_expires, sizeof(int), 0644, NULL, &proc_dointvec_jiffies, &sysctl_jiffies}, {NET_IPV4_ROUTE_MIN_PMTU, "min_pmtu", &ip_rt_min_pmtu, sizeof(int), 0644, NULL, &proc_dointvec}, {NET_IPV4_ROUTE_MIN_ADVMSS, "min_adv_mss", &ip_rt_min_advmss, sizeof(int), 0644, NULL, &proc_dointvec}, {0} }; #endif #ifdef CONFIG_NET_CLS_ROUTE struct ip_rt_acct *ip_rt_acct; static int ip_rt_acct_read(char *buffer, char **start, off_t offset, int length, int *eof, void *data) { *start=buffer; if ((offset&3) || (length&3)) return -EIO; if (offset + length >= sizeof(struct ip_rt_acct)*256) { length = sizeof(struct ip_rt_acct)*256 - offset; *eof = 1; } if (length > 0) { u32 *dst = (u32*)buffer; u32 *src = (u32*)(((u8*)ip_rt_acct) + offset); memcpy(dst, src, length); #ifdef CONFIG_SMP if (smp_num_cpus > 1 || cpu_logical_map(0) != 0) { int i; int cnt = length/4; for (i=0; i<smp_num_cpus; i++) { int cpu = cpu_logical_map(i); int k; if (cpu == 0) continue; src = (u32*)(((u8*)ip_rt_acct) + offset + cpu*256*sizeof(struct ip_rt_acct)); for (k=0; k<cnt; k++) dst[k] += src[k]; } } #endif return length; } return 0; } #endif 2273 void __init ip_rt_init(void) { int i, order, goal; #ifdef CONFIG_NET_CLS_ROUTE for (order=0; (PAGE_SIZE<<order) < 256*sizeof(ip_rt_acct)*NR_CPUS; order++) /* NOTHING */; ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order); if (!ip_rt_acct) panic("IP: failed to allocate ip_rt_acct\n"); memset(ip_rt_acct, 0, PAGE_SIZE<<order); #endif ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0, SLAB_HWCACHE_ALIGN, NULL, NULL); 2292 if (!ipv4_dst_ops.kmem_cachep) panic("IP: failed to allocate ip_dst_cache\n"); goal = num_physpages >> (26 - PAGE_SHIFT); 2297 for (order = 0; (1UL << order) < goal; order++) /* NOTHING */; 2300 do { rt_hash_mask = (1UL << order) * PAGE_SIZE / sizeof(struct rt_hash_bucket); 2303 while (rt_hash_mask & (rt_hash_mask-1)) rt_hash_mask--; rt_hash_table = (struct rt_hash_bucket *) __get_free_pages(GFP_ATOMIC, order); 2307 } while (rt_hash_table == NULL && --order > 0); 2309 if (!rt_hash_table) panic("Failed to allocate IP route cache hash table\n"); printk("IP: routing cache hash table of %u buckets, %ldKbytes\n", rt_hash_mask, (long) (rt_hash_mask*sizeof(struct rt_hash_bucket))/1024); 2316 for (rt_hash_log=0; (1<<rt_hash_log) != rt_hash_mask; rt_hash_log++) /* NOTHING */; rt_hash_mask--; 2320 for (i = 0; i <= rt_hash_mask; i++) { rt_hash_table[i].lock = RW_LOCK_UNLOCKED; rt_hash_table[i].chain = NULL; } ipv4_dst_ops.gc_thresh = (rt_hash_mask+1); ip_rt_max_size = (rt_hash_mask+1)*16; devinet_init(); ip_fib_init(); rt_flush_timer.function = rt_run_flush; rt_periodic_timer.function = rt_check_expire; /* All the timers, started at system startup tend to synchronize. Perturb it a bit. */ rt_periodic_timer.expires = jiffies + net_random()%ip_rt_gc_interval + ip_rt_gc_interval; add_timer(&rt_periodic_timer); proc_net_create ("rt_cache", 0, rt_cache_get_info); #ifdef CONFIG_NET_CLS_ROUTE create_proc_read_entry("net/rt_acct", 0, 0, ip_rt_acct_read, NULL); #endif }