./net/ipv4/route.c

      /*
       * INET		An implementation of the TCP/IP protocol suite for the LINUX
       *		operating system.  INET is implemented using the  BSD Socket
       *		interface as the means of communication with the user level.
       *
       *		ROUTE - implementation of the IP router.
       *
       * Version:	$Id: route.c,v 1.91 2000/10/03 07:29:00 anton Exp $
       *
       * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
       *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
       *		Alan Cox, <gw4pts@gw4pts.ampr.org>
       *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
       *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
       *
       * Fixes:
       *		Alan Cox	:	Verify area fixes.
       *		Alan Cox	:	cli() protects routing changes
       *		Rui Oliveira	:	ICMP routing table updates
       *		(rco@di.uminho.pt)	Routing table insertion and update
       *		Linus Torvalds	:	Rewrote bits to be sensible
       *		Alan Cox	:	Added BSD route gw semantics
       *		Alan Cox	:	Super /proc >4K 
       *		Alan Cox	:	MTU in route table
       *		Alan Cox	: 	MSS actually. Also added the window
       *					clamper.
       *		Sam Lantinga	:	Fixed route matching in rt_del()
       *		Alan Cox	:	Routing cache support.
       *		Alan Cox	:	Removed compatibility cruft.
       *		Alan Cox	:	RTF_REJECT support.
       *		Alan Cox	:	TCP irtt support.
       *		Jonathan Naylor	:	Added Metric support.
       *	Miquel van Smoorenburg	:	BSD API fixes.
       *	Miquel van Smoorenburg	:	Metrics.
       *		Alan Cox	:	Use __u32 properly
       *		Alan Cox	:	Aligned routing errors more closely with BSD
       *					our system is still very different.
       *		Alan Cox	:	Faster /proc handling
       *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
       *					routing caches and better behaviour.
       *		
       *		Olaf Erb	:	irtt wasn't being copied right.
       *		Bjorn Ekwall	:	Kerneld route support.
       *		Alan Cox	:	Multicast fixed (I hope)
       * 		Pavel Krauz	:	Limited broadcast fixed
       *		Mike McLagan	:	Routing by source
       *	Alexey Kuznetsov	:	End of old history. Splitted to fib.c and
       *					route.c and rewritten from scratch.
       *		Andi Kleen	:	Load-limit warning messages.
       *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
       *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
       *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
       *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
       *		Marc Boucher	:	routing by fwmark
       *
       *		This program is free software; you can redistribute it and/or
       *		modify it under the terms of the GNU General Public License
       *		as published by the Free Software Foundation; either version
       *		2 of the License, or (at your option) any later version.
       */
      
      #include <linux/config.h>
      #include <asm/uaccess.h>
      #include <asm/system.h>
      #include <asm/bitops.h>
      #include <linux/types.h>
      #include <linux/kernel.h>
      #include <linux/sched.h>
      #include <linux/mm.h>
      #include <linux/string.h>
      #include <linux/socket.h>
      #include <linux/sockios.h>
      #include <linux/errno.h>
      #include <linux/in.h>
      #include <linux/inet.h>
      #include <linux/netdevice.h>
      #include <linux/proc_fs.h>
      #include <linux/init.h>
      #include <linux/skbuff.h>
      #include <linux/rtnetlink.h>
      #include <linux/inetdevice.h>
      #include <linux/igmp.h>
      #include <linux/pkt_sched.h>
      #include <linux/mroute.h>
      #include <linux/netfilter_ipv4.h>
      #include <linux/random.h>
      #include <net/protocol.h>
      #include <net/ip.h>
      #include <net/route.h>
      #include <net/inetpeer.h>
      #include <net/sock.h>
      #include <net/ip_fib.h>
      #include <net/arp.h>
      #include <net/tcp.h>
      #include <net/icmp.h>
      #ifdef CONFIG_SYSCTL
      #include <linux/sysctl.h>
      #endif
      
      #define IP_MAX_MTU	0xFFF0
      
      #define RT_GC_TIMEOUT (300*HZ)
      
      int ip_rt_min_delay = 2*HZ;
      int ip_rt_max_delay = 10*HZ;
      int ip_rt_max_size;
      int ip_rt_gc_timeout = RT_GC_TIMEOUT;
      int ip_rt_gc_interval = 60*HZ;
      int ip_rt_gc_min_interval = 5*HZ;
      int ip_rt_redirect_number = 9;
      int ip_rt_redirect_load = HZ/50;
      int ip_rt_redirect_silence = ((HZ/50) << (9+1));
      int ip_rt_error_cost = HZ;
      int ip_rt_error_burst = 5*HZ;
      int ip_rt_gc_elasticity = 8;
      int ip_rt_mtu_expires = 10*60*HZ;
      int ip_rt_min_pmtu = 512+20+20;
      int ip_rt_min_advmss = 536;
      
      static unsigned long rt_deadline;
      
      #define RTprint(a...)	printk(KERN_DEBUG a)
      
      static struct timer_list rt_flush_timer;
      static struct timer_list rt_periodic_timer;
      
      /*
       *	Interface to generic destination cache.
       */
      
      static struct dst_entry * ipv4_dst_check(struct dst_entry * dst, u32);
      static struct dst_entry * ipv4_dst_reroute(struct dst_entry * dst,
      					   struct sk_buff *);
      static void		  ipv4_dst_destroy(struct dst_entry * dst);
      static struct dst_entry * ipv4_negative_advice(struct dst_entry *);
      static void		  ipv4_link_failure(struct sk_buff *skb);
      static int rt_garbage_collect(void);
      
      
      struct dst_ops ipv4_dst_ops =
      {
      	AF_INET,
      	__constant_htons(ETH_P_IP),
      	0,
      
      	rt_garbage_collect,
      	ipv4_dst_check,
      	ipv4_dst_reroute,
      	ipv4_dst_destroy,
      	ipv4_negative_advice,
      	ipv4_link_failure,
      	sizeof(struct rtable),
      };
      
      #ifdef CONFIG_INET_ECN
      #define ECN_OR_COST(class)	TC_PRIO_##class
      #else
      #define ECN_OR_COST(class)	TC_PRIO_FILLER
      #endif
      
      __u8 ip_tos2prio[16] = {
      	TC_PRIO_BESTEFFORT,
      	ECN_OR_COST(FILLER),
      	TC_PRIO_BESTEFFORT,
      	ECN_OR_COST(BESTEFFORT),
      	TC_PRIO_BULK,
      	ECN_OR_COST(BULK),
      	TC_PRIO_BULK,
      	ECN_OR_COST(BULK),
      	TC_PRIO_INTERACTIVE,
      	ECN_OR_COST(INTERACTIVE),
      	TC_PRIO_INTERACTIVE,
      	ECN_OR_COST(INTERACTIVE),
      	TC_PRIO_INTERACTIVE_BULK,
      	ECN_OR_COST(INTERACTIVE_BULK),
      	TC_PRIO_INTERACTIVE_BULK,
      	ECN_OR_COST(INTERACTIVE_BULK)
      };
      
      
      /*
       * Route cache.
       */
      
      /* The locking scheme is rather straight forward:
       *
       * 1) A BH protected rwlocks protect buckets of the central route hash.
       * 2) Only writers remove entries, and they hold the lock
       *    as they look at rtable reference counts.
       * 3) Only readers acquire references to rtable entries,
       *    they do so with atomic increments and with the
       *    lock held.
       */
      
      struct rt_hash_bucket {
      	struct rtable	*chain;
      	rwlock_t	lock;
      } __attribute__((__aligned__(8)));
      
      static struct rt_hash_bucket 	*rt_hash_table;
      static unsigned			rt_hash_mask;
      static int			rt_hash_log;
      
      static int rt_intern_hash(unsigned hash, struct rtable * rth, struct rtable ** res);
      
 206  static __inline__ unsigned rt_hash_code(u32 daddr, u32 saddr, u8 tos)
      {
      	unsigned hash = ((daddr&0xF0F0F0F0)>>4)|((daddr&0x0F0F0F0F)<<4);
      	hash ^= saddr^tos;
      	hash ^= (hash>>16);
 211  	return (hash^(hash>>8)) & rt_hash_mask;
      }
      
 214  static int rt_cache_get_info(char *buffer, char **start, off_t offset, int length)
      {
      	int len=0;
      	off_t pos=0;
      	char temp[129];
      	struct rtable *r;
      	int i;
      
      	pos = 128;
      
 224  	if (offset<128)	{
      		sprintf(buffer,"%-127s\n", "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\tMetric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\tHHUptod\tSpecDst");
      		len = 128;
        	}
      	
 229  	for (i = rt_hash_mask; i>=0; i--) {
 230  		read_lock_bh(&rt_hash_table[i].lock);
 231  		for (r = rt_hash_table[i].chain; r; r = r->u.rt_next) {
      			/*
      			 *	Spin through entries until we are ready
      			 */
      			pos += 128;
      
 237  			if (pos <= offset) {
      				len = 0;
 239  				continue;
      			}
      			sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
      				r->u.dst.dev ? r->u.dst.dev->name : "*",
      				(unsigned long)r->rt_dst,
      				(unsigned long)r->rt_gateway,
      				r->rt_flags,
      				atomic_read(&r->u.dst.__refcnt),
      				r->u.dst.__use,
      				0,
      				(unsigned long)r->rt_src, (int)r->u.dst.advmss + 40,
      				r->u.dst.window,
      				(int)((r->u.dst.rtt>>3) + r->u.dst.rttvar),
      				r->key.tos,
      				r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
      				r->u.dst.hh ? (r->u.dst.hh->hh_output == dev_queue_xmit) : 0,
      				r->rt_spec_dst);
      			sprintf(buffer+len,"%-127s\n",temp);
      			len += 128;
 258  			if (pos >= offset+length) {
 259  				read_unlock_bh(&rt_hash_table[i].lock);
 260  				goto done;
      			}
      		}
 263  		read_unlock_bh(&rt_hash_table[i].lock);
              }
      
      done:
        	*start = buffer+len-(pos-offset);
        	len = pos-offset;
 269    	if (len>length)
        		len = length;
 271    	return len;
      }
        
 274  static __inline__ void rt_free(struct rtable *rt)
      {
      	dst_free(&rt->u.dst);
      }
      
 279  static __inline__ void rt_drop(struct rtable *rt)
      {
      	ip_rt_put(rt);
      	dst_free(&rt->u.dst);
      }
      
 285  static __inline__ int rt_fast_clean(struct rtable *rth)
      {
      	/* Kill broadcast/multicast entries very aggresively, if they
      	   collide in hash table with more useful entries */
      	return ((rth->rt_flags&(RTCF_BROADCAST|RTCF_MULTICAST))
 290  		&& rth->key.iif && rth->u.rt_next);
      }
      
 293  static __inline__ int rt_valuable(struct rtable *rth)
      {
      	return ((rth->rt_flags&(RTCF_REDIRECTED|RTCF_NOTIFY))
 296  		|| rth->u.dst.expires);
      }
      
 299  static __inline__ int rt_may_expire(struct rtable *rth, int tmo1, int tmo2)
      {
      	int age;
      
 303  	if (atomic_read(&rth->u.dst.__refcnt))
 304  		return 0;
      
 306  	if (rth->u.dst.expires && (long)(rth->u.dst.expires - jiffies) <= 0)
 307  		return 1;
      
      	age = jiffies - rth->u.dst.lastuse;
 310  	if (age <= tmo1 && !rt_fast_clean(rth))
 311  		return 0;
 312  	if (age <= tmo2 && rt_valuable(rth))
 313  		return 0;
 314  	return 1;
      }
      
      /* This runs via a timer and thus is always in BH context. */
 318  static void SMP_TIMER_NAME(rt_check_expire)(unsigned long dummy)
      {
      	int i, t;
      	static int rover;
      	struct rtable *rth, **rthp;
      	unsigned long now = jiffies;
      
      	i = rover;
      
 327  	for (t=(ip_rt_gc_interval<<rt_hash_log); t>=0; t -= ip_rt_gc_timeout) {
      		unsigned tmo = ip_rt_gc_timeout;
      
      		i = (i + 1) & rt_hash_mask;
      		rthp = &rt_hash_table[i].chain;
      
      		write_lock(&rt_hash_table[i].lock);
 334  		while ((rth = *rthp) != NULL) {
 335  			if (rth->u.dst.expires) {
      				/* Entry is expired even if it is in use */
 337  				if ((long)(now - rth->u.dst.expires) <= 0) {
      					tmo >>= 1;
      					rthp = &rth->u.rt_next;
 340  					continue;
      				}
 342  			} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
      				tmo >>= 1;
      				rthp = &rth->u.rt_next;
 345  				continue;
      			}
      
      			/*
      			 * Cleanup aged off entries.
      			 */
      			*rthp = rth->u.rt_next;
      			rt_free(rth);
      		}
 354  		write_unlock(&rt_hash_table[i].lock);
      
      		/* Fallback loop breaker. */
 357  		if ((jiffies - now) > 0)
 358  			break;
      	}
      	rover = i;
      	mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval);
      }
      
      SMP_TIMER_DEFINE(rt_check_expire, rt_gc_task);
      
      /* This can run from both BH and non-BH contexts, the latter
       * in the case of a forced flush event.
       */
 369  static void SMP_TIMER_NAME(rt_run_flush)(unsigned long dummy)
      {
      	int i;
      	struct rtable * rth, * next;
      
      	rt_deadline = 0;
      
 376  	for (i=rt_hash_mask; i>=0; i--) {
 377  		write_lock_bh(&rt_hash_table[i].lock);
      		rth = rt_hash_table[i].chain;
 379  		if (rth)
      			rt_hash_table[i].chain = NULL;
 381  		write_unlock_bh(&rt_hash_table[i].lock);
      
 383  		for (; rth; rth=next) {
      			next = rth->u.rt_next;
      			rt_free(rth);
      		}
      	}
      }
      
      SMP_TIMER_DEFINE(rt_run_flush, rt_cache_flush_task);
        
      static spinlock_t rt_flush_lock = SPIN_LOCK_UNLOCKED;
      
 394  void rt_cache_flush(int delay)
      {
      	unsigned long now = jiffies;
      	int user_mode = !in_softirq();
      
 399  	if (delay < 0)
      		delay = ip_rt_min_delay;
      
 402  	spin_lock_bh(&rt_flush_lock);
      
 404  	if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
      		long tmo = (long)(rt_deadline - now);
      
      		/* If flush timer is already running
      		   and flush request is not immediate (delay > 0):
      
      		   if deadline is not achieved, prolongate timer to "delay",
      		   otherwise fire it at deadline time.
      		 */
      
 414  		if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
      			tmo = 0;
      		
 417  		if (delay > tmo)
      			delay = tmo;
      	}
      
 421  	if (delay <= 0) {
 422  		spin_unlock_bh(&rt_flush_lock);
      		SMP_TIMER_NAME(rt_run_flush)(0);
 424  		return;
      	}
      
 427  	if (rt_deadline == 0)
      		rt_deadline = now + ip_rt_max_delay;
      
      	mod_timer(&rt_flush_timer, now+delay);
 431  	spin_unlock_bh(&rt_flush_lock);
      }
      
      /*
         Short description of GC goals.
      
         We want to build algorithm, which will keep routing cache
         at some equilibrium point, when number of aged off entries
         is kept approximately equal to newly generated ones.
      
         Current expiration strength is variable "expire".
         We try to adjust it dynamically, so that if networking
         is idle expires is large enough to keep enough of warm entries,
         and when load increases it reduces to limit cache size.
       */
      
 447  static int rt_garbage_collect(void)
      {
      	static unsigned expire = RT_GC_TIMEOUT;
      	static unsigned long last_gc;
      	static int rover;
      	static int equilibrium;
      	struct rtable *rth, **rthp;
      	unsigned long now = jiffies;
      	int goal;
      
      	/*
      	 * Garbage collection is pretty expensive,
      	 * do not make it too frequently.
      	 */
      	if (now - last_gc < ip_rt_gc_min_interval &&
 462  	    atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
 463  		return 0;
      
      	/* Calculate number of entries, which we want to expire now. */
      	goal = atomic_read(&ipv4_dst_ops.entries) - (ip_rt_gc_elasticity<<rt_hash_log);
 467  	if (goal <= 0) {
 468  		if (equilibrium < ipv4_dst_ops.gc_thresh)
      			equilibrium = ipv4_dst_ops.gc_thresh;
      		goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 471  		if (goal > 0) {
      			equilibrium += min(goal/2, rt_hash_mask+1);
      			goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
      		}
 475  	} else {
      		/* We are in dangerous area. Try to reduce cache really
      		 * aggressively.
      		 */
      		goal = max(goal/2, rt_hash_mask+1);
      		equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
      	}
      
 483  	if (now - last_gc >= ip_rt_gc_min_interval)
      		last_gc = now;
      
 486  	if (goal <= 0) {
      		equilibrium += goal;
 488  		goto work_done;
      	}
      
 491  	do {
      		int i, k;
      
 494  		for (i=rt_hash_mask, k=rover; i>=0; i--) {
      			unsigned tmo = expire;
      
      			k = (k + 1) & rt_hash_mask;
      			rthp = &rt_hash_table[k].chain;
 499  			write_lock_bh(&rt_hash_table[k].lock);
 500  			while ((rth = *rthp) != NULL) {
 501  				if (!rt_may_expire(rth, tmo, expire)) {
      					tmo >>= 1;
      					rthp = &rth->u.rt_next;
 504  					continue;
      				}
      				*rthp = rth->u.rt_next;
      				rt_free(rth);
      				goal--;
      			}
 510  			write_unlock_bh(&rt_hash_table[k].lock);
 511  			if (goal <= 0)
 512  				break;
      		}
      		rover = k;
      
 516  		if (goal <= 0)
 517  			goto work_done;
      
      		/* Goal is not achieved. We stop process if:
      
      		   - if expire reduced to zero. Otherwise, expire is halfed.
      		   - if table is not full.
      		   - if we are called from interrupt.
      		   - jiffies check is just fallback/debug loop breaker.
      		     We will not spin here for long time in any case.
      		 */
      
 528  		if (expire == 0)
 529  			break;
      
      		expire >>= 1;
      #if RT_CACHE_DEBUG >= 2
      		printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire, atomic_read(&ipv4_dst_ops.entries), goal, i);
      #endif
      
 536  		if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
 537  			return 0;
 538  	} while (!in_softirq() && jiffies - now < 1);
      
 540  	if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
 541  		return 0;
 542  	if (net_ratelimit())
      		printk("dst cache overflow\n");
 544  	return 1;
      
      work_done:
      	expire += ip_rt_gc_min_interval;
      	if (expire > ip_rt_gc_timeout ||
 549  	    atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
      		expire = ip_rt_gc_timeout;
      #if RT_CACHE_DEBUG >= 2
      	printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire, atomic_read(&ipv4_dst_ops.entries), goal, rover);
      #endif
 554  	return 0;
      }
      
 557  static int rt_intern_hash(unsigned hash, struct rtable * rt, struct rtable ** rp)
      {
      	struct rtable	*rth, **rthp;
      	unsigned long	now = jiffies;
      	int attempts = !in_softirq();
      
      restart:
      	rthp = &rt_hash_table[hash].chain;
      
 566  	write_lock_bh(&rt_hash_table[hash].lock);
 567  	while ((rth = *rthp) != NULL) {
 568  		if (memcmp(&rth->key, &rt->key, sizeof(rt->key)) == 0) {
      			/* Put it first */
      			*rthp = rth->u.rt_next;
      			rth->u.rt_next = rt_hash_table[hash].chain;
      			rt_hash_table[hash].chain = rth;
      
      			rth->u.dst.__use++;
      			dst_hold(&rth->u.dst);
      			rth->u.dst.lastuse = now;
 577  			write_unlock_bh(&rt_hash_table[hash].lock);
      
      			rt_drop(rt);
      			*rp = rth;
 581  			return 0;
      		}
      
      		rthp = &rth->u.rt_next;
      	}
      
      	/* Try to bind route to arp only if it is output
      	   route or unicast forwarding path.
      	 */
 590  	if (rt->rt_type == RTN_UNICAST || rt->key.iif == 0) {
      		int err = arp_bind_neighbour(&rt->u.dst);
 592  		if (err) {
 593  			write_unlock_bh(&rt_hash_table[hash].lock);
      
 595  			if (err != -ENOBUFS) {
      				rt_drop(rt);
 597  				return err;
      			}
      
      			/* Neighbour tables are full and nothing
      			   can be released. Try to shrink route cache,
      			   it is most likely it holds some neighbour records.
      			 */
 604  			if (attempts-- > 0) {
      				int saved_elasticity = ip_rt_gc_elasticity;
      				int saved_int = ip_rt_gc_min_interval;
      				ip_rt_gc_elasticity = 1;
      				ip_rt_gc_min_interval = 0;
      				rt_garbage_collect();
      				ip_rt_gc_min_interval = saved_int;
      				ip_rt_gc_elasticity = saved_elasticity;
 612  				goto restart;
      			}
      
 615  			if (net_ratelimit())
      				printk("Neighbour table overflow.\n");
      			rt_drop(rt);
 618  			return -ENOBUFS;
      		}
      	}
      
      	rt->u.rt_next = rt_hash_table[hash].chain;
      #if RT_CACHE_DEBUG >= 2
      	if (rt->u.rt_next) {
      		struct rtable * trt;
      		printk("rt_cache @%02x: %u.%u.%u.%u", hash, NIPQUAD(rt->rt_dst));
      		for (trt=rt->u.rt_next; trt; trt=trt->u.rt_next)
      			printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
      		printk("\n");
      	}
      #endif
      	rt_hash_table[hash].chain = rt;
 633  	write_unlock_bh(&rt_hash_table[hash].lock);
      	*rp = rt;
 635  	return 0;
      }
      
 638  void rt_bind_peer(struct rtable *rt, int create)
      {
      	static spinlock_t rt_peer_lock = SPIN_LOCK_UNLOCKED;
      	struct inet_peer *peer;
      
      	peer = inet_getpeer(rt->rt_dst, create);
      
 645  	spin_lock_bh(&rt_peer_lock);
 646  	if (rt->peer == NULL) {
      		rt->peer = peer;
      		peer = NULL;
      	}
 650  	spin_unlock_bh(&rt_peer_lock);
 651  	if (peer)
      		inet_putpeer(peer);
      }
      
      /*
       * Peer allocation may fail only in serious out-of-memory conditions.  However
       * we still can generate some output.
       * Random ID selection looks a bit dangerous because we have no chances to
       * select ID being unique in a reasonable period of time.
       * But broken packet identifier may be better than no packet at all.
       */
 662  static void ip_select_fb_ident(struct iphdr *iph)
      {
      	static spinlock_t ip_fb_id_lock = SPIN_LOCK_UNLOCKED;
      	static u32 ip_fallback_id;
      	u32 salt;
      
 668  	spin_lock_bh(&ip_fb_id_lock);
      	salt = secure_ip_id(ip_fallback_id ^ iph->daddr);
      	iph->id = salt & 0xFFFF;
      	ip_fallback_id = salt;
 672  	spin_unlock_bh(&ip_fb_id_lock);
      }
      
 675  void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst)
      {
      	struct rtable *rt = (struct rtable *) dst;
      
 679  	if (rt) {
 680  		if (rt->peer == NULL)
      			rt_bind_peer(rt, 1);
      
      		/* If peer is attached to destination, it is never detached,
      		   so that we need not to grab a lock to dereference it.
      		 */
 686  		if (rt->peer) {
      			iph->id = inet_getid(rt->peer);
 688  			return;
      		}
 690  	} else {
      		printk(KERN_DEBUG "rt_bind_peer(0) @%p\n", NET_CALLER(iph));
      	}
      
      	ip_select_fb_ident(iph);
      }
      
 697  static void rt_del(unsigned hash, struct rtable *rt)
      {
      	struct rtable **rthp;
      
 701  	write_lock_bh(&rt_hash_table[hash].lock);
      	ip_rt_put(rt);
 703  	for (rthp = &rt_hash_table[hash].chain; *rthp; rthp = &(*rthp)->u.rt_next) {
 704  		if (*rthp == rt) {
      			*rthp = rt->u.rt_next;
      			rt_free(rt);
 707  			break;
      		}
      	}
 710  	write_unlock_bh(&rt_hash_table[hash].lock);
      }
      
 713  void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
      		    u32 saddr, u8 tos, struct net_device *dev)
      {
      	int i, k;
      	struct in_device *in_dev = in_dev_get(dev);
      	struct rtable *rth, **rthp;
      	u32  skeys[2] = { saddr, 0 };
      	int  ikeys[2] = { dev->ifindex, 0 };
      
      	tos &= IPTOS_RT_MASK;
      
 724  	if (!in_dev)
 725  		return;
      
      	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
 728  	    || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
 729  		goto reject_redirect;
      
 731  	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 732  		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 733  			goto reject_redirect;
 734  		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 735  			goto reject_redirect;
 736  	} else {
 737  		if (inet_addr_type(new_gw) != RTN_UNICAST)
 738  			goto reject_redirect;
      	}
      
 741  	for (i=0; i<2; i++) {
 742  		for (k=0; k<2; k++) {
      			unsigned hash = rt_hash_code(daddr, skeys[i]^(ikeys[k]<<5), tos);
      
      			rthp=&rt_hash_table[hash].chain;
      
      			read_lock(&rt_hash_table[hash].lock);
 748  			while ( (rth = *rthp) != NULL) {
      				struct rtable *rt;
      
      				if (rth->key.dst != daddr ||
      				    rth->key.src != skeys[i] ||
      				    rth->key.tos != tos ||
      				    rth->key.oif != ikeys[k] ||
 755  				    rth->key.iif != 0) {
      					rthp = &rth->u.rt_next;
 757  					continue;
      				}
      
      				if (rth->rt_dst != daddr ||
      				    rth->rt_src != saddr ||
      				    rth->u.dst.error ||
      				    rth->rt_gateway != old_gw ||
 764  				    rth->u.dst.dev != dev)
 765  					break;
      
      				dst_clone(&rth->u.dst);
 768  				read_unlock(&rt_hash_table[hash].lock);
      
      				rt = dst_alloc(&ipv4_dst_ops);
 771  				if (rt == NULL) {
      					ip_rt_put(rth);
      					in_dev_put(in_dev);
 774  					return;
      				}
      
      				/*
      				 * Copy all the information.
      				 */
      				*rt = *rth;
      				rt->u.dst.__use = 1;
      				atomic_set(&rt->u.dst.__refcnt, 1);
 783  				if (rt->u.dst.dev)
      					dev_hold(rt->u.dst.dev);
      				rt->u.dst.lastuse = jiffies;
      				rt->u.dst.neighbour = NULL;
      				rt->u.dst.hh = NULL;
      				rt->u.dst.obsolete = 0;
      
      				rt->rt_flags |= RTCF_REDIRECTED;
      
      				/* Gateway is different ... */
      				rt->rt_gateway = new_gw;
      
      				/* Redirect received -> path was valid */
      				dst_confirm(&rth->u.dst);
      
 798  				if (rt->peer)
      					atomic_inc(&rt->peer->refcnt);
      
      				if (arp_bind_neighbour(&rt->u.dst) ||
 802  				    !(rt->u.dst.neighbour->nud_state&NUD_VALID)) {
 803  					if (rt->u.dst.neighbour)
      						neigh_event_send(rt->u.dst.neighbour, NULL);
      					ip_rt_put(rth);
      					rt_drop(rt);
 807  					goto do_next;
      				}
      
      				rt_del(hash, rth);
 811  				if (!rt_intern_hash(hash, rt, &rt))
      					ip_rt_put(rt);
 813  				goto do_next;
      			}
 815  			read_unlock(&rt_hash_table[hash].lock);
      		do_next:
      			;
      		}
      	}
      	in_dev_put(in_dev);
 821  	return;
      
      reject_redirect:
      #ifdef CONFIG_IP_ROUTE_VERBOSE
      	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
      		printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about %u.%u.%u.%u ignored.\n"
      		       "  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u, tos %02x\n",
      		       NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
      		       NIPQUAD(saddr), NIPQUAD(daddr), tos);
      #endif
      	in_dev_put(in_dev);
      }
      
 834  static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
      {
      	struct rtable *rt = (struct rtable*)dst;
      
 838  	if (rt != NULL) {
 839  		if (dst->obsolete) {
      			ip_rt_put(rt);
 841  			return NULL;
      		}
 843  		if ((rt->rt_flags&RTCF_REDIRECTED) || rt->u.dst.expires) {
      			unsigned hash = rt_hash_code(rt->key.dst, rt->key.src^(rt->key.oif<<5), rt->key.tos);
      #if RT_CACHE_DEBUG >= 1
      			printk(KERN_DEBUG "ip_rt_advice: redirect to %u.%u.%u.%u/%02x dropped\n",
      				NIPQUAD(rt->rt_dst), rt->key.tos);
      #endif
      			rt_del(hash, rt);
 850  			return NULL;
      		}
      	}
 853  	return dst;
      }
      
      /*
       * Algorithm:
       *	1. The first ip_rt_redirect_number redirects are sent
       *	   with exponential backoff, then we stop sending them at all,
       *	   assuming that the host ignores our redirects.
       *	2. If we did not see packets requiring redirects
       *	   during ip_rt_redirect_silence, we assume that the host
       *	   forgot redirected route and start to send redirects again.
       *
       * This algorithm is much cheaper and more intelligent than dumb load limiting
       * in icmp.c.
       *
       * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
       * and "frag. need" (breaks PMTU discovery) in icmp.c.
       */
      
 872  void ip_rt_send_redirect(struct sk_buff *skb)
      {
      	struct rtable *rt = (struct rtable*)skb->dst;
      	struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
      
 877  	if (!in_dev)
 878  		return;
      
 880  	if (!IN_DEV_TX_REDIRECTS(in_dev))
 881  		goto out;
      
      	/* No redirected packets during ip_rt_redirect_silence;
      	 * reset the algorithm.
      	 */
 886  	if (jiffies - rt->u.dst.rate_last > ip_rt_redirect_silence)
      		rt->u.dst.rate_tokens = 0;
      
      	/* Too many ignored redirects; do not send anything
      	 * set u.dst.rate_last to the last seen redirected packet.
      	 */
 892  	if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
      		rt->u.dst.rate_last = jiffies;
 894  		goto out;
      	}
      
      	/* Check for load limit; set rate_last to the latest sent
      	 * redirect.
      	 */
 900  	if (jiffies - rt->u.dst.rate_last > (ip_rt_redirect_load<<rt->u.dst.rate_tokens)) {
      		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
      		rt->u.dst.rate_last = jiffies;
      		++rt->u.dst.rate_tokens;
      #ifdef CONFIG_IP_ROUTE_VERBOSE
      		if (IN_DEV_LOG_MARTIANS(in_dev) &&
      		    rt->u.dst.rate_tokens == ip_rt_redirect_number && net_ratelimit())
      			printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores redirects for "
      				"%u.%u.%u.%u to %u.%u.%u.%u.\n",
      				NIPQUAD(rt->rt_src), rt->rt_iif,
      				NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
      #endif
      	}
      out:
              in_dev_put(in_dev);
      }
      
 917  static int ip_error(struct sk_buff *skb)
      {
      	struct rtable *rt = (struct rtable*)skb->dst;
      	unsigned long now;
      	int code;
      
 923  	switch (rt->u.dst.error) {
 924  	case EINVAL:
 925  	default:
      		kfree_skb(skb);
 927  		return 0;
 928  	case EHOSTUNREACH:
      		code = ICMP_HOST_UNREACH;
 930  		break;
 931  	case ENETUNREACH:
      		code = ICMP_NET_UNREACH;
 933  		break;
 934  	case EACCES:
      		code = ICMP_PKT_FILTERED;
 936  		break;
      	}
      
      	now = jiffies;
 940  	if ((rt->u.dst.rate_tokens += (now - rt->u.dst.rate_last)) > ip_rt_error_burst)
      		rt->u.dst.rate_tokens = ip_rt_error_burst;
      	rt->u.dst.rate_last = now;
 943  	if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
      		rt->u.dst.rate_tokens -= ip_rt_error_cost;
      		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
      	}
      
      	kfree_skb(skb);
 949  	return 0;
      } 
      
      /*
       *	The last two values are not from the RFC but
       *	are needed for AMPRnet AX.25 paths.
       */
      
      static unsigned short mtu_plateau[] =
      {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
      
 960  static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
      {
      	int i;
      	
 964  	for (i = 0; i < sizeof(mtu_plateau)/sizeof(mtu_plateau[0]); i++)
 965  		if (old_mtu > mtu_plateau[i])
 966  			return mtu_plateau[i];
 967  	return 68;
      }
      
 970  unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
      {
      	int i;
      	unsigned short old_mtu = ntohs(iph->tot_len);
      	struct rtable *rth;
      	u32  skeys[2] = { iph->saddr, 0, };
      	u32  daddr = iph->daddr;
      	u8   tos = iph->tos & IPTOS_RT_MASK;
      	unsigned short est_mtu = 0;
      
 980  	if (ipv4_config.no_pmtu_disc)
 981  		return 0;
      
 983  	for (i=0; i<2; i++) {
      		unsigned hash = rt_hash_code(daddr, skeys[i], tos);
      
      		read_lock(&rt_hash_table[hash].lock);
 987  		for (rth = rt_hash_table[hash].chain; rth; rth = rth->u.rt_next) {
      			if (rth->key.dst == daddr &&
      			    rth->key.src == skeys[i] &&
      			    rth->rt_dst == daddr &&
      			    rth->rt_src == iph->saddr &&
      			    rth->key.tos == tos &&
      			    rth->key.iif == 0 &&
 994  			    !(rth->u.dst.mxlock&(1<<RTAX_MTU))) {
      				unsigned short mtu = new_mtu;
      
 997  				if (new_mtu < 68 || new_mtu >= old_mtu) {
      
      					/* BSD 4.2 compatibility hack :-( */
      					if (mtu == 0 && old_mtu >= rth->u.dst.pmtu &&
1001  					    old_mtu >= 68 + (iph->ihl<<2))
      						old_mtu -= iph->ihl<<2;
      
      					mtu = guess_mtu(old_mtu);
      				}
1006  				if (mtu <= rth->u.dst.pmtu) {
1007  					if (mtu < rth->u.dst.pmtu) { 
      						dst_confirm(&rth->u.dst);
1009  						if (mtu < ip_rt_min_pmtu) {
      							mtu = ip_rt_min_pmtu;
      							rth->u.dst.mxlock |= (1<<RTAX_MTU);
      						}
      						rth->u.dst.pmtu = mtu;
      						dst_set_expires(&rth->u.dst, ip_rt_mtu_expires);
      					}
      					est_mtu = mtu;
      				}
      			}
      		}
1020  		read_unlock(&rt_hash_table[hash].lock);
      	}
1022  	return est_mtu ? : new_mtu;
      }
      
1025  void ip_rt_update_pmtu(struct dst_entry *dst, unsigned mtu)
      {
      	if (dst->pmtu > mtu && mtu >= 68 &&
1028  	    !(dst->mxlock&(1<<RTAX_MTU))) {
1029  		if (mtu < ip_rt_min_pmtu) {
      			mtu = ip_rt_min_pmtu;
      			dst->mxlock |= (1<<RTAX_MTU);
      		}
      		dst->pmtu = mtu;
      		dst_set_expires(dst, ip_rt_mtu_expires);
      	}
      }
      
1038  static struct dst_entry * ipv4_dst_check(struct dst_entry * dst, u32 cookie)
      {
      	dst_release(dst);
1041  	return NULL;
      }
      
1044  static struct dst_entry * ipv4_dst_reroute(struct dst_entry * dst,
      					   struct sk_buff *skb)
      {
1047  	return NULL;
      }
      
1050  static void ipv4_dst_destroy(struct dst_entry * dst)
      {
      	struct rtable *rt = (struct rtable *) dst;
      	struct inet_peer *peer = rt->peer;
      
1055  	if (peer) {
      		rt->peer = NULL;
      		inet_putpeer(peer);
      	}
      }
      
1061  static void ipv4_link_failure(struct sk_buff *skb)
      {
      	struct rtable *rt;
      
      	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
      
      	rt = (struct rtable *) skb->dst;
1068  	if (rt)
      		dst_set_expires(&rt->u.dst, 0);
      }
      
1072  static int ip_rt_bug(struct sk_buff *skb)
      {
      	printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
      		NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
      		skb->dev ? skb->dev->name : "?");
      	kfree_skb(skb);
1078  	return 0;
      }
      
      /*
         We do not cache source address of outgoing interface,
         because it is used only by IP RR, TS and SRR options,
         so that it out of fast path.
      
         BTW remember: "addr" is allowed to be not aligned
         in IP options!
       */
      
1090  void ip_rt_get_source(u8 *addr, struct rtable *rt)
      {
      	u32 src;
      	struct fib_result res;
      
1095  	if (rt->key.iif == 0)
      		src = rt->rt_src;
1097  	else if (fib_lookup(&rt->key, &res) == 0) {
      #ifdef CONFIG_IP_ROUTE_NAT
      		if (res.type == RTN_NAT)
      			src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway, RT_SCOPE_UNIVERSE);
      		else
      #endif
      			src = FIB_RES_PREFSRC(res);
      		fib_res_put(&res);
1105  	} else
      		src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway, RT_SCOPE_UNIVERSE);
      	memcpy(addr, &src, 4);
      }
      
      #ifdef CONFIG_NET_CLS_ROUTE
      static void set_class_tag(struct rtable *rt, u32 tag)
      {
      	if (!(rt->u.dst.tclassid&0xFFFF))
      		rt->u.dst.tclassid |= tag&0xFFFF;
      	if (!(rt->u.dst.tclassid&0xFFFF0000))
      		rt->u.dst.tclassid |= tag&0xFFFF0000;
      }
      #endif
      
1120  static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
      {
      	struct fib_info *fi = res->fi;
      
1124  	if (fi) {
1125  		if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
      			rt->rt_gateway = FIB_RES_GW(*res);
      		memcpy(&rt->u.dst.mxlock, fi->fib_metrics, sizeof(fi->fib_metrics));
1128  		if (fi->fib_mtu == 0) {
      			rt->u.dst.pmtu = rt->u.dst.dev->mtu;
      			if (rt->u.dst.mxlock&(1<<RTAX_MTU) &&
      			    rt->rt_gateway != rt->rt_dst &&
1132  			    rt->u.dst.pmtu > 576)
      				rt->u.dst.pmtu = 576;
      		}
      #ifdef CONFIG_NET_CLS_ROUTE
      		rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
      #endif
1138  	} else {
      		rt->u.dst.pmtu	= rt->u.dst.dev->mtu;
      	}
1141  	if (rt->u.dst.pmtu > IP_MAX_MTU)
      		rt->u.dst.pmtu = IP_MAX_MTU;
1143  	if (rt->u.dst.advmss == 0)
      		rt->u.dst.advmss = max(rt->u.dst.dev->mtu-40, ip_rt_min_advmss);
1145  	if (rt->u.dst.advmss > 65535-40)
      		rt->u.dst.advmss = 65535-40;
      
      #ifdef CONFIG_NET_CLS_ROUTE
      #ifdef CONFIG_IP_MULTIPLE_TABLES
      	set_class_tag(rt, fib_rules_tclass(res));
      #endif
      	set_class_tag(rt, itag);
      #endif
              rt->rt_type = res->type;
      }
      
      static int
1158  ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
      		  u8 tos, struct net_device *dev, int our)
      {
      	unsigned hash;
      	struct rtable *rth;
      	u32 spec_dst;
      	struct in_device *in_dev = in_dev_get(dev);
      	u32 itag = 0;
      
      	/* Primary sanity checks. */
      
1169  	if (in_dev == NULL)
1170  		return -EINVAL;
      
      	if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1173  	    skb->protocol != __constant_htons(ETH_P_IP))
1174  		goto e_inval;
      
1176  	if (ZERONET(saddr)) {
1177  		if (!LOCAL_MCAST(daddr))
1178  			goto e_inval;
      		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1180  	} else if (fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, &itag) < 0)
1181  		goto e_inval;
      
      	rth = dst_alloc(&ipv4_dst_ops);
1184  	if (!rth)
1185  		goto e_nobufs;
      
      	rth->u.dst.output= ip_rt_bug;
      
      	atomic_set(&rth->u.dst.__refcnt, 1);
      	rth->u.dst.flags= DST_HOST;
      	rth->key.dst	= daddr;
      	rth->rt_dst	= daddr;
      	rth->key.tos	= tos;
      #ifdef CONFIG_IP_ROUTE_FWMARK
      	rth->key.fwmark	= skb->nfmark;
      #endif
      	rth->key.src	= saddr;
      	rth->rt_src	= saddr;
      #ifdef CONFIG_IP_ROUTE_NAT
      	rth->rt_dst_map	= daddr;
      	rth->rt_src_map	= saddr;
      #endif
      #ifdef CONFIG_NET_CLS_ROUTE
      	rth->u.dst.tclassid = itag;
      #endif
      	rth->rt_iif	=
      	rth->key.iif	= dev->ifindex;
      	rth->u.dst.dev	= &loopback_dev;
      	dev_hold(rth->u.dst.dev);
      	rth->key.oif	= 0;
      	rth->rt_gateway	= daddr;
      	rth->rt_spec_dst= spec_dst;
      	rth->rt_type	= RTN_MULTICAST;
      	rth->rt_flags	= RTCF_MULTICAST;
1215  	if (our) {
      		rth->u.dst.input= ip_local_deliver;
      		rth->rt_flags |= RTCF_LOCAL;
      	}
      
      #ifdef CONFIG_IP_MROUTE
      	if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
      		rth->u.dst.input = ip_mr_input;
      #endif
      
      	in_dev_put(in_dev);
      	hash = rt_hash_code(daddr, saddr^(dev->ifindex<<5), tos);
1227  	return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
      
      e_nobufs:
      	in_dev_put(in_dev);
1231  	return -ENOBUFS;
      
      e_inval:
      	in_dev_put(in_dev);
1235  	return -EINVAL;
      }
      
      /*
       *	NOTE. We drop all the packets that has local source
       *	addresses, because every properly looped back packet
       *	must have correct destination already attached by output routine.
       *
       *	Such approach solves two big problems:
       *	1. Not simplex devices are handled properly.
       *	2. IP spoofing attempts are filtered with 100% of guarantee.
       */
      
1248  int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
      			u8 tos, struct net_device *dev)
      {
      	struct rt_key	key;
      	struct fib_result res;
      	struct in_device *in_dev = in_dev_get(dev);
      	struct in_device *out_dev = NULL;
      	unsigned	flags = 0;
      	u32		itag = 0;
      	struct rtable * rth;
      	unsigned	hash;
      	u32		spec_dst;
      	int		err = -EINVAL;
      	int		free_res = 0;
      
      	/*
      	 *	IP on this device is disabled.
      	 */
      
1267  	if (!in_dev)
1268  		return -EINVAL;
      
      	key.dst = daddr;
      	key.src = saddr;
      	key.tos = tos;
      #ifdef CONFIG_IP_ROUTE_FWMARK
      	key.fwmark = skb->nfmark;
      #endif
      	key.iif = dev->ifindex;
      	key.oif = 0;
      	key.scope = RT_SCOPE_UNIVERSE;
      
      	hash = rt_hash_code(daddr, saddr^(key.iif<<5), tos);
      
      	/* Check for the most weird martians, which can be not detected
      	   by fib_lookup.
      	 */
      
1286  	if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1287  		goto martian_source;
      
1289  	if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
1290  		goto brd_input;
      
      	/* Accept zero addresses only to limited broadcast;
      	 * I even do not know to fix it or not. Waiting for complains :-)
      	 */
1295  	if (ZERONET(saddr))
1296  		goto martian_source;
      
1298  	if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1299  		goto martian_destination;
      
      	/*
      	 *	Now we are ready to route packet.
      	 */
1304  	if ((err = fib_lookup(&key, &res)) != 0) {
1305  		if (!IN_DEV_FORWARD(in_dev))
1306  			goto e_inval;
1307  		goto no_route;
      	}
      	free_res = 1;
      
      #ifdef CONFIG_IP_ROUTE_NAT
      	/* Policy is applied before mapping destination,
      	   but rerouting after map should be made with old source.
      	 */
      
      	if (1) {
      		u32 src_map = saddr;
      		if (res.r)
      			src_map = fib_rules_policy(saddr, &res, &flags);
      
      		if (res.type == RTN_NAT) {
      			key.dst = fib_rules_map_destination(daddr, &res);
      			fib_res_put(&res);
      			free_res = 0;
      			if (fib_lookup(&key, &res))
      				goto e_inval;
      			free_res = 1;
      			if (res.type != RTN_UNICAST)
      				goto e_inval;
      			flags |= RTCF_DNAT;
      		}
      		key.src = src_map;
      	}
      #endif
      
1336  	if (res.type == RTN_BROADCAST)
1337  		goto brd_input;
      
1339  	if (res.type == RTN_LOCAL) {
      		int result;
      		result = fib_validate_source(saddr, daddr, tos, loopback_dev.ifindex,
      					     dev, &spec_dst, &itag);
1343  		if (result < 0)
1344  			goto martian_source;
1345  		if (result)
      			flags |= RTCF_DIRECTSRC;
      		spec_dst = daddr;
1348  		goto local_input;
      	}
      
1351  	if (!IN_DEV_FORWARD(in_dev))
1352  		goto e_inval;
1353  	if (res.type != RTN_UNICAST)
1354  		goto martian_destination;
      
      #ifdef CONFIG_IP_ROUTE_MULTIPATH
      	if (res.fi->fib_nhs > 1 && key.oif == 0)
      		fib_select_multipath(&key, &res);
      #endif
      	out_dev = in_dev_get(FIB_RES_DEV(res));
1361  	if (out_dev == NULL) {
1362  		if (net_ratelimit())
      			printk(KERN_CRIT "Bug in ip_route_input_slow(). Please, report\n");
1364  		goto e_inval;
      	}
      
      	err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(res), dev, &spec_dst, &itag);
1368  	if (err < 0)
1369  		goto martian_source;
      
1371  	if (err)
      		flags |= RTCF_DIRECTSRC;
      
      	if (out_dev == in_dev && err && !(flags&(RTCF_NAT|RTCF_MASQ)) &&
      	    (IN_DEV_SHARED_MEDIA(out_dev)
1376  	     || inet_addr_onlink(out_dev, saddr, FIB_RES_GW(res))))
      		flags |= RTCF_DOREDIRECT;
      
1379  	if (skb->protocol != __constant_htons(ETH_P_IP)) {
      		/* Not IP (i.e. ARP). Do not create route, if it is
      		 * invalid for proxy arp. DNAT routes are always valid.
      		 */
1383  		if (out_dev == in_dev && !(flags&RTCF_DNAT))
1384  			goto e_inval;
      	}
      
      	rth = dst_alloc(&ipv4_dst_ops);
1388  	if (!rth)
1389  		goto e_nobufs;
      
      	atomic_set(&rth->u.dst.__refcnt, 1);
      	rth->u.dst.flags= DST_HOST;
      	rth->key.dst	= daddr;
      	rth->rt_dst	= daddr;
      	rth->key.tos	= tos;
      #ifdef CONFIG_IP_ROUTE_FWMARK
      	rth->key.fwmark	= skb->nfmark;
      #endif
      	rth->key.src	= saddr;
      	rth->rt_src	= saddr;
      	rth->rt_gateway	= daddr;
      #ifdef CONFIG_IP_ROUTE_NAT
      	rth->rt_src_map	= key.src;
      	rth->rt_dst_map	= key.dst;
      	if (flags&RTCF_DNAT)
      		rth->rt_gateway	= key.dst;
      #endif
      	rth->rt_iif 	=
      	rth->key.iif	= dev->ifindex;
      	rth->u.dst.dev	= out_dev->dev;
      	dev_hold(rth->u.dst.dev);
      	rth->key.oif 	= 0;
      	rth->rt_spec_dst= spec_dst;
      
      	rth->u.dst.input = ip_forward;
      	rth->u.dst.output = ip_output;
      
      	rt_set_nexthop(rth, &res, itag);
      
      	rth->rt_flags = flags;
      
      #ifdef CONFIG_NET_FASTROUTE
      	if (netdev_fastroute && !(flags&(RTCF_NAT|RTCF_MASQ|RTCF_DOREDIRECT))) {
      		struct net_device *odev = rth->u.dst.dev;
      		if (odev != dev &&
      		    dev->accept_fastpath &&
      		    odev->mtu >= dev->mtu &&
      		    dev->accept_fastpath(dev, &rth->u.dst) == 0)
      			rth->rt_flags |= RTCF_FAST;
      	}
      #endif
      
      intern:
      	err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
      done:
      	in_dev_put(in_dev);
1437  	if (out_dev)
      		in_dev_put(out_dev);
1439  	if (free_res)
      		fib_res_put(&res);
1441  	return err;
      
      brd_input:
1444  	if (skb->protocol != __constant_htons(ETH_P_IP))
1445  		goto e_inval;
      
1447  	if (ZERONET(saddr)) {
      		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1449  	} else {
      		err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, &itag);
1451  		if (err < 0)
1452  			goto martian_source;
1453  		if (err)
      			flags |= RTCF_DIRECTSRC;
      	}
      	flags |= RTCF_BROADCAST;
      	res.type = RTN_BROADCAST;
      
      local_input:
      	rth = dst_alloc(&ipv4_dst_ops);
1461  	if (!rth)
1462  		goto e_nobufs;
      
      	rth->u.dst.output= ip_rt_bug;
      
      	atomic_set(&rth->u.dst.__refcnt, 1);
      	rth->u.dst.flags= DST_HOST;
      	rth->key.dst	= daddr;
      	rth->rt_dst	= daddr;
      	rth->key.tos	= tos;
      #ifdef CONFIG_IP_ROUTE_FWMARK
      	rth->key.fwmark	= skb->nfmark;
      #endif
      	rth->key.src	= saddr;
      	rth->rt_src	= saddr;
      #ifdef CONFIG_IP_ROUTE_NAT
      	rth->rt_dst_map	= key.dst;
      	rth->rt_src_map	= key.src;
      #endif
      #ifdef CONFIG_NET_CLS_ROUTE
      	rth->u.dst.tclassid = itag;
      #endif
      	rth->rt_iif	=
      	rth->key.iif	= dev->ifindex;
      	rth->u.dst.dev	= &loopback_dev;
      	dev_hold(rth->u.dst.dev);
      	rth->key.oif 	= 0;
      	rth->rt_gateway	= daddr;
      	rth->rt_spec_dst= spec_dst;
      	rth->u.dst.input= ip_local_deliver;
      	rth->rt_flags 	= flags|RTCF_LOCAL;
1492  	if (res.type == RTN_UNREACHABLE) {
      		rth->u.dst.input= ip_error;
      		rth->u.dst.error= -err;
      		rth->rt_flags 	&= ~RTCF_LOCAL;
      	}
      	rth->rt_type	= res.type;
1498  	goto intern;
      
      no_route:
      	spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
      	res.type = RTN_UNREACHABLE;
1503  	goto local_input;
      
      	/*
      	 *	Do not cache martian addresses: they should be logged (RFC1812)
      	 */
      martian_destination:
      #ifdef CONFIG_IP_ROUTE_VERBOSE
      	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
      		printk(KERN_WARNING "martian destination %u.%u.%u.%u from %u.%u.%u.%u, dev %s\n",
      			NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
      #endif
      e_inval:
      	err = -EINVAL;
1516  	goto done;
      
      e_nobufs:
      	err = -ENOBUFS;
1520  	goto done;
      
      martian_source:
      #ifdef CONFIG_IP_ROUTE_VERBOSE
      	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
      		/*
      		 *	RFC1812 recommendation, if source is martian,
      		 *	the only hint is MAC header.
      		 */
      		printk(KERN_WARNING "martian source %u.%u.%u.%u from %u.%u.%u.%u, on dev %s\n",
      			NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
      		if (dev->hard_header_len) {
      			int i;
      			unsigned char *p = skb->mac.raw;
      			printk(KERN_WARNING "ll header: ");
      			for (i=0; i<dev->hard_header_len; i++, p++) {
      				printk("%02x", *p);
      				if(i<(dev->hard_header_len-1))
      					printk(":");
      			}
      			printk("\n");
      		}
      	}
      #endif
1544  	goto e_inval;
      }
      
1547  int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
      		   u8 tos, struct net_device *dev)
      {
      	struct rtable * rth;
      	unsigned	hash;
      	int iif = dev->ifindex;
      
      	tos &= IPTOS_RT_MASK;
      	hash = rt_hash_code(daddr, saddr^(iif<<5), tos);
      
      	read_lock(&rt_hash_table[hash].lock);
1558  	for (rth=rt_hash_table[hash].chain; rth; rth=rth->u.rt_next) {
      		if (rth->key.dst == daddr &&
      		    rth->key.src == saddr &&
      		    rth->key.iif == iif &&
      		    rth->key.oif == 0 &&
      #ifdef CONFIG_IP_ROUTE_FWMARK
      		    rth->key.fwmark == skb->nfmark &&
      #endif
1566  		    rth->key.tos == tos) {
      			rth->u.dst.lastuse = jiffies;
      			dst_hold(&rth->u.dst);
      			rth->u.dst.__use++;
1570  			read_unlock(&rt_hash_table[hash].lock);
      			skb->dst = (struct dst_entry*)rth;
1572  			return 0;
      		}
      	}
1575  	read_unlock(&rt_hash_table[hash].lock);
      
      	/* Multicast recognition logic is moved from route cache to here.
      	   The problem was that too many Ethernet cards have broken/missing
      	   hardware multicast filters :-( As result the host on multicasting
      	   network acquires a lot of useless route cache entries, sort of
      	   SDR messages from all the world. Now we try to get rid of them.
      	   Really, provided software IP multicast filter is organized
      	   reasonably (at least, hashed), it does not result in a slowdown
      	   comparing with route cache reject entries.
      	   Note, that multicast routers are not affected, because
      	   route cache entry is created eventually.
      	 */
1588  	if (MULTICAST(daddr)) {
      		struct in_device *in_dev;
      
      		read_lock(&inetdev_lock);
1592  		if ((in_dev = __in_dev_get(dev)) != NULL) {
      			int our = ip_check_mc(in_dev, daddr);
1594  			if (our
      #ifdef CONFIG_IP_MROUTE
      			    || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
      #endif
      			    ) {
1599  				read_unlock(&inetdev_lock);
1600  				return ip_route_input_mc(skb, daddr, saddr, tos, dev, our);
      			}
      		}
1603  		read_unlock(&inetdev_lock);
1604  		return -EINVAL;
      	}
1606  	return ip_route_input_slow(skb, daddr, saddr, tos, dev);
      }
      
      /*
       * Major route resolver routine.
       */
      
1613  int ip_route_output_slow(struct rtable **rp, const struct rt_key *oldkey)
      {
      	struct rt_key key;
      	struct fib_result res;
      	unsigned flags = 0;
      	struct rtable *rth;
      	struct net_device *dev_out = NULL;
      	unsigned hash;
      	int free_res = 0;
      	int err;
      	u32 tos;
      
      	tos = oldkey->tos & (IPTOS_RT_MASK|RTO_ONLINK);
      	key.dst = oldkey->dst;
      	key.src = oldkey->src;
      	key.tos = tos&IPTOS_RT_MASK;
      	key.iif = loopback_dev.ifindex;
      	key.oif = oldkey->oif;
      #ifdef CONFIG_IP_ROUTE_FWMARK
      	key.fwmark = oldkey->fwmark;
      #endif
      	key.scope = (tos&RTO_ONLINK) ? RT_SCOPE_LINK : RT_SCOPE_UNIVERSE;
      	res.fi = NULL;
      #ifdef CONFIG_IP_MULTIPLE_TABLES
      	res.r = NULL;
      #endif
      
1640  	if (oldkey->src) {
      		if (MULTICAST(oldkey->src)
      		    || BADCLASS(oldkey->src)
1643  		    || ZERONET(oldkey->src))
1644  			return -EINVAL;
      
      		/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
      		dev_out = ip_dev_find(oldkey->src);
1648  		if (dev_out == NULL)
1649  			return -EINVAL;
      
      		/* I removed check for oif == dev_out->oif here.
      		   It was wrong by three reasons:
      		   1. ip_dev_find(saddr) can return wrong iface, if saddr is
      		      assigned to multiple interfaces.
      		   2. Moreover, we are allowed to send packets with saddr
      		      of another iface. --ANK
      		 */
      
      		if (oldkey->oif == 0
1660  		    && (MULTICAST(oldkey->dst) || oldkey->dst == 0xFFFFFFFF)) {
      			/* Special hack: user can direct multicasts
      			   and limited broadcast via necessary interface
      			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
      			   This hack is not just for fun, it allows
      			   vic,vat and friends to work.
      			   They bind socket to loopback, set ttl to zero
      			   and expect that it will work.
      			   From the viewpoint of routing cache they are broken,
      			   because we are not allowed to build multicast path
      			   with loopback source addr (look, routing cache
      			   cannot know, that ttl is zero, so that packet
      			   will not leave this host and route is valid).
      			   Luckily, this hack is good workaround.
      			 */
      
      			key.oif = dev_out->ifindex;
1677  			goto make_route;
      		}
1679  		if (dev_out)
      			dev_put(dev_out);
      		dev_out = NULL;
      	}
1683  	if (oldkey->oif) {
      		dev_out = dev_get_by_index(oldkey->oif);
1685  		if (dev_out == NULL)
1686  			return -ENODEV;
1687  		if (__in_dev_get(dev_out) == NULL) {
      			dev_put(dev_out);
1689  			return -ENODEV;	/* Wrong error code */
      		}
      
1692  		if (LOCAL_MCAST(oldkey->dst) || oldkey->dst == 0xFFFFFFFF) {
1693  			if (!key.src)
      				key.src = inet_select_addr(dev_out, 0, RT_SCOPE_LINK);
1695  			goto make_route;
      		}
1697  		if (!key.src) {
1698  			if (MULTICAST(oldkey->dst))
      				key.src = inet_select_addr(dev_out, 0, key.scope);
1700  			else if (!oldkey->dst)
      				key.src = inet_select_addr(dev_out, 0, RT_SCOPE_HOST);
      		}
      	}
      
1705  	if (!key.dst) {
      		key.dst = key.src;
1707  		if (!key.dst)
      			key.dst = key.src = htonl(INADDR_LOOPBACK);
1709  		if (dev_out)
      			dev_put(dev_out);
      		dev_out = &loopback_dev;
      		dev_hold(dev_out);
      		key.oif = loopback_dev.ifindex;
      		res.type = RTN_LOCAL;
      		flags |= RTCF_LOCAL;
1716  		goto make_route;
      	}
      
1719  	if (fib_lookup(&key, &res)) {
      		res.fi = NULL;
1721  		if (oldkey->oif) {
      			/* Apparently, routing tables are wrong. Assume,
      			   that the destination is on link.
      
      			   WHY? DW.
      			   Because we are allowed to send to iface
      			   even if it has NO routes and NO assigned
      			   addresses. When oif is specified, routing
      			   tables are looked up with only one purpose:
      			   to catch if destination is gatewayed, rather than
      			   direct. Moreover, if MSG_DONTROUTE is set,
      			   we send packet, ignoring both routing tables
      			   and ifaddr state. --ANK
      
      
      			   We could make it even if oif is unknown,
      			   likely IPv6, but we do not.
      			 */
      
1740  			if (key.src == 0)
      				key.src = inet_select_addr(dev_out, 0, RT_SCOPE_LINK);
      			res.type = RTN_UNICAST;
1743  			goto make_route;
      		}
1745  		if (dev_out)
      			dev_put(dev_out);
1747  		return -ENETUNREACH;
      	}
      	free_res = 1;
      
1751  	if (res.type == RTN_NAT)
1752  		goto e_inval;
      
1754  	if (res.type == RTN_LOCAL) {
1755  		if (!key.src)
      			key.src = key.dst;
1757  		if (dev_out)
      			dev_put(dev_out);
      		dev_out = &loopback_dev;
      		dev_hold(dev_out);
      		key.oif = dev_out->ifindex;
1762  		if (res.fi)
      			fib_info_put(res.fi);
      		res.fi = NULL;
      		flags |= RTCF_LOCAL;
1766  		goto make_route;
      	}
      
      #ifdef CONFIG_IP_ROUTE_MULTIPATH
      	if (res.fi->fib_nhs > 1 && key.oif == 0)
      		fib_select_multipath(&key, &res);
      	else
      #endif
1774  	if (res.prefixlen==0 && res.type == RTN_UNICAST && key.oif == 0)
      		fib_select_default(&key, &res);
      
1777  	if (!key.src)
      		key.src = FIB_RES_PREFSRC(res);
      
1780  	if (dev_out)
      		dev_put(dev_out);
      	dev_out = FIB_RES_DEV(res);
      	dev_hold(dev_out);
      	key.oif = dev_out->ifindex;
      
      make_route:
1787  	if (LOOPBACK(key.src) && !(dev_out->flags&IFF_LOOPBACK))
1788  		goto e_inval;
      
1790  	if (key.dst == 0xFFFFFFFF)
      		res.type = RTN_BROADCAST;
1792  	else if (MULTICAST(key.dst))
      		res.type = RTN_MULTICAST;
1794  	else if (BADCLASS(key.dst) || ZERONET(key.dst))
1795  		goto e_inval;
      
1797  	if (dev_out->flags&IFF_LOOPBACK)
      		flags |= RTCF_LOCAL;
      
1800  	if (res.type == RTN_BROADCAST) {
      		flags |= RTCF_BROADCAST|RTCF_LOCAL;
1802  		if (res.fi) {
      			fib_info_put(res.fi);
      			res.fi = NULL;
      		}
1806  	} else if (res.type == RTN_MULTICAST) {
      		flags |= RTCF_MULTICAST|RTCF_LOCAL;
      		read_lock(&inetdev_lock);
1809  		if (!__in_dev_get(dev_out) || !ip_check_mc(__in_dev_get(dev_out), oldkey->dst))
      			flags &= ~RTCF_LOCAL;
1811  		read_unlock(&inetdev_lock);
      		/* If multicast route do not exist use
      		   default one, but do not gateway in this case.
      		   Yes, it is hack.
      		 */
1816  		if (res.fi && res.prefixlen < 4) {
      			fib_info_put(res.fi);
      			res.fi = NULL;
      		}
      	}
      
      	rth = dst_alloc(&ipv4_dst_ops);
1823  	if (!rth)
1824  		goto e_nobufs;
      
      	atomic_set(&rth->u.dst.__refcnt, 1);
      	rth->u.dst.flags= DST_HOST;
      	rth->key.dst	= oldkey->dst;
      	rth->key.tos	= tos;
      	rth->key.src	= oldkey->src;
      	rth->key.iif	= 0;
      	rth->key.oif	= oldkey->oif;
      #ifdef CONFIG_IP_ROUTE_FWMARK
      	rth->key.fwmark	= oldkey->fwmark;
      #endif
      	rth->rt_dst	= key.dst;
      	rth->rt_src	= key.src;
      #ifdef CONFIG_IP_ROUTE_NAT
      	rth->rt_dst_map	= key.dst;
      	rth->rt_src_map	= key.src;
      #endif
      	rth->rt_iif	= oldkey->oif ? : dev_out->ifindex;
      	rth->u.dst.dev	= dev_out;
      	dev_hold(dev_out);
      	rth->rt_gateway = key.dst;
      	rth->rt_spec_dst= key.src;
      
      	rth->u.dst.output=ip_output;
      
1850  	if (flags&RTCF_LOCAL) {
      		rth->u.dst.input = ip_local_deliver;
      		rth->rt_spec_dst = key.dst;
      	}
1854  	if (flags&(RTCF_BROADCAST|RTCF_MULTICAST)) {
      		rth->rt_spec_dst = key.src;
1856  		if (flags&RTCF_LOCAL && !(dev_out->flags&IFF_LOOPBACK))
      			rth->u.dst.output = ip_mc_output;
      #ifdef CONFIG_IP_MROUTE
      		if (res.type == RTN_MULTICAST) {
      			struct in_device *in_dev = in_dev_get(dev_out);
      			if (in_dev) {
      				if (IN_DEV_MFORWARD(in_dev) && !LOCAL_MCAST(oldkey->dst)) {
      					rth->u.dst.input = ip_mr_input;
      					rth->u.dst.output = ip_mc_output;
      				}
      				in_dev_put(in_dev);
      			}
      		}
      #endif
      	}
      
      	rt_set_nexthop(rth, &res, 0);
      
      	rth->rt_flags = flags;
      
      	hash = rt_hash_code(oldkey->dst, oldkey->src^(oldkey->oif<<5), tos);
      	err = rt_intern_hash(hash, rth, rp);
      done:
1879  	if (free_res)
      		fib_res_put(&res);
1881  	if (dev_out)
      		dev_put(dev_out);
1883  	return err;
      
      e_inval:
      	err = -EINVAL;
1887  	goto done;
      e_nobufs:
      	err = -ENOBUFS;
1890  	goto done;
      }
      
1893  int ip_route_output_key(struct rtable **rp, const struct rt_key *key)
      {
      	unsigned hash;
      	struct rtable *rth;
      
      	hash = rt_hash_code(key->dst, key->src^(key->oif<<5), key->tos);
      
1900  	read_lock_bh(&rt_hash_table[hash].lock);
1901  	for (rth=rt_hash_table[hash].chain; rth; rth=rth->u.rt_next) {
      		if (rth->key.dst == key->dst &&
      		    rth->key.src == key->src &&
      		    rth->key.iif == 0 &&
      		    rth->key.oif == key->oif &&
      #ifdef CONFIG_IP_ROUTE_FWMARK
      		    rth->key.fwmark == key->fwmark &&
      #endif
      		    !((rth->key.tos^key->tos)&(IPTOS_RT_MASK|RTO_ONLINK)) &&
1910  		    ((key->tos&RTO_TPROXY) || !(rth->rt_flags&RTCF_TPROXY))
1911  		) {
      			rth->u.dst.lastuse = jiffies;
      			dst_hold(&rth->u.dst);
      			rth->u.dst.__use++;
1915  			read_unlock_bh(&rt_hash_table[hash].lock);
      			*rp = rth;
1917  			return 0;
      		}
      	}
1920  	read_unlock_bh(&rt_hash_table[hash].lock);
      
1922  	return ip_route_output_slow(rp, key);
      }	
      
      #ifdef CONFIG_RTNETLINK
      
      static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event, int nowait)
      {
      	struct rtable *rt = (struct rtable*)skb->dst;
      	struct rtmsg *r;
      	struct nlmsghdr  *nlh;
      	unsigned char	 *b = skb->tail;
      	struct rta_cacheinfo ci;
      #ifdef CONFIG_IP_MROUTE
      	struct rtattr *eptr;
      #endif
      
      	nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*r));
      	r = NLMSG_DATA(nlh);
      	nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0;
      	r->rtm_family = AF_INET;
      	r->rtm_dst_len = 32;
      	r->rtm_src_len = 0;
      	r->rtm_tos = rt->key.tos;
      	r->rtm_table = RT_TABLE_MAIN;
      	r->rtm_type = rt->rt_type;
      	r->rtm_scope = RT_SCOPE_UNIVERSE;
      	r->rtm_protocol = RTPROT_UNSPEC;
      	r->rtm_flags = (rt->rt_flags&~0xFFFF) | RTM_F_CLONED;
      	if (rt->rt_flags & RTCF_NOTIFY)
      		r->rtm_flags |= RTM_F_NOTIFY;
      	RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
      	if (rt->key.src) {
      		r->rtm_src_len = 32;
      		RTA_PUT(skb, RTA_SRC, 4, &rt->key.src);
      	}
      	if (rt->u.dst.dev)
      		RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
      #ifdef CONFIG_NET_CLS_ROUTE
      	if (rt->u.dst.tclassid)
      		RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
      #endif
      	if (rt->key.iif)
      		RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
      	else if (rt->rt_src != rt->key.src)
      		RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
      	if (rt->rt_dst != rt->rt_gateway)
      		RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
      	if (rtnetlink_put_metrics(skb, &rt->u.dst.mxlock) < 0)
      		goto rtattr_failure;
      	ci.rta_lastuse = jiffies - rt->u.dst.lastuse;
      	ci.rta_used = rt->u.dst.__use;
      	ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
      	if (rt->u.dst.expires)
      		ci.rta_expires = rt->u.dst.expires - jiffies;
      	else
      		ci.rta_expires = 0;
      	ci.rta_error = rt->u.dst.error;
      	ci.rta_id = 0;
      	ci.rta_ts = 0;
      	ci.rta_tsage = 0;
      	if (rt->peer) {
      		ci.rta_id = rt->peer->ip_id_count;
      		if (rt->peer->tcp_ts_stamp) {
      			ci.rta_ts = rt->peer->tcp_ts;
      			ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
      		}
      	}
      #ifdef CONFIG_IP_MROUTE
      	eptr = (struct rtattr*)skb->tail;
      #endif
      	RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
      	if (rt->key.iif) {
      #ifdef CONFIG_IP_MROUTE
      		u32 dst = rt->rt_dst;
      
      		if (MULTICAST(dst) && !LOCAL_MCAST(dst) && ipv4_devconf.mc_forwarding) {
      			int err = ipmr_get_route(skb, r, nowait);
      			if (err <= 0) {
      				if (!nowait) {
      					if (err == 0)
      						return 0;
      					goto nlmsg_failure;
      				} else {
      					if (err == -EMSGSIZE)
      						goto nlmsg_failure;
      					((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
      				}
      			}
      		} else
      #endif
      		{
      			RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->key.iif);
      		}
      	}
      
      	nlh->nlmsg_len = skb->tail - b;
      	return skb->len;
      
      nlmsg_failure:
      rtattr_failure:
      	skb_trim(skb, b - skb->data);
      	return -1;
      }
      
      int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
      {
      	struct rtattr **rta = arg;
      	struct rtmsg *rtm = NLMSG_DATA(nlh);
      	struct rtable *rt = NULL;
      	u32 dst = 0;
      	u32 src = 0;
      	int iif = 0;
      	int err;
      	struct sk_buff *skb;
      
      	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
      	if (skb == NULL)
      		return -ENOBUFS;
      
      	/* Reserve room for dummy headers, this skb can pass
      	   through good chunk of routing engine.
      	 */
      	skb->mac.raw = skb->data;
      	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
      
      	if (rta[RTA_SRC-1])
      		memcpy(&src, RTA_DATA(rta[RTA_SRC-1]), 4);
      	if (rta[RTA_DST-1])
      		memcpy(&dst, RTA_DATA(rta[RTA_DST-1]), 4);
      	if (rta[RTA_IIF-1])
      		memcpy(&iif, RTA_DATA(rta[RTA_IIF-1]), sizeof(int));
      
      	if (iif) {
      		struct net_device *dev;
      		dev = __dev_get_by_index(iif);
      		if (!dev)
      			return -ENODEV;
      		skb->protocol = __constant_htons(ETH_P_IP);
      		skb->dev = dev;
      		local_bh_disable();
      		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
      		local_bh_enable();
      		rt = (struct rtable*)skb->dst;
      		if (!err && rt->u.dst.error)
      			err = -rt->u.dst.error;
      	} else {
      		int oif = 0;
      		if (rta[RTA_OIF-1])
      			memcpy(&oif, RTA_DATA(rta[RTA_OIF-1]), sizeof(int));
      		err = ip_route_output(&rt, dst, src, rtm->rtm_tos, oif);
      	}
      	if (err) {
      		kfree_skb(skb);
      		return err;
      	}
      
      	skb->dst = &rt->u.dst;
      	if (rtm->rtm_flags & RTM_F_NOTIFY)
      		rt->rt_flags |= RTCF_NOTIFY;
      
      	NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
      
      	err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, RTM_NEWROUTE, 0);
      	if (err == 0)
      		return 0;
      	if (err < 0)
      		return -EMSGSIZE;
      
      	err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
      	if (err < 0)
      		return err;
      	return 0;
      }
      
      
      int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
      {
      	struct rtable *rt;
      	int h, s_h;
      	int idx, s_idx;
      
      	s_h = cb->args[0];
      	s_idx = idx = cb->args[1];
      	for (h=0; h <= rt_hash_mask; h++) {
      		if (h < s_h) continue;
      		if (h > s_h)
      			s_idx = 0;
      		read_lock_bh(&rt_hash_table[h].lock);
      		for (rt = rt_hash_table[h].chain, idx = 0; rt; rt = rt->u.rt_next, idx++) {
      			if (idx < s_idx)
      				continue;
      			skb->dst = dst_clone(&rt->u.dst);
      			if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
      					 cb->nlh->nlmsg_seq, RTM_NEWROUTE, 1) <= 0) {
      				dst_release(xchg(&skb->dst, NULL));
      				read_unlock_bh(&rt_hash_table[h].lock);
      				goto done;
      			}
      			dst_release(xchg(&skb->dst, NULL));
      		}
      		read_unlock_bh(&rt_hash_table[h].lock);
      	}
      
      done:
      	cb->args[0] = h;
      	cb->args[1] = idx;
      	return skb->len;
      }
      
      #endif /* CONFIG_RTNETLINK */
      
2133  void ip_rt_multicast_event(struct in_device *in_dev)
      {
      	rt_cache_flush(0);
      }
      
      
      
      #ifdef CONFIG_SYSCTL
      
      static int flush_delay;
      
      static
2145  int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
      			      void *buffer, size_t *lenp)
      {
2148  	if (write) {
      		proc_dointvec(ctl, write, filp, buffer, lenp);
      		rt_cache_flush(flush_delay);
2151  		return 0;
2152  	} else
2153  		return -EINVAL;
      }
      
2156  static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table, int *name, int nlen,
      			 void *oldval, size_t *oldlenp,
      			 void *newval, size_t newlen, 
      			 void **context)
      {
      	int delay;
2162  	if (newlen != sizeof(int))
2163  		return -EINVAL;
2164  	if (get_user(delay,(int *)newval))
2165  		return -EFAULT; 
      	rt_cache_flush(delay); 
2167  	return 0;
      }
      
      ctl_table ipv4_route_table[] = {
              {NET_IPV4_ROUTE_FLUSH, "flush",
               &flush_delay, sizeof(int), 0644, NULL,
               &ipv4_sysctl_rtcache_flush, &ipv4_sysctl_rtcache_flush_strategy },
      	{NET_IPV4_ROUTE_MIN_DELAY, "min_delay",
               &ip_rt_min_delay, sizeof(int), 0644, NULL,
               &proc_dointvec_jiffies, &sysctl_jiffies},
      	{NET_IPV4_ROUTE_MAX_DELAY, "max_delay",
               &ip_rt_max_delay, sizeof(int), 0644, NULL,
               &proc_dointvec_jiffies, &sysctl_jiffies},
      	{NET_IPV4_ROUTE_GC_THRESH, "gc_thresh",
               &ipv4_dst_ops.gc_thresh, sizeof(int), 0644, NULL,
               &proc_dointvec},
      	{NET_IPV4_ROUTE_MAX_SIZE, "max_size",
               &ip_rt_max_size, sizeof(int), 0644, NULL,
               &proc_dointvec},
      	{NET_IPV4_ROUTE_GC_MIN_INTERVAL, "gc_min_interval",
               &ip_rt_gc_min_interval, sizeof(int), 0644, NULL,
               &proc_dointvec_jiffies, &sysctl_jiffies},
      	{NET_IPV4_ROUTE_GC_TIMEOUT, "gc_timeout",
               &ip_rt_gc_timeout, sizeof(int), 0644, NULL,
               &proc_dointvec_jiffies, &sysctl_jiffies},
      	{NET_IPV4_ROUTE_GC_INTERVAL, "gc_interval",
               &ip_rt_gc_interval, sizeof(int), 0644, NULL,
               &proc_dointvec_jiffies, &sysctl_jiffies},
      	{NET_IPV4_ROUTE_REDIRECT_LOAD, "redirect_load",
               &ip_rt_redirect_load, sizeof(int), 0644, NULL,
               &proc_dointvec},
      	{NET_IPV4_ROUTE_REDIRECT_NUMBER, "redirect_number",
               &ip_rt_redirect_number, sizeof(int), 0644, NULL,
               &proc_dointvec},
      	{NET_IPV4_ROUTE_REDIRECT_SILENCE, "redirect_silence",
               &ip_rt_redirect_silence, sizeof(int), 0644, NULL,
               &proc_dointvec},
      	{NET_IPV4_ROUTE_ERROR_COST, "error_cost",
               &ip_rt_error_cost, sizeof(int), 0644, NULL,
               &proc_dointvec},
      	{NET_IPV4_ROUTE_ERROR_BURST, "error_burst",
               &ip_rt_error_burst, sizeof(int), 0644, NULL,
               &proc_dointvec},
      	{NET_IPV4_ROUTE_GC_ELASTICITY, "gc_elasticity",
               &ip_rt_gc_elasticity, sizeof(int), 0644, NULL,
               &proc_dointvec},
      	{NET_IPV4_ROUTE_MTU_EXPIRES, "mtu_expires",
               &ip_rt_mtu_expires, sizeof(int), 0644, NULL,
               &proc_dointvec_jiffies, &sysctl_jiffies},
      	{NET_IPV4_ROUTE_MIN_PMTU, "min_pmtu",
               &ip_rt_min_pmtu, sizeof(int), 0644, NULL,
               &proc_dointvec},
      	{NET_IPV4_ROUTE_MIN_ADVMSS, "min_adv_mss",
               &ip_rt_min_advmss, sizeof(int), 0644, NULL,
               &proc_dointvec},
      	 {0}
      };
      #endif
      
      #ifdef CONFIG_NET_CLS_ROUTE
      struct ip_rt_acct *ip_rt_acct;
      
      static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
      			   int length, int *eof, void *data)
      {
      	*start=buffer;
      
      	if ((offset&3) || (length&3))
      		return -EIO;
      
      	if (offset + length >= sizeof(struct ip_rt_acct)*256) {
      		length = sizeof(struct ip_rt_acct)*256 - offset;
      		*eof = 1;
      	}
      	if (length > 0) {
      		u32 *dst = (u32*)buffer;
      		u32 *src = (u32*)(((u8*)ip_rt_acct) + offset);
      
      		memcpy(dst, src, length);
      
      #ifdef CONFIG_SMP
      		if (smp_num_cpus > 1 || cpu_logical_map(0) != 0) {
      			int i;
      			int cnt = length/4;
      
      			for (i=0; i<smp_num_cpus; i++) {
      				int cpu = cpu_logical_map(i);
      				int k;
      
      				if (cpu == 0)
      					continue;
      
      				src = (u32*)(((u8*)ip_rt_acct) + offset +
      					     cpu*256*sizeof(struct ip_rt_acct));
      
      				for (k=0; k<cnt; k++)
      					dst[k] += src[k];
      			}
      		}
      #endif
      		return length;
      	}
      	return 0;
      }
      #endif
      
2273  void __init ip_rt_init(void)
      {
      	int i, order, goal;
      
      #ifdef CONFIG_NET_CLS_ROUTE
      	for (order=0;
      	     (PAGE_SIZE<<order) < 256*sizeof(ip_rt_acct)*NR_CPUS; order++)
      		/* NOTHING */;
      	ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
      	if (!ip_rt_acct)
      		panic("IP: failed to allocate ip_rt_acct\n");
      	memset(ip_rt_acct, 0, PAGE_SIZE<<order);
      #endif
      
      	ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
      						     sizeof(struct rtable),
      						     0, SLAB_HWCACHE_ALIGN,
      						     NULL, NULL);
      
2292  	if (!ipv4_dst_ops.kmem_cachep)
      		panic("IP: failed to allocate ip_dst_cache\n");
      
      	goal = num_physpages >> (26 - PAGE_SHIFT);
      
2297  	for (order = 0; (1UL << order) < goal; order++)
      		/* NOTHING */;
      
2300  	do {
      		rt_hash_mask = (1UL << order) * PAGE_SIZE /
      			sizeof(struct rt_hash_bucket);
2303  		while (rt_hash_mask & (rt_hash_mask-1))
      			rt_hash_mask--;
      		rt_hash_table = (struct rt_hash_bucket *)
      			__get_free_pages(GFP_ATOMIC, order);
2307  	} while (rt_hash_table == NULL && --order > 0);
      
2309  	if (!rt_hash_table)
      		panic("Failed to allocate IP route cache hash table\n");
      
      	printk("IP: routing cache hash table of %u buckets, %ldKbytes\n",
      	       rt_hash_mask,
      	       (long) (rt_hash_mask*sizeof(struct rt_hash_bucket))/1024);
      
2316  	for (rt_hash_log=0; (1<<rt_hash_log) != rt_hash_mask; rt_hash_log++)
      		/* NOTHING */;
      
      	rt_hash_mask--;
2320  	for (i = 0; i <= rt_hash_mask; i++) {
      		rt_hash_table[i].lock = RW_LOCK_UNLOCKED;
      		rt_hash_table[i].chain = NULL;
      	}
      
      	ipv4_dst_ops.gc_thresh = (rt_hash_mask+1);
      	ip_rt_max_size = (rt_hash_mask+1)*16;
      
      	devinet_init();
      	ip_fib_init();
      
      	rt_flush_timer.function = rt_run_flush;
      	rt_periodic_timer.function = rt_check_expire;
      
      	/* All the timers, started at system startup tend
      	   to synchronize. Perturb it a bit.
      	 */
      	rt_periodic_timer.expires = jiffies + net_random()%ip_rt_gc_interval
      		+ ip_rt_gc_interval;
      	add_timer(&rt_periodic_timer);
      
      	proc_net_create ("rt_cache", 0, rt_cache_get_info);
      #ifdef CONFIG_NET_CLS_ROUTE
      	create_proc_read_entry("net/rt_acct", 0, 0, ip_rt_acct_read, NULL);
      #endif
      }