/*
       * INET		An implementation of the TCP/IP protocol suite for the LINUX
       *		operating system.  INET is implemented using the  BSD Socket
       *		interface as the means of communication with the user level.
       *
       *		RAW - implementation of IP "raw" sockets.
       *
       * Version:	$Id: raw.c,v 1.56 2000/11/28 13:38:38 davem Exp $
       *
       * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
       *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
       *
       * Fixes:
       *		Alan Cox	:	verify_area() fixed up
       *		Alan Cox	:	ICMP error handling
       *		Alan Cox	:	EMSGSIZE if you send too big a packet
       *		Alan Cox	: 	Now uses generic datagrams and shared skbuff
       *					library. No more peek crashes, no more backlogs
       *		Alan Cox	:	Checks sk->broadcast.
       *		Alan Cox	:	Uses skb_free_datagram/skb_copy_datagram
       *		Alan Cox	:	Raw passes ip options too
       *		Alan Cox	:	Setsocketopt added
       *		Alan Cox	:	Fixed error return for broadcasts
       *		Alan Cox	:	Removed wake_up calls
       *		Alan Cox	:	Use ttl/tos
       *		Alan Cox	:	Cleaned up old debugging
       *		Alan Cox	:	Use new kernel side addresses
       *	Arnt Gulbrandsen	:	Fixed MSG_DONTROUTE in raw sockets.
       *		Alan Cox	:	BSD style RAW socket demultiplexing.
       *		Alan Cox	:	Beginnings of mrouted support.
       *		Alan Cox	:	Added IP_HDRINCL option.
       *		Alan Cox	:	Skip broadcast check if BSDism set.
       *		David S. Miller	:	New socket lookup architecture.
       *
       *		This program is free software; you can redistribute it and/or
       *		modify it under the terms of the GNU General Public License
       *		as published by the Free Software Foundation; either version
       *		2 of the License, or (at your option) any later version.
       */
       
      #include <linux/config.h> 
      #include <asm/system.h>
      #include <asm/uaccess.h>
      #include <linux/types.h>
      #include <linux/sched.h>
      #include <linux/errno.h>
      #include <linux/timer.h>
      #include <linux/mm.h>
      #include <linux/kernel.h>
      #include <linux/fcntl.h>
      #include <linux/socket.h>
      #include <linux/in.h>
      #include <linux/inet.h>
      #include <linux/netdevice.h>
      #include <linux/mroute.h>
      #include <net/ip.h>
      #include <net/protocol.h>
      #include <linux/skbuff.h>
      #include <net/sock.h>
      #include <net/icmp.h>
      #include <net/udp.h>
      #include <net/raw.h>
      #include <net/inet_common.h>
      #include <net/checksum.h>
      
      struct sock *raw_v4_htable[RAWV4_HTABLE_SIZE];
      rwlock_t raw_v4_lock = RW_LOCK_UNLOCKED;
      
  69  static void raw_v4_hash(struct sock *sk)
      {
      	struct sock **skp = &raw_v4_htable[sk->num & (RAWV4_HTABLE_SIZE - 1)];
      
  73  	write_lock_bh(&raw_v4_lock);
  74  	if ((sk->next = *skp) != NULL)
      		(*skp)->pprev = &sk->next;
      	*skp = sk;
      	sk->pprev = skp;
      	sock_prot_inc_use(sk->prot);
       	sock_hold(sk);
  80  	write_unlock_bh(&raw_v4_lock);
      }
      
  83  static void raw_v4_unhash(struct sock *sk)
      {
  85   	write_lock_bh(&raw_v4_lock);
  86  	if (sk->pprev) {
  87  		if (sk->next)
      			sk->next->pprev = sk->pprev;
      		*sk->pprev = sk->next;
      		sk->pprev = NULL;
      		sock_prot_dec_use(sk->prot);
      		__sock_put(sk);
      	}
  94  	write_unlock_bh(&raw_v4_lock);
      }
      
  97  struct sock *__raw_v4_lookup(struct sock *sk, unsigned short num,
      			     unsigned long raddr, unsigned long laddr,
      			     int dif)
      {
      	struct sock *s = sk;
      
 103  	for(s = sk; s; s = s->next) {
      		if((s->num == num) 				&&
      		   !(s->daddr && s->daddr != raddr) 		&&
      		   !(s->rcv_saddr && s->rcv_saddr != laddr)	&&
 107  		   !(s->bound_dev_if && s->bound_dev_if != dif))
 108  			break; /* gotcha */
      	}
 110  	return s;
      }
      
      /*
       *	0 - deliver
       *	1 - block
       */
 117  static __inline__ int icmp_filter(struct sock *sk, struct sk_buff *skb)
      {
      	int    type;
      
      	type = skb->h.icmph->type;
 122  	if (type < 32)
 123  		return test_bit(type, &sk->tp_pinfo.tp_raw4.filter);
      
      	/* Do not block unknown ICMP types */
 126  	return 0;
      }
      
      /* IP input processing comes here for RAW socket delivery.
       * This is fun as to avoid copies we want to make no surplus
       * copies.
       *
       * RFC 1122: SHOULD pass TOS value up to the transport layer.
       * -> It does. And not only TOS, but all IP header.
       */
 136  struct sock *raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash)
      {
      	struct sock *sk;
      
      	read_lock(&raw_v4_lock);
 141  	if ((sk = raw_v4_htable[hash]) == NULL)
 142  		goto out;
      	sk = __raw_v4_lookup(sk, iph->protocol,
      			     iph->saddr, iph->daddr,
      			     skb->dev->ifindex);
      
 147  	while(sk != NULL) {
      		struct sock *sknext = __raw_v4_lookup(sk->next, iph->protocol,
      						      iph->saddr, iph->daddr,
      						      skb->dev->ifindex);
      		if (iph->protocol != IPPROTO_ICMP ||
 152  		    ! icmp_filter(sk, skb)) {
      			struct sk_buff *clone;
      
 155  			if(sknext == NULL)
 156  				break;
      			clone = skb_clone(skb, GFP_ATOMIC);
      			/* Not releasing hash table! */
 159  			if(clone)
      				raw_rcv(sk, clone);
      		}
      		sk = sknext;
      	}
      out:
 165  	if (sk)
      		sock_hold(sk);
 167  	read_unlock(&raw_v4_lock);
      
 169  	return sk;
      }
      
 172  void raw_err (struct sock *sk, struct sk_buff *skb)
      {
      	int type = skb->h.icmph->type;
      	int code = skb->h.icmph->code;
      	u32 info = 0;
      	int err = 0;
      	int harderr = 0;
      
      	/* Report error on raw socket, if:
      	   1. User requested ip_recverr.
      	   2. Socket is connected (otherwise the error indication
      	      is useless without ip_recverr and error is hard.
      	 */
 185  	if (!sk->protinfo.af_inet.recverr && sk->state != TCP_ESTABLISHED)
 186  		return;
      
 188  	switch (type) {
 189  	default:
 190  	case ICMP_TIME_EXCEEDED:
      		err = EHOSTUNREACH;
 192  		break;
 193  	case ICMP_SOURCE_QUENCH:
 194  		return;
 195  	case ICMP_PARAMETERPROB:
      		err = EPROTO;
      		info = ntohl(skb->h.icmph->un.gateway)>>24;
      		harderr = 1;
 199  		break;
 200  	case ICMP_DEST_UNREACH:
      		err = EHOSTUNREACH;
 202  		if (code > NR_ICMP_UNREACH)
 203  			break;
      		err = icmp_err_convert[code].errno;
      		harderr = icmp_err_convert[code].fatal;
 206  		if (code == ICMP_FRAG_NEEDED) {
      			harderr = (sk->protinfo.af_inet.pmtudisc != IP_PMTUDISC_DONT);
      			err = EMSGSIZE;
      			info = ntohs(skb->h.icmph->un.frag.mtu);
      		}
      	}
      
 213  	if (sk->protinfo.af_inet.recverr)
      		ip_icmp_error(sk, skb, err, 0, info, (u8 *)(skb->h.icmph + 1));
      
 216  	if (sk->protinfo.af_inet.recverr || harderr) {
      		sk->err = err;
      		sk->error_report(sk);
      	}
      }
      
 222  static int raw_rcv_skb(struct sock * sk, struct sk_buff * skb)
      {
      	/* Charge it to the socket. */
      	
 226  	if (sock_queue_rcv_skb(sk,skb)<0)
      	{
      		IP_INC_STATS(IpInDiscards);
      		kfree_skb(skb);
 230  		return NET_RX_DROP;
      	}
      
      	IP_INC_STATS(IpInDelivers);
 234  	return NET_RX_SUCCESS;
      }
      
      /*
       *	This should be the easiest of all, all we do is
       *	copy it into a buffer. All demultiplexing is done
       *	in ip.c
       */
      
 243  int raw_rcv(struct sock *sk, struct sk_buff *skb)
      {
      	/* Now we need to copy this into memory. */
      	skb_trim(skb, ntohs(skb->nh.iph->tot_len));
      	
      	skb->h.raw = skb->nh.raw;
      
      	raw_rcv_skb(sk, skb);
 251  	return 0;
      }
      
      struct rawfakehdr 
      {
      	struct  iovec *iov;
      	u32	saddr;
      	struct dst_entry *dst;
      };
      
      /*
       *	Send a RAW IP packet.
       */
      
      /*
       *	Callback support is trivial for SOCK_RAW
       */
        
 269  static int raw_getfrag(const void *p, char *to, unsigned int offset, unsigned int fraglen)
      {
      	struct rawfakehdr *rfh = (struct rawfakehdr *) p;
 272  	return memcpy_fromiovecend(to, rfh->iov, offset, fraglen);
      }
      
      /*
       *	IPPROTO_RAW needs extra work.
       */
       
 279  static int raw_getrawfrag(const void *p, char *to, unsigned int offset, unsigned int fraglen)
      {
      	struct rawfakehdr *rfh = (struct rawfakehdr *) p;
      
 283  	if (memcpy_fromiovecend(to, rfh->iov, offset, fraglen))
 284  		return -EFAULT;
      
 286  	if (offset==0) {
      		struct iphdr *iph = (struct iphdr *)to;
 288  		if (!iph->saddr)
      			iph->saddr = rfh->saddr;
      		iph->check=0;
      		iph->tot_len=htons(fraglen);	/* This is right as you can't frag
      						   RAW packets */
      		/*
      	 	 *	Deliberate breach of modularity to keep 
      	 	 *	ip_build_xmit clean (well less messy).
      		 */
 297  		if (!iph->id)
      			ip_select_ident(iph, rfh->dst);
      		iph->check=ip_fast_csum((unsigned char *)iph, iph->ihl);
      	}
 301  	return 0;
      }
      
 304  static int raw_sendmsg(struct sock *sk, struct msghdr *msg, int len)
      {
      	struct ipcm_cookie ipc;
      	struct rawfakehdr rfh;
      	struct rtable *rt = NULL;
      	int free = 0;
      	u32 daddr;
      	u8  tos;
      	int err;
      
      	/* This check is ONLY to check for arithmetic overflow
      	   on integer(!) len. Not more! Real check will be made
      	   in ip_build_xmit --ANK
      
      	   BTW socket.c -> af_*.c -> ... make multiple
      	   invalid conversions size_t -> int. We MUST repair it f.e.
      	   by replacing all of them with size_t and revise all
      	   the places sort of len += sizeof(struct iphdr)
      	   If len was ULONG_MAX-10 it would be cathastrophe  --ANK
      	 */
      
 325  	if (len < 0 || len > 0xFFFF)
 326  		return -EMSGSIZE;
      
      	/*
      	 *	Check the flags.
      	 */
      
 332  	if (msg->msg_flags & MSG_OOB)		/* Mirror BSD error message compatibility */
 333  		return -EOPNOTSUPP;
      			 
      	/*
      	 *	Get and verify the address. 
      	 */
      
 339  	if (msg->msg_namelen) {
      		struct sockaddr_in *usin = (struct sockaddr_in*)msg->msg_name;
 341  		if (msg->msg_namelen < sizeof(*usin))
 342  			return(-EINVAL);
 343  		if (usin->sin_family != AF_INET) {
      			static int complained;
 345  			if (!complained++)
      				printk(KERN_INFO "%s forgot to set AF_INET in raw sendmsg. Fix it!\n", current->comm);
 347  			if (usin->sin_family)
 348  				return -EINVAL;
      		}
      		daddr = usin->sin_addr.s_addr;
      		/* ANK: I did not forget to get protocol from port field.
      		 * I just do not know, who uses this weirdness.
      		 * IP_HDRINCL is much more convenient.
      		 */
 355  	} else {
 356  		if (sk->state != TCP_ESTABLISHED) 
 357  			return(-EINVAL);
      		daddr = sk->daddr;
      	}
      
      	ipc.addr = sk->saddr;
      	ipc.opt = NULL;
      	ipc.oif = sk->bound_dev_if;
      
 365  	if (msg->msg_controllen) {
      		int tmp = ip_cmsg_send(msg, &ipc);
 367  		if (tmp)
 368  			return tmp;
 369  		if (ipc.opt)
      			free=1;
      	}
      
      	rfh.saddr = ipc.addr;
      	ipc.addr = daddr;
      
 376  	if (!ipc.opt)
      		ipc.opt = sk->protinfo.af_inet.opt;
      
 379  	if (ipc.opt) {
      		err = -EINVAL;
      		/* Linux does not mangle headers on raw sockets,
      		 * so that IP options + IP_HDRINCL is non-sense.
      		 */
 384  		if (sk->protinfo.af_inet.hdrincl)
 385  			goto done;
 386  		if (ipc.opt->srr) {
 387  			if (!daddr)
 388  				goto done;
      			daddr = ipc.opt->faddr;
      		}
      	}
      	tos = RT_TOS(sk->protinfo.af_inet.tos) | sk->localroute;
 393  	if (msg->msg_flags&MSG_DONTROUTE)
      		tos |= RTO_ONLINK;
      
 396  	if (MULTICAST(daddr)) {
 397  		if (!ipc.oif)
      			ipc.oif = sk->protinfo.af_inet.mc_index;
 399  		if (!rfh.saddr)
      			rfh.saddr = sk->protinfo.af_inet.mc_addr;
      	}
      
      	err = ip_route_output(&rt, daddr, rfh.saddr, tos, ipc.oif);
      
 405  	if (err)
 406  		goto done;
      
      	err = -EACCES;
 409  	if (rt->rt_flags&RTCF_BROADCAST && !sk->broadcast)
 410  		goto done;
      
 412  	if (msg->msg_flags&MSG_CONFIRM)
 413  		goto do_confirm;
      back_from_confirm:
      
      	rfh.iov = msg->msg_iov;
      	rfh.saddr = rt->rt_src;
      	rfh.dst = &rt->u.dst;
 419  	if (!ipc.addr)
      		ipc.addr = rt->rt_dst;
      	err=ip_build_xmit(sk, sk->protinfo.af_inet.hdrincl ? raw_getrawfrag : raw_getfrag,
      			  &rfh, len, &ipc, rt, msg->msg_flags);
      
      done:
 425  	if (free)
      		kfree(ipc.opt);
      	ip_rt_put(rt);
      
 429  	return err<0 ? err : len;
      
      do_confirm:
      	dst_confirm(&rt->u.dst);
 433  	if (!(msg->msg_flags&MSG_PROBE) || len)
 434  		goto back_from_confirm;
      	err = 0;
 436  	goto done;
      }
      
 439  static void raw_close(struct sock *sk, long timeout)
      {
              /*
      	 * Raw sockets may have direct kernel refereneces. Kill them.
      	 */
      	ip_ra_control(sk, 0, NULL);
      
      	inet_sock_release(sk);
      }
      
      /* This gets rid of all the nasties in af_inet. -DaveM */
 450  static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
      {
      	struct sockaddr_in *addr = (struct sockaddr_in *) uaddr;
      	int chk_addr_ret;
      
 455  	if((sk->state != TCP_CLOSE) || (addr_len < sizeof(struct sockaddr_in)))
 456  		return -EINVAL;
      	chk_addr_ret = inet_addr_type(addr->sin_addr.s_addr);
      	if(addr->sin_addr.s_addr != 0 && chk_addr_ret != RTN_LOCAL &&
 459  	   chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST)
 460  		return -EADDRNOTAVAIL;
      	sk->rcv_saddr = sk->saddr = addr->sin_addr.s_addr;
 462  	if(chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
      		sk->saddr = 0;  /* Use device */
      	sk_dst_reset(sk);
 465  	return 0;
      }
      
      /*
       *	This should be easy, if there is something there
       *	we return it, otherwise we block.
       */
      
 473  int raw_recvmsg(struct sock *sk, struct msghdr *msg, int len,
      		int noblock, int flags,int *addr_len)
      {
      	int copied=0;
      	struct sk_buff *skb;
      	int err;
      	struct sockaddr_in *sin=(struct sockaddr_in *)msg->msg_name;
      
 481  	if (flags & MSG_OOB)
 482  		return -EOPNOTSUPP;
      
 484  	if (addr_len)
      		*addr_len=sizeof(*sin);
      
 487  	if (flags & MSG_ERRQUEUE)
 488  		return ip_recv_error(sk, msg, len);
      
      	skb=skb_recv_datagram(sk,flags,noblock,&err);
 491  	if(skb==NULL)
 492   		return err;
      
      	copied = skb->len;
 495  	if (len < copied)
      	{
      		msg->msg_flags |= MSG_TRUNC;
      		copied = len;
      	}
      	
      	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
 502  	if (err)
 503  		goto done;
      
      	sock_recv_timestamp(msg, sk, skb);
      
      	/* Copy the address. */
 508  	if (sin) {
      		sin->sin_family = AF_INET;
      		sin->sin_addr.s_addr = skb->nh.iph->saddr;
      	}
 512  	if (sk->protinfo.af_inet.cmsg_flags)
      		ip_cmsg_recv(msg, skb);
      done:
      	skb_free_datagram(sk, skb);
 516  	return (err ? : copied);
      }
      
 519  static int raw_init(struct sock *sk)
      {
      	struct raw_opt *tp = &(sk->tp_pinfo.tp_raw4);
 522  	if (sk->num == IPPROTO_ICMP)
      		memset(&tp->filter, 0, sizeof(tp->filter));
 524  	return 0;
      }
      
 527  static int raw_seticmpfilter(struct sock *sk, char *optval, int optlen)
      {
 529  	if (optlen > sizeof(struct icmp_filter))
      		optlen = sizeof(struct icmp_filter);
 531  	if (copy_from_user(&sk->tp_pinfo.tp_raw4.filter, optval, optlen))
 532  		return -EFAULT;
 533  	return 0;
      }
      
 536  static int raw_geticmpfilter(struct sock *sk, char *optval, int *optlen)
      {
      	int len;
      
 540  	if (get_user(len,optlen))
 541  		return -EFAULT;
 542  	if (len > sizeof(struct icmp_filter))
      		len = sizeof(struct icmp_filter);
 544  	if (put_user(len, optlen))
 545  		return -EFAULT;
 546  	if (copy_to_user(optval, &sk->tp_pinfo.tp_raw4.filter, len))
 547  		return -EFAULT;
 548  	return 0;
      }
      
 551  static int raw_setsockopt(struct sock *sk, int level, int optname, 
      			  char *optval, int optlen)
      {
 554  	if (level != SOL_RAW)
 555  		return ip_setsockopt(sk, level, optname, optval, optlen);
      
 557  	switch (optname) {
 558  	case ICMP_FILTER:
 559  		if (sk->num != IPPROTO_ICMP)
 560  			return -EOPNOTSUPP;
 561  		return raw_seticmpfilter(sk, optval, optlen);
      	};
      
 564  	return -ENOPROTOOPT;
      }
      
 567  static int raw_getsockopt(struct sock *sk, int level, int optname, 
      			  char *optval, int *optlen)
      {
 570  	if (level != SOL_RAW)
 571  		return ip_getsockopt(sk, level, optname, optval, optlen);
      
 573  	switch (optname) {
 574  	case ICMP_FILTER:
 575  		if (sk->num != IPPROTO_ICMP)
 576  			return -EOPNOTSUPP;
 577  		return raw_geticmpfilter(sk, optval, optlen);
      	};
      
 580  	return -ENOPROTOOPT;
      }
      
 583  static int raw_ioctl(struct sock *sk, int cmd, unsigned long arg)
      {
 585  	switch(cmd) {
 586  		case SIOCOUTQ:
      		{
      			int amount = atomic_read(&sk->wmem_alloc);
 589  			return put_user(amount, (int *)arg);
      		}
 591  		case SIOCINQ:
      		{
      			struct sk_buff *skb;
      			int amount = 0;
      
 596  			spin_lock_irq(&sk->receive_queue.lock);
      			skb = skb_peek(&sk->receive_queue);
 598  			if (skb != NULL)
      				amount = skb->len;
 600  			spin_unlock_irq(&sk->receive_queue.lock);
 601  			return put_user(amount, (int *)arg);
      		}
      
 604  		default:
      #ifdef CONFIG_IP_MROUTE
      			return ipmr_ioctl(sk, cmd, arg);
      #else
 608  			return -ENOIOCTLCMD;
      #endif
      	}
      }
      
 613  static void get_raw_sock(struct sock *sp, char *tmpbuf, int i)
      {
      	unsigned int dest, src;
      	__u16 destp, srcp;
      
      	dest  = sp->daddr;
      	src   = sp->rcv_saddr;
      	destp = 0;
      	srcp  = sp->num;
      	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
      		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %ld %d %p",
      		i, src, srcp, dest, destp, sp->state, 
      		atomic_read(&sp->wmem_alloc), atomic_read(&sp->rmem_alloc),
      		0, 0L, 0,
      		sock_i_uid(sp), 0,
      		sock_i_ino(sp),
      		atomic_read(&sp->refcnt), sp);
      }
      
 632  int raw_get_info(char *buffer, char **start, off_t offset, int length)
      {
      	int len = 0, num = 0, i;
      	off_t pos = 0;
      	off_t begin;
      	char tmpbuf[129];
      
 639  	if (offset < 128) 
      		len += sprintf(buffer, "%-127s\n",
      			       "  sl  local_address rem_address   st tx_queue "
      			       "rx_queue tr tm->when retrnsmt   uid  timeout inode");
      	pos = 128;
      	read_lock(&raw_v4_lock);
 645  	for (i = 0; i < RAWV4_HTABLE_SIZE; i++) {
      		struct sock *sk;
      
 648  		for (sk = raw_v4_htable[i]; sk; sk = sk->next, num++) {
 649  			if (sk->family != PF_INET)
 650  				continue;
      			pos += 128;
 652  			if (pos <= offset)
 653  				continue;
      			get_raw_sock(sk, tmpbuf, i);
      			len += sprintf(buffer+len, "%-127s\n", tmpbuf);
 656  			if(len >= length)
 657  				goto out;
      		}
      	}
      out:
 661  	read_unlock(&raw_v4_lock);
      	begin = len - (pos - offset);
      	*start = buffer + begin;
      	len -= begin;
 665  	if(len > length)
      		len = length;
 667  	if (len < 0)
      		len = 0; 
 669  	return len;
      }
      
      struct proto raw_prot = {
      	name:		"RAW",
      	close:		raw_close,
      	connect:	udp_connect,
      	disconnect:	udp_disconnect,
      	ioctl:		raw_ioctl,
      	init:		raw_init,
      	setsockopt:	raw_setsockopt,
      	getsockopt:	raw_getsockopt,
      	sendmsg:	raw_sendmsg,
      	recvmsg:	raw_recvmsg,
      	bind:		raw_bind,
      	backlog_rcv:	raw_rcv_skb,
      	hash:		raw_v4_hash,
      	unhash:		raw_v4_unhash,
      };