./fs/buffer.c

      /*
       *  linux/fs/buffer.c
       *
       *  Copyright (C) 1991, 1992  Linus Torvalds
       */
      
      /*
       *  'buffer.c' implements the buffer-cache functions. Race-conditions have
       * been avoided by NEVER letting an interrupt change a buffer (except for the
       * data, of course), but instead letting the caller do it.
       */
      
      /* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 */
      
      /* Removed a lot of unnecessary code and simplified things now that
       * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
       */
      
      /* Speed up hash, lru, and free list operations.  Use gfp() for allocating
       * hash table, use SLAB cache for buffer heads. -DaveM
       */
      
      /* Added 32k buffer block sizes - these are required older ARM systems.
       * - RMK
       */
      
      /* Thread it... -DaveM */
      
      /* async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de> */
      
      #include <linux/config.h>
      #include <linux/sched.h>
      #include <linux/fs.h>
      #include <linux/malloc.h>
      #include <linux/locks.h>
      #include <linux/errno.h>
      #include <linux/swap.h>
      #include <linux/swapctl.h>
      #include <linux/smp_lock.h>
      #include <linux/vmalloc.h>
      #include <linux/blkdev.h>
      #include <linux/sysrq.h>
      #include <linux/file.h>
      #include <linux/init.h>
      #include <linux/quotaops.h>
      #include <linux/iobuf.h>
      #include <linux/highmem.h>
      
      #include <asm/uaccess.h>
      #include <asm/io.h>
      #include <asm/bitops.h>
      #include <asm/mmu_context.h>
      
      #define NR_SIZES 7
      static char buffersize_index[65] =
      {-1,  0,  1, -1,  2, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1,
        4, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
        5, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
        6};
      
      #define BUFSIZE_INDEX(X) ((int) buffersize_index[(X)>>9])
      #define MAX_BUF_PER_PAGE (PAGE_CACHE_SIZE / 512)
      #define NR_RESERVED (2*MAX_BUF_PER_PAGE)
      #define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this 
      					     number of unused buffer heads */
      
      /* Anti-deadlock ordering:
       *	lru_list_lock > hash_table_lock > free_list_lock > unused_list_lock
       */
      
      #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_inode_buffers)
      
      /*
       * Hash table gook..
       */
      static unsigned int bh_hash_mask;
      static unsigned int bh_hash_shift;
      static struct buffer_head **hash_table;
      static rwlock_t hash_table_lock = RW_LOCK_UNLOCKED;
      
      static struct buffer_head *lru_list[NR_LIST];
      static spinlock_t lru_list_lock = SPIN_LOCK_UNLOCKED;
      static int nr_buffers_type[NR_LIST];
      static unsigned long size_buffers_type[NR_LIST];
      
      static struct buffer_head * unused_list;
      static int nr_unused_buffer_heads;
      static spinlock_t unused_list_lock = SPIN_LOCK_UNLOCKED;
      static DECLARE_WAIT_QUEUE_HEAD(buffer_wait);
      
      struct bh_free_head {
      	struct buffer_head *list;
      	spinlock_t lock;
      };
      static struct bh_free_head free_list[NR_SIZES];
      
      static int grow_buffers(int size);
      static void __refile_buffer(struct buffer_head *);
      
      /* This is used by some architectures to estimate available memory. */
      atomic_t buffermem_pages = ATOMIC_INIT(0);
      
      /* Here is the parameter block for the bdflush process. If you add or
       * remove any of the parameters, make sure to update kernel/sysctl.c.
       */
      
      #define N_PARAM 9
      
      /* The dummy values in this structure are left in there for compatibility
       * with old programs that play with the /proc entries.
       */
      union bdflush_param {
      	struct {
      		int nfract;  /* Percentage of buffer cache dirty to 
      				activate bdflush */
      		int ndirty;  /* Maximum number of dirty blocks to write out per
      				wake-cycle */
      		int nrefill; /* Number of clean buffers to try to obtain
      				each time we call refill */
      		int dummy1;   /* unused */
      		int interval; /* jiffies delay between kupdate flushes */
      		int age_buffer;  /* Time for normal buffer to age before we flush it */
      		int nfract_sync; /* Percentage of buffer cache dirty to 
      				    activate bdflush synchronously */
      		int dummy2;    /* unused */
      		int dummy3;    /* unused */
      	} b_un;
      	unsigned int data[N_PARAM];
      } bdf_prm = {{30, 64, 64, 256, 5*HZ, 30*HZ, 60, 0, 0}};
      
      /* These are the min and max parameter values that we will allow to be assigned */
      int bdflush_min[N_PARAM] = {  0,  10,    5,   25,  0,   1*HZ,   0, 0, 0};
      int bdflush_max[N_PARAM] = {100,50000, 20000, 20000,600*HZ, 6000*HZ, 100, 0, 0};
      
      /*
       * Rewrote the wait-routines to use the "new" wait-queue functionality,
       * and getting rid of the cli-sti pairs. The wait-queue routines still
       * need cli-sti, but now it's just a couple of 386 instructions or so.
       *
       * Note that the real wait_on_buffer() is an inline function that checks
       * if 'b_wait' is set before calling this, so that the queues aren't set
       * up unnecessarily.
       */
 145  void __wait_on_buffer(struct buffer_head * bh)
      {
      	struct task_struct *tsk = current;
      	DECLARE_WAITQUEUE(wait, tsk);
      
      	atomic_inc(&bh->b_count);
      	add_wait_queue(&bh->b_wait, &wait);
 152  	do {
      		run_task_queue(&tq_disk);
 154  		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 155  		if (!buffer_locked(bh))
 156  			break;
      		schedule();
 158  	} while (buffer_locked(bh));
      	tsk->state = TASK_RUNNING;
      	remove_wait_queue(&bh->b_wait, &wait);
      	atomic_dec(&bh->b_count);
      }
      
      /* Call sync_buffers with wait!=0 to ensure that the call does not
       * return until all buffer writes have completed.  Sync() may return
       * before the writes have finished; fsync() may not.
       */
      
      /* Godamity-damn.  Some buffers (bitmaps for filesystems)
       * spontaneously dirty themselves without ever brelse being called.
       * We will ultimately want to put these in a separate list, but for
       * now we search all of the lists for dirty buffers.
       */
 174  static int sync_buffers(kdev_t dev, int wait)
      {
      	int i, retry, pass = 0, err = 0;
      	struct buffer_head * bh, *next;
      
      	/* One pass for no-wait, three for wait:
      	 * 0) write out all dirty, unlocked buffers;
      	 * 1) write out all dirty buffers, waiting if locked;
      	 * 2) wait for completion by waiting for all buffers to unlock.
      	 */
 184  	do {
      		retry = 0;
      
      		/* We search all lists as a failsafe mechanism, not because we expect
      		 * there to be dirty buffers on any of the other lists.
      		 */
      repeat:
      		spin_lock(&lru_list_lock);
      		bh = lru_list[BUF_DIRTY];
 193  		if (!bh)
 194  			goto repeat2;
      
 196  		for (i = nr_buffers_type[BUF_DIRTY]*2 ; i-- > 0 ; bh = next) {
      			next = bh->b_next_free;
      
 199  			if (!lru_list[BUF_DIRTY])
 200  				break;
 201  			if (dev && bh->b_dev != dev)
 202  				continue;
 203  			if (buffer_locked(bh)) {
      				/* Buffer is locked; skip it unless wait is
      				 * requested AND pass > 0.
      				 */
 207  				if (!wait || !pass) {
      					retry = 1;
 209  					continue;
      				}
      				atomic_inc(&bh->b_count);
 212  				spin_unlock(&lru_list_lock);
      				wait_on_buffer (bh);
      				atomic_dec(&bh->b_count);
 215  				goto repeat;
      			}
      
      			/* If an unlocked buffer is not uptodate, there has
      			 * been an IO error. Skip it.
      			 */
      			if (wait && buffer_req(bh) && !buffer_locked(bh) &&
 222  			    !buffer_dirty(bh) && !buffer_uptodate(bh)) {
      				err = -EIO;
 224  				continue;
      			}
      
      			/* Don't write clean buffers.  Don't write ANY buffers
      			 * on the third pass.
      			 */
 230  			if (!buffer_dirty(bh) || pass >= 2)
 231  				continue;
      
      			atomic_inc(&bh->b_count);
 234  			spin_unlock(&lru_list_lock);
      			ll_rw_block(WRITE, 1, &bh);
      			atomic_dec(&bh->b_count);
      			retry = 1;
 238  			goto repeat;
      		}
      
          repeat2:
      		bh = lru_list[BUF_LOCKED];
 243  		if (!bh) {
 244  			spin_unlock(&lru_list_lock);
 245  			break;
      		}
 247  		for (i = nr_buffers_type[BUF_LOCKED]*2 ; i-- > 0 ; bh = next) {
      			next = bh->b_next_free;
      
 250  			if (!lru_list[BUF_LOCKED])
 251  				break;
 252  			if (dev && bh->b_dev != dev)
 253  				continue;
 254  			if (buffer_locked(bh)) {
      				/* Buffer is locked; skip it unless wait is
      				 * requested AND pass > 0.
      				 */
 258  				if (!wait || !pass) {
      					retry = 1;
 260  					continue;
      				}
      				atomic_inc(&bh->b_count);
 263  				spin_unlock(&lru_list_lock);
      				wait_on_buffer (bh);
      				spin_lock(&lru_list_lock);
      				atomic_dec(&bh->b_count);
 267  				goto repeat2;
      			}
      		}
 270  		spin_unlock(&lru_list_lock);
      
      		/* If we are waiting for the sync to succeed, and if any dirty
      		 * blocks were written, then repeat; on the second pass, only
      		 * wait for buffers being written (do not pass to write any
      		 * more buffers on the second pass).
      		 */
 277  	} while (wait && retry && ++pass<=2);
 278  	return err;
      }
      
 281  void sync_dev(kdev_t dev)
      {
      	sync_supers(dev);
      	sync_inodes(dev);
 285  	DQUOT_SYNC(dev);
      	/* sync all the dirty buffers out to disk only _after_ all the
      	   high level layers finished generated buffer dirty data
      	   (or we'll return with some buffer still dirty on the blockdevice
      	   so breaking the semantics of this call) */
      	sync_buffers(dev, 0);
      	/*
      	 * FIXME(eric) we need to sync the physical devices here.
      	 * This is because some (scsi) controllers have huge amounts of
      	 * cache onboard (hundreds of Mb), and we need to instruct
      	 * them to commit all of the dirty memory to disk, and we should
      	 * not return until this has happened.
      	 *
      	 * This would need to get implemented by going through the assorted
      	 * layers so that each block major number can be synced, and this
      	 * would call down into the upper and mid-layer scsi.
      	 */
      }
      
 304  int fsync_dev(kdev_t dev)
      {
      	sync_buffers(dev, 0);
      
 308  	lock_kernel();
      	sync_supers(dev);
      	sync_inodes(dev);
 311  	DQUOT_SYNC(dev);
 312  	unlock_kernel();
      
 314  	return sync_buffers(dev, 1);
      }
      
 317  asmlinkage long sys_sync(void)
      {
      	fsync_dev(0);
 320  	return 0;
      }
      
      /*
       *	filp may be NULL if called via the msync of a vma.
       */
       
 327  int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
      {
      	struct inode * inode = dentry->d_inode;
      	struct super_block * sb;
      	kdev_t dev;
      	int ret;
      
 334  	lock_kernel();
      	/* sync the inode to buffers */
      	write_inode_now(inode, 0);
      
      	/* sync the superblock to buffers */
      	sb = inode->i_sb;
      	lock_super(sb);
 341  	if (sb->s_op && sb->s_op->write_super)
      		sb->s_op->write_super(sb);
      	unlock_super(sb);
      
      	/* .. finally sync the buffers to disk */
      	dev = inode->i_dev;
      	ret = sync_buffers(dev, 1);
 348  	unlock_kernel();
 349  	return ret;
      }
      
 352  asmlinkage long sys_fsync(unsigned int fd)
      {
      	struct file * file;
      	struct dentry * dentry;
      	struct inode * inode;
      	int err;
      
      	err = -EBADF;
      	file = fget(fd);
 361  	if (!file)
 362  		goto out;
      
      	dentry = file->f_dentry;
      	inode = dentry->d_inode;
      
      	err = -EINVAL;
 368  	if (!file->f_op || !file->f_op->fsync)
 369  		goto out_putf;
      
      	/* We need to protect against concurrent writers.. */
      	down(&inode->i_sem);
      	filemap_fdatasync(inode->i_mapping);
      	err = file->f_op->fsync(file, dentry, 0);
      	filemap_fdatawait(inode->i_mapping);
      	up(&inode->i_sem);
      
      out_putf:
      	fput(file);
      out:
 381  	return err;
      }
      
 384  asmlinkage long sys_fdatasync(unsigned int fd)
      {
      	struct file * file;
      	struct dentry * dentry;
      	struct inode * inode;
      	int err;
      
      	err = -EBADF;
      	file = fget(fd);
 393  	if (!file)
 394  		goto out;
      
      	dentry = file->f_dentry;
      	inode = dentry->d_inode;
      
      	err = -EINVAL;
 400  	if (!file->f_op || !file->f_op->fsync)
 401  		goto out_putf;
      
      	down(&inode->i_sem);
      	filemap_fdatasync(inode->i_mapping);
      	err = file->f_op->fsync(file, dentry, 1);
      	filemap_fdatawait(inode->i_mapping);
      	up(&inode->i_sem);
      
      out_putf:
      	fput(file);
      out:
 412  	return err;
      }
      
      /* After several hours of tedious analysis, the following hash
       * function won.  Do not mess with it... -DaveM
       */
      #define _hashfn(dev,block)	\
      	((((dev)<<(bh_hash_shift - 6)) ^ ((dev)<<(bh_hash_shift - 9))) ^ \
      	 (((block)<<(bh_hash_shift - 6)) ^ ((block) >> 13) ^ \
      	  ((block) << (bh_hash_shift - 12))))
      #define hash(dev,block) hash_table[(_hashfn(HASHDEV(dev),block) & bh_hash_mask)]
      
 424  static __inline__ void __hash_link(struct buffer_head *bh, struct buffer_head **head)
      {
 426  	if ((bh->b_next = *head) != NULL)
      		bh->b_next->b_pprev = &bh->b_next;
      	*head = bh;
      	bh->b_pprev = head;
      }
      
 432  static __inline__ void __hash_unlink(struct buffer_head *bh)
      {
 434  	if (bh->b_pprev) {
 435  		if (bh->b_next)
      			bh->b_next->b_pprev = bh->b_pprev;
      		*(bh->b_pprev) = bh->b_next;
      		bh->b_pprev = NULL;
      	}
      }
      
 442  static void __insert_into_lru_list(struct buffer_head * bh, int blist)
      {
      	struct buffer_head **bhp = &lru_list[blist];
      
 446  	if(!*bhp) {
      		*bhp = bh;
      		bh->b_prev_free = bh;
      	}
      	bh->b_next_free = *bhp;
      	bh->b_prev_free = (*bhp)->b_prev_free;
      	(*bhp)->b_prev_free->b_next_free = bh;
      	(*bhp)->b_prev_free = bh;
      	nr_buffers_type[blist]++;
      	size_buffers_type[blist] += bh->b_size;
      }
      
 458  static void __remove_from_lru_list(struct buffer_head * bh, int blist)
      {
 460  	if (bh->b_prev_free || bh->b_next_free) {
      		bh->b_prev_free->b_next_free = bh->b_next_free;
      		bh->b_next_free->b_prev_free = bh->b_prev_free;
 463  		if (lru_list[blist] == bh)
      			lru_list[blist] = bh->b_next_free;
 465  		if (lru_list[blist] == bh)
      			lru_list[blist] = NULL;
      		bh->b_next_free = bh->b_prev_free = NULL;
      		nr_buffers_type[blist]--;
      		size_buffers_type[blist] -= bh->b_size;
      	}
      }
      
 473  static void __remove_from_free_list(struct buffer_head * bh, int index)
      {
 475  	if(bh->b_next_free == bh)
      		 free_list[index].list = NULL;
 477  	else {
      		bh->b_prev_free->b_next_free = bh->b_next_free;
      		bh->b_next_free->b_prev_free = bh->b_prev_free;
 480  		if (free_list[index].list == bh)
      			 free_list[index].list = bh->b_next_free;
      	}
      	bh->b_next_free = bh->b_prev_free = NULL;
      }
      
      /* must be called with both the hash_table_lock and the lru_list_lock
         held */
 488  static void __remove_from_queues(struct buffer_head *bh)
      {
      	__hash_unlink(bh);
      	__remove_from_lru_list(bh, bh->b_list);
      }
      
 494  static void __insert_into_queues(struct buffer_head *bh)
      {
      	struct buffer_head **head = &hash(bh->b_dev, bh->b_blocknr);
      
      	__hash_link(bh, head);
      	__insert_into_lru_list(bh, bh->b_list);
      }
      
      /* This function must only run if there are no other
       * references _anywhere_ to this buffer head.
       */
 505  static void put_last_free(struct buffer_head * bh)
      {
      	struct bh_free_head *head = &free_list[BUFSIZE_INDEX(bh->b_size)];
      	struct buffer_head **bhp = &head->list;
      
      	bh->b_state = 0;
      
      	spin_lock(&head->lock);
      	bh->b_dev = B_FREE;
 514  	if(!*bhp) {
      		*bhp = bh;
      		bh->b_prev_free = bh;
      	}
      	bh->b_next_free = *bhp;
      	bh->b_prev_free = (*bhp)->b_prev_free;
      	(*bhp)->b_prev_free->b_next_free = bh;
      	(*bhp)->b_prev_free = bh;
 522  	spin_unlock(&head->lock);
      }
      
      /*
       * Why like this, I hear you say... The reason is race-conditions.
       * As we don't lock buffers (unless we are reading them, that is),
       * something might happen to it while we sleep (ie a read-error
       * will force it bad). This shouldn't really happen currently, but
       * the code is ready.
       */
 532  static inline struct buffer_head * __get_hash_table(kdev_t dev, int block, int size)
      {
      	struct buffer_head *bh = hash(dev, block);
      
 536  	for (; bh; bh = bh->b_next)
      		if (bh->b_blocknr == block	&&
      		    bh->b_size    == size	&&
 539  		    bh->b_dev     == dev)
 540  			break;
 541  	if (bh)
      		atomic_inc(&bh->b_count);
      
 544  	return bh;
      }
      
 547  struct buffer_head * get_hash_table(kdev_t dev, int block, int size)
      {
      	struct buffer_head *bh;
      
      	read_lock(&hash_table_lock);
      	bh = __get_hash_table(dev, block, size);
 553  	read_unlock(&hash_table_lock);
      
 555  	return bh;
      }
      
 558  unsigned int get_hardblocksize(kdev_t dev)
      {
      	/*
      	 * Get the hard sector size for the given device.  If we don't know
      	 * what it is, return 0.
      	 */
 564  	if (hardsect_size[MAJOR(dev)] != NULL) {
      		int blksize = hardsect_size[MAJOR(dev)][MINOR(dev)];
 566  		if (blksize != 0)
 567  			return blksize;
      	}
      
      	/*
      	 * We don't know what the hardware sector size for this device is.
      	 * Return 0 indicating that we don't know.
      	 */
 574  	return 0;
      }
      
 577  void buffer_insert_inode_queue(struct buffer_head *bh, struct inode *inode)
      {
      	spin_lock(&lru_list_lock);
 580  	if (bh->b_inode)
      		list_del(&bh->b_inode_buffers);
      	bh->b_inode = inode;
      	list_add(&bh->b_inode_buffers, &inode->i_dirty_buffers);
 584  	spin_unlock(&lru_list_lock);
      }
      
      /* The caller must have the lru_list lock before calling the 
         remove_inode_queue functions.  */
 589  static void __remove_inode_queue(struct buffer_head *bh)
      {
      	bh->b_inode = NULL;
      	list_del(&bh->b_inode_buffers);
      }
      
 595  static inline void remove_inode_queue(struct buffer_head *bh)
      {
 597  	if (bh->b_inode)
      		__remove_inode_queue(bh);
      }
      
 601  int inode_has_buffers(struct inode *inode)
      {
      	int ret;
      	
      	spin_lock(&lru_list_lock);
      	ret = !list_empty(&inode->i_dirty_buffers);
 607  	spin_unlock(&lru_list_lock);
      	
 609  	return ret;
      }
      
      
      /* If invalidate_buffers() will trash dirty buffers, it means some kind
         of fs corruption is going on. Trashing dirty data always imply losing
         information that was supposed to be just stored on the physical layer
         by the user.
      
         Thus invalidate_buffers in general usage is not allwowed to trash dirty
         buffers. For example ioctl(FLSBLKBUF) expects dirty data to be preserved.
      
         NOTE: In the case where the user removed a removable-media-disk even if
         there's still dirty data not synced on disk (due a bug in the device driver
         or due an error of the user), by not destroying the dirty buffers we could
         generate corruption also on the next media inserted, thus a parameter is
         necessary to handle this case in the most safe way possible (trying
         to not corrupt also the new disk inserted with the data belonging to
         the old now corrupted disk). Also for the ramdisk the natural thing
         to do in order to release the ramdisk memory is to destroy dirty buffers.
      
         These are two special cases. Normal usage imply the device driver
         to issue a sync on the device (without waiting I/O completation) and
         then an invalidate_buffers call that doesn't trash dirty buffers. */
 633  void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers)
      {
      	int i, nlist, slept;
      	struct buffer_head * bh, * bh_next;
      
       retry:
      	slept = 0;
      	spin_lock(&lru_list_lock);
 641  	for(nlist = 0; nlist < NR_LIST; nlist++) {
      		bh = lru_list[nlist];
 643  		if (!bh)
 644  			continue;
 645  		for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) {
      			bh_next = bh->b_next_free;
      
      			/* Another device? */
 649  			if (bh->b_dev != dev)
 650  				continue;
      			/* Part of a mapping? */
 652  			if (bh->b_page->mapping)
 653  				continue;
 654  			if (buffer_locked(bh)) {
      				atomic_inc(&bh->b_count);
 656  				spin_unlock(&lru_list_lock);
      				wait_on_buffer(bh);
      				slept = 1;
      				spin_lock(&lru_list_lock);
      				atomic_dec(&bh->b_count);
      			}
      
      			write_lock(&hash_table_lock);
      			if (!atomic_read(&bh->b_count) &&
 665  			    (destroy_dirty_buffers || !buffer_dirty(bh))) {
      				remove_inode_queue(bh);
      				__remove_from_queues(bh);
      				put_last_free(bh);
      			}
      			/* else complain loudly? */
      
 672  			write_unlock(&hash_table_lock);
 673  			if (slept)
 674  				goto out;
      		}
      	}
      out:
 678  	spin_unlock(&lru_list_lock);
 679  	if (slept)
 680  		goto retry;
      }
      
 683  void set_blocksize(kdev_t dev, int size)
      {
      	extern int *blksize_size[];
      	int i, nlist, slept;
      	struct buffer_head * bh, * bh_next;
      
 689  	if (!blksize_size[MAJOR(dev)])
 690  		return;
      
      	/* Size must be a power of two, and between 512 and PAGE_SIZE */
 693  	if (size > PAGE_SIZE || size < 512 || (size & (size-1)))
      		panic("Invalid blocksize passed to set_blocksize");
      
 696  	if (blksize_size[MAJOR(dev)][MINOR(dev)] == 0 && size == BLOCK_SIZE) {
      		blksize_size[MAJOR(dev)][MINOR(dev)] = size;
 698  		return;
      	}
 700  	if (blksize_size[MAJOR(dev)][MINOR(dev)] == size)
 701  		return;
      	sync_buffers(dev, 2);
      	blksize_size[MAJOR(dev)][MINOR(dev)] = size;
      
       retry:
      	slept = 0;
      	spin_lock(&lru_list_lock);
 708  	for(nlist = 0; nlist < NR_LIST; nlist++) {
      		bh = lru_list[nlist];
 710  		if (!bh)
 711  			continue;
 712  		for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) {
      			bh_next = bh->b_next_free;
 714  			if (bh->b_dev != dev || bh->b_size == size)
 715  				continue;
 716  			if (buffer_locked(bh)) {
      				atomic_inc(&bh->b_count);
 718  				spin_unlock(&lru_list_lock);
      				wait_on_buffer(bh);
      				slept = 1;
      				spin_lock(&lru_list_lock);
      				atomic_dec(&bh->b_count);
      			}
      
      			write_lock(&hash_table_lock);
 726  			if (!atomic_read(&bh->b_count)) {
 727  				if (buffer_dirty(bh))
      					printk(KERN_WARNING
      					       "set_blocksize: dev %s buffer_dirty %lu size %hu\n",
      					       kdevname(dev), bh->b_blocknr, bh->b_size);
      				remove_inode_queue(bh);
      				__remove_from_queues(bh);
      				put_last_free(bh);
 734  			} else {
 735  				if (atomic_set_buffer_clean(bh))
      					__refile_buffer(bh);
      				clear_bit(BH_Uptodate, &bh->b_state);
      				printk(KERN_WARNING
      				       "set_blocksize: "
      				       "b_count %d, dev %s, block %lu, from %p\n",
      				       atomic_read(&bh->b_count), bdevname(bh->b_dev),
      				       bh->b_blocknr, __builtin_return_address(0));
      			}
 744  			write_unlock(&hash_table_lock);
 745  			if (slept)
 746  				goto out;
      		}
      	}
       out:
 750  	spin_unlock(&lru_list_lock);
 751  	if (slept)
 752  		goto retry;
      }
      
      /*
       * We used to try various strange things. Let's not.
       * We'll just try to balance dirty buffers, and possibly
       * launder some pages.
       */
 760  static void refill_freelist(int size)
      {
      	balance_dirty(NODEV);
 763  	if (free_shortage())
      		page_launder(GFP_BUFFER, 0);
      	grow_buffers(size);
      }
      
 768  void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
      {
      	bh->b_list = BUF_CLEAN;
      	bh->b_end_io = handler;
      	bh->b_private = private;
      }
      
 775  static void end_buffer_io_async(struct buffer_head * bh, int uptodate)
      {
      	static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
      	unsigned long flags;
      	struct buffer_head *tmp;
      	struct page *page;
      
      	mark_buffer_uptodate(bh, uptodate);
      
      	/* This is a temporary buffer used for page I/O. */
      	page = bh->b_page;
      
 787  	if (!uptodate)
      		SetPageError(page);
      
      	/*
      	 * Be _very_ careful from here on. Bad things can happen if
      	 * two buffer heads end IO at almost the same time and both
      	 * decide that the page is now completely done.
      	 *
      	 * Async buffer_heads are here only as labels for IO, and get
      	 * thrown away once the IO for this page is complete.  IO is
      	 * deemed complete once all buffers have been visited
      	 * (b_count==0) and are now unlocked. We must make sure that
      	 * only the _last_ buffer that decrements its count is the one
      	 * that unlock the page..
      	 */
 802  	spin_lock_irqsave(&page_uptodate_lock, flags);
      	unlock_buffer(bh);
      	atomic_dec(&bh->b_count);
      	tmp = bh->b_this_page;
 806  	while (tmp != bh) {
 807  		if (tmp->b_end_io == end_buffer_io_async && buffer_locked(tmp))
 808  			goto still_busy;
      		tmp = tmp->b_this_page;
      	}
      
      	/* OK, the async IO on this page is complete. */
 813  	spin_unlock_irqrestore(&page_uptodate_lock, flags);
      
      	/*
      	 * if none of the buffers had errors then we can set the
      	 * page uptodate:
      	 */
 819  	if (!PageError(page))
      		SetPageUptodate(page);
      
      	/*
      	 * Run the hooks that have to be done when a page I/O has completed.
      	 */
 825  	if (PageTestandClearDecrAfter(page))
      		atomic_dec(&nr_async_pages);
      
 828  	UnlockPage(page);
      
 830  	return;
      
      still_busy:
 833  	spin_unlock_irqrestore(&page_uptodate_lock, flags);
 834  	return;
      }
      
      /*
       * Synchronise all the inode's dirty buffers to the disk.
       *
       * We have conflicting pressures: we want to make sure that all
       * initially dirty buffers get waited on, but that any subsequently
       * dirtied buffers don't.  After all, we don't want fsync to last
       * forever if somebody is actively writing to the file.
       *
       * Do this in two main stages: first we copy dirty buffers to a
       * temporary inode list, queueing the writes as we go.  Then we clean
       * up, waiting for those writes to complete.
       * 
       * During this second stage, any subsequent updates to the file may end
       * up refiling the buffer on the original inode's dirty list again, so
       * there is a chance we will end up with a buffer queued for write but
       * not yet completed on that list.  So, as a final cleanup we go through
       * the osync code to catch these locked, dirty buffers without requeuing
       * any newly dirty buffers for write.
       */
      
 857  int fsync_inode_buffers(struct inode *inode)
      {
      	struct buffer_head *bh;
      	struct inode tmp;
      	int err = 0, err2;
      	
 863  	INIT_LIST_HEAD(&tmp.i_dirty_buffers);
      	
      	spin_lock(&lru_list_lock);
      
 867  	while (!list_empty(&inode->i_dirty_buffers)) {
      		bh = BH_ENTRY(inode->i_dirty_buffers.next);
      		list_del(&bh->b_inode_buffers);
 870  		if (!buffer_dirty(bh) && !buffer_locked(bh))
      			bh->b_inode = NULL;
 872  		else {
      			bh->b_inode = &tmp;
      			list_add(&bh->b_inode_buffers, &tmp.i_dirty_buffers);
 875  			if (buffer_dirty(bh)) {
      				atomic_inc(&bh->b_count);
 877  				spin_unlock(&lru_list_lock);
      				ll_rw_block(WRITE, 1, &bh);
      				brelse(bh);
      				spin_lock(&lru_list_lock);
      			}
      		}
      	}
      
 885  	while (!list_empty(&tmp.i_dirty_buffers)) {
      		bh = BH_ENTRY(tmp.i_dirty_buffers.prev);
      		remove_inode_queue(bh);
      		atomic_inc(&bh->b_count);
 889  		spin_unlock(&lru_list_lock);
      		wait_on_buffer(bh);
 891  		if (!buffer_uptodate(bh))
      			err = -EIO;
      		brelse(bh);
      		spin_lock(&lru_list_lock);
      	}
      	
 897  	spin_unlock(&lru_list_lock);
      	err2 = osync_inode_buffers(inode);
      
 900  	if (err)
 901  		return err;
 902  	else
 903  		return err2;
      }
      
      
      /*
       * osync is designed to support O_SYNC io.  It waits synchronously for
       * all already-submitted IO to complete, but does not queue any new
       * writes to the disk.
       *
       * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
       * you dirty the buffers, and then use osync_inode_buffers to wait for
       * completion.  Any other dirty buffers which are not yet queued for
       * write will not be flushed to disk by the osync.
       */
      
 918  int osync_inode_buffers(struct inode *inode)
      {
      	struct buffer_head *bh;
      	struct list_head *list;
      	int err = 0;
      
      	spin_lock(&lru_list_lock);
      	
       repeat:
      	
      	for (list = inode->i_dirty_buffers.prev; 
 929  	     bh = BH_ENTRY(list), list != &inode->i_dirty_buffers;
      	     list = bh->b_inode_buffers.prev) {
 931  		if (buffer_locked(bh)) {
      			atomic_inc(&bh->b_count);
 933  			spin_unlock(&lru_list_lock);
      			wait_on_buffer(bh);
 935  			if (!buffer_uptodate(bh))
      				err = -EIO;
      			brelse(bh);
      			spin_lock(&lru_list_lock);
 939  			goto repeat;
      		}
      	}
      
 943  	spin_unlock(&lru_list_lock);
 944  	return err;
      }
      
      
      /*
       * Invalidate any and all dirty buffers on a given inode.  We are
       * probably unmounting the fs, but that doesn't mean we have already
       * done a sync().  Just drop the buffers from the inode list.
       */
 953  void invalidate_inode_buffers(struct inode *inode)
      {
      	struct list_head *list, *next;
      	
      	spin_lock(&lru_list_lock);
      	list = inode->i_dirty_buffers.next; 
 959  	while (list != &inode->i_dirty_buffers) {
      		next = list->next;
      		remove_inode_queue(BH_ENTRY(list));
      		list = next;
      	}
 964  	spin_unlock(&lru_list_lock);
      }
      
      
      /*
       * Ok, this is getblk, and it isn't very clear, again to hinder
       * race-conditions. Most of the code is seldom used, (ie repeating),
       * so it should be much more efficient than it looks.
       *
       * The algorithm is changed: hopefully better, and an elusive bug removed.
       *
       * 14.02.92: changed it to sync dirty buffers a bit: better performance
       * when the filesystem starts to get full of dirty blocks (I hope).
       */
 978  struct buffer_head * getblk(kdev_t dev, int block, int size)
      {
      	struct buffer_head * bh;
      	int isize;
      
      repeat:
      	spin_lock(&lru_list_lock);
      	write_lock(&hash_table_lock);
      	bh = __get_hash_table(dev, block, size);
 987  	if (bh)
 988  		goto out;
      
      	isize = BUFSIZE_INDEX(size);
      	spin_lock(&free_list[isize].lock);
      	bh = free_list[isize].list;
 993  	if (bh) {
      		__remove_from_free_list(bh, isize);
      		atomic_set(&bh->b_count, 1);
      	}
 997  	spin_unlock(&free_list[isize].lock);
      
      	/*
      	 * OK, FINALLY we know that this buffer is the only one of
      	 * its kind, we hold a reference (b_count>0), it is unlocked,
      	 * and it is clean.
      	 */
1004  	if (bh) {
      		init_buffer(bh, NULL, NULL);
      		bh->b_dev = dev;
      		bh->b_blocknr = block;
      		bh->b_state = 1 << BH_Mapped;
      
      		/* Insert the buffer into the regular lists */
      		__insert_into_queues(bh);
      	out:
1013  		write_unlock(&hash_table_lock);
1014  		spin_unlock(&lru_list_lock);
      		touch_buffer(bh);
1016  		return bh;
      	}
      
      	/*
      	 * If we block while refilling the free list, somebody may
      	 * create the buffer first ... search the hashes again.
      	 */
1023  	write_unlock(&hash_table_lock);
1024  	spin_unlock(&lru_list_lock);
      	refill_freelist(size);
1026  	goto repeat;
      }
      
      /* -1 -> no need to flush
          0 -> async flush
          1 -> sync flush (wait for I/O completation) */
1032  int balance_dirty_state(kdev_t dev)
      {
      	unsigned long dirty, tot, hard_dirty_limit, soft_dirty_limit;
      	int shortage;
      
      	dirty = size_buffers_type[BUF_DIRTY] >> PAGE_SHIFT;
      	tot = nr_free_buffer_pages();
      
      	dirty *= 100;
      	soft_dirty_limit = tot * bdf_prm.b_un.nfract;
      	hard_dirty_limit = tot * bdf_prm.b_un.nfract_sync;
      
      	/* First, check for the "real" dirty limit. */
1045  	if (dirty > soft_dirty_limit) {
1046  		if (dirty > hard_dirty_limit)
1047  			return 1;
1048  		return 0;
      	}
      
      	/*
      	 * If we are about to get low on free pages and
      	 * cleaning the inactive_dirty pages would help
      	 * fix this, wake up bdflush.
      	 */
      	shortage = free_shortage();
      	if (shortage && nr_inactive_dirty_pages > shortage &&
1058  			nr_inactive_dirty_pages > freepages.high)
1059  		return 0;
      
1061  	return -1;
      }
      
      /*
       * if a new dirty buffer is created we need to balance bdflush.
       *
       * in the future we might want to make bdflush aware of different
       * pressures on different devices - thus the (currently unused)
       * 'dev' parameter.
       */
1071  void balance_dirty(kdev_t dev)
      {
      	int state = balance_dirty_state(dev);
      
1075  	if (state < 0)
1076  		return;
      	wakeup_bdflush(state);
      }
      
1080  static __inline__ void __mark_dirty(struct buffer_head *bh)
      {
      	bh->b_flushtime = jiffies + bdf_prm.b_un.age_buffer;
      	refile_buffer(bh);
      }
      
      /* atomic version, the user must call balance_dirty() by hand
         as soon as it become possible to block */
1088  void __mark_buffer_dirty(struct buffer_head *bh)
      {
1090  	if (!atomic_set_buffer_dirty(bh))
      		__mark_dirty(bh);
      }
      
1094  void mark_buffer_dirty(struct buffer_head *bh)
      {
1096  	if (!atomic_set_buffer_dirty(bh)) {
      		__mark_dirty(bh);
      		balance_dirty(bh->b_dev);
      	}
      }
      
      /*
       * A buffer may need to be moved from one buffer list to another
       * (e.g. in case it is not shared any more). Handle this.
       */
1106  static void __refile_buffer(struct buffer_head *bh)
      {
      	int dispose = BUF_CLEAN;
1109  	if (buffer_locked(bh))
      		dispose = BUF_LOCKED;
1111  	if (buffer_dirty(bh))
      		dispose = BUF_DIRTY;
1113  	if (buffer_protected(bh))
      		dispose = BUF_PROTECTED;
1115  	if (dispose != bh->b_list) {
      		__remove_from_lru_list(bh, bh->b_list);
      		bh->b_list = dispose;
1118  		if (dispose == BUF_CLEAN)
      			remove_inode_queue(bh);
      		__insert_into_lru_list(bh, dispose);
      	}
      }
      
1124  void refile_buffer(struct buffer_head *bh)
      {
      	spin_lock(&lru_list_lock);
      	__refile_buffer(bh);
1128  	spin_unlock(&lru_list_lock);
      }
      
      /*
       * Release a buffer head
       */
1134  void __brelse(struct buffer_head * buf)
      {
1136  	if (atomic_read(&buf->b_count)) {
      		atomic_dec(&buf->b_count);
1138  		return;
      	}
      	printk("VFS: brelse: Trying to free free buffer\n");
      }
      
      /*
       * bforget() is like brelse(), except it puts the buffer on the
       * free list if it can.. We can NOT free the buffer if:
       *  - there are other users of it
       *  - it is locked and thus can have active IO
       */
1149  void __bforget(struct buffer_head * buf)
      {
      	/* grab the lru lock here to block bdflush. */
      	spin_lock(&lru_list_lock);
      	write_lock(&hash_table_lock);
1154  	if (!atomic_dec_and_test(&buf->b_count) || buffer_locked(buf))
1155  		goto in_use;
      	__hash_unlink(buf);
      	remove_inode_queue(buf);
1158  	write_unlock(&hash_table_lock);
      	__remove_from_lru_list(buf, buf->b_list);
1160  	spin_unlock(&lru_list_lock);
      	put_last_free(buf);
1162  	return;
      
       in_use:
1165  	write_unlock(&hash_table_lock);
1166  	spin_unlock(&lru_list_lock);
      }
      
      /*
       * bread() reads a specified block and returns the buffer that contains
       * it. It returns NULL if the block was unreadable.
       */
1173  struct buffer_head * bread(kdev_t dev, int block, int size)
      {
      	struct buffer_head * bh;
      
      	bh = getblk(dev, block, size);
1178  	if (buffer_uptodate(bh))
1179  		return bh;
      	ll_rw_block(READ, 1, &bh);
      	wait_on_buffer(bh);
1182  	if (buffer_uptodate(bh))
1183  		return bh;
      	brelse(bh);
1185  	return NULL;
      }
      
      /*
       * Note: the caller should wake up the buffer_wait list if needed.
       */
1191  static __inline__ void __put_unused_buffer_head(struct buffer_head * bh)
      {
1193  	if (bh->b_inode)
1194  		BUG();
1195  	if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
      		kmem_cache_free(bh_cachep, bh);
1197  	} else {
      		bh->b_blocknr = -1;
      		init_waitqueue_head(&bh->b_wait);
      		nr_unused_buffer_heads++;
      		bh->b_next_free = unused_list;
      		bh->b_this_page = NULL;
      		unused_list = bh;
      	}
      }
      
      /*
       * Reserve NR_RESERVED buffer heads for async IO requests to avoid
       * no-buffer-head deadlock.  Return NULL on failure; waiting for
       * buffer heads is now handled in create_buffers().
       */ 
1212  static struct buffer_head * get_unused_buffer_head(int async)
      {
      	struct buffer_head * bh;
      
      	spin_lock(&unused_list_lock);
1217  	if (nr_unused_buffer_heads > NR_RESERVED) {
      		bh = unused_list;
      		unused_list = bh->b_next_free;
      		nr_unused_buffer_heads--;
1221  		spin_unlock(&unused_list_lock);
1222  		return bh;
      	}
1224  	spin_unlock(&unused_list_lock);
      
      	/* This is critical.  We can't swap out pages to get
      	 * more buffer heads, because the swap-out may need
      	 * more buffer-heads itself.  Thus SLAB_BUFFER.
      	 */
1230  	if((bh = kmem_cache_alloc(bh_cachep, SLAB_BUFFER)) != NULL) {
      		memset(bh, 0, sizeof(*bh));
      		init_waitqueue_head(&bh->b_wait);
1233  		return bh;
      	}
      
      	/*
      	 * If we need an async buffer, use the reserved buffer heads.
      	 */
1239  	if (async) {
      		spin_lock(&unused_list_lock);
1241  		if (unused_list) {
      			bh = unused_list;
      			unused_list = bh->b_next_free;
      			nr_unused_buffer_heads--;
1245  			spin_unlock(&unused_list_lock);
1246  			return bh;
      		}
1248  		spin_unlock(&unused_list_lock);
      	}
      #if 0
      	/*
      	 * (Pending further analysis ...)
      	 * Ordinary (non-async) requests can use a different memory priority
      	 * to free up pages. Any swapping thus generated will use async
      	 * buffer heads.
      	 */
      	if(!async &&
      	   (bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL)) != NULL) {
      		memset(bh, 0, sizeof(*bh));
      		init_waitqueue_head(&bh->b_wait);
      		return bh;
      	}
      #endif
      
1265  	return NULL;
      }
      
1268  void set_bh_page (struct buffer_head *bh, struct page *page, unsigned long offset)
      {
      	bh->b_page = page;
1271  	if (offset >= PAGE_SIZE)
1272  		BUG();
1273  	if (PageHighMem(page))
      		/*
      		 * This catches illegal uses and preserves the offset:
      		 */
      		bh->b_data = (char *)(0 + offset);
1278  	else
      		bh->b_data = page_address(page) + offset;
      }
      
      /*
       * Create the appropriate buffers when given a page for data area and
       * the size of each buffer.. Use the bh->b_this_page linked list to
       * follow the buffers created.  Return NULL if unable to create more
       * buffers.
       * The async flag is used to differentiate async IO (paging, swapping)
       * from ordinary buffer allocations, and only async requests are allowed
       * to sleep waiting for buffer heads. 
       */
1291  static struct buffer_head * create_buffers(struct page * page, unsigned long size, int async)
      {
      	struct buffer_head *bh, *head;
      	long offset;
      
      try_again:
      	head = NULL;
      	offset = PAGE_SIZE;
1299  	while ((offset -= size) >= 0) {
      		bh = get_unused_buffer_head(async);
1301  		if (!bh)
1302  			goto no_grow;
      
      		bh->b_dev = B_FREE;  /* Flag as unused */
      		bh->b_this_page = head;
      		head = bh;
      
      		bh->b_state = 0;
      		bh->b_next_free = NULL;
      		bh->b_pprev = NULL;
      		atomic_set(&bh->b_count, 0);
      		bh->b_size = size;
      
      		set_bh_page(bh, page, offset);
      
      		bh->b_list = BUF_CLEAN;
      		bh->b_end_io = NULL;
      	}
1319  	return head;
      /*
       * In case anything failed, we just free everything we got.
       */
      no_grow:
1324  	if (head) {
      		spin_lock(&unused_list_lock);
1326  		do {
      			bh = head;
      			head = head->b_this_page;
      			__put_unused_buffer_head(bh);
1330  		} while (head);
1331  		spin_unlock(&unused_list_lock);
      
      		/* Wake up any waiters ... */
      		wake_up(&buffer_wait);
      	}
      
      	/*
      	 * Return failure for non-async IO requests.  Async IO requests
      	 * are not allowed to fail, so we have to wait until buffer heads
      	 * become available.  But we don't want tasks sleeping with 
      	 * partially complete buffers, so all were released above.
      	 */
1343  	if (!async)
1344  		return NULL;
      
      	/* We're _really_ low on memory. Now we just
      	 * wait for old buffer heads to become free due to
      	 * finishing IO.  Since this is an async request and
      	 * the reserve list is empty, we're sure there are 
      	 * async buffer heads in use.
      	 */
      	run_task_queue(&tq_disk);
      
      	/* 
      	 * Set our state for sleeping, then check again for buffer heads.
      	 * This ensures we won't miss a wake_up from an interrupt.
      	 */
1358  	wait_event(buffer_wait, nr_unused_buffer_heads >= MAX_BUF_PER_PAGE);
1359  	goto try_again;
      }
      
1362  static void unmap_buffer(struct buffer_head * bh)
      {
1364  	if (buffer_mapped(bh)) {
      		mark_buffer_clean(bh);
      		wait_on_buffer(bh);
      		clear_bit(BH_Uptodate, &bh->b_state);
      		clear_bit(BH_Mapped, &bh->b_state);
      		clear_bit(BH_Req, &bh->b_state);
      		clear_bit(BH_New, &bh->b_state);
      	}
      }
      
      /*
       * We don't have to release all buffers here, but
       * we have to be sure that no dirty buffer is left
       * and no IO is going on (no buffer is locked), because
       * we have truncated the file and are going to free the
       * blocks on-disk..
       */
1381  int block_flushpage(struct page *page, unsigned long offset)
      {
      	struct buffer_head *head, *bh, *next;
      	unsigned int curr_off = 0;
      
1386  	if (!PageLocked(page))
1387  		BUG();
1388  	if (!page->buffers)
1389  		return 1;
      
      	head = page->buffers;
      	bh = head;
1393  	do {
      		unsigned int next_off = curr_off + bh->b_size;
      		next = bh->b_this_page;
      
      		/*
      		 * is this block fully flushed?
      		 */
1400  		if (offset <= curr_off)
      			unmap_buffer(bh);
      		curr_off = next_off;
      		bh = next;
1404  	} while (bh != head);
      
      	/*
      	 * subtle. We release buffer-heads only if this is
      	 * the 'final' flushpage. We have invalidated the get_block
      	 * cached value unconditionally, so real IO is not
      	 * possible anymore.
      	 *
      	 * If the free doesn't work out, the buffers can be
      	 * left around - they just turn into anonymous buffers
      	 * instead.
      	 */
1416  	if (!offset) {
1417  		if (!try_to_free_buffers(page, 0)) {
      			atomic_inc(&buffermem_pages);
1419  			return 0;
      		}
      	}
      
1423  	return 1;
      }
      
1426  static void create_empty_buffers(struct page *page, kdev_t dev, unsigned long blocksize)
      {
      	struct buffer_head *bh, *head, *tail;
      
      	head = create_buffers(page, blocksize, 1);
1431  	if (page->buffers)
1432  		BUG();
      
      	bh = head;
1435  	do {
      		bh->b_dev = dev;
      		bh->b_blocknr = 0;
      		bh->b_end_io = NULL;
      		tail = bh;
      		bh = bh->b_this_page;
1441  	} while (bh);
      	tail->b_this_page = head;
      	page->buffers = head;
      	page_cache_get(page);
      }
      
      /*
       * We are taking a block for data and we don't want any output from any
       * buffer-cache aliases starting from return from that function and
       * until the moment when something will explicitly mark the buffer
       * dirty (hopefully that will not happen until we will free that block ;-)
       * We don't even need to mark it not-uptodate - nobody can expect
       * anything from a newly allocated buffer anyway. We used to used
       * unmap_buffer() for such invalidation, but that was wrong. We definitely
       * don't want to mark the alias unmapped, for example - it would confuse
       * anyone who might pick it with bread() afterwards...
       */
      
1459  static void unmap_underlying_metadata(struct buffer_head * bh)
      {
      	struct buffer_head *old_bh;
      
      	old_bh = get_hash_table(bh->b_dev, bh->b_blocknr, bh->b_size);
1464  	if (old_bh) {
      		mark_buffer_clean(old_bh);
      		wait_on_buffer(old_bh);
      		clear_bit(BH_Req, &old_bh->b_state);
      		/* Here we could run brelse or bforget. We use
      		   bforget because it will try to put the buffer
      		   in the freelist. */
      		__bforget(old_bh);
      	}
      }
      
      /*
       * NOTE! All mapped/uptodate combinations are valid:
       *
       *	Mapped	Uptodate	Meaning
       *
       *	No	No		"unknown" - must do get_block()
       *	No	Yes		"hole" - zero-filled
       *	Yes	No		"allocated" - allocated on disk, not read in
       *	Yes	Yes		"valid" - allocated and up-to-date in memory.
       *
       * "Dirty" is valid only with the last case (mapped+uptodate).
       */
      
      /*
       * block_write_full_page() is SMP-safe - currently it's still
       * being called with the kernel lock held, but the code is ready.
       */
1492  static int __block_write_full_page(struct inode *inode, struct page *page, get_block_t *get_block)
      {
      	int err, i;
      	unsigned long block;
      	struct buffer_head *bh, *head;
      
1498  	if (!PageLocked(page))
1499  		BUG();
      
1501  	if (!page->buffers)
      		create_empty_buffers(page, inode->i_dev, inode->i_sb->s_blocksize);
      	head = page->buffers;
      
      	block = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
      
      	bh = head;
      	i = 0;
      
      	/* Stage 1: make sure we have all the buffers mapped! */
1511  	do {
      		/*
      		 * If the buffer isn't up-to-date, we can't be sure
      		 * that the buffer has been initialized with the proper
      		 * block number information etc..
      		 *
      		 * Leave it to the low-level FS to make all those
      		 * decisions (block #0 may actually be a valid block)
      		 */
1520  		if (!buffer_mapped(bh)) {
      			err = get_block(inode, block, bh, 1);
1522  			if (err)
1523  				goto out;
1524  			if (buffer_new(bh))
      				unmap_underlying_metadata(bh);
      		}
      		bh = bh->b_this_page;
      		block++;
1529  	} while (bh != head);
      
      	/* Stage 2: lock the buffers, mark them clean */
1532  	do {
      		lock_buffer(bh);
      		bh->b_end_io = end_buffer_io_async;
      		atomic_inc(&bh->b_count);
      		set_bit(BH_Uptodate, &bh->b_state);
      		clear_bit(BH_Dirty, &bh->b_state);
      		bh = bh->b_this_page;
1539  	} while (bh != head);
      
      	/* Stage 3: submit the IO */
1542  	do {
      		submit_bh(WRITE, bh);
      		bh = bh->b_this_page;		
1545  	} while (bh != head);
      
      	/* Done - end_buffer_io_async will unlock */
      	SetPageUptodate(page);
1549  	return 0;
      
      out:
      	ClearPageUptodate(page);
1553  	UnlockPage(page);
1554  	return err;
      }
      
1557  static int __block_prepare_write(struct inode *inode, struct page *page,
      		unsigned from, unsigned to, get_block_t *get_block)
      {
      	unsigned block_start, block_end;
      	unsigned long block;
      	int err = 0;
      	unsigned blocksize, bbits;
      	struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
      	char *kaddr = kmap(page);
      
      	blocksize = inode->i_sb->s_blocksize;
1568  	if (!page->buffers)
      		create_empty_buffers(page, inode->i_dev, blocksize);
      	head = page->buffers;
      
      	bbits = inode->i_sb->s_blocksize_bits;
      	block = page->index << (PAGE_CACHE_SHIFT - bbits);
      
1575  	for(bh = head, block_start = 0; bh != head || !block_start;
      	    block++, block_start=block_end, bh = bh->b_this_page) {
1577  		if (!bh)
1578  			BUG();
      		block_end = block_start+blocksize;
1580  		if (block_end <= from)
1581  			continue;
1582  		if (block_start >= to)
1583  			break;
1584  		if (!buffer_mapped(bh)) {
      			err = get_block(inode, block, bh, 1);
1586  			if (err)
1587  				goto out;
1588  			if (buffer_new(bh)) {
      				unmap_underlying_metadata(bh);
1590  				if (Page_Uptodate(page)) {
      					set_bit(BH_Uptodate, &bh->b_state);
1592  					continue;
      				}
1594  				if (block_end > to)
      					memset(kaddr+to, 0, block_end-to);
1596  				if (block_start < from)
      					memset(kaddr+block_start, 0, from-block_start);
1598  				if (block_end > to || block_start < from)
1599  					flush_dcache_page(page);
1600  				continue;
      			}
      		}
1603  		if (Page_Uptodate(page)) {
      			set_bit(BH_Uptodate, &bh->b_state);
1605  			continue; 
      		}
      		if (!buffer_uptodate(bh) &&
1608  		     (block_start < from || block_end > to)) {
      			ll_rw_block(READ, 1, &bh);
      			*wait_bh++=bh;
      		}
      	}
      	/*
      	 * If we issued read requests - let them complete.
      	 */
1616  	while(wait_bh > wait) {
      		wait_on_buffer(*--wait_bh);
      		err = -EIO;
1619  		if (!buffer_uptodate(*wait_bh))
1620  			goto out;
      	}
1622  	return 0;
      out:
1624  	return err;
      }
      
1627  static int __block_commit_write(struct inode *inode, struct page *page,
      		unsigned from, unsigned to)
      {
      	unsigned block_start, block_end;
      	int partial = 0, need_balance_dirty = 0;
      	unsigned blocksize;
      	struct buffer_head *bh, *head;
      
      	blocksize = inode->i_sb->s_blocksize;
      
      	for(bh = head = page->buffers, block_start = 0;
1638  	    bh != head || !block_start;
      	    block_start=block_end, bh = bh->b_this_page) {
      		block_end = block_start + blocksize;
1641  		if (block_end <= from || block_start >= to) {
1642  			if (!buffer_uptodate(bh))
      				partial = 1;
1644  		} else {
      			set_bit(BH_Uptodate, &bh->b_state);
1646  			if (!atomic_set_buffer_dirty(bh)) {
      				__mark_dirty(bh);
      				buffer_insert_inode_queue(bh, inode);
      				need_balance_dirty = 1;
      			}
      		}
      	}
      
1654  	if (need_balance_dirty)
      		balance_dirty(bh->b_dev);
      	/*
      	 * is this a partial write that happened to make all buffers
      	 * uptodate then we can optimize away a bogus readpage() for
      	 * the next read(). Here we 'discover' wether the page went
      	 * uptodate as a result of this (potentially partial) write.
      	 */
1662  	if (!partial)
      		SetPageUptodate(page);
1664  	return 0;
      }
      
      /*
       * Generic "read page" function for block devices that have the normal
       * get_block functionality. This is most of the block device filesystems.
       * Reads the page asynchronously --- the unlock_buffer() and
       * mark_buffer_uptodate() functions propagate buffer state into the
       * page struct once IO has completed.
       */
1674  int block_read_full_page(struct page *page, get_block_t *get_block)
      {
      	struct inode *inode = page->mapping->host;
      	unsigned long iblock, lblock;
      	struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
      	unsigned int blocksize, blocks;
      	int nr, i;
      
1682  	if (!PageLocked(page))
1683  		PAGE_BUG(page);
      	blocksize = inode->i_sb->s_blocksize;
1685  	if (!page->buffers)
      		create_empty_buffers(page, inode->i_dev, blocksize);
      	head = page->buffers;
      
      	blocks = PAGE_CACHE_SIZE >> inode->i_sb->s_blocksize_bits;
      	iblock = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
      	lblock = (inode->i_size+blocksize-1) >> inode->i_sb->s_blocksize_bits;
      	bh = head;
      	nr = 0;
      	i = 0;
      
1696  	do {
1697  		if (buffer_uptodate(bh))
1698  			continue;
      
1700  		if (!buffer_mapped(bh)) {
1701  			if (iblock < lblock) {
1702  				if (get_block(inode, iblock, bh, 0))
1703  					continue;
      			}
1705  			if (!buffer_mapped(bh)) {
      				memset(kmap(page) + i*blocksize, 0, blocksize);
1707  				flush_dcache_page(page);
1708  				kunmap(page);
      				set_bit(BH_Uptodate, &bh->b_state);
1710  				continue;
      			}
      			/* get_block() might have updated the buffer synchronously */
1713  			if (buffer_uptodate(bh))
1714  				continue;
      		}
      
      		arr[nr] = bh;
      		nr++;
1719  	} while (i++, iblock++, (bh = bh->b_this_page) != head);
      
1721  	if (!nr) {
      		/*
      		 * all buffers are uptodate - we can set the page
      		 * uptodate as well.
      		 */
      		SetPageUptodate(page);
1727  		UnlockPage(page);
1728  		return 0;
      	}
      
      	/* Stage two: lock the buffers */
1732  	for (i = 0; i < nr; i++) {
      		struct buffer_head * bh = arr[i];
      		lock_buffer(bh);
      		bh->b_end_io = end_buffer_io_async;
      		atomic_inc(&bh->b_count);
      	}
      
      	/* Stage 3: start the IO */
1740  	for (i = 0; i < nr; i++)
      		submit_bh(READ, arr[i]);
      
1743  	return 0;
      }
      
      /*
       * For moronic filesystems that do not allow holes in file.
       * We may have to extend the file.
       */
      
1751  int cont_prepare_write(struct page *page, unsigned offset, unsigned to, get_block_t *get_block, unsigned long *bytes)
      {
      	struct address_space *mapping = page->mapping;
      	struct inode *inode = mapping->host;
      	struct page *new_page;
      	unsigned long pgpos;
      	long status;
      	unsigned zerofrom;
      	unsigned blocksize = inode->i_sb->s_blocksize;
      	char *kaddr;
      
1762  	while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
      		status = -ENOMEM;
      		new_page = grab_cache_page(mapping, pgpos);
1765  		if (!new_page)
1766  			goto out;
      		/* we might sleep */
1768  		if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
1769  			UnlockPage(new_page);
      			page_cache_release(new_page);
1771  			continue;
      		}
      		zerofrom = *bytes & ~PAGE_CACHE_MASK;
1774  		if (zerofrom & (blocksize-1)) {
      			*bytes |= (blocksize-1);
      			(*bytes)++;
      		}
      		status = __block_prepare_write(inode, new_page, zerofrom,
      						PAGE_CACHE_SIZE, get_block);
1780  		if (status)
1781  			goto out_unmap;
      		kaddr = page_address(new_page);
      		memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
1784  		flush_dcache_page(new_page);
      		__block_commit_write(inode, new_page, zerofrom, PAGE_CACHE_SIZE);
1786  		kunmap(new_page);
1787  		UnlockPage(new_page);
      		page_cache_release(new_page);
      	}
      
1791  	if (page->index < pgpos) {
      		/* completely inside the area */
      		zerofrom = offset;
1794  	} else {
      		/* page covers the boundary, find the boundary offset */
      		zerofrom = *bytes & ~PAGE_CACHE_MASK;
      
      		/* if we will expand the thing last block will be filled */
1799  		if (to > zerofrom && (zerofrom & (blocksize-1))) {
      			*bytes |= (blocksize-1);
      			(*bytes)++;
      		}
      
      		/* starting below the boundary? Nothing to zero out */
1805  		if (offset <= zerofrom)
      			zerofrom = offset;
      	}
      	status = __block_prepare_write(inode, page, zerofrom, to, get_block);
1809  	if (status)
1810  		goto out1;
      	kaddr = page_address(page);
1812  	if (zerofrom < offset) {
      		memset(kaddr+zerofrom, 0, offset-zerofrom);
1814  		flush_dcache_page(page);
      		__block_commit_write(inode, page, zerofrom, offset);
      	}
1817  	return 0;
      out1:
      	ClearPageUptodate(page);
1820  	kunmap(page);
1821  	return status;
      
      out_unmap:
      	ClearPageUptodate(new_page);
1825  	kunmap(new_page);
1826  	UnlockPage(new_page);
      	page_cache_release(new_page);
      out:
1829  	return status;
      }
      
1832  int block_prepare_write(struct page *page, unsigned from, unsigned to,
      			get_block_t *get_block)
      {
      	struct inode *inode = page->mapping->host;
      	int err = __block_prepare_write(inode, page, from, to, get_block);
1837  	if (err) {
      		ClearPageUptodate(page);
1839  		kunmap(page);
      	}
1841  	return err;
      }
      
1844  int generic_commit_write(struct file *file, struct page *page,
      		unsigned from, unsigned to)
      {
      	struct inode *inode = page->mapping->host;
      	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
      	__block_commit_write(inode,page,from,to);
1850  	kunmap(page);
1851  	if (pos > inode->i_size) {
      		inode->i_size = pos;
      		mark_inode_dirty(inode);
      	}
1855  	return 0;
      }
      
1858  int block_truncate_page(struct address_space *mapping, loff_t from, get_block_t *get_block)
      {
      	unsigned long index = from >> PAGE_CACHE_SHIFT;
      	unsigned offset = from & (PAGE_CACHE_SIZE-1);
      	unsigned blocksize, iblock, length, pos;
      	struct inode *inode = mapping->host;
      	struct page *page;
      	struct buffer_head *bh;
      	int err;
      
      	blocksize = inode->i_sb->s_blocksize;
      	length = offset & (blocksize - 1);
      
      	/* Block boundary? Nothing to do */
1872  	if (!length)
1873  		return 0;
      
      	length = blocksize - length;
      	iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
      	
      	page = grab_cache_page(mapping, index);
      	err = PTR_ERR(page);
1880  	if (IS_ERR(page))
1881  		goto out;
      
1883  	if (!page->buffers)
      		create_empty_buffers(page, inode->i_dev, blocksize);
      
      	/* Find the buffer that contains "offset" */
      	bh = page->buffers;
      	pos = blocksize;
1889  	while (offset >= pos) {
      		bh = bh->b_this_page;
      		iblock++;
      		pos += blocksize;
      	}
      
      	err = 0;
1896  	if (!buffer_mapped(bh)) {
      		/* Hole? Nothing to do */
1898  		if (buffer_uptodate(bh))
1899  			goto unlock;
      		get_block(inode, iblock, bh, 0);
      		/* Still unmapped? Nothing to do */
1902  		if (!buffer_mapped(bh))
1903  			goto unlock;
      	}
      
      	/* Ok, it's mapped. Make sure it's up-to-date */
1907  	if (Page_Uptodate(page))
      		set_bit(BH_Uptodate, &bh->b_state);
      
1910  	if (!buffer_uptodate(bh)) {
      		err = -EIO;
      		ll_rw_block(READ, 1, &bh);
      		wait_on_buffer(bh);
      		/* Uhhuh. Read error. Complain and punt. */
1915  		if (!buffer_uptodate(bh))
1916  			goto unlock;
      	}
      
      	memset(kmap(page) + offset, 0, length);
1920  	flush_dcache_page(page);
1921  	kunmap(page);
      
      	__mark_buffer_dirty(bh);
      	err = 0;
      
      unlock:
1927  	UnlockPage(page);
      	page_cache_release(page);
      out:
1930  	return err;
      }
      
1933  int block_write_full_page(struct page *page, get_block_t *get_block)
      {
      	struct inode *inode = page->mapping->host;
      	unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
      	unsigned offset;
      	int err;
      
      	/* easy case */
1941  	if (page->index < end_index)
1942  		return __block_write_full_page(inode, page, get_block);
      
      	/* things got complicated... */
      	offset = inode->i_size & (PAGE_CACHE_SIZE-1);
      	/* OK, are we completely out? */
1947  	if (page->index >= end_index+1 || !offset) {
1948  		UnlockPage(page);
1949  		return -EIO;
      	}
      
      	/* Sigh... will have to work, then... */
      	err = __block_prepare_write(inode, page, 0, offset, get_block);
1954  	if (!err) {
      		memset(page_address(page) + offset, 0, PAGE_CACHE_SIZE - offset);
1956  		flush_dcache_page(page);
      		__block_commit_write(inode,page,0,offset);
      done:
1959  		kunmap(page);
1960  		UnlockPage(page);
1961  		return err;
      	}
      	ClearPageUptodate(page);
1964  	goto done;
      }
      
1967  int generic_block_bmap(struct address_space *mapping, long block, get_block_t *get_block)
      {
      	struct buffer_head tmp;
      	struct inode *inode = mapping->host;
      	tmp.b_state = 0;
      	tmp.b_blocknr = 0;
      	get_block(inode, block, &tmp, 0);
1974  	return tmp.b_blocknr;
      }
      
      /*
       * IO completion routine for a buffer_head being used for kiobuf IO: we
       * can't dispatch the kiobuf callback until io_count reaches 0.  
       */
      
1982  static void end_buffer_io_kiobuf(struct buffer_head *bh, int uptodate)
      {
      	struct kiobuf *kiobuf;
      	
      	mark_buffer_uptodate(bh, uptodate);
      
      	kiobuf = bh->b_private;
      	unlock_buffer(bh);
      	end_kio_request(kiobuf, uptodate);
      }
      
      
      /*
       * For brw_kiovec: submit a set of buffer_head temporary IOs and wait
       * for them to complete.  Clean up the buffer_heads afterwards.  
       */
      
1999  static int wait_kio(int rw, int nr, struct buffer_head *bh[], int size)
      {
      	int iosize;
      	int i;
      	struct buffer_head *tmp;
      
      
      	iosize = 0;
      	spin_lock(&unused_list_lock);
      
2009  	for (i = nr; --i >= 0; ) {
      		iosize += size;
      		tmp = bh[i];
2012  		if (buffer_locked(tmp)) {
2013  			spin_unlock(&unused_list_lock);
      			wait_on_buffer(tmp);
      			spin_lock(&unused_list_lock);
      		}
      		
2018  		if (!buffer_uptodate(tmp)) {
      			/* We are traversing bh'es in reverse order so
                                 clearing iosize on error calculates the
                                 amount of IO before the first error. */
      			iosize = 0;
      		}
      		__put_unused_buffer_head(tmp);
      	}
      	
2027  	spin_unlock(&unused_list_lock);
      
2029  	return iosize;
      }
      
      /*
       * Start I/O on a physical range of kernel memory, defined by a vector
       * of kiobuf structs (much like a user-space iovec list).
       *
       * The kiobuf must already be locked for IO.  IO is submitted
       * asynchronously: you need to check page->locked, page->uptodate, and
       * maybe wait on page->wait.
       *
       * It is up to the caller to make sure that there are enough blocks
       * passed in to completely map the iobufs to disk.
       */
      
2044  int brw_kiovec(int rw, int nr, struct kiobuf *iovec[], 
      	       kdev_t dev, unsigned long b[], int size)
      {
      	int		err;
      	int		length;
      	int		transferred;
      	int		i;
      	int		bufind;
      	int		pageind;
      	int		bhind;
      	int		offset;
      	unsigned long	blocknr;
      	struct kiobuf *	iobuf = NULL;
      	struct page *	map;
      	struct buffer_head *tmp, *bh[KIO_MAX_SECTORS];
      
2060  	if (!nr)
2061  		return 0;
      	
      	/* 
      	 * First, do some alignment and validity checks 
      	 */
2066  	for (i = 0; i < nr; i++) {
      		iobuf = iovec[i];
      		if ((iobuf->offset & (size-1)) ||
2069  		    (iobuf->length & (size-1)))
2070  			return -EINVAL;
2071  		if (!iobuf->nr_pages)
      			panic("brw_kiovec: iobuf not initialised");
      	}
      
      	/* 
      	 * OK to walk down the iovec doing page IO on each page we find. 
      	 */
      	bufind = bhind = transferred = err = 0;
2079  	for (i = 0; i < nr; i++) {
      		iobuf = iovec[i];
      		offset = iobuf->offset;
      		length = iobuf->length;
      		iobuf->errno = 0;
      		
2085  		for (pageind = 0; pageind < iobuf->nr_pages; pageind++) {
      			map  = iobuf->maplist[pageind];
2087  			if (!map) {
      				err = -EFAULT;
2089  				goto error;
      			}
      			
2092  			while (length > 0) {
      				blocknr = b[bufind++];
      				tmp = get_unused_buffer_head(0);
2095  				if (!tmp) {
      					err = -ENOMEM;
2097  					goto error;
      				}
      				
      				tmp->b_dev = B_FREE;
      				tmp->b_size = size;
      				set_bh_page(tmp, map, offset);
      				tmp->b_this_page = tmp;
      
      				init_buffer(tmp, end_buffer_io_kiobuf, iobuf);
      				tmp->b_dev = dev;
      				tmp->b_blocknr = blocknr;
      				tmp->b_state = (1 << BH_Mapped) | (1 << BH_Lock) | (1 << BH_Req);
      
2110  				if (rw == WRITE) {
      					set_bit(BH_Uptodate, &tmp->b_state);
      					clear_bit(BH_Dirty, &tmp->b_state);
      				}
      
      				bh[bhind++] = tmp;
      				length -= size;
      				offset += size;
      
      				atomic_inc(&iobuf->io_count);
      
      				submit_bh(rw, tmp);
      				/* 
      				 * Wait for IO if we have got too much 
      				 */
2125  				if (bhind >= KIO_MAX_SECTORS) {
      					err = wait_kio(rw, bhind, bh, size);
2127  					if (err >= 0)
      						transferred += err;
2129  					else
2130  						goto finished;
      					bhind = 0;
      				}
      				
2134  				if (offset >= PAGE_SIZE) {
      					offset = 0;
2136  					break;
      				}
      			} /* End of block loop */
      		} /* End of page loop */		
      	} /* End of iovec loop */
      
      	/* Is there any IO still left to submit? */
2143  	if (bhind) {
      		err = wait_kio(rw, bhind, bh, size);
2145  		if (err >= 0)
      			transferred += err;
2147  		else
2148  			goto finished;
      	}
      
       finished:
2152  	if (transferred)
2153  		return transferred;
2154  	return err;
      
       error:
      	/* We got an error allocating the bh'es.  Just free the current
                 buffer_heads and exit. */
      	spin_lock(&unused_list_lock);
2160  	for (i = bhind; --i >= 0; ) {
      		__put_unused_buffer_head(bh[i]);
      	}
2163  	spin_unlock(&unused_list_lock);
2164  	goto finished;
      }
      
      /*
       * Start I/O on a page.
       * This function expects the page to be locked and may return
       * before I/O is complete. You then have to check page->locked,
       * page->uptodate, and maybe wait on page->wait.
       *
       * brw_page() is SMP-safe, although it's being called with the
       * kernel lock held - but the code is ready.
       *
       * FIXME: we need a swapper_inode->get_block function to remove
       *        some of the bmap kludges and interface ugliness here.
       */
2179  int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size)
      {
      	struct buffer_head *head, *bh;
      
2183  	if (!PageLocked(page))
      		panic("brw_page: page not locked for I/O");
      
2186  	if (!page->buffers)
      		create_empty_buffers(page, dev, size);
      	head = bh = page->buffers;
      
      	/* Stage 1: lock all the buffers */
2191  	do {
      		lock_buffer(bh);
      		bh->b_blocknr = *(b++);
      		set_bit(BH_Mapped, &bh->b_state);
      		bh->b_end_io = end_buffer_io_async;
      		atomic_inc(&bh->b_count);
      		bh = bh->b_this_page;
2198  	} while (bh != head);
      
      	/* Stage 2: start the IO */
2201  	do {
      		submit_bh(rw, bh);
      		bh = bh->b_this_page;
2204  	} while (bh != head);
2205  	return 0;
      }
      
2208  int block_symlink(struct inode *inode, const char *symname, int len)
      {
      	struct address_space *mapping = inode->i_mapping;
      	struct page *page = grab_cache_page(mapping, 0);
      	int err = -ENOMEM;
      	char *kaddr;
      
2215  	if (!page)
2216  		goto fail;
      	err = mapping->a_ops->prepare_write(NULL, page, 0, len-1);
2218  	if (err)
2219  		goto fail_map;
      	kaddr = page_address(page);
      	memcpy(kaddr, symname, len-1);
      	mapping->a_ops->commit_write(NULL, page, 0, len-1);
      	/*
      	 * Notice that we are _not_ going to block here - end of page is
      	 * unmapped, so this will only try to map the rest of page, see
      	 * that it is unmapped (typically even will not look into inode -
      	 * ->i_size will be enough for everything) and zero it out.
      	 * OTOH it's obviously correct and should make the page up-to-date.
      	 */
      	err = mapping->a_ops->readpage(NULL, page);
      	wait_on_page(page);
      	page_cache_release(page);
2233  	if (err < 0)
2234  		goto fail;
      	mark_inode_dirty(inode);
2236  	return 0;
      fail_map:
2238  	UnlockPage(page);
      	page_cache_release(page);
      fail:
2241  	return err;
      }
      
      /*
       * Try to increase the number of buffers available: the size argument
       * is used to determine what kind of buffers we want.
       */
2248  static int grow_buffers(int size)
      {
      	struct page * page;
      	struct buffer_head *bh, *tmp;
      	struct buffer_head * insert_point;
      	int isize;
      
2255  	if ((size & 511) || (size > PAGE_SIZE)) {
      		printk("VFS: grow_buffers: size = %d\n",size);
2257  		return 0;
      	}
      
      	page = alloc_page(GFP_BUFFER);
2261  	if (!page)
2262  		goto out;
      	LockPage(page);
      	bh = create_buffers(page, size, 0);
2265  	if (!bh)
2266  		goto no_buffer_head;
      
      	isize = BUFSIZE_INDEX(size);
      
      	spin_lock(&free_list[isize].lock);
      	insert_point = free_list[isize].list;
      	tmp = bh;
2273  	while (1) {
2274  		if (insert_point) {
      			tmp->b_next_free = insert_point->b_next_free;
      			tmp->b_prev_free = insert_point;
      			insert_point->b_next_free->b_prev_free = tmp;
      			insert_point->b_next_free = tmp;
2279  		} else {
      			tmp->b_prev_free = tmp;
      			tmp->b_next_free = tmp;
      		}
      		insert_point = tmp;
2284  		if (tmp->b_this_page)
      			tmp = tmp->b_this_page;
2286  		else
2287  			break;
      	}
      	tmp->b_this_page = bh;
      	free_list[isize].list = bh;
2291  	spin_unlock(&free_list[isize].lock);
      
      	page->buffers = bh;
      	page->flags &= ~(1 << PG_referenced);
      	lru_cache_add(page);
2296  	UnlockPage(page);
      	atomic_inc(&buffermem_pages);
2298  	return 1;
      
      no_buffer_head:
2301  	UnlockPage(page);
      	page_cache_release(page);
      out:
2304  	return 0;
      }
      
      /*
       * Sync all the buffers on one page..
       *
       * If we have old buffers that are locked, we'll
       * wait on them, but we won't wait on the new ones
       * we're writing out now.
       *
       * This all is required so that we can free up memory
       * later.
       *
       * Wait:
       *	0 - no wait (this does not get called - see try_to_free_buffers below)
       *	1 - start IO for dirty buffers
       *	2 - wait for completion of locked buffers
       */
2322  static void sync_page_buffers(struct buffer_head *bh, int wait)
      {
      	struct buffer_head * tmp = bh;
      
2326  	do {
      		struct buffer_head *p = tmp;
      		tmp = tmp->b_this_page;
2329  		if (buffer_locked(p)) {
2330  			if (wait > 1)
      				__wait_on_buffer(p);
2332  		} else if (buffer_dirty(p))
      			ll_rw_block(WRITE, 1, &p);
2334  	} while (tmp != bh);
      }
      
      /*
       * Can the buffer be thrown out?
       */
      #define BUFFER_BUSY_BITS	((1<<BH_Dirty) | (1<<BH_Lock) | (1<<BH_Protected))
      #define buffer_busy(bh)		(atomic_read(&(bh)->b_count) | ((bh)->b_state & BUFFER_BUSY_BITS))
      
      /*
       * try_to_free_buffers() checks if all the buffers on this particular page
       * are unused, and free's the page if so.
       *
       * Wake up bdflush() if this fails - if we're running low on memory due
       * to dirty buffers, we need to flush them out as quickly as possible.
       *
       * NOTE: There are quite a number of ways that threads of control can
       *       obtain a reference to a buffer head within a page.  So we must
       *	 lock out all of these paths to cleanly toss the page.
       */
2354  int try_to_free_buffers(struct page * page, int wait)
      {
      	struct buffer_head * tmp, * bh = page->buffers;
      	int index = BUFSIZE_INDEX(bh->b_size);
      	int loop = 0;
      
      cleaned_buffers_try_again:
      	spin_lock(&lru_list_lock);
      	write_lock(&hash_table_lock);
      	spin_lock(&free_list[index].lock);
      	tmp = bh;
2365  	do {
      		struct buffer_head *p = tmp;
      
      		tmp = tmp->b_this_page;
2369  		if (buffer_busy(p))
2370  			goto busy_buffer_page;
2371  	} while (tmp != bh);
      
      	spin_lock(&unused_list_lock);
      	tmp = bh;
2375  	do {
      		struct buffer_head * p = tmp;
      		tmp = tmp->b_this_page;
      
      		/* The buffer can be either on the regular
      		 * queues or on the free list..
      		 */
2382  		if (p->b_dev != B_FREE) {
      			remove_inode_queue(p);
      			__remove_from_queues(p);
2385  		} else
      			__remove_from_free_list(p, index);
      		__put_unused_buffer_head(p);
2388  	} while (tmp != bh);
2389  	spin_unlock(&unused_list_lock);
      
      	/* Wake up anyone waiting for buffer heads */
      	wake_up(&buffer_wait);
      
      	/* And free the page */
      	page->buffers = NULL;
      	page_cache_release(page);
2397  	spin_unlock(&free_list[index].lock);
2398  	write_unlock(&hash_table_lock);
2399  	spin_unlock(&lru_list_lock);
2400  	return 1;
      
      busy_buffer_page:
      	/* Uhhuh, start writeback so that we don't end up with all dirty pages */
2404  	spin_unlock(&free_list[index].lock);
2405  	write_unlock(&hash_table_lock);
2406  	spin_unlock(&lru_list_lock);
2407  	if (wait) {
      		sync_page_buffers(bh, wait);
      		/* We waited synchronously, so we can free the buffers. */
2410  		if (wait > 1 && !loop) {
      			loop = 1;
2412  			goto cleaned_buffers_try_again;
      		}
      	}
2415  	return 0;
      }
      
      /* ================== Debugging =================== */
      
2420  void show_buffers(void)
      {
      #ifdef CONFIG_SMP
      	struct buffer_head * bh;
      	int found = 0, locked = 0, dirty = 0, used = 0, lastused = 0;
      	int protected = 0;
      	int nlist;
      	static char *buf_types[NR_LIST] = { "CLEAN", "LOCKED", "DIRTY", "PROTECTED", };
      #endif
      
      	printk("Buffer memory:   %6dkB\n",
      			atomic_read(&buffermem_pages) << (PAGE_SHIFT-10));
      
      #ifdef CONFIG_SMP /* trylock does nothing on UP and so we could deadlock */
      	if (!spin_trylock(&lru_list_lock))
      		return;
      	for(nlist = 0; nlist < NR_LIST; nlist++) {
      		found = locked = dirty = used = lastused = protected = 0;
      		bh = lru_list[nlist];
      		if(!bh) continue;
      
      		do {
      			found++;
      			if (buffer_locked(bh))
      				locked++;
      			if (buffer_protected(bh))
      				protected++;
      			if (buffer_dirty(bh))
      				dirty++;
      			if (atomic_read(&bh->b_count))
      				used++, lastused = found;
      			bh = bh->b_next_free;
      		} while (bh != lru_list[nlist]);
      		{
      			int tmp = nr_buffers_type[nlist];
      			if (found != tmp)
      				printk("%9s: BUG -> found %d, reported %d\n",
      				       buf_types[nlist], found, tmp);
      		}
      		printk("%9s: %d buffers, %lu kbyte, %d used (last=%d), "
      		       "%d locked, %d protected, %d dirty\n",
      		       buf_types[nlist], found, size_buffers_type[nlist]>>10,
      		       used, lastused, locked, protected, dirty);
      	}
      	spin_unlock(&lru_list_lock);
      #endif
      }
      
      /* ===================== Init ======================= */
      
      /*
       * allocate the hash table and init the free list
       * Use gfp() for the hash table to decrease TLB misses, use
       * SLAB cache for buffer heads.
       */
2475  void __init buffer_init(unsigned long mempages)
      {
      	int order, i;
      	unsigned int nr_hash;
      
      	/* The buffer cache hash table is less important these days,
      	 * trim it a bit.
      	 */
      	mempages >>= 14;
      
      	mempages *= sizeof(struct buffer_head *);
      
2487  	for (order = 0; (1 << order) < mempages; order++)
      		;
      
      	/* try to allocate something until we get it or we're asking
      	   for something that is really too small */
      
2493  	do {
      		unsigned long tmp;
      
      		nr_hash = (PAGE_SIZE << order) / sizeof(struct buffer_head *);
      		bh_hash_mask = (nr_hash - 1);
      
      		tmp = nr_hash;
      		bh_hash_shift = 0;
2501  		while((tmp >>= 1UL) != 0UL)
      			bh_hash_shift++;
      
      		hash_table = (struct buffer_head **)
      		    __get_free_pages(GFP_ATOMIC, order);
2506  	} while (hash_table == NULL && --order > 0);
      	printk("Buffer-cache hash table entries: %d (order: %d, %ld bytes)\n",
      	       nr_hash, order, (PAGE_SIZE << order));
      
2510  	if (!hash_table)
      		panic("Failed to allocate buffer hash table\n");
      
      	/* Setup hash chains. */
2514  	for(i = 0; i < nr_hash; i++)
      		hash_table[i] = NULL;
      
      	/* Setup free lists. */
2518  	for(i = 0; i < NR_SIZES; i++) {
      		free_list[i].list = NULL;
      		free_list[i].lock = SPIN_LOCK_UNLOCKED;
      	}
      
      	/* Setup lru lists. */
2524  	for(i = 0; i < NR_LIST; i++)
      		lru_list[i] = NULL;
      
      }
      
      
      /* ====================== bdflush support =================== */
      
      /* This is a simple kernel daemon, whose job it is to provide a dynamic
       * response to dirty buffers.  Once this process is activated, we write back
       * a limited number of buffers to the disks and then go back to sleep again.
       */
      
      /* This is the _only_ function that deals with flushing async writes
         to disk.
         NOTENOTENOTENOTE: we _only_ need to browse the DIRTY lru list
         as all dirty buffers lives _only_ in the DIRTY lru list.
         As we never browse the LOCKED and CLEAN lru lists they are infact
         completly useless. */
2543  static int flush_dirty_buffers(int check_flushtime)
      {
      	struct buffer_head * bh, *next;
      	int flushed = 0, i;
      
       restart:
      	spin_lock(&lru_list_lock);
      	bh = lru_list[BUF_DIRTY];
2551  	if (!bh)
2552  		goto out_unlock;
2553  	for (i = nr_buffers_type[BUF_DIRTY]; i-- > 0; bh = next) {
      		next = bh->b_next_free;
      
2556  		if (!buffer_dirty(bh)) {
      			__refile_buffer(bh);
2558  			continue;
      		}
2560  		if (buffer_locked(bh))
2561  			continue;
      
2563  		if (check_flushtime) {
      			/* The dirty lru list is chronologically ordered so
      			   if the current bh is not yet timed out,
      			   then also all the following bhs
      			   will be too young. */
2568  			if (time_before(jiffies, bh->b_flushtime))
2569  				goto out_unlock;
2570  		} else {
2571  			if (++flushed > bdf_prm.b_un.ndirty)
2572  				goto out_unlock;
      		}
      
      		/* OK, now we are committed to write it out. */
      		atomic_inc(&bh->b_count);
2577  		spin_unlock(&lru_list_lock);
      		ll_rw_block(WRITE, 1, &bh);
      		atomic_dec(&bh->b_count);
      
2581  		if (current->need_resched)
      			schedule();
2583  		goto restart;
      	}
       out_unlock:
2586  	spin_unlock(&lru_list_lock);
      
2588  	return flushed;
      }
      
      struct task_struct *bdflush_tsk = 0;
      
2593  void wakeup_bdflush(int block)
      {
2595  	if (current != bdflush_tsk) {
      		wake_up_process(bdflush_tsk);
      
2598  		if (block)
      			flush_dirty_buffers(0);
      	}
      }
      
      /* 
       * Here we attempt to write back old buffers.  We also try to flush inodes 
       * and supers as well, since this function is essentially "update", and 
       * otherwise there would be no way of ensuring that these quantities ever 
       * get written back.  Ideally, we would have a timestamp on the inodes
       * and superblocks so that we could write back only the old ones as well
       */
      
2611  static int sync_old_buffers(void)
      {
2613  	lock_kernel();
      	sync_supers(0);
      	sync_inodes(0);
2616  	unlock_kernel();
      
      	flush_dirty_buffers(1);
      	/* must really sync all the active I/O request to disk here */
      	run_task_queue(&tq_disk);
2621  	return 0;
      }
      
2624  int block_sync_page(struct page *page)
      {
      	run_task_queue(&tq_disk);
2627  	return 0;
      }
      
      /* This is the interface to bdflush.  As we get more sophisticated, we can
       * pass tuning parameters to this "process", to adjust how it behaves. 
       * We would want to verify each parameter, however, to make sure that it 
       * is reasonable. */
      
2635  asmlinkage long sys_bdflush(int func, long data)
      {
2637  	if (!capable(CAP_SYS_ADMIN))
2638  		return -EPERM;
      
2640  	if (func == 1) {
      		/* do_exit directly and let kupdate to do its work alone. */
      		do_exit(0);
      #if 0 /* left here as it's the only example of lazy-mm-stuff used from
      	 a syscall that doesn't care about the current mm context. */
      		int error;
      		struct mm_struct *user_mm;
      
      		/*
      		 * bdflush will spend all of it's time in kernel-space,
      		 * without touching user-space, so we can switch it into
      		 * 'lazy TLB mode' to reduce the cost of context-switches
      		 * to and from bdflush.
      		 */
      		user_mm = start_lazy_tlb();
      		error = sync_old_buffers();
      		end_lazy_tlb(user_mm);
      		return error;
      #endif
      	}
      
      	/* Basically func 1 means read param 1, 2 means write param 1, etc */
2662  	if (func >= 2) {
      		int i = (func-2) >> 1;
2664  		if (i >= 0 && i < N_PARAM) {
2665  			if ((func & 1) == 0)
2666  				return put_user(bdf_prm.data[i], (int*)data);
      
2668  			if (data >= bdflush_min[i] && data <= bdflush_max[i]) {
      				bdf_prm.data[i] = data;
2670  				return 0;
      			}
      		}
2673  		return -EINVAL;
      	}
      
      	/* Having func 0 used to launch the actual bdflush and then never
      	 * return (unless explicitly killed). We return zero here to 
      	 * remain semi-compatible with present update(8) programs.
      	 */
2680  	return 0;
      }
      
      /*
       * This is the actual bdflush daemon itself. It used to be started from
       * the syscall above, but now we launch it ourselves internally with
       * kernel_thread(...)  directly after the first thread in init/main.c
       */
2688  int bdflush(void *sem)
      {
      	struct task_struct *tsk = current;
      	int flushed;
      	/*
      	 *	We have a bare-bones task_struct, and really should fill
      	 *	in a few more things so "top" and /proc/2/{exe,root,cwd}
      	 *	display semi-sane things. Not real crucial though...  
      	 */
      
      	tsk->session = 1;
      	tsk->pgrp = 1;
      	strcpy(tsk->comm, "bdflush");
      	bdflush_tsk = tsk;
      
      	/* avoid getting signals */
2704  	spin_lock_irq(&tsk->sigmask_lock);
      	flush_signals(tsk);
      	sigfillset(&tsk->blocked);
      	recalc_sigpending(tsk);
2708  	spin_unlock_irq(&tsk->sigmask_lock);
      
      	up((struct semaphore *)sem);
      
2712  	for (;;) {
      		CHECK_EMERGENCY_SYNC
      
      		flushed = flush_dirty_buffers(0);
2716  		if (free_shortage())
      			flushed += page_launder(GFP_KERNEL, 0);
      
      		/*
      		 * If there are still a lot of dirty buffers around,
      		 * skip the sleep and flush some more. Otherwise, we
      		 * go to sleep waiting a wakeup.
      		 */
2724  		set_current_state(TASK_INTERRUPTIBLE);
2725  		if (!flushed || balance_dirty_state(NODEV) < 0) {
      			run_task_queue(&tq_disk);
      			schedule();
      		}
      		/* Remember to mark us as running otherwise
      		   the next schedule will block. */
2731  		__set_current_state(TASK_RUNNING);
      	}
      }
      
      /*
       * This is the kernel update daemon. It was used to live in userspace
       * but since it's need to run safely we want it unkillable by mistake.
       * You don't need to change your userspace configuration since
       * the userspace `update` will do_exit(0) at the first sys_bdflush().
       */
2741  int kupdate(void *sem)
      {
      	struct task_struct * tsk = current;
      	int interval;
      
      	tsk->session = 1;
      	tsk->pgrp = 1;
      	strcpy(tsk->comm, "kupdate");
      
      	/* sigstop and sigcont will stop and wakeup kupdate */
2751  	spin_lock_irq(&tsk->sigmask_lock);
      	sigfillset(&tsk->blocked);
      	siginitsetinv(¤t->blocked, sigmask(SIGCONT) | sigmask(SIGSTOP));
      	recalc_sigpending(tsk);
2755  	spin_unlock_irq(&tsk->sigmask_lock);
      
      	up((struct semaphore *)sem);
      
2759  	for (;;) {
      		/* update interval */
      		interval = bdf_prm.b_un.interval;
2762  		if (interval) {
      			tsk->state = TASK_INTERRUPTIBLE;
      			schedule_timeout(interval);
2765  		} else {
      		stop_kupdate:
      			tsk->state = TASK_STOPPED;
      			schedule(); /* wait for SIGCONT */
      		}
      		/* check for sigstop */
2771  		if (signal_pending(tsk)) {
      			int stopped = 0;
2773  			spin_lock_irq(&tsk->sigmask_lock);
2774  			if (sigismember(&tsk->pending.signal, SIGSTOP)) {
      				sigdelset(&tsk->pending.signal, SIGSTOP);
      				stopped = 1;
      			}
      			recalc_sigpending(tsk);
2779  			spin_unlock_irq(&tsk->sigmask_lock);
2780  			if (stopped)
2781  				goto stop_kupdate;
      		}
      #ifdef DEBUG
      		printk("kupdate() activated...\n");
      #endif
      		sync_old_buffers();
      	}
      }
      
2790  static int __init bdflush_init(void)
      {
      	DECLARE_MUTEX_LOCKED(sem);
      	kernel_thread(bdflush, &sem, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
      	down(&sem);
      	kernel_thread(kupdate, &sem, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
      	down(&sem);
2797  	return 0;
      }
      
      module_init(bdflush_init)