./mm/filemap.c

      /*
       *	linux/mm/filemap.c
       *
       * Copyright (C) 1994-1999  Linus Torvalds
       */
      
      /*
       * This file handles the generic file mmap semantics used by
       * most "normal" filesystems (but you don't /have/ to use this:
       * the NFS filesystem used to do this differently, for example)
       */
      #include <linux/malloc.h>
      #include <linux/shm.h>
      #include <linux/mman.h>
      #include <linux/locks.h>
      #include <linux/pagemap.h>
      #include <linux/swap.h>
      #include <linux/smp_lock.h>
      #include <linux/blkdev.h>
      #include <linux/file.h>
      #include <linux/swapctl.h>
      #include <linux/slab.h>
      #include <linux/init.h>
      #include <linux/mm.h>
      
      #include <asm/pgalloc.h>
      #include <asm/uaccess.h>
      #include <asm/mman.h>
      
      #include <linux/highmem.h>
      
      /*
       * Shared mappings implemented 30.11.1994. It's not fully working yet,
       * though.
       *
       * Shared mappings now work. 15.8.1995  Bruno.
       *
       * finished 'unifying' the page and buffer cache and SMP-threaded the
       * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
       *
       * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
       */
      
      atomic_t page_cache_size = ATOMIC_INIT(0);
      unsigned int page_hash_bits;
      struct page **page_hash_table;
      
      spinlock_t pagecache_lock = SPIN_LOCK_UNLOCKED;
      /*
       * NOTE: to avoid deadlocking you must never acquire the pagecache_lock with
       *       the pagemap_lru_lock held.
       */
      spinlock_t pagemap_lru_lock = SPIN_LOCK_UNLOCKED;
      
      #define CLUSTER_PAGES		(1 << page_cluster)
      #define CLUSTER_OFFSET(x)	(((x) >> page_cluster) << page_cluster)
      
  58  static void add_page_to_hash_queue(struct page * page, struct page **p)
      {
      	struct page *next = *p;
      
      	*p = page;
      	page->next_hash = next;
      	page->pprev_hash = p;
  65  	if (next)
      		next->pprev_hash = &page->next_hash;
  67  	if (page->buffers)
  68  		PAGE_BUG(page);
      	atomic_inc(&page_cache_size);
      }
      
  72  static inline void add_page_to_inode_queue(struct address_space *mapping, struct page * page)
      {
      	struct list_head *head = &mapping->clean_pages;
      
      	mapping->nrpages++;
      	list_add(&page->list, head);
      	page->mapping = mapping;
      }
      
  81  static inline void remove_page_from_inode_queue(struct page * page)
      {
      	struct address_space * mapping = page->mapping;
      
      	mapping->nrpages--;
      	list_del(&page->list);
      	page->mapping = NULL;
      }
      
  90  static inline void remove_page_from_hash_queue(struct page * page)
      {
      	struct page *next = page->next_hash;
      	struct page **pprev = page->pprev_hash;
      
  95  	if (next)
      		next->pprev_hash = pprev;
      	*pprev = next;
      	page->pprev_hash = NULL;
      	atomic_dec(&page_cache_size);
      }
      
      /*
       * Remove a page from the page cache and free it. Caller has to make
       * sure the page is locked and that nobody else uses it - or that usage
       * is safe.
       */
 107  void __remove_inode_page(struct page *page)
      {
 109  	if (PageDirty(page)) BUG();
      	remove_page_from_inode_queue(page);
      	remove_page_from_hash_queue(page);
      	page->mapping = NULL;
      }
      
 115  void remove_inode_page(struct page *page)
      {
 117  	if (!PageLocked(page))
 118  		PAGE_BUG(page);
      
      	spin_lock(&pagecache_lock);
      	__remove_inode_page(page);
 122  	spin_unlock(&pagecache_lock);
      }
      
 125  static inline int sync_page(struct page *page)
      {
      	struct address_space *mapping = page->mapping;
      
 129  	if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
 130  		return mapping->a_ops->sync_page(page);
 131  	return 0;
      }
      
      /*
       * Add a page to the dirty page list.
       */
 137  void __set_page_dirty(struct page *page)
      {
      	struct address_space *mapping = page->mapping;
      
      	spin_lock(&pagecache_lock);
      	list_del(&page->list);
      	list_add(&page->list, &mapping->dirty_pages);
 144  	spin_unlock(&pagecache_lock);
      
      	mark_inode_dirty_pages(mapping->host);
      }
      
      /**
       * invalidate_inode_pages - Invalidate all the unlocked pages of one inode
       * @inode: the inode which pages we want to invalidate
       *
       * This function only removes the unlocked pages, if you want to
       * remove all the pages of one inode, you must call truncate_inode_pages.
       */
      
 157  void invalidate_inode_pages(struct inode * inode)
      {
      	struct list_head *head, *curr;
      	struct page * page;
      
      	head = &inode->i_mapping->clean_pages;
      
      	spin_lock(&pagecache_lock);
      	spin_lock(&pagemap_lru_lock);
      	curr = head->next;
      
 168  	while (curr != head) {
      		page = list_entry(curr, struct page, list);
      		curr = curr->next;
      
      		/* We cannot invalidate something in use.. */
 173  		if (page_count(page) != 1)
 174  			continue;
      
      		/* ..or dirty.. */
 177  		if (PageDirty(page))
 178  			continue;
      
      		/* ..or locked */
 181  		if (TryLockPage(page))
 182  			continue;
      
      		__lru_cache_del(page);
      		__remove_inode_page(page);
 186  		UnlockPage(page);
      		page_cache_release(page);
      	}
      
 190  	spin_unlock(&pagemap_lru_lock);
 191  	spin_unlock(&pagecache_lock);
      }
      
 194  static inline void truncate_partial_page(struct page *page, unsigned partial)
      {
      	memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
      				
 198  	if (page->buffers)
      		block_flushpage(page, partial);
      
      }
      
 203  static inline void truncate_complete_page(struct page *page)
      {
      	/* Leave it on the LRU if it gets converted into anonymous buffers */
 206  	if (!page->buffers || block_flushpage(page, 0))
      		lru_cache_del(page);
      
      	/*
      	 * We remove the page from the page cache _after_ we have
      	 * destroyed all buffer-cache references to it. Otherwise some
      	 * other process might think this inode page is not in the
      	 * page cache and creates a buffer-cache alias to it causing
      	 * all sorts of fun problems ...  
      	 */
      	ClearPageDirty(page);
      	ClearPageUptodate(page);
      	remove_inode_page(page);
      	page_cache_release(page);
      }
      
      static int FASTCALL(truncate_list_pages(struct list_head *, unsigned long, unsigned *));
 223  static int truncate_list_pages(struct list_head *head, unsigned long start, unsigned *partial)
      {
      	struct list_head *curr;
      	struct page * page;
      
      	curr = head->next;
 229  	while (curr != head) {
      		unsigned long offset;
      
      		page = list_entry(curr, struct page, list);
      		curr = curr->next;
      		offset = page->index;
      
      		/* Is one of the pages to truncate? */
 237  		if ((offset >= start) || (*partial && (offset + 1) == start)) {
 238  			if (TryLockPage(page)) {
      				page_cache_get(page);
 240  				spin_unlock(&pagecache_lock);
      				wait_on_page(page);
      				page_cache_release(page);
 243  				return 1;
      			}
      			page_cache_get(page);
 246  			spin_unlock(&pagecache_lock);
      
 248  			if (*partial && (offset + 1) == start) {
      				truncate_partial_page(page, *partial);
      				*partial = 0;
 251  			} else 
      				truncate_complete_page(page);
      
 254  			UnlockPage(page);
      			page_cache_release(page);
 256  			return 1;
      		}
      	}
 259  	return 0;
      }
      
      
      /**
       * truncate_inode_pages - truncate *all* the pages from an offset
       * @mapping: mapping to truncate
       * @lstart: offset from with to truncate
       *
       * Truncate the page cache at a set offset, removing the pages
       * that are beyond that offset (and zeroing out partial pages).
       * If any page is locked we wait for it to become unlocked.
       */
 272  void truncate_inode_pages(struct address_space * mapping, loff_t lstart) 
      {
      	unsigned long start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
      	unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
      
      repeat:
      	spin_lock(&pagecache_lock);
 279  	if (truncate_list_pages(&mapping->clean_pages, start, &partial))
 280  		goto repeat;
 281  	if (truncate_list_pages(&mapping->dirty_pages, start, &partial))
 282  		goto repeat;
 283  	if (truncate_list_pages(&mapping->locked_pages, start, &partial))
 284  		goto repeat;
 285  	spin_unlock(&pagecache_lock);
      }
      
 288  static inline struct page * __find_page_nolock(struct address_space *mapping, unsigned long offset, struct page *page)
      {
 290  	goto inside;
      
 292  	for (;;) {
      		page = page->next_hash;
      inside:
 295  		if (!page)
 296  			goto not_found;
 297  		if (page->mapping != mapping)
 298  			continue;
 299  		if (page->index == offset)
 300  			break;
      	}
      	/*
      	 * Touching the page may move it to the active list.
      	 * If we end up with too few inactive pages, we wake
      	 * up kswapd.
      	 */
      	age_page_up(page);
 308  	if (inactive_shortage() > inactive_target / 2 && free_shortage())
      			wakeup_kswapd(0);
      not_found:
 311  	return page;
      }
      
      /*
       * By the time this is called, the page is locked and
       * we don't have to worry about any races any more.
       *
       * Start the IO..
       */
 320  static int writeout_one_page(struct page *page)
      {
      	struct buffer_head *bh, *head = page->buffers;
      
      	bh = head;
 325  	do {
 326  		if (buffer_locked(bh) || !buffer_dirty(bh) || !buffer_uptodate(bh))
 327  			continue;
      
      		bh->b_flushtime = jiffies;
      		ll_rw_block(WRITE, 1, &bh);	
 331  	} while ((bh = bh->b_this_page) != head);
 332  	return 0;
      }
      
 335  static int waitfor_one_page(struct page *page)
      {
      	int error = 0;
      	struct buffer_head *bh, *head = page->buffers;
      
      	bh = head;
 341  	do {
      		wait_on_buffer(bh);
 343  		if (buffer_req(bh) && !buffer_uptodate(bh))
      			error = -EIO;
 345  	} while ((bh = bh->b_this_page) != head);
 346  	return error;
      }
      
 349  static int do_buffer_fdatasync(struct list_head *head, unsigned long start, unsigned long end, int (*fn)(struct page *))
      {
      	struct list_head *curr;
      	struct page *page;
      	int retval = 0;
      
      	spin_lock(&pagecache_lock);
      	curr = head->next;
 357  	while (curr != head) {
      		page = list_entry(curr, struct page, list);
      		curr = curr->next;
 360  		if (!page->buffers)
 361  			continue;
 362  		if (page->index >= end)
 363  			continue;
 364  		if (page->index < start)
 365  			continue;
      
      		page_cache_get(page);
 368  		spin_unlock(&pagecache_lock);
      		lock_page(page);
      
      		/* The buffers could have been free'd while we waited for the page lock */
 372  		if (page->buffers)
      			retval |= fn(page);
      
 375  		UnlockPage(page);
      		spin_lock(&pagecache_lock);
      		curr = page->list.next;
      		page_cache_release(page);
      	}
 380  	spin_unlock(&pagecache_lock);
      
 382  	return retval;
      }
      
      /*
       * Two-stage data sync: first start the IO, then go back and
       * collect the information..
       */
 389  int generic_buffer_fdatasync(struct inode *inode, unsigned long start_idx, unsigned long end_idx)
      {
      	int retval;
      
      	/* writeout dirty buffers on pages from both clean and dirty lists */
      	retval = do_buffer_fdatasync(&inode->i_mapping->dirty_pages, start_idx, end_idx, writeout_one_page);
      	retval |= do_buffer_fdatasync(&inode->i_mapping->clean_pages, start_idx, end_idx, writeout_one_page);
      	retval |= do_buffer_fdatasync(&inode->i_mapping->locked_pages, start_idx, end_idx, writeout_one_page);
      
      	/* now wait for locked buffers on pages from both clean and dirty lists */
      	retval |= do_buffer_fdatasync(&inode->i_mapping->dirty_pages, start_idx, end_idx, writeout_one_page);
      	retval |= do_buffer_fdatasync(&inode->i_mapping->clean_pages, start_idx, end_idx, waitfor_one_page);
      	retval |= do_buffer_fdatasync(&inode->i_mapping->locked_pages, start_idx, end_idx, waitfor_one_page);
      
 403  	return retval;
      }
      
      /**
       *      filemap_fdatasync - walk the list of dirty pages of the given address space
       *     	and writepage() all of them.
       * 
       *      @mapping: address space structure to write
       *
       */
 413  void filemap_fdatasync(struct address_space * mapping)
      {
      	int (*writepage)(struct page *) = mapping->a_ops->writepage;
      
      	spin_lock(&pagecache_lock);
      
 419          while (!list_empty(&mapping->dirty_pages)) {
      		struct page *page = list_entry(mapping->dirty_pages.next, struct page, list);
      
      		list_del(&page->list);
      		list_add(&page->list, &mapping->locked_pages);
      
 425  		if (!PageDirty(page))
 426  			continue;
      
      		page_cache_get(page);
 429  		spin_unlock(&pagecache_lock);
      
      		lock_page(page);
      
 433  		if (PageDirty(page)) {
      			ClearPageDirty(page);
      			writepage(page);
 436  		} else
 437  			UnlockPage(page);
      
      		page_cache_release(page);
      		spin_lock(&pagecache_lock);
      	}
 442  	spin_unlock(&pagecache_lock);
      }
      
      /**
       *      filemap_fdatawait - walk the list of locked pages of the given address space
       *     	and wait for all of them.
       * 
       *      @mapping: address space structure to wait for
       *
       */
 452  void filemap_fdatawait(struct address_space * mapping)
      {
      	spin_lock(&pagecache_lock);
      
 456          while (!list_empty(&mapping->locked_pages)) {
      		struct page *page = list_entry(mapping->locked_pages.next, struct page, list);
      
      		list_del(&page->list);
      		list_add(&page->list, &mapping->clean_pages);
      
 462  		if (!PageLocked(page))
 463  			continue;
      
      		page_cache_get(page);
 466  		spin_unlock(&pagecache_lock);
      
      		___wait_on_page(page);
      
      		page_cache_release(page);
      		spin_lock(&pagecache_lock);
      	}
 473  	spin_unlock(&pagecache_lock);
      }
      
      /*
       * Add a page to the inode page cache.
       *
       * The caller must have locked the page and 
       * set all the page flags correctly..
       */
 482  void add_to_page_cache_locked(struct page * page, struct address_space *mapping, unsigned long index)
      {
 484  	if (!PageLocked(page))
 485  		BUG();
      
      	page_cache_get(page);
      	spin_lock(&pagecache_lock);
      	page->index = index;
      	add_page_to_inode_queue(mapping, page);
      	add_page_to_hash_queue(page, page_hash(mapping, index));
      	lru_cache_add(page);
 493  	spin_unlock(&pagecache_lock);
      }
      
      /*
       * This adds a page to the page cache, starting out as locked,
       * owned by us, but unreferenced, not uptodate and with no errors.
       */
 500  static inline void __add_to_page_cache(struct page * page,
      	struct address_space *mapping, unsigned long offset,
      	struct page **hash)
      {
      	unsigned long flags;
      
 506  	if (PageLocked(page))
 507  		BUG();
      
      	flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_dirty) | (1 << PG_referenced) | (1 << PG_arch_1));
      	page->flags = flags | (1 << PG_locked);
      	page_cache_get(page);
      	page->index = offset;
      	add_page_to_inode_queue(mapping, page);
      	add_page_to_hash_queue(page, hash);
      	lru_cache_add(page);
      }
      
 518  void add_to_page_cache(struct page * page, struct address_space * mapping, unsigned long offset)
      {
      	spin_lock(&pagecache_lock);
      	__add_to_page_cache(page, mapping, offset, page_hash(mapping, offset));
 522  	spin_unlock(&pagecache_lock);
      }
      
 525  static int add_to_page_cache_unique(struct page * page,
      	struct address_space *mapping, unsigned long offset,
      	struct page **hash)
      {
      	int err;
      	struct page *alias;
      
      	spin_lock(&pagecache_lock);
      	alias = __find_page_nolock(mapping, offset, *hash);
      
      	err = 1;
 536  	if (!alias) {
      		__add_to_page_cache(page,mapping,offset,hash);
      		err = 0;
      	}
      
 541  	spin_unlock(&pagecache_lock);
 542  	return err;
      }
      
      /*
       * This adds the requested page to the page cache if it isn't already there,
       * and schedules an I/O to read in its contents from disk.
       */
 549  static inline int page_cache_read(struct file * file, unsigned long offset) 
      {
      	struct inode *inode = file->f_dentry->d_inode;
      	struct address_space *mapping = inode->i_mapping;
      	struct page **hash = page_hash(mapping, offset);
      	struct page *page; 
      
      	spin_lock(&pagecache_lock);
      	page = __find_page_nolock(mapping, offset, *hash); 
 558  	spin_unlock(&pagecache_lock);
 559  	if (page)
 560  		return 0;
      
      	page = page_cache_alloc();
 563  	if (!page)
 564  		return -ENOMEM;
      
 566  	if (!add_to_page_cache_unique(page, mapping, offset, hash)) {
      		int error = mapping->a_ops->readpage(file, page);
      		page_cache_release(page);
 569  		return error;
      	}
      	/*
      	 * We arrive here in the unlikely event that someone 
      	 * raced with us and added our page to the cache first.
      	 */
      	page_cache_free(page);
 576  	return 0;
      }
      
      /*
       * Read in an entire cluster at once.  A cluster is usually a 64k-
       * aligned block that includes the page requested in "offset."
       */
 583  static int read_cluster_nonblocking(struct file * file, unsigned long offset,
      	unsigned long filesize)
      {
      	unsigned long pages = CLUSTER_PAGES;
      
      	offset = CLUSTER_OFFSET(offset);
 589  	while ((pages-- > 0) && (offset < filesize)) {
      		int error = page_cache_read(file, offset);
 591  		if (error < 0)
 592  			return error;
      		offset ++;
      	}
      
 596  	return 0;
      }
      
      /* 
       * Wait for a page to get unlocked.
       *
       * This must be called with the caller "holding" the page,
       * ie with increased "page->count" so that the page won't
       * go away during the wait..
       */
 606  void ___wait_on_page(struct page *page)
      {
      	struct task_struct *tsk = current;
      	DECLARE_WAITQUEUE(wait, tsk);
      
      	add_wait_queue(&page->wait, &wait);
 612  	do {
      		sync_page(page);
 614  		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 615  		if (!PageLocked(page))
 616  			break;
      		run_task_queue(&tq_disk);
      		schedule();
 619  	} while (PageLocked(page));
      	tsk->state = TASK_RUNNING;
      	remove_wait_queue(&page->wait, &wait);
      }
      
      /*
       * Get a lock on the page, assuming we need to sleep
       * to get it..
       */
 628  static void __lock_page(struct page *page)
      {
      	struct task_struct *tsk = current;
      	DECLARE_WAITQUEUE(wait, tsk);
      
      	add_wait_queue_exclusive(&page->wait, &wait);
 634  	for (;;) {
      		sync_page(page);
 636  		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 637  		if (PageLocked(page)) {
      			run_task_queue(&tq_disk);
      			schedule();
 640  			continue;
      		}
 642  		if (!TryLockPage(page))
 643  			break;
      	}
      	tsk->state = TASK_RUNNING;
      	remove_wait_queue(&page->wait, &wait);
      }
      	
      
      /*
       * Get an exclusive lock on the page, optimistically
       * assuming it's not locked..
       */
 654  void lock_page(struct page *page)
      {
 656  	if (TryLockPage(page))
      		__lock_page(page);
      }
      
      /*
       * a rather lightweight function, finding and getting a reference to a
       * hashed page atomically, waiting for it if it's locked.
       */
 664  struct page * __find_get_page(struct address_space *mapping,
      			      unsigned long offset, struct page **hash)
      {
      	struct page *page;
      
      	/*
      	 * We scan the hash list read-only. Addition to and removal from
      	 * the hash-list needs a held write-lock.
      	 */
      	spin_lock(&pagecache_lock);
      	page = __find_page_nolock(mapping, offset, *hash);
 675  	if (page)
      		page_cache_get(page);
 677  	spin_unlock(&pagecache_lock);
 678  	return page;
      }
      
      /*
       * Get the lock to a page atomically.
       */
 684  struct page * __find_lock_page (struct address_space *mapping,
      				unsigned long offset, struct page **hash)
      {
      	struct page *page;
      
      	/*
      	 * We scan the hash list read-only. Addition to and removal from
      	 * the hash-list needs a held write-lock.
      	 */
      repeat:
      	spin_lock(&pagecache_lock);
      	page = __find_page_nolock(mapping, offset, *hash);
 696  	if (page) {
      		page_cache_get(page);
 698  		spin_unlock(&pagecache_lock);
      
      		lock_page(page);
      
      		/* Is the page still hashed? Ok, good.. */
 703  		if (page->mapping)
 704  			return page;
      
      		/* Nope: we raced. Release and try again.. */
 707  		UnlockPage(page);
      		page_cache_release(page);
 709  		goto repeat;
      	}
 711  	spin_unlock(&pagecache_lock);
 712  	return NULL;
      }
      
      #if 0
      #define PROFILE_READAHEAD
      #define DEBUG_READAHEAD
      #endif
      
      /*
       * We combine this with read-ahead to deactivate pages when we
       * think there's sequential IO going on. Note that this is
       * harmless since we don't actually evict the pages from memory
       * but just move them to the inactive list.
       *
       * TODO:
       * - make the readahead code smarter
       * - move readahead to the VMA level so we can do the same
       *   trick with mmap()
       *
       * Rik van Riel, 2000
       */
 733  static void drop_behind(struct file * file, unsigned long index)
      {
      	struct inode *inode = file->f_dentry->d_inode;
      	struct address_space *mapping = inode->i_mapping;
      	struct page **hash;
      	struct page *page;
      	unsigned long start;
      
      	/* Nothing to drop-behind if we're on the first page. */
 742  	if (!index)
 743  		return;
      
 745  	if (index > file->f_rawin)
      		start = index - file->f_rawin;
 747  	else
      		start = 0;
      
      	/*
      	 * Go backwards from index-1 and drop all pages in the
      	 * readahead window. Since the readahead window may have
      	 * been increased since the last time we were called, we
      	 * stop when the page isn't there.
      	 */
      	spin_lock(&pagecache_lock);
 757  	while (--index >= start) {
      		hash = page_hash(mapping, index);
      		page = __find_page_nolock(mapping, index, *hash);
 760  		if (!page)
 761  			break;
      		deactivate_page(page);
      	}
 764  	spin_unlock(&pagecache_lock);
      }
      
      /*
       * Read-ahead profiling information
       * --------------------------------
       * Every PROFILE_MAXREADCOUNT, the following information is written 
       * to the syslog:
       *   Percentage of asynchronous read-ahead.
       *   Average of read-ahead fields context value.
       * If DEBUG_READAHEAD is defined, a snapshot of these fields is written 
       * to the syslog.
       */
      
      #ifdef PROFILE_READAHEAD
      
      #define PROFILE_MAXREADCOUNT 1000
      
      static unsigned long total_reada;
      static unsigned long total_async;
      static unsigned long total_ramax;
      static unsigned long total_ralen;
      static unsigned long total_rawin;
      
      static void profile_readahead(int async, struct file *filp)
      {
      	unsigned long flags;
      
      	++total_reada;
      	if (async)
      		++total_async;
      
      	total_ramax	+= filp->f_ramax;
      	total_ralen	+= filp->f_ralen;
      	total_rawin	+= filp->f_rawin;
      
      	if (total_reada > PROFILE_MAXREADCOUNT) {
      		save_flags(flags);
      		cli();
      		if (!(total_reada > PROFILE_MAXREADCOUNT)) {
      			restore_flags(flags);
      			return;
      		}
      
      		printk("Readahead average:  max=%ld, len=%ld, win=%ld, async=%ld%%\n",
      			total_ramax/total_reada,
      			total_ralen/total_reada,
      			total_rawin/total_reada,
      			(total_async*100)/total_reada);
      #ifdef DEBUG_READAHEAD
      		printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%Ld\n",
      			filp->f_ramax, filp->f_ralen, filp->f_rawin, filp->f_raend);
      #endif
      
      		total_reada	= 0;
      		total_async	= 0;
      		total_ramax	= 0;
      		total_ralen	= 0;
      		total_rawin	= 0;
      
      		restore_flags(flags);
      	}
      }
      #endif  /* defined PROFILE_READAHEAD */
      
      /*
       * Read-ahead context:
       * -------------------
       * The read ahead context fields of the "struct file" are the following:
       * - f_raend : position of the first byte after the last page we tried to
       *	       read ahead.
       * - f_ramax : current read-ahead maximum size.
       * - f_ralen : length of the current IO read block we tried to read-ahead.
       * - f_rawin : length of the current read-ahead window.
       *		if last read-ahead was synchronous then
       *			f_rawin = f_ralen
       *		otherwise (was asynchronous)
       *			f_rawin = previous value of f_ralen + f_ralen
       *
       * Read-ahead limits:
       * ------------------
       * MIN_READAHEAD   : minimum read-ahead size when read-ahead.
       * MAX_READAHEAD   : maximum read-ahead size when read-ahead.
       *
       * Synchronous read-ahead benefits:
       * --------------------------------
       * Using reasonable IO xfer length from peripheral devices increase system 
       * performances.
       * Reasonable means, in this context, not too large but not too small.
       * The actual maximum value is:
       *	MAX_READAHEAD + PAGE_CACHE_SIZE = 76k is CONFIG_READA_SMALL is undefined
       *      and 32K if defined (4K page size assumed).
       *
       * Asynchronous read-ahead benefits:
       * ---------------------------------
       * Overlapping next read request and user process execution increase system 
       * performance.
       *
       * Read-ahead risks:
       * -----------------
       * We have to guess which further data are needed by the user process.
       * If these data are often not really needed, it's bad for system 
       * performances.
       * However, we know that files are often accessed sequentially by 
       * application programs and it seems that it is possible to have some good 
       * strategy in that guessing.
       * We only try to read-ahead files that seems to be read sequentially.
       *
       * Asynchronous read-ahead risks:
       * ------------------------------
       * In order to maximize overlapping, we must start some asynchronous read 
       * request from the device, as soon as possible.
       * We must be very careful about:
       * - The number of effective pending IO read requests.
       *   ONE seems to be the only reasonable value.
       * - The total memory pool usage for the file access stream.
       *   This maximum memory usage is implicitly 2 IO read chunks:
       *   2*(MAX_READAHEAD + PAGE_CACHE_SIZE) = 156K if CONFIG_READA_SMALL is undefined,
       *   64k if defined (4K page size assumed).
       */
      
 885  static inline int get_max_readahead(struct inode * inode)
      {
 887  	if (!inode->i_dev || !max_readahead[MAJOR(inode->i_dev)])
 888  		return MAX_READAHEAD;
 889  	return max_readahead[MAJOR(inode->i_dev)][MINOR(inode->i_dev)];
      }
      
 892  static void generic_file_readahead(int reada_ok,
      	struct file * filp, struct inode * inode,
      	struct page * page)
      {
      	unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
      	unsigned long index = page->index;
      	unsigned long max_ahead, ahead;
      	unsigned long raend;
      	int max_readahead = get_max_readahead(inode);
      
      	raend = filp->f_raend;
      	max_ahead = 0;
      
      /*
       * The current page is locked.
       * If the current position is inside the previous read IO request, do not
       * try to reread previously read ahead pages.
       * Otherwise decide or not to read ahead some pages synchronously.
       * If we are not going to read ahead, set the read ahead context for this 
       * page only.
       */
 913  	if (PageLocked(page)) {
 914  		if (!filp->f_ralen || index >= raend || index + filp->f_rawin < raend) {
      			raend = index;
 916  			if (raend < end_index)
      				max_ahead = filp->f_ramax;
      			filp->f_rawin = 0;
      			filp->f_ralen = 1;
 920  			if (!max_ahead) {
      				filp->f_raend  = index + filp->f_ralen;
      				filp->f_rawin += filp->f_ralen;
      			}
      		}
      	}
      /*
       * The current page is not locked.
       * If we were reading ahead and,
       * if the current max read ahead size is not zero and,
       * if the current position is inside the last read-ahead IO request,
       *   it is the moment to try to read ahead asynchronously.
       * We will later force unplug device in order to force asynchronous read IO.
       */
      	else if (reada_ok && filp->f_ramax && raend >= 1 &&
 935  		 index <= raend && index + filp->f_ralen >= raend) {
      /*
       * Add ONE page to max_ahead in order to try to have about the same IO max size
       * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_CACHE_SIZE.
       * Compute the position of the last page we have tried to read in order to 
       * begin to read ahead just at the next page.
       */
      		raend -= 1;
 943  		if (raend < end_index)
      			max_ahead = filp->f_ramax + 1;
      
 946  		if (max_ahead) {
      			filp->f_rawin = filp->f_ralen;
      			filp->f_ralen = 0;
      			reada_ok      = 2;
      		}
      	}
      /*
       * Try to read ahead pages.
       * We hope that ll_rw_blk() plug/unplug, coalescence, requests sort and the
       * scheduler, will work enough for us to avoid too bad actuals IO requests.
       */
      	ahead = 0;
 958  	while (ahead < max_ahead) {
      		ahead ++;
 960  		if ((raend + ahead) >= end_index)
 961  			break;
 962  		if (page_cache_read(filp, raend + ahead) < 0)
 963  			break;
      	}
      /*
       * If we tried to read ahead some pages,
       * If we tried to read ahead asynchronously,
       *   Try to force unplug of the device in order to start an asynchronous
       *   read IO request.
       * Update the read-ahead context.
       * Store the length of the current read-ahead window.
       * Double the current max read ahead size.
       *   That heuristic avoid to do some large IO for files that are not really
       *   accessed sequentially.
       */
 976  	if (ahead) {
 977  		if (reada_ok == 2) {
      			run_task_queue(&tq_disk);
      		}
      
      		filp->f_ralen += ahead;
      		filp->f_rawin += filp->f_ralen;
      		filp->f_raend = raend + ahead + 1;
      
      		filp->f_ramax += filp->f_ramax;
      
 987  		if (filp->f_ramax > max_readahead)
      			filp->f_ramax = max_readahead;
      
      		/*
      		 * Move the pages that have already been passed
      		 * to the inactive list.
      		 */
      		drop_behind(filp, index);
      
      #ifdef PROFILE_READAHEAD
      		profile_readahead((reada_ok == 2), filp);
      #endif
      	}
      
1001  	return;
      }
      
      
      /*
       * This is a generic file read routine, and uses the
       * inode->i_op->readpage() function for the actual low-level
       * stuff.
       *
       * This is really ugly. But the goto's actually try to clarify some
       * of the logic when it comes to error handling etc.
       */
1013  void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor)
      {
      	struct inode *inode = filp->f_dentry->d_inode;
      	struct address_space *mapping = inode->i_mapping;
      	unsigned long index, offset;
      	struct page *cached_page;
      	int reada_ok;
      	int error;
      	int max_readahead = get_max_readahead(inode);
      
      	cached_page = NULL;
      	index = *ppos >> PAGE_CACHE_SHIFT;
      	offset = *ppos & ~PAGE_CACHE_MASK;
      
      /*
       * If the current position is outside the previous read-ahead window, 
       * we reset the current read-ahead context and set read ahead max to zero
       * (will be set to just needed value later),
       * otherwise, we assume that the file accesses are sequential enough to
       * continue read-ahead.
       */
1034  	if (index > filp->f_raend || index + filp->f_rawin < filp->f_raend) {
      		reada_ok = 0;
      		filp->f_raend = 0;
      		filp->f_ralen = 0;
      		filp->f_ramax = 0;
      		filp->f_rawin = 0;
1040  	} else {
      		reada_ok = 1;
      	}
      /*
       * Adjust the current value of read-ahead max.
       * If the read operation stay in the first half page, force no readahead.
       * Otherwise try to increase read ahead max just enough to do the read request.
       * Then, at least MIN_READAHEAD if read ahead is ok,
       * and at most MAX_READAHEAD in all cases.
       */
1050  	if (!index && offset + desc->count <= (PAGE_CACHE_SIZE >> 1)) {
      		filp->f_ramax = 0;
1052  	} else {
      		unsigned long needed;
      
      		needed = ((offset + desc->count) >> PAGE_CACHE_SHIFT) + 1;
      
1057  		if (filp->f_ramax < needed)
      			filp->f_ramax = needed;
      
1060  		if (reada_ok && filp->f_ramax < MIN_READAHEAD)
      				filp->f_ramax = MIN_READAHEAD;
1062  		if (filp->f_ramax > max_readahead)
      			filp->f_ramax = max_readahead;
      	}
      
1066  	for (;;) {
      		struct page *page, **hash;
      		unsigned long end_index, nr;
      
      		end_index = inode->i_size >> PAGE_CACHE_SHIFT;
1071  		if (index > end_index)
1072  			break;
      		nr = PAGE_CACHE_SIZE;
1074  		if (index == end_index) {
      			nr = inode->i_size & ~PAGE_CACHE_MASK;
1076  			if (nr <= offset)
1077  				break;
      		}
      
      		nr = nr - offset;
      
      		/*
      		 * Try to find the data in the page cache..
      		 */
      		hash = page_hash(mapping, index);
      
      		spin_lock(&pagecache_lock);
      		page = __find_page_nolock(mapping, index, *hash);
1089  		if (!page)
1090  			goto no_cached_page;
      found_page:
      		page_cache_get(page);
1093  		spin_unlock(&pagecache_lock);
      
1095  		if (!Page_Uptodate(page))
1096  			goto page_not_up_to_date;
      		generic_file_readahead(reada_ok, filp, inode, page);
      page_ok:
      		/* If users can be writing to this page using arbitrary
      		 * virtual addresses, take care about potential aliasing
      		 * before reading the page on the kernel side.
      		 */
1103  		if (mapping->i_mmap_shared != NULL)
1104  			flush_dcache_page(page);
      
      		/*
      		 * Ok, we have the page, and it's up-to-date, so
      		 * now we can copy it to user space...
      		 *
      		 * The actor routine returns how many bytes were actually used..
      		 * NOTE! This may not be the same as how much of a user buffer
      		 * we filled up (we may be padding etc), so we can only update
      		 * "pos" here (the actor routine has to update the user buffer
      		 * pointers and the remaining count).
      		 */
      		nr = actor(desc, page, offset, nr);
      		offset += nr;
      		index += offset >> PAGE_CACHE_SHIFT;
      		offset &= ~PAGE_CACHE_MASK;
      	
      		page_cache_release(page);
1122  		if (nr && desc->count)
1123  			continue;
1124  		break;
      
      /*
       * Ok, the page was not immediately readable, so let's try to read ahead while we're at it..
       */
      page_not_up_to_date:
      		generic_file_readahead(reada_ok, filp, inode, page);
      
1132  		if (Page_Uptodate(page))
1133  			goto page_ok;
      
      		/* Get exclusive access to the page ... */
      		lock_page(page);
      
      		/* Did it get unhashed before we got the lock? */
1139  		if (!page->mapping) {
1140  			UnlockPage(page);
      			page_cache_release(page);
1142  			continue;
      		}
      
      		/* Did somebody else fill it already? */
1146  		if (Page_Uptodate(page)) {
1147  			UnlockPage(page);
1148  			goto page_ok;
      		}
      
      readpage:
      		/* ... and start the actual read. The read will unlock the page. */
      		error = mapping->a_ops->readpage(filp, page);
      
1155  		if (!error) {
1156  			if (Page_Uptodate(page))
1157  				goto page_ok;
      
      			/* Again, try some read-ahead while waiting for the page to finish.. */
      			generic_file_readahead(reada_ok, filp, inode, page);
      			wait_on_page(page);
1162  			if (Page_Uptodate(page))
1163  				goto page_ok;
      			error = -EIO;
      		}
      
      		/* UHHUH! A synchronous read error occurred. Report it */
      		desc->error = error;
      		page_cache_release(page);
1170  		break;
      
      no_cached_page:
      		/*
      		 * Ok, it wasn't cached, so we need to create a new
      		 * page..
      		 *
      		 * We get here with the page cache lock held.
      		 */
1179  		if (!cached_page) {
1180  			spin_unlock(&pagecache_lock);
      			cached_page = page_cache_alloc();
1182  			if (!cached_page) {
      				desc->error = -ENOMEM;
1184  				break;
      			}
      
      			/*
      			 * Somebody may have added the page while we
      			 * dropped the page cache lock. Check for that.
      			 */
      			spin_lock(&pagecache_lock);
      			page = __find_page_nolock(mapping, index, *hash);
1193  			if (page)
1194  				goto found_page;
      		}
      
      		/*
      		 * Ok, add the new page to the hash-queues...
      		 */
      		page = cached_page;
      		__add_to_page_cache(page, mapping, index, hash);
1202  		spin_unlock(&pagecache_lock);
      		cached_page = NULL;
      
1205  		goto readpage;
      	}
      
      	*ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
      	filp->f_reada = 1;
1210  	if (cached_page)
      		page_cache_free(cached_page);
      	UPDATE_ATIME(inode);
      }
      
1215  static int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size)
      {
      	char *kaddr;
      	unsigned long left, count = desc->count;
      
1220  	if (size > count)
      		size = count;
      
      	kaddr = kmap(page);
      	left = __copy_to_user(desc->buf, kaddr + offset, size);
1225  	kunmap(page);
      	
1227  	if (left) {
      		size -= left;
      		desc->error = -EFAULT;
      	}
      	desc->count = count - size;
      	desc->written += size;
      	desc->buf += size;
1234  	return size;
      }
      
      /*
       * This is the "read()" routine for all filesystems
       * that can use the page cache directly.
       */
1241  ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *ppos)
      {
      	ssize_t retval;
      
      	retval = -EFAULT;
1246  	if (access_ok(VERIFY_WRITE, buf, count)) {
      		retval = 0;
      
1249  		if (count) {
      			read_descriptor_t desc;
      
      			desc.written = 0;
      			desc.count = count;
      			desc.buf = buf;
      			desc.error = 0;
      			do_generic_file_read(filp, ppos, &desc, file_read_actor);
      
      			retval = desc.written;
1259  			if (!retval)
      				retval = desc.error;
      		}
      	}
1263  	return retval;
      }
      
1266  static int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset , unsigned long size)
      {
      	char *kaddr;
      	ssize_t written;
      	unsigned long count = desc->count;
      	struct file *file = (struct file *) desc->buf;
      	mm_segment_t old_fs;
      
1274  	if (size > count)
      		size = count;
      	old_fs = get_fs();
      	set_fs(KERNEL_DS);
      
      	kaddr = kmap(page);
      	written = file->f_op->write(file, kaddr + offset, size, &file->f_pos);
1281  	kunmap(page);
      	set_fs(old_fs);
1283  	if (written < 0) {
      		desc->error = written;
      		written = 0;
      	}
      	desc->count = count - written;
      	desc->written += written;
1289  	return written;
      }
      
1292  asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t *offset, size_t count)
      {
      	ssize_t retval;
      	struct file * in_file, * out_file;
      	struct inode * in_inode, * out_inode;
      
      	/*
      	 * Get input file, and verify that it is ok..
      	 */
      	retval = -EBADF;
      	in_file = fget(in_fd);
1303  	if (!in_file)
1304  		goto out;
1305  	if (!(in_file->f_mode & FMODE_READ))
1306  		goto fput_in;
      	retval = -EINVAL;
      	in_inode = in_file->f_dentry->d_inode;
1309  	if (!in_inode)
1310  		goto fput_in;
1311  	if (!in_inode->i_mapping->a_ops->readpage)
1312  		goto fput_in;
      	retval = locks_verify_area(FLOCK_VERIFY_READ, in_inode, in_file, in_file->f_pos, count);
1314  	if (retval)
1315  		goto fput_in;
      
      	/*
      	 * Get output file, and verify that it is ok..
      	 */
      	retval = -EBADF;
      	out_file = fget(out_fd);
1322  	if (!out_file)
1323  		goto fput_in;
1324  	if (!(out_file->f_mode & FMODE_WRITE))
1325  		goto fput_out;
      	retval = -EINVAL;
1327  	if (!out_file->f_op || !out_file->f_op->write)
1328  		goto fput_out;
      	out_inode = out_file->f_dentry->d_inode;
      	retval = locks_verify_area(FLOCK_VERIFY_WRITE, out_inode, out_file, out_file->f_pos, count);
1331  	if (retval)
1332  		goto fput_out;
      
      	retval = 0;
1335  	if (count) {
      		read_descriptor_t desc;
      		loff_t pos = 0, *ppos;
      
      		retval = -EFAULT;
      		ppos = &in_file->f_pos;
1341  		if (offset) {
1342  			if (get_user(pos, offset))
1343  				goto fput_out;
      			ppos = &pos;
      		}
      
      		desc.written = 0;
      		desc.count = count;
      		desc.buf = (char *) out_file;
      		desc.error = 0;
      		do_generic_file_read(in_file, ppos, &desc, file_send_actor);
      
      		retval = desc.written;
1354  		if (!retval)
      			retval = desc.error;
1356  		if (offset)
      			put_user(pos, offset);
      	}
      
      fput_out:
      	fput(out_file);
      fput_in:
      	fput(in_file);
      out:
1365  	return retval;
      }
      
      /*
       * Read-ahead and flush behind for MADV_SEQUENTIAL areas.  Since we are
       * sure this is sequential access, we don't need a flexible read-ahead
       * window size -- we can always use a large fixed size window.
       */
1373  static void nopage_sequential_readahead(struct vm_area_struct * vma,
      	unsigned long pgoff, unsigned long filesize)
      {
      	unsigned long ra_window;
      
      	ra_window = get_max_readahead(vma->vm_file->f_dentry->d_inode);
      	ra_window = CLUSTER_OFFSET(ra_window + CLUSTER_PAGES - 1);
      
      	/* vm_raend is zero if we haven't read ahead in this area yet.  */
1382  	if (vma->vm_raend == 0)
      		vma->vm_raend = vma->vm_pgoff + ra_window;
      
      	/*
      	 * If we've just faulted the page half-way through our window,
      	 * then schedule reads for the next window, and release the
      	 * pages in the previous window.
      	 */
1390  	if ((pgoff + (ra_window >> 1)) == vma->vm_raend) {
      		unsigned long start = vma->vm_pgoff + vma->vm_raend;
      		unsigned long end = start + ra_window;
      
1394  		if (end > ((vma->vm_end >> PAGE_SHIFT) + vma->vm_pgoff))
      			end = (vma->vm_end >> PAGE_SHIFT) + vma->vm_pgoff;
1396  		if (start > end)
1397  			return;
      
1399  		while ((start < end) && (start < filesize)) {
      			if (read_cluster_nonblocking(vma->vm_file,
1401  							start, filesize) < 0)
1402  				break;
      			start += CLUSTER_PAGES;
      		}
      		run_task_queue(&tq_disk);
      
      		/* if we're far enough past the beginning of this area,
      		   recycle pages that are in the previous window. */
1409  		if (vma->vm_raend > (vma->vm_pgoff + ra_window + ra_window)) {
      			unsigned long window = ra_window << PAGE_SHIFT;
      
      			end = vma->vm_start + (vma->vm_raend << PAGE_SHIFT);
      			end -= window + window;
      			filemap_sync(vma, end - window, window, MS_INVALIDATE);
      		}
      
      		vma->vm_raend += ra_window;
      	}
      
1420  	return;
      }
      
      /*
       * filemap_nopage() is invoked via the vma operations vector for a
       * mapped memory region to read in file data during a page fault.
       *
       * The goto's are kind of ugly, but this streamlines the normal case of having
       * it in the page cache, and handles the special cases reasonably without
       * having a lot of duplicated code.
       */
1431  struct page * filemap_nopage(struct vm_area_struct * area,
      	unsigned long address, int no_share)
      {
      	int error;
      	struct file *file = area->vm_file;
      	struct inode *inode = file->f_dentry->d_inode;
      	struct address_space *mapping = inode->i_mapping;
      	struct page *page, **hash, *old_page;
      	unsigned long size, pgoff;
      
      	pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
      
      retry_all:
      	/*
      	 * An external ptracer can access pages that normally aren't
      	 * accessible..
      	 */
      	size = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1449  	if ((pgoff >= size) && (area->vm_mm == current->mm))
1450  		return NULL;
      
      	/*
      	 * Do we have something in the page cache already?
      	 */
      	hash = page_hash(mapping, pgoff);
      retry_find:
      	page = __find_get_page(mapping, pgoff, hash);
1458  	if (!page)
1459  		goto no_cached_page;
      
      	/*
      	 * Ok, found a page in the page cache, now we need to check
      	 * that it's up-to-date.
      	 */
1465  	if (!Page_Uptodate(page))
1466  		goto page_not_uptodate;
      
      success:
       	/*
      	 * Try read-ahead for sequential areas.
      	 */
1472  	if (VM_SequentialReadHint(area))
      		nopage_sequential_readahead(area, pgoff, size);
      
      	/*
      	 * Found the page and have a reference on it, need to check sharing
      	 * and possibly copy it over to another page..
      	 */
      	old_page = page;
1480  	if (no_share) {
      		struct page *new_page = page_cache_alloc();
      
1483  		if (new_page) {
      			copy_user_highpage(new_page, old_page, address);
1485  			flush_page_to_ram(new_page);
1486  		} else
      			new_page = NOPAGE_OOM;
      		page_cache_release(page);
1489  		return new_page;
      	}
      
1492  	flush_page_to_ram(old_page);
1493  	return old_page;
      
      no_cached_page:
      	/*
      	 * If the requested offset is within our file, try to read a whole 
      	 * cluster of pages at once.
      	 *
      	 * Otherwise, we're off the end of a privately mapped file,
      	 * so we need to map a zero page.
      	 */
1503  	if ((pgoff < size) && !VM_RandomReadHint(area))
      		error = read_cluster_nonblocking(file, pgoff, size);
1505  	else
      		error = page_cache_read(file, pgoff);
      
      	/*
      	 * The page we want has now been added to the page cache.
      	 * In the unlikely event that someone removed it in the
      	 * meantime, we'll just come back here and read it again.
      	 */
1513  	if (error >= 0)
1514  		goto retry_find;
      
      	/*
      	 * An error return from page_cache_read can result if the
      	 * system is low on memory, or a problem occurs while trying
      	 * to schedule I/O.
      	 */
1521  	if (error == -ENOMEM)
1522  		return NOPAGE_OOM;
1523  	return NULL;
      
      page_not_uptodate:
      	lock_page(page);
      
      	/* Did it get unhashed while we waited for it? */
1529  	if (!page->mapping) {
1530  		UnlockPage(page);
      		page_cache_release(page);
1532  		goto retry_all;
      	}
      
      	/* Did somebody else get it up-to-date? */
1536  	if (Page_Uptodate(page)) {
1537  		UnlockPage(page);
1538  		goto success;
      	}
      
1541  	if (!mapping->a_ops->readpage(file, page)) {
      		wait_on_page(page);
1543  		if (Page_Uptodate(page))
1544  			goto success;
      	}
      
      	/*
      	 * Umm, take care of errors if the page isn't up-to-date.
      	 * Try to re-read it _once_. We do this synchronously,
      	 * because there really aren't any performance issues here
      	 * and we need to check for errors.
      	 */
      	lock_page(page);
      
      	/* Somebody truncated the page on us? */
1556  	if (!page->mapping) {
1557  		UnlockPage(page);
      		page_cache_release(page);
1559  		goto retry_all;
      	}
      
      	/* Somebody else successfully read it in? */
1563  	if (Page_Uptodate(page)) {
1564  		UnlockPage(page);
1565  		goto success;
      	}
      	ClearPageError(page);
1568  	if (!mapping->a_ops->readpage(file, page)) {
      		wait_on_page(page);
1570  		if (Page_Uptodate(page))
1571  			goto success;
      	}
      
      	/*
      	 * Things didn't work out. Return zero to tell the
      	 * mm layer so, possibly freeing the page cache page first.
      	 */
      	page_cache_release(page);
1579  	return NULL;
      }
      
      /* Called with mm->page_table_lock held to protect against other
       * threads/the swapper from ripping pte's out from under us.
       */
1585  static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
      	unsigned long address, unsigned int flags)
      {
      	pte_t pte = *ptep;
      
1590  	if (pte_present(pte) && ptep_test_and_clear_dirty(ptep)) {
      		struct page *page = pte_page(pte);
      		flush_tlb_page(vma, address);
      		set_page_dirty(page);
      	}
1595  	return 0;
      }
      
1598  static inline int filemap_sync_pte_range(pmd_t * pmd,
      	unsigned long address, unsigned long size, 
      	struct vm_area_struct *vma, unsigned long offset, unsigned int flags)
      {
      	pte_t * pte;
      	unsigned long end;
      	int error;
      
1606  	if (pmd_none(*pmd))
1607  		return 0;
1608  	if (pmd_bad(*pmd)) {
      		pmd_ERROR(*pmd);
1610  		pmd_clear(pmd);
1611  		return 0;
      	}
      	pte = pte_offset(pmd, address);
      	offset += address & PMD_MASK;
      	address &= ~PMD_MASK;
      	end = address + size;
1617  	if (end > PMD_SIZE)
      		end = PMD_SIZE;
      	error = 0;
1620  	do {
      		error |= filemap_sync_pte(pte, vma, address + offset, flags);
      		address += PAGE_SIZE;
      		pte++;
1624  	} while (address && (address < end));
1625  	return error;
      }
      
1628  static inline int filemap_sync_pmd_range(pgd_t * pgd,
      	unsigned long address, unsigned long size, 
      	struct vm_area_struct *vma, unsigned int flags)
      {
      	pmd_t * pmd;
      	unsigned long offset, end;
      	int error;
      
1636  	if (pgd_none(*pgd))
1637  		return 0;
1638  	if (pgd_bad(*pgd)) {
      		pgd_ERROR(*pgd);
1640  		pgd_clear(pgd);
1641  		return 0;
      	}
      	pmd = pmd_offset(pgd, address);
      	offset = address & PGDIR_MASK;
      	address &= ~PGDIR_MASK;
      	end = address + size;
1647  	if (end > PGDIR_SIZE)
      		end = PGDIR_SIZE;
      	error = 0;
1650  	do {
      		error |= filemap_sync_pte_range(pmd, address, end - address, vma, offset, flags);
      		address = (address + PMD_SIZE) & PMD_MASK;
      		pmd++;
1654  	} while (address && (address < end));
1655  	return error;
      }
      
1658  int filemap_sync(struct vm_area_struct * vma, unsigned long address,
      	size_t size, unsigned int flags)
      {
      	pgd_t * dir;
      	unsigned long end = address + size;
      	int error = 0;
      
      	/* Aquire the lock early; it may be possible to avoid dropping
      	 * and reaquiring it repeatedly.
      	 */
      	spin_lock(&vma->vm_mm->page_table_lock);
      
      	dir = pgd_offset(vma->vm_mm, address);
1671  	flush_cache_range(vma->vm_mm, end - size, end);
1672  	if (address >= end)
1673  		BUG();
1674  	do {
      		error |= filemap_sync_pmd_range(dir, address, end - address, vma, flags);
      		address = (address + PGDIR_SIZE) & PGDIR_MASK;
      		dir++;
1678  	} while (address && (address < end));
      	flush_tlb_range(vma->vm_mm, end - size, end);
      
1681  	spin_unlock(&vma->vm_mm->page_table_lock);
      
1683  	return error;
      }
      
      /*
       * Shared mappings need to be able to do the right thing at
       * close/unmap/sync. They will also use the private file as
       * backing-store for swapping..
       */
      static struct vm_operations_struct file_shared_mmap = {
      	nopage:		filemap_nopage,
      };
      
      /*
       * Private mappings just need to be able to load in the map.
       *
       * (This is actually used for shared mappings as well, if we
       * know they can't ever get write permissions..)
       */
      static struct vm_operations_struct file_private_mmap = {
      	nopage:		filemap_nopage,
      };
      
      /* This is used for a general mmap of a disk file */
      
1707  int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
      {
      	struct vm_operations_struct * ops;
      	struct inode *inode = file->f_dentry->d_inode;
      
      	ops = &file_private_mmap;
1713  	if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
1714  		if (!inode->i_mapping->a_ops->writepage)
1715  			return -EINVAL;
      		ops = &file_shared_mmap;
      	}
1718  	if (!inode->i_sb || !S_ISREG(inode->i_mode))
1719  		return -EACCES;
1720  	if (!inode->i_mapping->a_ops->readpage)
1721  		return -ENOEXEC;
      	UPDATE_ATIME(inode);
      	vma->vm_ops = ops;
1724  	return 0;
      }
      
      /*
       * The msync() system call.
       */
      
1731  static int msync_interval(struct vm_area_struct * vma,
      	unsigned long start, unsigned long end, int flags)
      {
      	struct file * file = vma->vm_file;
1735  	if (file && (vma->vm_flags & VM_SHARED)) {
      		int error;
      		error = filemap_sync(vma, start, end-start, flags);
      
1739  		if (!error && (flags & MS_SYNC)) {
      			struct inode * inode = file->f_dentry->d_inode;
      			down(&inode->i_sem);
      			filemap_fdatasync(inode->i_mapping);
1743  			if (file->f_op && file->f_op->fsync)
      				error = file->f_op->fsync(file, file->f_dentry, 1);
      			filemap_fdatawait(inode->i_mapping);
      			up(&inode->i_sem);
      		}
1748  		return error;
      	}
1750  	return 0;
      }
      
1753  asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
      {
      	unsigned long end;
      	struct vm_area_struct * vma;
      	int unmapped_error, error = -EINVAL;
      
      	down(¤t->mm->mmap_sem);
1760  	if (start & ~PAGE_MASK)
1761  		goto out;
      	len = (len + ~PAGE_MASK) & PAGE_MASK;
      	end = start + len;
1764  	if (end < start)
1765  		goto out;
1766  	if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
1767  		goto out;
      	error = 0;
1769  	if (end == start)
1770  		goto out;
      	/*
      	 * If the interval [start,end) covers some unmapped address ranges,
      	 * just ignore them, but return -EFAULT at the end.
      	 */
      	vma = find_vma(current->mm, start);
      	unmapped_error = 0;
1777  	for (;;) {
      		/* Still start < end. */
      		error = -EFAULT;
1780  		if (!vma)
1781  			goto out;
      		/* Here start < vma->vm_end. */
1783  		if (start < vma->vm_start) {
      			unmapped_error = -EFAULT;
      			start = vma->vm_start;
      		}
      		/* Here vma->vm_start <= start < vma->vm_end. */
1788  		if (end <= vma->vm_end) {
1789  			if (start < end) {
      				error = msync_interval(vma, start, end, flags);
1791  				if (error)
1792  					goto out;
      			}
      			error = unmapped_error;
1795  			goto out;
      		}
      		/* Here vma->vm_start <= start < vma->vm_end < end. */
      		error = msync_interval(vma, start, vma->vm_end, flags);
1799  		if (error)
1800  			goto out;
      		start = vma->vm_end;
      		vma = vma->vm_next;
      	}
      out:
      	up(¤t->mm->mmap_sem);
1806  	return error;
      }
      
1809  static inline void setup_read_behavior(struct vm_area_struct * vma,
      	int behavior)
      {
      	VM_ClearReadHint(vma);
1813  	switch(behavior) {
1814  		case MADV_SEQUENTIAL:
      			vma->vm_flags |= VM_SEQ_READ;
1816  			break;
1817  		case MADV_RANDOM:
      			vma->vm_flags |= VM_RAND_READ;
1819  			break;
1820  		default:
1821  			break;
      	}
1823  	return;
      }
      
1826  static long madvise_fixup_start(struct vm_area_struct * vma,
      	unsigned long end, int behavior)
      {
      	struct vm_area_struct * n;
      
      	n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
1832  	if (!n)
1833  		return -EAGAIN;
      	*n = *vma;
      	n->vm_end = end;
      	setup_read_behavior(n, behavior);
      	n->vm_raend = 0;
      	get_file(n->vm_file);
1839  	if (n->vm_ops && n->vm_ops->open)
      		n->vm_ops->open(n);
      	lock_vma_mappings(vma);
      	spin_lock(&vma->vm_mm->page_table_lock);
      	vma->vm_pgoff += (end - vma->vm_start) >> PAGE_SHIFT;
      	vma->vm_start = end;
      	__insert_vm_struct(current->mm, n);
1846  	spin_unlock(&vma->vm_mm->page_table_lock);
      	unlock_vma_mappings(vma);
1848  	return 0;
      }
      
1851  static long madvise_fixup_end(struct vm_area_struct * vma,
      	unsigned long start, int behavior)
      {
      	struct vm_area_struct * n;
      
      	n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
1857  	if (!n)
1858  		return -EAGAIN;
      	*n = *vma;
      	n->vm_start = start;
      	n->vm_pgoff += (n->vm_start - vma->vm_start) >> PAGE_SHIFT;
      	setup_read_behavior(n, behavior);
      	n->vm_raend = 0;
      	get_file(n->vm_file);
1865  	if (n->vm_ops && n->vm_ops->open)
      		n->vm_ops->open(n);
      	lock_vma_mappings(vma);
      	spin_lock(&vma->vm_mm->page_table_lock);
      	vma->vm_end = start;
      	__insert_vm_struct(current->mm, n);
1871  	spin_unlock(&vma->vm_mm->page_table_lock);
      	unlock_vma_mappings(vma);
1873  	return 0;
      }
      
1876  static long madvise_fixup_middle(struct vm_area_struct * vma,
      	unsigned long start, unsigned long end, int behavior)
      {
      	struct vm_area_struct * left, * right;
      
      	left = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
1882  	if (!left)
1883  		return -EAGAIN;
      	right = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
1885  	if (!right) {
      		kmem_cache_free(vm_area_cachep, left);
1887  		return -EAGAIN;
      	}
      	*left = *vma;
      	*right = *vma;
      	left->vm_end = start;
      	right->vm_start = end;
      	right->vm_pgoff += (right->vm_start - left->vm_start) >> PAGE_SHIFT;
      	left->vm_raend = 0;
      	right->vm_raend = 0;
      	atomic_add(2, &vma->vm_file->f_count);
      
1898  	if (vma->vm_ops && vma->vm_ops->open) {
      		vma->vm_ops->open(left);
      		vma->vm_ops->open(right);
      	}
      	lock_vma_mappings(vma);
      	spin_lock(&vma->vm_mm->page_table_lock);
      	vma->vm_pgoff += (start - vma->vm_start) >> PAGE_SHIFT;
      	vma->vm_start = start;
      	vma->vm_end = end;
      	setup_read_behavior(vma, behavior);
      	vma->vm_raend = 0;
      	__insert_vm_struct(current->mm, left);
      	__insert_vm_struct(current->mm, right);
1911  	spin_unlock(&vma->vm_mm->page_table_lock);
      	unlock_vma_mappings(vma);
1913  	return 0;
      }
      
      /*
       * We can potentially split a vm area into separate
       * areas, each area with its own behavior.
       */
1920  static long madvise_behavior(struct vm_area_struct * vma,
      	unsigned long start, unsigned long end, int behavior)
      {
      	int error = 0;
      
      	/* This caps the number of vma's this process can own */
1926  	if (vma->vm_mm->map_count > MAX_MAP_COUNT)
1927  		return -ENOMEM;
      
1929  	if (start == vma->vm_start) {
1930  		if (end == vma->vm_end) {
      			setup_read_behavior(vma, behavior);
      			vma->vm_raend = 0;
1933  		} else
      			error = madvise_fixup_start(vma, end, behavior);
1935  	} else {
1936  		if (end == vma->vm_end)
      			error = madvise_fixup_end(vma, start, behavior);
1938  		else
      			error = madvise_fixup_middle(vma, start, end, behavior);
      	}
      
1942  	return error;
      }
      
      /*
       * Schedule all required I/O operations, then run the disk queue
       * to make sure they are started.  Do not wait for completion.
       */
1949  static long madvise_willneed(struct vm_area_struct * vma,
      	unsigned long start, unsigned long end)
      {
      	long error = -EBADF;
      	struct file * file;
      	unsigned long size, rlim_rss;
      
      	/* Doesn't work if there's no mapped file. */
1957  	if (!vma->vm_file)
1958  		return error;
      	file = vma->vm_file;
      	size = (file->f_dentry->d_inode->i_size + PAGE_CACHE_SIZE - 1) >>
      							PAGE_CACHE_SHIFT;
      
      	start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
1964  	if (end > vma->vm_end)
      		end = vma->vm_end;
      	end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
      
      	/* Make sure this doesn't exceed the process's max rss. */
      	error = -EIO;
      	rlim_rss = current->rlim ?  current->rlim[RLIMIT_RSS].rlim_cur :
      				LONG_MAX; /* default: see resource.h */
1972  	if ((vma->vm_mm->rss + (end - start)) > rlim_rss)
1973  		return error;
      
      	/* round to cluster boundaries if this isn't a "random" area. */
1976  	if (!VM_RandomReadHint(vma)) {
      		start = CLUSTER_OFFSET(start);
      		end = CLUSTER_OFFSET(end + CLUSTER_PAGES - 1);
      
1980  		while ((start < end) && (start < size)) {
      			error = read_cluster_nonblocking(file, start, size);
      			start += CLUSTER_PAGES;
1983  			if (error < 0)
1984  				break;
      		}
1986  	} else {
1987  		while ((start < end) && (start < size)) {
      			error = page_cache_read(file, start);
      			start++;
1990  			if (error < 0)
1991  				break;
      		}
      	}
      
      	/* Don't wait for someone else to push these requests. */
      	run_task_queue(&tq_disk);
      
1998  	return error;
      }
      
      /*
       * Application no longer needs these pages.  If the pages are dirty,
       * it's OK to just throw them away.  The app will be more careful about
       * data it wants to keep.  Be sure to free swap resources too.  The
       * zap_page_range call sets things up for refill_inactive to actually free
       * these pages later if no one else has touched them in the meantime,
       * although we could add these pages to a global reuse list for
       * refill_inactive to pick up before reclaiming other pages.
       *
       * NB: This interface discards data rather than pushes it out to swap,
       * as some implementations do.  This has performance implications for
       * applications like large transactional databases which want to discard
       * pages in anonymous maps after committing to backing store the data
       * that was kept in them.  There is no reason to write this data out to
       * the swap area if the application is discarding it.
       *
       * An interface that causes the system to free clean pages and flush
       * dirty pages is already available as msync(MS_INVALIDATE).
       */
2020  static long madvise_dontneed(struct vm_area_struct * vma,
      	unsigned long start, unsigned long end)
      {
2023  	if (vma->vm_flags & VM_LOCKED)
2024  		return -EINVAL;
      
2026  	flush_cache_range(vma->vm_mm, start, end);
      	zap_page_range(vma->vm_mm, start, end - start);
      	flush_tlb_range(vma->vm_mm, start, end);
2029  	return 0;
      }
      
2032  static long madvise_vma(struct vm_area_struct * vma, unsigned long start,
      	unsigned long end, int behavior)
      {
      	long error = -EBADF;
      
2037  	switch (behavior) {
2038  	case MADV_NORMAL:
2039  	case MADV_SEQUENTIAL:
2040  	case MADV_RANDOM:
      		error = madvise_behavior(vma, start, end, behavior);
2042  		break;
      
2044  	case MADV_WILLNEED:
      		error = madvise_willneed(vma, start, end);
2046  		break;
      
2048  	case MADV_DONTNEED:
      		error = madvise_dontneed(vma, start, end);
2050  		break;
      
2052  	default:
      		error = -EINVAL;
2054  		break;
      	}
      		
2057  	return error;
      }
      
      /*
       * The madvise(2) system call.
       *
       * Applications can use madvise() to advise the kernel how it should
       * handle paging I/O in this VM area.  The idea is to help the kernel
       * use appropriate read-ahead and caching techniques.  The information
       * provided is advisory only, and can be safely disregarded by the
       * kernel without affecting the correct operation of the application.
       *
       * behavior values:
       *  MADV_NORMAL - the default behavior is to read clusters.  This
       *		results in some read-ahead and read-behind.
       *  MADV_RANDOM - the system should read the minimum amount of data
       *		on any access, since it is unlikely that the appli-
       *		cation will need more than what it asks for.
       *  MADV_SEQUENTIAL - pages in the given range will probably be accessed
       *		once, so they can be aggressively read ahead, and
       *		can be freed soon after they are accessed.
       *  MADV_WILLNEED - the application is notifying the system to read
       *		some pages ahead.
       *  MADV_DONTNEED - the application is finished with the given range,
       *		so the kernel can free resources associated with it.
       *
       * return values:
       *  zero    - success
       *  -EINVAL - start + len < 0, start is not page-aligned,
       *		"behavior" is not a valid value, or application
       *		is attempting to release locked or shared pages.
       *  -ENOMEM - addresses in the specified range are not currently
       *		mapped, or are outside the AS of the process.
       *  -EIO    - an I/O error occurred while paging in data.
       *  -EBADF  - map exists, but area maps something that isn't a file.
       *  -EAGAIN - a kernel resource was temporarily unavailable.
       */
2094  asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior)
      {
      	unsigned long end;
      	struct vm_area_struct * vma;
      	int unmapped_error = 0;
      	int error = -EINVAL;
      
      	down(¤t->mm->mmap_sem);
      
2103  	if (start & ~PAGE_MASK)
2104  		goto out;
      	len = (len + ~PAGE_MASK) & PAGE_MASK;
      	end = start + len;
2107  	if (end < start)
2108  		goto out;
      
      	error = 0;
2111  	if (end == start)
2112  		goto out;
      
      	/*
      	 * If the interval [start,end) covers some unmapped address
      	 * ranges, just ignore them, but return -ENOMEM at the end.
      	 */
      	vma = find_vma(current->mm, start);
2119  	for (;;) {
      		/* Still start < end. */
      		error = -ENOMEM;
2122  		if (!vma)
2123  			goto out;
      
      		/* Here start < vma->vm_end. */
2126  		if (start < vma->vm_start) {
      			unmapped_error = -ENOMEM;
      			start = vma->vm_start;
      		}
      
      		/* Here vma->vm_start <= start < vma->vm_end. */
2132  		if (end <= vma->vm_end) {
2133  			if (start < end) {
      				error = madvise_vma(vma, start, end,
      							behavior);
2136  				if (error)
2137  					goto out;
      			}
      			error = unmapped_error;
2140  			goto out;
      		}
      
      		/* Here vma->vm_start <= start < vma->vm_end < end. */
      		error = madvise_vma(vma, start, vma->vm_end, behavior);
2145  		if (error)
2146  			goto out;
      		start = vma->vm_end;
      		vma = vma->vm_next;
      	}
      
      out:
      	up(¤t->mm->mmap_sem);
2153  	return error;
      }
      
      /*
       * Later we can get more picky about what "in core" means precisely.
       * For now, simply check to see if the page is in the page cache,
       * and is up to date; i.e. that no page-in operation would be required
       * at this time if an application were to map and access this page.
       */
2162  static unsigned char mincore_page(struct vm_area_struct * vma,
      	unsigned long pgoff)
      {
      	unsigned char present = 0;
      	struct address_space * as = &vma->vm_file->f_dentry->d_inode->i_data;
      	struct page * page, ** hash = page_hash(as, pgoff);
      
      	spin_lock(&pagecache_lock);
      	page = __find_page_nolock(as, pgoff, *hash);
2171  	if ((page) && (Page_Uptodate(page)))
      		present = 1;
2173  	spin_unlock(&pagecache_lock);
      
2175  	return present;
      }
      
2178  static long mincore_vma(struct vm_area_struct * vma,
      	unsigned long start, unsigned long end, unsigned char * vec)
      {
      	long error, i, remaining;
      	unsigned char * tmp;
      
      	error = -ENOMEM;
2185  	if (!vma->vm_file)
2186  		return error;
      
      	start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2189  	if (end > vma->vm_end)
      		end = vma->vm_end;
      	end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
      
      	error = -EAGAIN;
      	tmp = (unsigned char *) __get_free_page(GFP_KERNEL);
2195  	if (!tmp)
2196  		return error;
      
      	/* (end - start) is # of pages, and also # of bytes in "vec */
      	remaining = (end - start),
      
      	error = 0;
2202  	for (i = 0; remaining > 0; remaining -= PAGE_SIZE, i++) {
      		int j = 0;
      		long thispiece = (remaining < PAGE_SIZE) ?
      						remaining : PAGE_SIZE;
      
2207  		while (j < thispiece)
      			tmp[j++] = mincore_page(vma, start++);
      
2210  		if (copy_to_user(vec + PAGE_SIZE * i, tmp, thispiece)) {
      			error = -EFAULT;
2212  			break;
      		}
      	}
      
      	free_page((unsigned long) tmp);
2217  	return error;
      }
      
      /*
       * The mincore(2) system call.
       *
       * mincore() returns the memory residency status of the pages in the
       * current process's address space specified by [addr, addr + len).
       * The status is returned in a vector of bytes.  The least significant
       * bit of each byte is 1 if the referenced page is in memory, otherwise
       * it is zero.
       *
       * Because the status of a page can change after mincore() checks it
       * but before it returns to the application, the returned vector may
       * contain stale information.  Only locked pages are guaranteed to
       * remain in memory.
       *
       * return values:
       *  zero    - success
       *  -EFAULT - vec points to an illegal address
       *  -EINVAL - addr is not a multiple of PAGE_CACHE_SIZE,
       *		or len has a nonpositive value
       *  -ENOMEM - Addresses in the range [addr, addr + len] are
       *		invalid for the address space of this process, or
       *		specify one or more pages which are not currently
       *		mapped
       *  -EAGAIN - A kernel resource was temporarily unavailable.
       */
2245  asmlinkage long sys_mincore(unsigned long start, size_t len,
      	unsigned char * vec)
      {
      	int index = 0;
      	unsigned long end;
      	struct vm_area_struct * vma;
      	int unmapped_error = 0;
      	long error = -EINVAL;
      
      	down(¤t->mm->mmap_sem);
      
2256  	if (start & ~PAGE_CACHE_MASK)
2257  		goto out;
      	len = (len + ~PAGE_CACHE_MASK) & PAGE_CACHE_MASK;
      	end = start + len;
2260  	if (end < start)
2261  		goto out;
      
      	error = 0;
2264  	if (end == start)
2265  		goto out;
      
      	/*
      	 * If the interval [start,end) covers some unmapped address
      	 * ranges, just ignore them, but return -ENOMEM at the end.
      	 */
      	vma = find_vma(current->mm, start);
2272  	for (;;) {
      		/* Still start < end. */
      		error = -ENOMEM;
2275  		if (!vma)
2276  			goto out;
      
      		/* Here start < vma->vm_end. */
2279  		if (start < vma->vm_start) {
      			unmapped_error = -ENOMEM;
      			start = vma->vm_start;
      		}
      
      		/* Here vma->vm_start <= start < vma->vm_end. */
2285  		if (end <= vma->vm_end) {
2286  			if (start < end) {
      				error = mincore_vma(vma, start, end,
      							&vec[index]);
2289  				if (error)
2290  					goto out;
      			}
      			error = unmapped_error;
2293  			goto out;
      		}
      
      		/* Here vma->vm_start <= start < vma->vm_end < end. */
      		error = mincore_vma(vma, start, vma->vm_end, &vec[index]);
2298  		if (error)
2299  			goto out;
      		index += (vma->vm_end - start) >> PAGE_CACHE_SHIFT;
      		start = vma->vm_end;
      		vma = vma->vm_next;
      	}
      
      out:
      	up(¤t->mm->mmap_sem);
2307  	return error;
      }
      
      static inline
2311  struct page *__read_cache_page(struct address_space *mapping,
      				unsigned long index,
      				int (*filler)(void *,struct page*),
      				void *data)
      {
      	struct page **hash = page_hash(mapping, index);
      	struct page *page, *cached_page = NULL;
      	int err;
      repeat:
      	page = __find_get_page(mapping, index, hash);
2321  	if (!page) {
2322  		if (!cached_page) {
      			cached_page = page_cache_alloc();
2324  			if (!cached_page)
2325  				return ERR_PTR(-ENOMEM);
      		}
      		page = cached_page;
2328  		if (add_to_page_cache_unique(page, mapping, index, hash))
2329  			goto repeat;
      		cached_page = NULL;
      		err = filler(data, page);
2332  		if (err < 0) {
      			page_cache_release(page);
      			page = ERR_PTR(err);
      		}
      	}
2337  	if (cached_page)
      		page_cache_free(cached_page);
2339  	return page;
      }
      
      /*
       * Read into the page cache. If a page already exists,
       * and Page_Uptodate() is not set, try to fill the page.
       */
2346  struct page *read_cache_page(struct address_space *mapping,
      				unsigned long index,
      				int (*filler)(void *,struct page*),
      				void *data)
      {
      	struct page *page;
      	int err;
      
      retry:
      	page = __read_cache_page(mapping, index, filler, data);
2356  	if (IS_ERR(page) || Page_Uptodate(page))
2357  		goto out;
      
      	lock_page(page);
2360  	if (!page->mapping) {
2361  		UnlockPage(page);
      		page_cache_release(page);
2363  		goto retry;
      	}
2365  	if (Page_Uptodate(page)) {
2366  		UnlockPage(page);
2367  		goto out;
      	}
      	err = filler(data, page);
2370  	if (err < 0) {
      		page_cache_release(page);
      		page = ERR_PTR(err);
      	}
       out:
2375  	return page;
      }
      
2378  static inline struct page * __grab_cache_page(struct address_space *mapping,
      				unsigned long index, struct page **cached_page)
      {
      	struct page *page, **hash = page_hash(mapping, index);
      repeat:
      	page = __find_lock_page(mapping, index, hash);
2384  	if (!page) {
2385  		if (!*cached_page) {
      			*cached_page = page_cache_alloc();
2387  			if (!*cached_page)
2388  				return NULL;
      		}
      		page = *cached_page;
2391  		if (add_to_page_cache_unique(page, mapping, index, hash))
2392  			goto repeat;
      		*cached_page = NULL;
      	}
2395  	return page;
      }
      
      /*
       * Returns locked page at given index in given cache, creating it if needed.
       */
      
2402  struct page *grab_cache_page(struct address_space *mapping, unsigned long index)
      {
      	struct page *cached_page = NULL;
      	struct page *page = __grab_cache_page(mapping,index,&cached_page);
2406  	if (cached_page)
      		page_cache_free(cached_page);
2408  	return page;
      }
      
2411  static inline void remove_suid(struct inode *inode)
      {
      	unsigned int mode;
      
      	/* set S_IGID if S_IXGRP is set, and always set S_ISUID */
      	mode = (inode->i_mode & S_IXGRP)*(S_ISGID/S_IXGRP) | S_ISUID;
      
      	/* was any of the uid bits set? */
      	mode &= inode->i_mode;
2420  	if (mode && !capable(CAP_FSETID)) {
      		inode->i_mode &= ~mode;
      		mark_inode_dirty(inode);
      	}
      }
      
      /*
       * Write to a file through the page cache. 
       *
       * We currently put everything into the page cache prior to writing it.
       * This is not a problem when writing full pages. With partial pages,
       * however, we first have to read the data into the cache, then
       * dirty the page, and finally schedule it for writing. Alternatively, we
       * could write-through just the portion of data that would go into that
       * page, but that would kill performance for applications that write data
       * line by line, and it's prone to race conditions.
       *
       * Note that this routine doesn't try to keep track of dirty pages. Each
       * file system has to do this all by itself, unfortunately.
       *							okir@monad.swb.de
       */
      ssize_t
2442  generic_file_write(struct file *file,const char *buf,size_t count,loff_t *ppos)
      {
      	struct inode	*inode = file->f_dentry->d_inode; 
      	struct address_space *mapping = inode->i_mapping;
      	unsigned long	limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
      	loff_t		pos;
      	struct page	*page, *cached_page;
      	unsigned long	written;
      	long		status;
      	int		err;
      
      	cached_page = NULL;
      
      	down(&inode->i_sem);
      
      	pos = *ppos;
      	err = -EINVAL;
2459  	if (pos < 0)
2460  		goto out;
      
      	err = file->f_error;
2463  	if (err) {
      		file->f_error = 0;
2465  		goto out;
      	}
      
      	written = 0;
      
2470  	if (file->f_flags & O_APPEND)
      		pos = inode->i_size;
      
      	/*
      	 * Check whether we've reached the file size limit.
      	 */
      	err = -EFBIG;
2477  	if (limit != RLIM_INFINITY) {
2478  		if (pos >= limit) {
      			send_sig(SIGXFSZ, current, 0);
2480  			goto out;
      		}
2482  		if (count > limit - pos) {
      			send_sig(SIGXFSZ, current, 0);
      			count = limit - pos;
      		}
      	}
      
      	status  = 0;
2489  	if (count) {
      		remove_suid(inode);
      		inode->i_ctime = inode->i_mtime = CURRENT_TIME;
      		mark_inode_dirty_sync(inode);
      	}
      
2495  	while (count) {
      		unsigned long bytes, index, offset;
      		char *kaddr;
      		int deactivate = 1;
      
      		/*
      		 * Try to find the page in the cache. If it isn't there,
      		 * allocate a free page.
      		 */
      		offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
      		index = pos >> PAGE_CACHE_SHIFT;
      		bytes = PAGE_CACHE_SIZE - offset;
2507  		if (bytes > count) {
      			bytes = count;
      			deactivate = 0;
      		}
      
      		/*
      		 * Bring in the user page that we will copy from _first_.
      		 * Otherwise there's a nasty deadlock on copying from the
      		 * same page as we're writing to, without it being marked
      		 * up-to-date.
      		 */
      		{ volatile unsigned char dummy;
      			__get_user(dummy, buf);
      			__get_user(dummy, buf+bytes-1);
      		}
      
      		status = -ENOMEM;	/* we'll assign it later anyway */
      		page = __grab_cache_page(mapping, index, &cached_page);
2525  		if (!page)
2526  			break;
      
      		/* We have exclusive IO access to the page.. */
2529  		if (!PageLocked(page)) {
2530  			PAGE_BUG(page);
      		}
      
      		status = mapping->a_ops->prepare_write(file, page, offset, offset+bytes);
2534  		if (status)
2535  			goto unlock;
      		kaddr = page_address(page);
      		status = copy_from_user(kaddr+offset, buf, bytes);
2538  		flush_dcache_page(page);
2539  		if (status)
2540  			goto fail_write;
      		status = mapping->a_ops->commit_write(file, page, offset, offset+bytes);
2542  		if (!status)
      			status = bytes;
      
2545  		if (status >= 0) {
      			written += status;
      			count -= status;
      			pos += status;
      			buf += status;
      		}
      unlock:
      		/* Mark it unlocked again and drop the page.. */
2553  		UnlockPage(page);
2554  		if (deactivate)
      			deactivate_page(page);
      		page_cache_release(page);
      
2558  		if (status < 0)
2559  			break;
      	}
      	*ppos = pos;
      
2563  	if (cached_page)
      		page_cache_free(cached_page);
      
      	/* For now, when the user asks for O_SYNC, we'll actually
      	 * provide O_DSYNC. */
2568  	if ((status >= 0) && (file->f_flags & O_SYNC))
      		status = generic_osync_inode(inode, 1); /* 1 means datasync */
      	
      	err = written ? written : status;
      out:
      
      	up(&inode->i_sem);
2575  	return err;
      fail_write:
      	status = -EFAULT;
      	ClearPageUptodate(page);
2579  	kunmap(page);
2580  	goto unlock;
      }
      
2583  void __init page_cache_init(unsigned long mempages)
      {
      	unsigned long htable_size, order;
      
      	htable_size = mempages;
      	htable_size *= sizeof(struct page *);
2589  	for(order = 0; (PAGE_SIZE << order) < htable_size; order++)
      		;
      
2592  	do {
      		unsigned long tmp = (PAGE_SIZE << order) / sizeof(struct page *);
      
      		page_hash_bits = 0;
2596  		while((tmp >>= 1UL) != 0UL)
      			page_hash_bits++;
      
      		page_hash_table = (struct page **)
      			__get_free_pages(GFP_ATOMIC, order);
2601  	} while(page_hash_table == NULL && --order > 0);
      
      	printk("Page-cache hash table entries: %d (order: %ld, %ld bytes)\n",
      	       (1 << page_hash_bits), order, (PAGE_SIZE << order));
2605  	if (!page_hash_table)
      		panic("Failed to allocate page hash table\n");
      	memset((void *)page_hash_table, 0, PAGE_HASH_SIZE * sizeof(struct page *));
      }