/*
       *  linux/mm/vmscan.c
       *
       *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
       *
       *  Swap reorganised 29.12.95, Stephen Tweedie.
       *  kswapd added: 7.1.96  sct
       *  Removed kswapd_ctl limits, and swap out as many pages as needed
       *  to bring the system back to freepages.high: 2.4.97, Rik van Riel.
       *  Version: $Id: vmscan.c,v 1.5 1998/02/23 22:14:28 sct Exp $
       *  Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
       *  Multiqueue VM started 5.8.00, Rik van Riel.
       */
      
      #include <linux/slab.h>
      #include <linux/kernel_stat.h>
      #include <linux/swap.h>
      #include <linux/swapctl.h>
      #include <linux/smp_lock.h>
      #include <linux/pagemap.h>
      #include <linux/init.h>
      #include <linux/highmem.h>
      #include <linux/file.h>
      
      #include <asm/pgalloc.h>
      
      /*
       * The swap-out functions return 1 if they successfully
       * threw something out, and we got a free page. It returns
       * zero if it couldn't do anything, and any other value
       * indicates it decreased rss, but the page was shared.
       *
       * NOTE! If it sleeps, it *must* return 1 to make sure we
       * don't continue with the swap-out. Otherwise we may be
       * using a process that no longer actually exists (it might
       * have died while we slept).
       */
  38  static int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, int gfp_mask)
      {
      	pte_t pte;
      	swp_entry_t entry;
      	struct page * page;
      	int onlist;
      
      	pte = *page_table;
  46  	if (!pte_present(pte))
  47  		goto out_failed;
      	page = pte_page(pte);
  49  	if ((!VALID_PAGE(page)) || PageReserved(page))
  50  		goto out_failed;
      
  52  	if (!mm->swap_cnt)
  53  		return 1;
      
      	mm->swap_cnt--;
      
      	onlist = PageActive(page);
      	/* Don't look at this pte if it's been accessed recently. */
  59  	if (ptep_test_and_clear_young(page_table)) {
      		age_page_up(page);
  61  		goto out_failed;
      	}
  63  	if (!onlist)
      		/* The page is still mapped, so it can't be freeable... */
      		age_page_down_ageonly(page);
      
      	/*
      	 * If the page is in active use by us, or if the page
      	 * is in active use by others, don't unmap it or
      	 * (worse) start unneeded IO.
      	 */
  72  	if (page->age > 0)
  73  		goto out_failed;
      
  75  	if (TryLockPage(page))
  76  		goto out_failed;
      
      	/* From this point on, the odds are that we're going to
      	 * nuke this pte, so read and clear the pte.  This hook
      	 * is needed on CPUs which update the accessed and dirty
      	 * bits in hardware.
      	 */
      	pte = ptep_get_and_clear(page_table);
      	flush_tlb_page(vma, address);
      
      	/*
      	 * Is the page already in the swap cache? If so, then
      	 * we can just drop our reference to it without doing
      	 * any IO - it's already up-to-date on disk.
      	 *
      	 * Return 0, as we didn't actually free any real
      	 * memory, and we should just continue our scan.
      	 */
  94  	if (PageSwapCache(page)) {
      		entry.val = page->index;
  96  		if (pte_dirty(pte))
      			set_page_dirty(page);
      set_swap_pte:
      		swap_duplicate(entry);
      		set_pte(page_table, swp_entry_to_pte(entry));
      drop_pte:
 102  		UnlockPage(page);
      		mm->rss--;
      		deactivate_page(page);
      		page_cache_release(page);
      out_failed:
 107  		return 0;
      	}
      
      	/*
      	 * Is it a clean page? Then it must be recoverable
      	 * by just paging it in again, and we can just drop
      	 * it..
      	 *
      	 * However, this won't actually free any real
      	 * memory, as the page will just be in the page cache
      	 * somewhere, and as such we should just continue
      	 * our scan.
      	 *
      	 * Basically, this just makes it possible for us to do
      	 * some real work in the future in "refill_inactive()".
      	 */
 123  	flush_cache_page(vma, address);
 124  	if (!pte_dirty(pte))
 125  		goto drop_pte;
      
      	/*
      	 * Ok, it's really dirty. That means that
      	 * we should either create a new swap cache
      	 * entry for it, or we should write it back
      	 * to its own backing store.
      	 */
 133  	if (page->mapping) {
      		set_page_dirty(page);
 135  		goto drop_pte;
      	}
      
      	/*
      	 * This is a dirty, swappable page.  First of all,
      	 * get a suitable swap entry for it, and make sure
      	 * we have the swap cache set up to associate the
      	 * page with that swap entry.
      	 */
      	entry = get_swap_page();
 145  	if (!entry.val)
 146  		goto out_unlock_restore; /* No swap space left */
      
      	/* Add it to the swap cache and mark it dirty */
      	add_to_swap_cache(page, entry);
      	set_page_dirty(page);
 151  	goto set_swap_pte;
      
      out_unlock_restore:
      	set_pte(page_table, pte);
 155  	UnlockPage(page);
 156  	return 0;
      }
      
      /*
       * A new implementation of swap_out().  We do not swap complete processes,
       * but only a small number of blocks, before we continue with the next
       * process.  The number of blocks actually swapped is determined on the
       * number of page faults, that this process actually had in the last time,
       * so we won't swap heavily used processes all the time ...
       *
       * Note: the priority argument is a hint on much CPU to waste with the
       *       swap block search, not a hint, of how much blocks to swap with
       *       each process.
       *
       * (C) 1993 Kai Petzke, wpp@marie.physik.tu-berlin.de
       */
      
 173  static inline int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
      {
      	pte_t * pte;
      	unsigned long pmd_end;
      
 178  	if (pmd_none(*dir))
 179  		return 0;
 180  	if (pmd_bad(*dir)) {
      		pmd_ERROR(*dir);
 182  		pmd_clear(dir);
 183  		return 0;
      	}
      	
      	pte = pte_offset(dir, address);
      	
      	pmd_end = (address + PMD_SIZE) & PMD_MASK;
 189  	if (end > pmd_end)
      		end = pmd_end;
      
 192  	do {
      		int result;
      		mm->swap_address = address + PAGE_SIZE;
      		result = try_to_swap_out(mm, vma, address, pte, gfp_mask);
 196  		if (result)
 197  			return result;
      		address += PAGE_SIZE;
      		pte++;
 200  	} while (address && (address < end));
 201  	return 0;
      }
      
 204  static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
      {
      	pmd_t * pmd;
      	unsigned long pgd_end;
      
 209  	if (pgd_none(*dir))
 210  		return 0;
 211  	if (pgd_bad(*dir)) {
      		pgd_ERROR(*dir);
 213  		pgd_clear(dir);
 214  		return 0;
      	}
      
      	pmd = pmd_offset(dir, address);
      
      	pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK;	
 220  	if (pgd_end && (end > pgd_end))
      		end = pgd_end;
      	
 223  	do {
      		int result = swap_out_pmd(mm, vma, pmd, address, end, gfp_mask);
 225  		if (result)
 226  			return result;
      		address = (address + PMD_SIZE) & PMD_MASK;
      		pmd++;
 229  	} while (address && (address < end));
 230  	return 0;
      }
      
 233  static int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int gfp_mask)
      {
      	pgd_t *pgdir;
      	unsigned long end;
      
      	/* Don't swap out areas which are locked down */
 239  	if (vma->vm_flags & (VM_LOCKED|VM_RESERVED))
 240  		return 0;
      
      	pgdir = pgd_offset(mm, address);
      
      	end = vma->vm_end;
 245  	if (address >= end)
 246  		BUG();
 247  	do {
      		int result = swap_out_pgd(mm, vma, pgdir, address, end, gfp_mask);
 249  		if (result)
 250  			return result;
      		address = (address + PGDIR_SIZE) & PGDIR_MASK;
      		pgdir++;
 253  	} while (address && (address < end));
 254  	return 0;
      }
      
 257  static int swap_out_mm(struct mm_struct * mm, int gfp_mask)
      {
      	int result = 0;
      	unsigned long address;
      	struct vm_area_struct* vma;
      
      	/*
      	 * Go through process' page directory.
      	 */
      
      	/*
      	 * Find the proper vm-area after freezing the vma chain 
      	 * and ptes.
      	 */
      	spin_lock(&mm->page_table_lock);
      	address = mm->swap_address;
      	vma = find_vma(mm, address);
 274  	if (vma) {
 275  		if (address < vma->vm_start)
      			address = vma->vm_start;
      
 278  		for (;;) {
      			result = swap_out_vma(mm, vma, address, gfp_mask);
 280  			if (result)
 281  				goto out_unlock;
      			vma = vma->vm_next;
 283  			if (!vma)
 284  				break;
      			address = vma->vm_start;
      		}
      	}
      	/* Reset to 0 when we reach the end of address space */
      	mm->swap_address = 0;
      	mm->swap_cnt = 0;
      
      out_unlock:
 293  	spin_unlock(&mm->page_table_lock);
 294  	return result;
      }
      
      /*
       * Select the task with maximal swap_cnt and try to swap out a page.
       * N.B. This function returns only 0 or 1.  Return values != 1 from
       * the lower level routines result in continued processing.
       */
      #define SWAP_SHIFT 5
      #define SWAP_MIN 8
      
 305  static int swap_out(unsigned int priority, int gfp_mask)
      {
      	int counter;
      	int __ret = 0;
      
      	/* 
      	 * We make one or two passes through the task list, indexed by 
      	 * assign = {0, 1}:
      	 *   Pass 1: select the swappable task with maximal RSS that has
      	 *         not yet been swapped out. 
      	 *   Pass 2: re-assign rss swap_cnt values, then select as above.
      	 *
      	 * With this approach, there's no need to remember the last task
      	 * swapped out.  If the swap-out fails, we clear swap_cnt so the 
      	 * task won't be selected again until all others have been tried.
      	 *
      	 * Think of swap_cnt as a "shadow rss" - it tells us which process
      	 * we want to page out (always try largest first).
      	 */
      	counter = (nr_threads << SWAP_SHIFT) >> priority;
 325  	if (counter < 1)
      		counter = 1;
      
 328  	for (; counter >= 0; counter--) {
      		struct list_head *p;
      		unsigned long max_cnt = 0;
      		struct mm_struct *best = NULL;
      		int assign = 0;
      		int found_task = 0;
      	select:
      		spin_lock(&mmlist_lock);
      		p = init_mm.mmlist.next;
 337  		for (; p != &init_mm.mmlist; p = p->next) {
      			struct mm_struct *mm = list_entry(p, struct mm_struct, mmlist);
 339  	 		if (mm->rss <= 0)
 340  				continue;
      			found_task++;
      			/* Refresh swap_cnt? */
 343  			if (assign == 1) {
      				mm->swap_cnt = (mm->rss >> SWAP_SHIFT);
 345  				if (mm->swap_cnt < SWAP_MIN)
      					mm->swap_cnt = SWAP_MIN;
      			}
 348  			if (mm->swap_cnt > max_cnt) {
      				max_cnt = mm->swap_cnt;
      				best = mm;
      			}
      		}
      
      		/* Make sure it doesn't disappear */
 355  		if (best)
      			atomic_inc(&best->mm_users);
 357  		spin_unlock(&mmlist_lock);
      
      		/*
      		 * We have dropped the tasklist_lock, but we
      		 * know that "mm" still exists: we are running
      		 * with the big kernel lock, and exit_mm()
      		 * cannot race with us.
      		 */
 365  		if (!best) {
 366  			if (!assign && found_task > 0) {
      				assign = 1;
 368  				goto select;
      			}
 370  			break;
 371  		} else {
      			__ret = swap_out_mm(best, gfp_mask);
      			mmput(best);
 374  			break;
      		}
      	}
 377  	return __ret;
      }
      
      
      /**
       * reclaim_page -	reclaims one page from the inactive_clean list
       * @zone: reclaim a page from this zone
       *
       * The pages on the inactive_clean can be instantly reclaimed.
       * The tests look impressive, but most of the time we'll grab
       * the first page of the list and exit successfully.
       */
 389  struct page * reclaim_page(zone_t * zone)
      {
      	struct page * page = NULL;
      	struct list_head * page_lru;
      	int maxscan;
      
      	/*
      	 * We only need the pagemap_lru_lock if we don't reclaim the page,
      	 * but we have to grab the pagecache_lock before the pagemap_lru_lock
      	 * to avoid deadlocks and most of the time we'll succeed anyway.
      	 */
      	spin_lock(&pagecache_lock);
      	spin_lock(&pagemap_lru_lock);
      	maxscan = zone->inactive_clean_pages;
      	while ((page_lru = zone->inactive_clean_list.prev) !=
 404  			&zone->inactive_clean_list && maxscan--) {
      		page = list_entry(page_lru, struct page, lru);
      
      		/* Wrong page on list?! (list corruption, should not happen) */
 408  		if (!PageInactiveClean(page)) {
      			printk("VM: reclaim_page, wrong page on list.\n");
      			list_del(page_lru);
      			page->zone->inactive_clean_pages--;
 412  			continue;
      		}
      
      		/* Page is or was in use?  Move it to the active list. */
      		if (PageTestandClearReferenced(page) || page->age > 0 ||
 417  				(!page->buffers && page_count(page) > 1)) {
 418  			del_page_from_inactive_clean_list(page);
 419  			add_page_to_active_list(page);
 420  			continue;
      		}
      
      		/* The page is dirty, or locked, move to inactive_dirty list. */
 424  		if (page->buffers || PageDirty(page) || TryLockPage(page)) {
 425  			del_page_from_inactive_clean_list(page);
 426  			add_page_to_inactive_dirty_list(page);
 427  			continue;
      		}
      
      		/* OK, remove the page from the caches. */
 431                  if (PageSwapCache(page)) {
      			__delete_from_swap_cache(page);
 433  			goto found_page;
      		}
      
 436  		if (page->mapping) {
      			__remove_inode_page(page);
 438  			goto found_page;
      		}
      
      		/* We should never ever get here. */
      		printk(KERN_ERR "VM: reclaim_page, found unknown page\n");
      		list_del(page_lru);
      		zone->inactive_clean_pages--;
 445  		UnlockPage(page);
      	}
      	/* Reset page pointer, maybe we encountered an unfreeable page. */
      	page = NULL;
 449  	goto out;
      
      found_page:
 452  	del_page_from_inactive_clean_list(page);
 453  	UnlockPage(page);
      	page->age = PAGE_AGE_START;
 455  	if (page_count(page) != 1)
      		printk("VM: reclaim_page, found page with count %d!\n",
      				page_count(page));
      out:
 459  	spin_unlock(&pagemap_lru_lock);
 460  	spin_unlock(&pagecache_lock);
      	memory_pressure++;
 462  	return page;
      }
      
      /**
       * page_launder - clean dirty inactive pages, move to inactive_clean list
       * @gfp_mask: what operations we are allowed to do
       * @sync: should we wait synchronously for the cleaning of pages
       *
       * When this function is called, we are most likely low on free +
       * inactive_clean pages. Since we want to refill those pages as
       * soon as possible, we'll make two loops over the inactive list,
       * one to move the already cleaned pages to the inactive_clean lists
       * and one to (often asynchronously) clean the dirty inactive pages.
       *
       * In situations where kswapd cannot keep up, user processes will
       * end up calling this function. Since the user process needs to
       * have a page before it can continue with its allocation, we'll
       * do synchronous page flushing in that case.
       *
       * This code is heavily inspired by the FreeBSD source code. Thanks
       * go out to Matthew Dillon.
       */
      #define MAX_LAUNDER 		(4 * (1 << page_cluster))
 485  int page_launder(int gfp_mask, int sync)
      {
      	int launder_loop, maxscan, cleaned_pages, maxlaunder;
      	int can_get_io_locks;
      	struct list_head * page_lru;
      	struct page * page;
      
      	/*
      	 * We can only grab the IO locks (eg. for flushing dirty
      	 * buffers to disk) if __GFP_IO is set.
      	 */
      	can_get_io_locks = gfp_mask & __GFP_IO;
      
      	launder_loop = 0;
      	maxlaunder = 0;
      	cleaned_pages = 0;
      
      dirty_page_rescan:
      	spin_lock(&pagemap_lru_lock);
      	maxscan = nr_inactive_dirty_pages;
      	while ((page_lru = inactive_dirty_list.prev) != &inactive_dirty_list &&
 506  				maxscan-- > 0) {
      		page = list_entry(page_lru, struct page, lru);
      
      		/* Wrong page on list?! (list corruption, should not happen) */
 510  		if (!PageInactiveDirty(page)) {
      			printk("VM: page_launder, wrong page on list.\n");
      			list_del(page_lru);
      			nr_inactive_dirty_pages--;
      			page->zone->inactive_dirty_pages--;
 515  			continue;
      		}
      
      		/* Page is or was in use?  Move it to the active list. */
      		if (PageTestandClearReferenced(page) || page->age > 0 ||
      				(!page->buffers && page_count(page) > 1) ||
 521  				page_ramdisk(page)) {
 522  			del_page_from_inactive_dirty_list(page);
 523  			add_page_to_active_list(page);
 524  			continue;
      		}
      
      		/*
      		 * The page is locked. IO in progress?
      		 * Move it to the back of the list.
      		 */
 531  		if (TryLockPage(page)) {
      			list_del(page_lru);
      			list_add(page_lru, &inactive_dirty_list);
 534  			continue;
      		}
      
      		/*
      		 * Dirty swap-cache page? Write it out if
      		 * last copy..
      		 */
 541  		if (PageDirty(page)) {
      			int (*writepage)(struct page *) = page->mapping->a_ops->writepage;
      			int result;
      
 545  			if (!writepage)
 546  				goto page_active;
      
      			/* First time through? Move it to the back of the list */
 549  			if (!launder_loop) {
      				list_del(page_lru);
      				list_add(page_lru, &inactive_dirty_list);
 552  				UnlockPage(page);
 553  				continue;
      			}
      
      			/* OK, do a physical asynchronous write to swap.  */
      			ClearPageDirty(page);
      			page_cache_get(page);
 559  			spin_unlock(&pagemap_lru_lock);
      
      			result = writepage(page);
      			page_cache_release(page);
      
      			/* And re-start the thing.. */
      			spin_lock(&pagemap_lru_lock);
 566  			if (result != 1)
 567  				continue;
      			/* writepage refused to do anything */
      			set_page_dirty(page);
 570  			goto page_active;
      		}
      
      		/*
      		 * If the page has buffers, try to free the buffer mappings
      		 * associated with this page. If we succeed we either free
      		 * the page (in case it was a buffercache only page) or we
      		 * move the page to the inactive_clean list.
      		 *
      		 * On the first round, we should free all previously cleaned
      		 * buffer pages
      		 */
 582  		if (page->buffers) {
      			int wait, clearedbuf;
      			int freed_page = 0;
      			/*
      			 * Since we might be doing disk IO, we have to
      			 * drop the spinlock and take an extra reference
      			 * on the page so it doesn't go away from under us.
      			 */
 590  			del_page_from_inactive_dirty_list(page);
      			page_cache_get(page);
 592  			spin_unlock(&pagemap_lru_lock);
      
      			/* Will we do (asynchronous) IO? */
 595  			if (launder_loop && maxlaunder == 0 && sync)
      				wait = 2;	/* Synchrounous IO */
 597  			else if (launder_loop && maxlaunder-- > 0)
      				wait = 1;	/* Async IO */
 599  			else
      				wait = 0;	/* No IO */
      
      			/* Try to free the page buffers. */
      			clearedbuf = try_to_free_buffers(page, wait);
      
      			/*
      			 * Re-take the spinlock. Note that we cannot
      			 * unlock the page yet since we're still
      			 * accessing the page_struct here...
      			 */
      			spin_lock(&pagemap_lru_lock);
      
      			/* The buffers were not freed. */
 613  			if (!clearedbuf) {
 614  				add_page_to_inactive_dirty_list(page);
      
      			/* The page was only in the buffer cache. */
 617  			} else if (!page->mapping) {
      				atomic_dec(&buffermem_pages);
      				freed_page = 1;
      				cleaned_pages++;
      
      			/* The page has more users besides the cache and us. */
 623  			} else if (page_count(page) > 2) {
 624  				add_page_to_active_list(page);
      
      			/* OK, we "created" a freeable page. */
 627  			} else /* page->mapping && page_count(page) == 2 */ {
 628  				add_page_to_inactive_clean_list(page);
      				cleaned_pages++;
      			}
      
      			/*
      			 * Unlock the page and drop the extra reference.
      			 * We can only do it here because we ar accessing
      			 * the page struct above.
      			 */
 637  			UnlockPage(page);
      			page_cache_release(page);
      
      			/* 
      			 * If we're freeing buffer cache pages, stop when
      			 * we've got enough free memory.
      			 */
 644  			if (freed_page && !free_shortage())
 645  				break;
 646  			continue;
 647  		} else if (page->mapping && !PageDirty(page)) {
      			/*
      			 * If a page had an extra reference in
      			 * deactivate_page(), we will find it here.
      			 * Now the page is really freeable, so we
      			 * move it to the inactive_clean list.
      			 */
 654  			del_page_from_inactive_dirty_list(page);
 655  			add_page_to_inactive_clean_list(page);
 656  			UnlockPage(page);
      			cleaned_pages++;
 658  		} else {
      page_active:
      			/*
      			 * OK, we don't know what to do with the page.
      			 * It's no use keeping it here, so we move it to
      			 * the active list.
      			 */
 665  			del_page_from_inactive_dirty_list(page);
 666  			add_page_to_active_list(page);
 667  			UnlockPage(page);
      		}
      	}
 670  	spin_unlock(&pagemap_lru_lock);
      
      	/*
      	 * If we don't have enough free pages, we loop back once
      	 * to queue the dirty pages for writeout. When we were called
      	 * by a user process (that /needs/ a free page) and we didn't
      	 * free anything yet, we wait synchronously on the writeout of
      	 * MAX_SYNC_LAUNDER pages.
      	 *
      	 * We also wake up bdflush, since bdflush should, under most
      	 * loads, flush out the dirty pages before we have to wait on
      	 * IO.
      	 */
 683  	if (can_get_io_locks && !launder_loop && free_shortage()) {
      		launder_loop = 1;
      		/* If we cleaned pages, never do synchronous IO. */
 686  		if (cleaned_pages)
      			sync = 0;
      		/* We only do a few "out of order" flushes. */
      		maxlaunder = MAX_LAUNDER;
      		/* Kflushd takes care of the rest. */
      		wakeup_bdflush(0);
 692  		goto dirty_page_rescan;
      	}
      
      	/* Return the number of pages moved to the inactive_clean list. */
 696  	return cleaned_pages;
      }
      
      /**
       * refill_inactive_scan - scan the active list and find pages to deactivate
       * @priority: the priority at which to scan
       * @oneshot: exit after deactivating one page
       *
       * This function will scan a portion of the active list to find
       * unused pages, those pages will then be moved to the inactive list.
       */
 707  int refill_inactive_scan(unsigned int priority, int oneshot)
      {
      	struct list_head * page_lru;
      	struct page * page;
      	int maxscan, page_active = 0;
      	int ret = 0;
      
      	/* Take the lock while messing with the list... */
      	spin_lock(&pagemap_lru_lock);
      	maxscan = nr_active_pages >> priority;
 717  	while (maxscan-- > 0 && (page_lru = active_list.prev) != &active_list) {
      		page = list_entry(page_lru, struct page, lru);
      
      		/* Wrong page on list?! (list corruption, should not happen) */
 721  		if (!PageActive(page)) {
      			printk("VM: refill_inactive, wrong page on list.\n");
      			list_del(page_lru);
      			nr_active_pages--;
 725  			continue;
      		}
      
      		/* Do aging on the pages. */
 729  		if (PageTestandClearReferenced(page)) {
      			age_page_up_nolock(page);
      			page_active = 1;
 732  		} else {
      			age_page_down_ageonly(page);
      			/*
      			 * Since we don't hold a reference on the page
      			 * ourselves, we have to do our test a bit more
      			 * strict then deactivate_page(). This is needed
      			 * since otherwise the system could hang shuffling
      			 * unfreeable pages from the active list to the
      			 * inactive_dirty list and back again...
      			 *
      			 * SUBTLE: we can have buffer pages with count 1.
      			 */
      			if (page->age == 0 && page_count(page) <=
 745  						(page->buffers ? 2 : 1)) {
      				deactivate_page_nolock(page);
      				page_active = 0;
 748  			} else {
      				page_active = 1;
      			}
      		}
      		/*
      		 * If the page is still on the active list, move it
      		 * to the other end of the list. Otherwise it was
      		 * deactivated by age_page_down and we exit successfully.
      		 */
 757  		if (page_active || PageActive(page)) {
      			list_del(page_lru);
      			list_add(page_lru, &active_list);
 760  		} else {
      			ret = 1;
 762  			if (oneshot)
 763  				break;
      		}
      	}
 766  	spin_unlock(&pagemap_lru_lock);
      
 768  	return ret;
      }
      
      /*
       * Check if there are zones with a severe shortage of free pages,
       * or if all zones have a minor shortage.
       */
 775  int free_shortage(void)
      {
      	pg_data_t *pgdat = pgdat_list;
      	int sum = 0;
      	int freeable = nr_free_pages() + nr_inactive_clean_pages();
      	int freetarget = freepages.high + inactive_target / 3;
      
      	/* Are we low on free pages globally? */
 783  	if (freeable < freetarget)
 784  		return freetarget - freeable;
      
      	/* If not, are we very low on any particular zone? */
 787  	do {
      		int i;
 789  		for(i = 0; i < MAX_NR_ZONES; i++) {
      			zone_t *zone = pgdat->node_zones+ i;
      			if (zone->size && (zone->inactive_clean_pages +
 792  					zone->free_pages < zone->pages_min+1)) {
      				/* + 1 to have overlap with alloc_pages() !! */
      				sum += zone->pages_min + 1;
      				sum -= zone->free_pages;
      				sum -= zone->inactive_clean_pages;
      			}
      		}
      		pgdat = pgdat->node_next;
 800  	} while (pgdat);
      
 802  	return sum;
      }
      
      /*
       * How many inactive pages are we short?
       */
 808  int inactive_shortage(void)
      {
      	int shortage = 0;
      
      	shortage += freepages.high;
      	shortage += inactive_target;
      	shortage -= nr_free_pages();
      	shortage -= nr_inactive_clean_pages();
      	shortage -= nr_inactive_dirty_pages;
      
 818  	if (shortage > 0)
 819  		return shortage;
      
 821  	return 0;
      }
      
      /*
       * We need to make the locks finer granularity, but right
       * now we need this so that we can do page allocations
       * without holding the kernel lock etc.
       *
       * We want to try to free "count" pages, and we want to 
       * cluster them so that we get good swap-out behaviour.
       *
       * OTOH, if we're a user process (and not kswapd), we
       * really care about latency. In that case we don't try
       * to free too many pages.
       */
 836  static int refill_inactive(unsigned int gfp_mask, int user)
      {
      	int priority, count, start_count, made_progress;
      
      	count = inactive_shortage() + free_shortage();
 841  	if (user)
      		count = (1 << page_cluster);
      	start_count = count;
      
      	/* Always trim SLAB caches when memory gets low. */
      	kmem_cache_reap(gfp_mask);
      
      	priority = 6;
 849  	do {
      		made_progress = 0;
      
 852  		if (current->need_resched) {
 853  			__set_current_state(TASK_RUNNING);
      			schedule();
      		}
      
 857  		while (refill_inactive_scan(priority, 1)) {
      			made_progress = 1;
 859  			if (--count <= 0)
 860  				goto done;
      		}
      
      		/*
      		 * don't be too light against the d/i cache since
      	   	 * refill_inactive() almost never fail when there's
      	   	 * really plenty of memory free. 
      		 */
      		shrink_dcache_memory(priority, gfp_mask);
      		shrink_icache_memory(priority, gfp_mask);
      
      		/*
      		 * Then, try to page stuff out..
      		 */
 874  		while (swap_out(priority, gfp_mask)) {
      			made_progress = 1;
 876  			if (--count <= 0)
 877  				goto done;
      		}
      
      		/*
      		 * If we either have enough free memory, or if
      		 * page_launder() will be able to make enough
      		 * free memory, then stop.
      		 */
 885  		if (!inactive_shortage() || !free_shortage())
 886  			goto done;
      
      		/*
      		 * Only switch to a lower "priority" if we
      		 * didn't make any useful progress in the
      		 * last loop.
      		 */
 893  		if (!made_progress)
      			priority--;
 895  	} while (priority >= 0);
      
      	/* Always end on a refill_inactive.., may sleep... */
 898  	while (refill_inactive_scan(0, 1)) {
 899  		if (--count <= 0)
 900  			goto done;
      	}
      
      done:
 904  	return (count < start_count);
      }
      
 907  static int do_try_to_free_pages(unsigned int gfp_mask, int user)
      {
      	int ret = 0;
      
      	/*
      	 * If we're low on free pages, move pages from the
      	 * inactive_dirty list to the inactive_clean list.
      	 *
      	 * Usually bdflush will have pre-cleaned the pages
      	 * before we get around to moving them to the other
      	 * list, so this is a relatively cheap operation.
      	 */
      	if (free_shortage() || nr_inactive_dirty_pages > nr_free_pages() +
 920  			nr_inactive_clean_pages())
      		ret += page_launder(gfp_mask, user);
      
      	/*
      	 * If needed, we move pages from the active list
      	 * to the inactive list. We also "eat" pages from
      	 * the inode and dentry cache whenever we do this.
      	 */
 928  	if (free_shortage() || inactive_shortage()) {
      		shrink_dcache_memory(6, gfp_mask);
      		shrink_icache_memory(6, gfp_mask);
      		ret += refill_inactive(gfp_mask, user);
 932  	} else {
      		/*
      		 * Reclaim unused slab cache memory.
      		 */
      		kmem_cache_reap(gfp_mask);
      		ret = 1;
      	}
      
 940  	return ret;
      }
      
      DECLARE_WAIT_QUEUE_HEAD(kswapd_wait);
      DECLARE_WAIT_QUEUE_HEAD(kswapd_done);
      struct task_struct *kswapd_task;
      
      /*
       * The background pageout daemon, started as a kernel thread
       * from the init process. 
       *
       * This basically trickles out pages so that we have _some_
       * free memory available even if there is no other activity
       * that frees anything up. This is needed for things like routing
       * etc, where we otherwise might have all activity going on in
       * asynchronous contexts that cannot page things out.
       *
       * If there are applications that are active memory-allocators
       * (most normal use), this basically shouldn't matter.
       */
 960  int kswapd(void *unused)
      {
      	struct task_struct *tsk = current;
      
      	tsk->session = 1;
      	tsk->pgrp = 1;
      	strcpy(tsk->comm, "kswapd");
      	sigfillset(&tsk->blocked);
      	kswapd_task = tsk;
      	
      	/*
      	 * Tell the memory management that we're a "memory allocator",
      	 * and that if we need more memory we should get access to it
      	 * regardless (see "__alloc_pages()"). "kswapd" should
      	 * never get caught in the normal page freeing logic.
      	 *
      	 * (Kswapd normally doesn't need memory anyway, but sometimes
      	 * you need a small amount of memory in order to be able to
      	 * page out something else, and this flag essentially protects
      	 * us from recursively trying to free more memory as we're
      	 * trying to free the first piece of memory in the first place).
      	 */
      	tsk->flags |= PF_MEMALLOC;
      
      	/*
      	 * Kswapd main loop.
      	 */
 987  	for (;;) {
      		static int recalc = 0;
      
      		/* If needed, try to free some memory. */
 991  		if (inactive_shortage() || free_shortage()) {
      			int wait = 0;
      			/* Do we need to do some synchronous flushing? */
 994  			if (waitqueue_active(&kswapd_done))
      				wait = 1;
      			do_try_to_free_pages(GFP_KSWAPD, wait);
      		}
      
      		/*
      		 * Do some (very minimal) background scanning. This
      		 * will scan all pages on the active list once
      		 * every minute. This clears old referenced bits
      		 * and moves unused pages to the inactive list.
      		 */
      		refill_inactive_scan(6, 0);
      
      		/* Once a second, recalculate some VM stats. */
1008  		if (time_after(jiffies, recalc + HZ)) {
      			recalc = jiffies;
      			recalculate_vm_stats();
      		}
      
      		/*
      		 * Wake up everybody waiting for free memory
      		 * and unplug the disk queue.
      		 */
      		wake_up_all(&kswapd_done);
      		run_task_queue(&tq_disk);
      
      		/* 
      		 * We go to sleep if either the free page shortage
      		 * or the inactive page shortage is gone. We do this
      		 * because:
      		 * 1) we need no more free pages   or
      		 * 2) the inactive pages need to be flushed to disk,
      		 *    it wouldn't help to eat CPU time now ...
      		 *
      		 * We go to sleep for one second, but if it's needed
      		 * we'll be woken up earlier...
      		 */
1031  		if (!free_shortage() || !inactive_shortage()) {
      			interruptible_sleep_on_timeout(&kswapd_wait, HZ);
      		/*
      		 * If we couldn't free enough memory, we see if it was
      		 * due to the system just not having enough memory.
      		 * If that is the case, the only solution is to kill
      		 * a process (the alternative is enternal deadlock).
      		 *
      		 * If there still is enough memory around, we just loop
      		 * and try free some more memory...
      		 */
1042  		} else if (out_of_memory()) {
      			oom_kill();
      		}
      	}
      }
      
1048  void wakeup_kswapd(int block)
      {
      	DECLARE_WAITQUEUE(wait, current);
      
1052  	if (current == kswapd_task)
1053  		return;
      
1055  	if (!block) {
1056  		if (waitqueue_active(&kswapd_wait))
      			wake_up(&kswapd_wait);
1058  		return;
      	}
      
      	/*
      	 * Kswapd could wake us up before we get a chance
      	 * to sleep, so we have to be very careful here to
      	 * prevent SMP races...
      	 */
1066  	__set_current_state(TASK_UNINTERRUPTIBLE);
      	add_wait_queue(&kswapd_done, &wait);
      
1069  	if (waitqueue_active(&kswapd_wait))
      		wake_up(&kswapd_wait);
      	schedule();
      
      	remove_wait_queue(&kswapd_done, &wait);
1074  	__set_current_state(TASK_RUNNING);
      }
      
      /*
       * Called by non-kswapd processes when they want more
       * memory but are unable to sleep on kswapd because
       * they might be holding some IO locks ...
       */
1082  int try_to_free_pages(unsigned int gfp_mask)
      {
      	int ret = 1;
      
1086  	if (gfp_mask & __GFP_WAIT) {
      		current->flags |= PF_MEMALLOC;
      		ret = do_try_to_free_pages(gfp_mask, 1);
      		current->flags &= ~PF_MEMALLOC;
      	}
      
1092  	return ret;
      }
      
      DECLARE_WAIT_QUEUE_HEAD(kreclaimd_wait);
      /*
       * Kreclaimd will move pages from the inactive_clean list to the
       * free list, in order to keep atomic allocations possible under
       * all circumstances. Even when kswapd is blocked on IO.
       */
1101  int kreclaimd(void *unused)
      {
      	struct task_struct *tsk = current;
      	pg_data_t *pgdat;
      
      	tsk->session = 1;
      	tsk->pgrp = 1;
      	strcpy(tsk->comm, "kreclaimd");
      	sigfillset(&tsk->blocked);
      	current->flags |= PF_MEMALLOC;
      
1112  	while (1) {
      
      		/*
      		 * We sleep until someone wakes us up from
      		 * page_alloc.c::__alloc_pages().
      		 */
      		interruptible_sleep_on(&kreclaimd_wait);
      
      		/*
      		 * Move some pages from the inactive_clean lists to
      		 * the free lists, if it is needed.
      		 */
      		pgdat = pgdat_list;
1125  		do {
      			int i;
1127  			for(i = 0; i < MAX_NR_ZONES; i++) {
      				zone_t *zone = pgdat->node_zones + i;
1129  				if (!zone->size)
1130  					continue;
      
1132  				while (zone->free_pages < zone->pages_low) {
      					struct page * page;
      					page = reclaim_page(zone);
1135  					if (!page)
1136  						break;
      					__free_page(page);
      				}
      			}
      			pgdat = pgdat->node_next;
1141  		} while (pgdat);
      	}
      }
      
      
1146  static int __init kswapd_init(void)
      {
      	printk("Starting kswapd v1.8\n");
      	swap_setup();
      	kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
      	kernel_thread(kreclaimd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
1152  	return 0;
      }
      
      module_init(kswapd_init)