/*
* linux/mm/vmscan.c
*
* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
*
* Swap reorganised 29.12.95, Stephen Tweedie.
* kswapd added: 7.1.96 sct
* Removed kswapd_ctl limits, and swap out as many pages as needed
* to bring the system back to freepages.high: 2.4.97, Rik van Riel.
* Version: $Id: vmscan.c,v 1.5 1998/02/23 22:14:28 sct Exp $
* Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
* Multiqueue VM started 5.8.00, Rik van Riel.
*/
#include <linux/slab.h>
#include <linux/kernel_stat.h>
#include <linux/swap.h>
#include <linux/swapctl.h>
#include <linux/smp_lock.h>
#include <linux/pagemap.h>
#include <linux/init.h>
#include <linux/highmem.h>
#include <linux/file.h>
#include <asm/pgalloc.h>
/*
* The swap-out functions return 1 if they successfully
* threw something out, and we got a free page. It returns
* zero if it couldn't do anything, and any other value
* indicates it decreased rss, but the page was shared.
*
* NOTE! If it sleeps, it *must* return 1 to make sure we
* don't continue with the swap-out. Otherwise we may be
* using a process that no longer actually exists (it might
* have died while we slept).
*/
38 static int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, int gfp_mask)
{
pte_t pte;
swp_entry_t entry;
struct page * page;
int onlist;
pte = *page_table;
46 if (!pte_present(pte))
47 goto out_failed;
page = pte_page(pte);
49 if ((!VALID_PAGE(page)) || PageReserved(page))
50 goto out_failed;
52 if (!mm->swap_cnt)
53 return 1;
mm->swap_cnt--;
onlist = PageActive(page);
/* Don't look at this pte if it's been accessed recently. */
59 if (ptep_test_and_clear_young(page_table)) {
age_page_up(page);
61 goto out_failed;
}
63 if (!onlist)
/* The page is still mapped, so it can't be freeable... */
age_page_down_ageonly(page);
/*
* If the page is in active use by us, or if the page
* is in active use by others, don't unmap it or
* (worse) start unneeded IO.
*/
72 if (page->age > 0)
73 goto out_failed;
75 if (TryLockPage(page))
76 goto out_failed;
/* From this point on, the odds are that we're going to
* nuke this pte, so read and clear the pte. This hook
* is needed on CPUs which update the accessed and dirty
* bits in hardware.
*/
pte = ptep_get_and_clear(page_table);
flush_tlb_page(vma, address);
/*
* Is the page already in the swap cache? If so, then
* we can just drop our reference to it without doing
* any IO - it's already up-to-date on disk.
*
* Return 0, as we didn't actually free any real
* memory, and we should just continue our scan.
*/
94 if (PageSwapCache(page)) {
entry.val = page->index;
96 if (pte_dirty(pte))
set_page_dirty(page);
set_swap_pte:
swap_duplicate(entry);
set_pte(page_table, swp_entry_to_pte(entry));
drop_pte:
102 UnlockPage(page);
mm->rss--;
deactivate_page(page);
page_cache_release(page);
out_failed:
107 return 0;
}
/*
* Is it a clean page? Then it must be recoverable
* by just paging it in again, and we can just drop
* it..
*
* However, this won't actually free any real
* memory, as the page will just be in the page cache
* somewhere, and as such we should just continue
* our scan.
*
* Basically, this just makes it possible for us to do
* some real work in the future in "refill_inactive()".
*/
123 flush_cache_page(vma, address);
124 if (!pte_dirty(pte))
125 goto drop_pte;
/*
* Ok, it's really dirty. That means that
* we should either create a new swap cache
* entry for it, or we should write it back
* to its own backing store.
*/
133 if (page->mapping) {
set_page_dirty(page);
135 goto drop_pte;
}
/*
* This is a dirty, swappable page. First of all,
* get a suitable swap entry for it, and make sure
* we have the swap cache set up to associate the
* page with that swap entry.
*/
entry = get_swap_page();
145 if (!entry.val)
146 goto out_unlock_restore; /* No swap space left */
/* Add it to the swap cache and mark it dirty */
add_to_swap_cache(page, entry);
set_page_dirty(page);
151 goto set_swap_pte;
out_unlock_restore:
set_pte(page_table, pte);
155 UnlockPage(page);
156 return 0;
}
/*
* A new implementation of swap_out(). We do not swap complete processes,
* but only a small number of blocks, before we continue with the next
* process. The number of blocks actually swapped is determined on the
* number of page faults, that this process actually had in the last time,
* so we won't swap heavily used processes all the time ...
*
* Note: the priority argument is a hint on much CPU to waste with the
* swap block search, not a hint, of how much blocks to swap with
* each process.
*
* (C) 1993 Kai Petzke, wpp@marie.physik.tu-berlin.de
*/
173 static inline int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
{
pte_t * pte;
unsigned long pmd_end;
178 if (pmd_none(*dir))
179 return 0;
180 if (pmd_bad(*dir)) {
pmd_ERROR(*dir);
182 pmd_clear(dir);
183 return 0;
}
pte = pte_offset(dir, address);
pmd_end = (address + PMD_SIZE) & PMD_MASK;
189 if (end > pmd_end)
end = pmd_end;
192 do {
int result;
mm->swap_address = address + PAGE_SIZE;
result = try_to_swap_out(mm, vma, address, pte, gfp_mask);
196 if (result)
197 return result;
address += PAGE_SIZE;
pte++;
200 } while (address && (address < end));
201 return 0;
}
204 static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
{
pmd_t * pmd;
unsigned long pgd_end;
209 if (pgd_none(*dir))
210 return 0;
211 if (pgd_bad(*dir)) {
pgd_ERROR(*dir);
213 pgd_clear(dir);
214 return 0;
}
pmd = pmd_offset(dir, address);
pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK;
220 if (pgd_end && (end > pgd_end))
end = pgd_end;
223 do {
int result = swap_out_pmd(mm, vma, pmd, address, end, gfp_mask);
225 if (result)
226 return result;
address = (address + PMD_SIZE) & PMD_MASK;
pmd++;
229 } while (address && (address < end));
230 return 0;
}
233 static int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int gfp_mask)
{
pgd_t *pgdir;
unsigned long end;
/* Don't swap out areas which are locked down */
239 if (vma->vm_flags & (VM_LOCKED|VM_RESERVED))
240 return 0;
pgdir = pgd_offset(mm, address);
end = vma->vm_end;
245 if (address >= end)
246 BUG();
247 do {
int result = swap_out_pgd(mm, vma, pgdir, address, end, gfp_mask);
249 if (result)
250 return result;
address = (address + PGDIR_SIZE) & PGDIR_MASK;
pgdir++;
253 } while (address && (address < end));
254 return 0;
}
257 static int swap_out_mm(struct mm_struct * mm, int gfp_mask)
{
int result = 0;
unsigned long address;
struct vm_area_struct* vma;
/*
* Go through process' page directory.
*/
/*
* Find the proper vm-area after freezing the vma chain
* and ptes.
*/
spin_lock(&mm->page_table_lock);
address = mm->swap_address;
vma = find_vma(mm, address);
274 if (vma) {
275 if (address < vma->vm_start)
address = vma->vm_start;
278 for (;;) {
result = swap_out_vma(mm, vma, address, gfp_mask);
280 if (result)
281 goto out_unlock;
vma = vma->vm_next;
283 if (!vma)
284 break;
address = vma->vm_start;
}
}
/* Reset to 0 when we reach the end of address space */
mm->swap_address = 0;
mm->swap_cnt = 0;
out_unlock:
293 spin_unlock(&mm->page_table_lock);
294 return result;
}
/*
* Select the task with maximal swap_cnt and try to swap out a page.
* N.B. This function returns only 0 or 1. Return values != 1 from
* the lower level routines result in continued processing.
*/
#define SWAP_SHIFT 5
#define SWAP_MIN 8
305 static int swap_out(unsigned int priority, int gfp_mask)
{
int counter;
int __ret = 0;
/*
* We make one or two passes through the task list, indexed by
* assign = {0, 1}:
* Pass 1: select the swappable task with maximal RSS that has
* not yet been swapped out.
* Pass 2: re-assign rss swap_cnt values, then select as above.
*
* With this approach, there's no need to remember the last task
* swapped out. If the swap-out fails, we clear swap_cnt so the
* task won't be selected again until all others have been tried.
*
* Think of swap_cnt as a "shadow rss" - it tells us which process
* we want to page out (always try largest first).
*/
counter = (nr_threads << SWAP_SHIFT) >> priority;
325 if (counter < 1)
counter = 1;
328 for (; counter >= 0; counter--) {
struct list_head *p;
unsigned long max_cnt = 0;
struct mm_struct *best = NULL;
int assign = 0;
int found_task = 0;
select:
spin_lock(&mmlist_lock);
p = init_mm.mmlist.next;
337 for (; p != &init_mm.mmlist; p = p->next) {
struct mm_struct *mm = list_entry(p, struct mm_struct, mmlist);
339 if (mm->rss <= 0)
340 continue;
found_task++;
/* Refresh swap_cnt? */
343 if (assign == 1) {
mm->swap_cnt = (mm->rss >> SWAP_SHIFT);
345 if (mm->swap_cnt < SWAP_MIN)
mm->swap_cnt = SWAP_MIN;
}
348 if (mm->swap_cnt > max_cnt) {
max_cnt = mm->swap_cnt;
best = mm;
}
}
/* Make sure it doesn't disappear */
355 if (best)
atomic_inc(&best->mm_users);
357 spin_unlock(&mmlist_lock);
/*
* We have dropped the tasklist_lock, but we
* know that "mm" still exists: we are running
* with the big kernel lock, and exit_mm()
* cannot race with us.
*/
365 if (!best) {
366 if (!assign && found_task > 0) {
assign = 1;
368 goto select;
}
370 break;
371 } else {
__ret = swap_out_mm(best, gfp_mask);
mmput(best);
374 break;
}
}
377 return __ret;
}
/**
* reclaim_page - reclaims one page from the inactive_clean list
* @zone: reclaim a page from this zone
*
* The pages on the inactive_clean can be instantly reclaimed.
* The tests look impressive, but most of the time we'll grab
* the first page of the list and exit successfully.
*/
389 struct page * reclaim_page(zone_t * zone)
{
struct page * page = NULL;
struct list_head * page_lru;
int maxscan;
/*
* We only need the pagemap_lru_lock if we don't reclaim the page,
* but we have to grab the pagecache_lock before the pagemap_lru_lock
* to avoid deadlocks and most of the time we'll succeed anyway.
*/
spin_lock(&pagecache_lock);
spin_lock(&pagemap_lru_lock);
maxscan = zone->inactive_clean_pages;
while ((page_lru = zone->inactive_clean_list.prev) !=
404 &zone->inactive_clean_list && maxscan--) {
page = list_entry(page_lru, struct page, lru);
/* Wrong page on list?! (list corruption, should not happen) */
408 if (!PageInactiveClean(page)) {
printk("VM: reclaim_page, wrong page on list.\n");
list_del(page_lru);
page->zone->inactive_clean_pages--;
412 continue;
}
/* Page is or was in use? Move it to the active list. */
if (PageTestandClearReferenced(page) || page->age > 0 ||
417 (!page->buffers && page_count(page) > 1)) {
418 del_page_from_inactive_clean_list(page);
419 add_page_to_active_list(page);
420 continue;
}
/* The page is dirty, or locked, move to inactive_dirty list. */
424 if (page->buffers || PageDirty(page) || TryLockPage(page)) {
425 del_page_from_inactive_clean_list(page);
426 add_page_to_inactive_dirty_list(page);
427 continue;
}
/* OK, remove the page from the caches. */
431 if (PageSwapCache(page)) {
__delete_from_swap_cache(page);
433 goto found_page;
}
436 if (page->mapping) {
__remove_inode_page(page);
438 goto found_page;
}
/* We should never ever get here. */
printk(KERN_ERR "VM: reclaim_page, found unknown page\n");
list_del(page_lru);
zone->inactive_clean_pages--;
445 UnlockPage(page);
}
/* Reset page pointer, maybe we encountered an unfreeable page. */
page = NULL;
449 goto out;
found_page:
452 del_page_from_inactive_clean_list(page);
453 UnlockPage(page);
page->age = PAGE_AGE_START;
455 if (page_count(page) != 1)
printk("VM: reclaim_page, found page with count %d!\n",
page_count(page));
out:
459 spin_unlock(&pagemap_lru_lock);
460 spin_unlock(&pagecache_lock);
memory_pressure++;
462 return page;
}
/**
* page_launder - clean dirty inactive pages, move to inactive_clean list
* @gfp_mask: what operations we are allowed to do
* @sync: should we wait synchronously for the cleaning of pages
*
* When this function is called, we are most likely low on free +
* inactive_clean pages. Since we want to refill those pages as
* soon as possible, we'll make two loops over the inactive list,
* one to move the already cleaned pages to the inactive_clean lists
* and one to (often asynchronously) clean the dirty inactive pages.
*
* In situations where kswapd cannot keep up, user processes will
* end up calling this function. Since the user process needs to
* have a page before it can continue with its allocation, we'll
* do synchronous page flushing in that case.
*
* This code is heavily inspired by the FreeBSD source code. Thanks
* go out to Matthew Dillon.
*/
#define MAX_LAUNDER (4 * (1 << page_cluster))
485 int page_launder(int gfp_mask, int sync)
{
int launder_loop, maxscan, cleaned_pages, maxlaunder;
int can_get_io_locks;
struct list_head * page_lru;
struct page * page;
/*
* We can only grab the IO locks (eg. for flushing dirty
* buffers to disk) if __GFP_IO is set.
*/
can_get_io_locks = gfp_mask & __GFP_IO;
launder_loop = 0;
maxlaunder = 0;
cleaned_pages = 0;
dirty_page_rescan:
spin_lock(&pagemap_lru_lock);
maxscan = nr_inactive_dirty_pages;
while ((page_lru = inactive_dirty_list.prev) != &inactive_dirty_list &&
506 maxscan-- > 0) {
page = list_entry(page_lru, struct page, lru);
/* Wrong page on list?! (list corruption, should not happen) */
510 if (!PageInactiveDirty(page)) {
printk("VM: page_launder, wrong page on list.\n");
list_del(page_lru);
nr_inactive_dirty_pages--;
page->zone->inactive_dirty_pages--;
515 continue;
}
/* Page is or was in use? Move it to the active list. */
if (PageTestandClearReferenced(page) || page->age > 0 ||
(!page->buffers && page_count(page) > 1) ||
521 page_ramdisk(page)) {
522 del_page_from_inactive_dirty_list(page);
523 add_page_to_active_list(page);
524 continue;
}
/*
* The page is locked. IO in progress?
* Move it to the back of the list.
*/
531 if (TryLockPage(page)) {
list_del(page_lru);
list_add(page_lru, &inactive_dirty_list);
534 continue;
}
/*
* Dirty swap-cache page? Write it out if
* last copy..
*/
541 if (PageDirty(page)) {
int (*writepage)(struct page *) = page->mapping->a_ops->writepage;
int result;
545 if (!writepage)
546 goto page_active;
/* First time through? Move it to the back of the list */
549 if (!launder_loop) {
list_del(page_lru);
list_add(page_lru, &inactive_dirty_list);
552 UnlockPage(page);
553 continue;
}
/* OK, do a physical asynchronous write to swap. */
ClearPageDirty(page);
page_cache_get(page);
559 spin_unlock(&pagemap_lru_lock);
result = writepage(page);
page_cache_release(page);
/* And re-start the thing.. */
spin_lock(&pagemap_lru_lock);
566 if (result != 1)
567 continue;
/* writepage refused to do anything */
set_page_dirty(page);
570 goto page_active;
}
/*
* If the page has buffers, try to free the buffer mappings
* associated with this page. If we succeed we either free
* the page (in case it was a buffercache only page) or we
* move the page to the inactive_clean list.
*
* On the first round, we should free all previously cleaned
* buffer pages
*/
582 if (page->buffers) {
int wait, clearedbuf;
int freed_page = 0;
/*
* Since we might be doing disk IO, we have to
* drop the spinlock and take an extra reference
* on the page so it doesn't go away from under us.
*/
590 del_page_from_inactive_dirty_list(page);
page_cache_get(page);
592 spin_unlock(&pagemap_lru_lock);
/* Will we do (asynchronous) IO? */
595 if (launder_loop && maxlaunder == 0 && sync)
wait = 2; /* Synchrounous IO */
597 else if (launder_loop && maxlaunder-- > 0)
wait = 1; /* Async IO */
599 else
wait = 0; /* No IO */
/* Try to free the page buffers. */
clearedbuf = try_to_free_buffers(page, wait);
/*
* Re-take the spinlock. Note that we cannot
* unlock the page yet since we're still
* accessing the page_struct here...
*/
spin_lock(&pagemap_lru_lock);
/* The buffers were not freed. */
613 if (!clearedbuf) {
614 add_page_to_inactive_dirty_list(page);
/* The page was only in the buffer cache. */
617 } else if (!page->mapping) {
atomic_dec(&buffermem_pages);
freed_page = 1;
cleaned_pages++;
/* The page has more users besides the cache and us. */
623 } else if (page_count(page) > 2) {
624 add_page_to_active_list(page);
/* OK, we "created" a freeable page. */
627 } else /* page->mapping && page_count(page) == 2 */ {
628 add_page_to_inactive_clean_list(page);
cleaned_pages++;
}
/*
* Unlock the page and drop the extra reference.
* We can only do it here because we ar accessing
* the page struct above.
*/
637 UnlockPage(page);
page_cache_release(page);
/*
* If we're freeing buffer cache pages, stop when
* we've got enough free memory.
*/
644 if (freed_page && !free_shortage())
645 break;
646 continue;
647 } else if (page->mapping && !PageDirty(page)) {
/*
* If a page had an extra reference in
* deactivate_page(), we will find it here.
* Now the page is really freeable, so we
* move it to the inactive_clean list.
*/
654 del_page_from_inactive_dirty_list(page);
655 add_page_to_inactive_clean_list(page);
656 UnlockPage(page);
cleaned_pages++;
658 } else {
page_active:
/*
* OK, we don't know what to do with the page.
* It's no use keeping it here, so we move it to
* the active list.
*/
665 del_page_from_inactive_dirty_list(page);
666 add_page_to_active_list(page);
667 UnlockPage(page);
}
}
670 spin_unlock(&pagemap_lru_lock);
/*
* If we don't have enough free pages, we loop back once
* to queue the dirty pages for writeout. When we were called
* by a user process (that /needs/ a free page) and we didn't
* free anything yet, we wait synchronously on the writeout of
* MAX_SYNC_LAUNDER pages.
*
* We also wake up bdflush, since bdflush should, under most
* loads, flush out the dirty pages before we have to wait on
* IO.
*/
683 if (can_get_io_locks && !launder_loop && free_shortage()) {
launder_loop = 1;
/* If we cleaned pages, never do synchronous IO. */
686 if (cleaned_pages)
sync = 0;
/* We only do a few "out of order" flushes. */
maxlaunder = MAX_LAUNDER;
/* Kflushd takes care of the rest. */
wakeup_bdflush(0);
692 goto dirty_page_rescan;
}
/* Return the number of pages moved to the inactive_clean list. */
696 return cleaned_pages;
}
/**
* refill_inactive_scan - scan the active list and find pages to deactivate
* @priority: the priority at which to scan
* @oneshot: exit after deactivating one page
*
* This function will scan a portion of the active list to find
* unused pages, those pages will then be moved to the inactive list.
*/
707 int refill_inactive_scan(unsigned int priority, int oneshot)
{
struct list_head * page_lru;
struct page * page;
int maxscan, page_active = 0;
int ret = 0;
/* Take the lock while messing with the list... */
spin_lock(&pagemap_lru_lock);
maxscan = nr_active_pages >> priority;
717 while (maxscan-- > 0 && (page_lru = active_list.prev) != &active_list) {
page = list_entry(page_lru, struct page, lru);
/* Wrong page on list?! (list corruption, should not happen) */
721 if (!PageActive(page)) {
printk("VM: refill_inactive, wrong page on list.\n");
list_del(page_lru);
nr_active_pages--;
725 continue;
}
/* Do aging on the pages. */
729 if (PageTestandClearReferenced(page)) {
age_page_up_nolock(page);
page_active = 1;
732 } else {
age_page_down_ageonly(page);
/*
* Since we don't hold a reference on the page
* ourselves, we have to do our test a bit more
* strict then deactivate_page(). This is needed
* since otherwise the system could hang shuffling
* unfreeable pages from the active list to the
* inactive_dirty list and back again...
*
* SUBTLE: we can have buffer pages with count 1.
*/
if (page->age == 0 && page_count(page) <=
745 (page->buffers ? 2 : 1)) {
deactivate_page_nolock(page);
page_active = 0;
748 } else {
page_active = 1;
}
}
/*
* If the page is still on the active list, move it
* to the other end of the list. Otherwise it was
* deactivated by age_page_down and we exit successfully.
*/
757 if (page_active || PageActive(page)) {
list_del(page_lru);
list_add(page_lru, &active_list);
760 } else {
ret = 1;
762 if (oneshot)
763 break;
}
}
766 spin_unlock(&pagemap_lru_lock);
768 return ret;
}
/*
* Check if there are zones with a severe shortage of free pages,
* or if all zones have a minor shortage.
*/
775 int free_shortage(void)
{
pg_data_t *pgdat = pgdat_list;
int sum = 0;
int freeable = nr_free_pages() + nr_inactive_clean_pages();
int freetarget = freepages.high + inactive_target / 3;
/* Are we low on free pages globally? */
783 if (freeable < freetarget)
784 return freetarget - freeable;
/* If not, are we very low on any particular zone? */
787 do {
int i;
789 for(i = 0; i < MAX_NR_ZONES; i++) {
zone_t *zone = pgdat->node_zones+ i;
if (zone->size && (zone->inactive_clean_pages +
792 zone->free_pages < zone->pages_min+1)) {
/* + 1 to have overlap with alloc_pages() !! */
sum += zone->pages_min + 1;
sum -= zone->free_pages;
sum -= zone->inactive_clean_pages;
}
}
pgdat = pgdat->node_next;
800 } while (pgdat);
802 return sum;
}
/*
* How many inactive pages are we short?
*/
808 int inactive_shortage(void)
{
int shortage = 0;
shortage += freepages.high;
shortage += inactive_target;
shortage -= nr_free_pages();
shortage -= nr_inactive_clean_pages();
shortage -= nr_inactive_dirty_pages;
818 if (shortage > 0)
819 return shortage;
821 return 0;
}
/*
* We need to make the locks finer granularity, but right
* now we need this so that we can do page allocations
* without holding the kernel lock etc.
*
* We want to try to free "count" pages, and we want to
* cluster them so that we get good swap-out behaviour.
*
* OTOH, if we're a user process (and not kswapd), we
* really care about latency. In that case we don't try
* to free too many pages.
*/
836 static int refill_inactive(unsigned int gfp_mask, int user)
{
int priority, count, start_count, made_progress;
count = inactive_shortage() + free_shortage();
841 if (user)
count = (1 << page_cluster);
start_count = count;
/* Always trim SLAB caches when memory gets low. */
kmem_cache_reap(gfp_mask);
priority = 6;
849 do {
made_progress = 0;
852 if (current->need_resched) {
853 __set_current_state(TASK_RUNNING);
schedule();
}
857 while (refill_inactive_scan(priority, 1)) {
made_progress = 1;
859 if (--count <= 0)
860 goto done;
}
/*
* don't be too light against the d/i cache since
* refill_inactive() almost never fail when there's
* really plenty of memory free.
*/
shrink_dcache_memory(priority, gfp_mask);
shrink_icache_memory(priority, gfp_mask);
/*
* Then, try to page stuff out..
*/
874 while (swap_out(priority, gfp_mask)) {
made_progress = 1;
876 if (--count <= 0)
877 goto done;
}
/*
* If we either have enough free memory, or if
* page_launder() will be able to make enough
* free memory, then stop.
*/
885 if (!inactive_shortage() || !free_shortage())
886 goto done;
/*
* Only switch to a lower "priority" if we
* didn't make any useful progress in the
* last loop.
*/
893 if (!made_progress)
priority--;
895 } while (priority >= 0);
/* Always end on a refill_inactive.., may sleep... */
898 while (refill_inactive_scan(0, 1)) {
899 if (--count <= 0)
900 goto done;
}
done:
904 return (count < start_count);
}
907 static int do_try_to_free_pages(unsigned int gfp_mask, int user)
{
int ret = 0;
/*
* If we're low on free pages, move pages from the
* inactive_dirty list to the inactive_clean list.
*
* Usually bdflush will have pre-cleaned the pages
* before we get around to moving them to the other
* list, so this is a relatively cheap operation.
*/
if (free_shortage() || nr_inactive_dirty_pages > nr_free_pages() +
920 nr_inactive_clean_pages())
ret += page_launder(gfp_mask, user);
/*
* If needed, we move pages from the active list
* to the inactive list. We also "eat" pages from
* the inode and dentry cache whenever we do this.
*/
928 if (free_shortage() || inactive_shortage()) {
shrink_dcache_memory(6, gfp_mask);
shrink_icache_memory(6, gfp_mask);
ret += refill_inactive(gfp_mask, user);
932 } else {
/*
* Reclaim unused slab cache memory.
*/
kmem_cache_reap(gfp_mask);
ret = 1;
}
940 return ret;
}
DECLARE_WAIT_QUEUE_HEAD(kswapd_wait);
DECLARE_WAIT_QUEUE_HEAD(kswapd_done);
struct task_struct *kswapd_task;
/*
* The background pageout daemon, started as a kernel thread
* from the init process.
*
* This basically trickles out pages so that we have _some_
* free memory available even if there is no other activity
* that frees anything up. This is needed for things like routing
* etc, where we otherwise might have all activity going on in
* asynchronous contexts that cannot page things out.
*
* If there are applications that are active memory-allocators
* (most normal use), this basically shouldn't matter.
*/
960 int kswapd(void *unused)
{
struct task_struct *tsk = current;
tsk->session = 1;
tsk->pgrp = 1;
strcpy(tsk->comm, "kswapd");
sigfillset(&tsk->blocked);
kswapd_task = tsk;
/*
* Tell the memory management that we're a "memory allocator",
* and that if we need more memory we should get access to it
* regardless (see "__alloc_pages()"). "kswapd" should
* never get caught in the normal page freeing logic.
*
* (Kswapd normally doesn't need memory anyway, but sometimes
* you need a small amount of memory in order to be able to
* page out something else, and this flag essentially protects
* us from recursively trying to free more memory as we're
* trying to free the first piece of memory in the first place).
*/
tsk->flags |= PF_MEMALLOC;
/*
* Kswapd main loop.
*/
987 for (;;) {
static int recalc = 0;
/* If needed, try to free some memory. */
991 if (inactive_shortage() || free_shortage()) {
int wait = 0;
/* Do we need to do some synchronous flushing? */
994 if (waitqueue_active(&kswapd_done))
wait = 1;
do_try_to_free_pages(GFP_KSWAPD, wait);
}
/*
* Do some (very minimal) background scanning. This
* will scan all pages on the active list once
* every minute. This clears old referenced bits
* and moves unused pages to the inactive list.
*/
refill_inactive_scan(6, 0);
/* Once a second, recalculate some VM stats. */
1008 if (time_after(jiffies, recalc + HZ)) {
recalc = jiffies;
recalculate_vm_stats();
}
/*
* Wake up everybody waiting for free memory
* and unplug the disk queue.
*/
wake_up_all(&kswapd_done);
run_task_queue(&tq_disk);
/*
* We go to sleep if either the free page shortage
* or the inactive page shortage is gone. We do this
* because:
* 1) we need no more free pages or
* 2) the inactive pages need to be flushed to disk,
* it wouldn't help to eat CPU time now ...
*
* We go to sleep for one second, but if it's needed
* we'll be woken up earlier...
*/
1031 if (!free_shortage() || !inactive_shortage()) {
interruptible_sleep_on_timeout(&kswapd_wait, HZ);
/*
* If we couldn't free enough memory, we see if it was
* due to the system just not having enough memory.
* If that is the case, the only solution is to kill
* a process (the alternative is enternal deadlock).
*
* If there still is enough memory around, we just loop
* and try free some more memory...
*/
1042 } else if (out_of_memory()) {
oom_kill();
}
}
}
1048 void wakeup_kswapd(int block)
{
DECLARE_WAITQUEUE(wait, current);
1052 if (current == kswapd_task)
1053 return;
1055 if (!block) {
1056 if (waitqueue_active(&kswapd_wait))
wake_up(&kswapd_wait);
1058 return;
}
/*
* Kswapd could wake us up before we get a chance
* to sleep, so we have to be very careful here to
* prevent SMP races...
*/
1066 __set_current_state(TASK_UNINTERRUPTIBLE);
add_wait_queue(&kswapd_done, &wait);
1069 if (waitqueue_active(&kswapd_wait))
wake_up(&kswapd_wait);
schedule();
remove_wait_queue(&kswapd_done, &wait);
1074 __set_current_state(TASK_RUNNING);
}
/*
* Called by non-kswapd processes when they want more
* memory but are unable to sleep on kswapd because
* they might be holding some IO locks ...
*/
1082 int try_to_free_pages(unsigned int gfp_mask)
{
int ret = 1;
1086 if (gfp_mask & __GFP_WAIT) {
current->flags |= PF_MEMALLOC;
ret = do_try_to_free_pages(gfp_mask, 1);
current->flags &= ~PF_MEMALLOC;
}
1092 return ret;
}
DECLARE_WAIT_QUEUE_HEAD(kreclaimd_wait);
/*
* Kreclaimd will move pages from the inactive_clean list to the
* free list, in order to keep atomic allocations possible under
* all circumstances. Even when kswapd is blocked on IO.
*/
1101 int kreclaimd(void *unused)
{
struct task_struct *tsk = current;
pg_data_t *pgdat;
tsk->session = 1;
tsk->pgrp = 1;
strcpy(tsk->comm, "kreclaimd");
sigfillset(&tsk->blocked);
current->flags |= PF_MEMALLOC;
1112 while (1) {
/*
* We sleep until someone wakes us up from
* page_alloc.c::__alloc_pages().
*/
interruptible_sleep_on(&kreclaimd_wait);
/*
* Move some pages from the inactive_clean lists to
* the free lists, if it is needed.
*/
pgdat = pgdat_list;
1125 do {
int i;
1127 for(i = 0; i < MAX_NR_ZONES; i++) {
zone_t *zone = pgdat->node_zones + i;
1129 if (!zone->size)
1130 continue;
1132 while (zone->free_pages < zone->pages_low) {
struct page * page;
page = reclaim_page(zone);
1135 if (!page)
1136 break;
__free_page(page);
}
}
pgdat = pgdat->node_next;
1141 } while (pgdat);
}
}
1146 static int __init kswapd_init(void)
{
printk("Starting kswapd v1.8\n");
swap_setup();
kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
kernel_thread(kreclaimd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
1152 return 0;
}
module_init(kswapd_init)