/* * linux/mm/vmscan.c * * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds * * Swap reorganised 29.12.95, Stephen Tweedie. * kswapd added: 7.1.96 sct * Removed kswapd_ctl limits, and swap out as many pages as needed * to bring the system back to freepages.high: 2.4.97, Rik van Riel. * Version: $Id: vmscan.c,v 1.5 1998/02/23 22:14:28 sct Exp $ * Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com). * Multiqueue VM started 5.8.00, Rik van Riel. */ #include <linux/slab.h> #include <linux/kernel_stat.h> #include <linux/swap.h> #include <linux/swapctl.h> #include <linux/smp_lock.h> #include <linux/pagemap.h> #include <linux/init.h> #include <linux/highmem.h> #include <linux/file.h> #include <asm/pgalloc.h> /* * The swap-out functions return 1 if they successfully * threw something out, and we got a free page. It returns * zero if it couldn't do anything, and any other value * indicates it decreased rss, but the page was shared. * * NOTE! If it sleeps, it *must* return 1 to make sure we * don't continue with the swap-out. Otherwise we may be * using a process that no longer actually exists (it might * have died while we slept). */ 38 static int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, int gfp_mask) { pte_t pte; swp_entry_t entry; struct page * page; int onlist; pte = *page_table; 46 if (!pte_present(pte)) 47 goto out_failed; page = pte_page(pte); 49 if ((!VALID_PAGE(page)) || PageReserved(page)) 50 goto out_failed; 52 if (!mm->swap_cnt) 53 return 1; mm->swap_cnt--; onlist = PageActive(page); /* Don't look at this pte if it's been accessed recently. */ 59 if (ptep_test_and_clear_young(page_table)) { age_page_up(page); 61 goto out_failed; } 63 if (!onlist) /* The page is still mapped, so it can't be freeable... */ age_page_down_ageonly(page); /* * If the page is in active use by us, or if the page * is in active use by others, don't unmap it or * (worse) start unneeded IO. */ 72 if (page->age > 0) 73 goto out_failed; 75 if (TryLockPage(page)) 76 goto out_failed; /* From this point on, the odds are that we're going to * nuke this pte, so read and clear the pte. This hook * is needed on CPUs which update the accessed and dirty * bits in hardware. */ pte = ptep_get_and_clear(page_table); flush_tlb_page(vma, address); /* * Is the page already in the swap cache? If so, then * we can just drop our reference to it without doing * any IO - it's already up-to-date on disk. * * Return 0, as we didn't actually free any real * memory, and we should just continue our scan. */ 94 if (PageSwapCache(page)) { entry.val = page->index; 96 if (pte_dirty(pte)) set_page_dirty(page); set_swap_pte: swap_duplicate(entry); set_pte(page_table, swp_entry_to_pte(entry)); drop_pte: 102 UnlockPage(page); mm->rss--; deactivate_page(page); page_cache_release(page); out_failed: 107 return 0; } /* * Is it a clean page? Then it must be recoverable * by just paging it in again, and we can just drop * it.. * * However, this won't actually free any real * memory, as the page will just be in the page cache * somewhere, and as such we should just continue * our scan. * * Basically, this just makes it possible for us to do * some real work in the future in "refill_inactive()". */ 123 flush_cache_page(vma, address); 124 if (!pte_dirty(pte)) 125 goto drop_pte; /* * Ok, it's really dirty. That means that * we should either create a new swap cache * entry for it, or we should write it back * to its own backing store. */ 133 if (page->mapping) { set_page_dirty(page); 135 goto drop_pte; } /* * This is a dirty, swappable page. First of all, * get a suitable swap entry for it, and make sure * we have the swap cache set up to associate the * page with that swap entry. */ entry = get_swap_page(); 145 if (!entry.val) 146 goto out_unlock_restore; /* No swap space left */ /* Add it to the swap cache and mark it dirty */ add_to_swap_cache(page, entry); set_page_dirty(page); 151 goto set_swap_pte; out_unlock_restore: set_pte(page_table, pte); 155 UnlockPage(page); 156 return 0; } /* * A new implementation of swap_out(). We do not swap complete processes, * but only a small number of blocks, before we continue with the next * process. The number of blocks actually swapped is determined on the * number of page faults, that this process actually had in the last time, * so we won't swap heavily used processes all the time ... * * Note: the priority argument is a hint on much CPU to waste with the * swap block search, not a hint, of how much blocks to swap with * each process. * * (C) 1993 Kai Petzke, wpp@marie.physik.tu-berlin.de */ 173 static inline int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int gfp_mask) { pte_t * pte; unsigned long pmd_end; 178 if (pmd_none(*dir)) 179 return 0; 180 if (pmd_bad(*dir)) { pmd_ERROR(*dir); 182 pmd_clear(dir); 183 return 0; } pte = pte_offset(dir, address); pmd_end = (address + PMD_SIZE) & PMD_MASK; 189 if (end > pmd_end) end = pmd_end; 192 do { int result; mm->swap_address = address + PAGE_SIZE; result = try_to_swap_out(mm, vma, address, pte, gfp_mask); 196 if (result) 197 return result; address += PAGE_SIZE; pte++; 200 } while (address && (address < end)); 201 return 0; } 204 static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int gfp_mask) { pmd_t * pmd; unsigned long pgd_end; 209 if (pgd_none(*dir)) 210 return 0; 211 if (pgd_bad(*dir)) { pgd_ERROR(*dir); 213 pgd_clear(dir); 214 return 0; } pmd = pmd_offset(dir, address); pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK; 220 if (pgd_end && (end > pgd_end)) end = pgd_end; 223 do { int result = swap_out_pmd(mm, vma, pmd, address, end, gfp_mask); 225 if (result) 226 return result; address = (address + PMD_SIZE) & PMD_MASK; pmd++; 229 } while (address && (address < end)); 230 return 0; } 233 static int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int gfp_mask) { pgd_t *pgdir; unsigned long end; /* Don't swap out areas which are locked down */ 239 if (vma->vm_flags & (VM_LOCKED|VM_RESERVED)) 240 return 0; pgdir = pgd_offset(mm, address); end = vma->vm_end; 245 if (address >= end) 246 BUG(); 247 do { int result = swap_out_pgd(mm, vma, pgdir, address, end, gfp_mask); 249 if (result) 250 return result; address = (address + PGDIR_SIZE) & PGDIR_MASK; pgdir++; 253 } while (address && (address < end)); 254 return 0; } 257 static int swap_out_mm(struct mm_struct * mm, int gfp_mask) { int result = 0; unsigned long address; struct vm_area_struct* vma; /* * Go through process' page directory. */ /* * Find the proper vm-area after freezing the vma chain * and ptes. */ spin_lock(&mm->page_table_lock); address = mm->swap_address; vma = find_vma(mm, address); 274 if (vma) { 275 if (address < vma->vm_start) address = vma->vm_start; 278 for (;;) { result = swap_out_vma(mm, vma, address, gfp_mask); 280 if (result) 281 goto out_unlock; vma = vma->vm_next; 283 if (!vma) 284 break; address = vma->vm_start; } } /* Reset to 0 when we reach the end of address space */ mm->swap_address = 0; mm->swap_cnt = 0; out_unlock: 293 spin_unlock(&mm->page_table_lock); 294 return result; } /* * Select the task with maximal swap_cnt and try to swap out a page. * N.B. This function returns only 0 or 1. Return values != 1 from * the lower level routines result in continued processing. */ #define SWAP_SHIFT 5 #define SWAP_MIN 8 305 static int swap_out(unsigned int priority, int gfp_mask) { int counter; int __ret = 0; /* * We make one or two passes through the task list, indexed by * assign = {0, 1}: * Pass 1: select the swappable task with maximal RSS that has * not yet been swapped out. * Pass 2: re-assign rss swap_cnt values, then select as above. * * With this approach, there's no need to remember the last task * swapped out. If the swap-out fails, we clear swap_cnt so the * task won't be selected again until all others have been tried. * * Think of swap_cnt as a "shadow rss" - it tells us which process * we want to page out (always try largest first). */ counter = (nr_threads << SWAP_SHIFT) >> priority; 325 if (counter < 1) counter = 1; 328 for (; counter >= 0; counter--) { struct list_head *p; unsigned long max_cnt = 0; struct mm_struct *best = NULL; int assign = 0; int found_task = 0; select: spin_lock(&mmlist_lock); p = init_mm.mmlist.next; 337 for (; p != &init_mm.mmlist; p = p->next) { struct mm_struct *mm = list_entry(p, struct mm_struct, mmlist); 339 if (mm->rss <= 0) 340 continue; found_task++; /* Refresh swap_cnt? */ 343 if (assign == 1) { mm->swap_cnt = (mm->rss >> SWAP_SHIFT); 345 if (mm->swap_cnt < SWAP_MIN) mm->swap_cnt = SWAP_MIN; } 348 if (mm->swap_cnt > max_cnt) { max_cnt = mm->swap_cnt; best = mm; } } /* Make sure it doesn't disappear */ 355 if (best) atomic_inc(&best->mm_users); 357 spin_unlock(&mmlist_lock); /* * We have dropped the tasklist_lock, but we * know that "mm" still exists: we are running * with the big kernel lock, and exit_mm() * cannot race with us. */ 365 if (!best) { 366 if (!assign && found_task > 0) { assign = 1; 368 goto select; } 370 break; 371 } else { __ret = swap_out_mm(best, gfp_mask); mmput(best); 374 break; } } 377 return __ret; } /** * reclaim_page - reclaims one page from the inactive_clean list * @zone: reclaim a page from this zone * * The pages on the inactive_clean can be instantly reclaimed. * The tests look impressive, but most of the time we'll grab * the first page of the list and exit successfully. */ 389 struct page * reclaim_page(zone_t * zone) { struct page * page = NULL; struct list_head * page_lru; int maxscan; /* * We only need the pagemap_lru_lock if we don't reclaim the page, * but we have to grab the pagecache_lock before the pagemap_lru_lock * to avoid deadlocks and most of the time we'll succeed anyway. */ spin_lock(&pagecache_lock); spin_lock(&pagemap_lru_lock); maxscan = zone->inactive_clean_pages; while ((page_lru = zone->inactive_clean_list.prev) != 404 &zone->inactive_clean_list && maxscan--) { page = list_entry(page_lru, struct page, lru); /* Wrong page on list?! (list corruption, should not happen) */ 408 if (!PageInactiveClean(page)) { printk("VM: reclaim_page, wrong page on list.\n"); list_del(page_lru); page->zone->inactive_clean_pages--; 412 continue; } /* Page is or was in use? Move it to the active list. */ if (PageTestandClearReferenced(page) || page->age > 0 || 417 (!page->buffers && page_count(page) > 1)) { 418 del_page_from_inactive_clean_list(page); 419 add_page_to_active_list(page); 420 continue; } /* The page is dirty, or locked, move to inactive_dirty list. */ 424 if (page->buffers || PageDirty(page) || TryLockPage(page)) { 425 del_page_from_inactive_clean_list(page); 426 add_page_to_inactive_dirty_list(page); 427 continue; } /* OK, remove the page from the caches. */ 431 if (PageSwapCache(page)) { __delete_from_swap_cache(page); 433 goto found_page; } 436 if (page->mapping) { __remove_inode_page(page); 438 goto found_page; } /* We should never ever get here. */ printk(KERN_ERR "VM: reclaim_page, found unknown page\n"); list_del(page_lru); zone->inactive_clean_pages--; 445 UnlockPage(page); } /* Reset page pointer, maybe we encountered an unfreeable page. */ page = NULL; 449 goto out; found_page: 452 del_page_from_inactive_clean_list(page); 453 UnlockPage(page); page->age = PAGE_AGE_START; 455 if (page_count(page) != 1) printk("VM: reclaim_page, found page with count %d!\n", page_count(page)); out: 459 spin_unlock(&pagemap_lru_lock); 460 spin_unlock(&pagecache_lock); memory_pressure++; 462 return page; } /** * page_launder - clean dirty inactive pages, move to inactive_clean list * @gfp_mask: what operations we are allowed to do * @sync: should we wait synchronously for the cleaning of pages * * When this function is called, we are most likely low on free + * inactive_clean pages. Since we want to refill those pages as * soon as possible, we'll make two loops over the inactive list, * one to move the already cleaned pages to the inactive_clean lists * and one to (often asynchronously) clean the dirty inactive pages. * * In situations where kswapd cannot keep up, user processes will * end up calling this function. Since the user process needs to * have a page before it can continue with its allocation, we'll * do synchronous page flushing in that case. * * This code is heavily inspired by the FreeBSD source code. Thanks * go out to Matthew Dillon. */ #define MAX_LAUNDER (4 * (1 << page_cluster)) 485 int page_launder(int gfp_mask, int sync) { int launder_loop, maxscan, cleaned_pages, maxlaunder; int can_get_io_locks; struct list_head * page_lru; struct page * page; /* * We can only grab the IO locks (eg. for flushing dirty * buffers to disk) if __GFP_IO is set. */ can_get_io_locks = gfp_mask & __GFP_IO; launder_loop = 0; maxlaunder = 0; cleaned_pages = 0; dirty_page_rescan: spin_lock(&pagemap_lru_lock); maxscan = nr_inactive_dirty_pages; while ((page_lru = inactive_dirty_list.prev) != &inactive_dirty_list && 506 maxscan-- > 0) { page = list_entry(page_lru, struct page, lru); /* Wrong page on list?! (list corruption, should not happen) */ 510 if (!PageInactiveDirty(page)) { printk("VM: page_launder, wrong page on list.\n"); list_del(page_lru); nr_inactive_dirty_pages--; page->zone->inactive_dirty_pages--; 515 continue; } /* Page is or was in use? Move it to the active list. */ if (PageTestandClearReferenced(page) || page->age > 0 || (!page->buffers && page_count(page) > 1) || 521 page_ramdisk(page)) { 522 del_page_from_inactive_dirty_list(page); 523 add_page_to_active_list(page); 524 continue; } /* * The page is locked. IO in progress? * Move it to the back of the list. */ 531 if (TryLockPage(page)) { list_del(page_lru); list_add(page_lru, &inactive_dirty_list); 534 continue; } /* * Dirty swap-cache page? Write it out if * last copy.. */ 541 if (PageDirty(page)) { int (*writepage)(struct page *) = page->mapping->a_ops->writepage; int result; 545 if (!writepage) 546 goto page_active; /* First time through? Move it to the back of the list */ 549 if (!launder_loop) { list_del(page_lru); list_add(page_lru, &inactive_dirty_list); 552 UnlockPage(page); 553 continue; } /* OK, do a physical asynchronous write to swap. */ ClearPageDirty(page); page_cache_get(page); 559 spin_unlock(&pagemap_lru_lock); result = writepage(page); page_cache_release(page); /* And re-start the thing.. */ spin_lock(&pagemap_lru_lock); 566 if (result != 1) 567 continue; /* writepage refused to do anything */ set_page_dirty(page); 570 goto page_active; } /* * If the page has buffers, try to free the buffer mappings * associated with this page. If we succeed we either free * the page (in case it was a buffercache only page) or we * move the page to the inactive_clean list. * * On the first round, we should free all previously cleaned * buffer pages */ 582 if (page->buffers) { int wait, clearedbuf; int freed_page = 0; /* * Since we might be doing disk IO, we have to * drop the spinlock and take an extra reference * on the page so it doesn't go away from under us. */ 590 del_page_from_inactive_dirty_list(page); page_cache_get(page); 592 spin_unlock(&pagemap_lru_lock); /* Will we do (asynchronous) IO? */ 595 if (launder_loop && maxlaunder == 0 && sync) wait = 2; /* Synchrounous IO */ 597 else if (launder_loop && maxlaunder-- > 0) wait = 1; /* Async IO */ 599 else wait = 0; /* No IO */ /* Try to free the page buffers. */ clearedbuf = try_to_free_buffers(page, wait); /* * Re-take the spinlock. Note that we cannot * unlock the page yet since we're still * accessing the page_struct here... */ spin_lock(&pagemap_lru_lock); /* The buffers were not freed. */ 613 if (!clearedbuf) { 614 add_page_to_inactive_dirty_list(page); /* The page was only in the buffer cache. */ 617 } else if (!page->mapping) { atomic_dec(&buffermem_pages); freed_page = 1; cleaned_pages++; /* The page has more users besides the cache and us. */ 623 } else if (page_count(page) > 2) { 624 add_page_to_active_list(page); /* OK, we "created" a freeable page. */ 627 } else /* page->mapping && page_count(page) == 2 */ { 628 add_page_to_inactive_clean_list(page); cleaned_pages++; } /* * Unlock the page and drop the extra reference. * We can only do it here because we ar accessing * the page struct above. */ 637 UnlockPage(page); page_cache_release(page); /* * If we're freeing buffer cache pages, stop when * we've got enough free memory. */ 644 if (freed_page && !free_shortage()) 645 break; 646 continue; 647 } else if (page->mapping && !PageDirty(page)) { /* * If a page had an extra reference in * deactivate_page(), we will find it here. * Now the page is really freeable, so we * move it to the inactive_clean list. */ 654 del_page_from_inactive_dirty_list(page); 655 add_page_to_inactive_clean_list(page); 656 UnlockPage(page); cleaned_pages++; 658 } else { page_active: /* * OK, we don't know what to do with the page. * It's no use keeping it here, so we move it to * the active list. */ 665 del_page_from_inactive_dirty_list(page); 666 add_page_to_active_list(page); 667 UnlockPage(page); } } 670 spin_unlock(&pagemap_lru_lock); /* * If we don't have enough free pages, we loop back once * to queue the dirty pages for writeout. When we were called * by a user process (that /needs/ a free page) and we didn't * free anything yet, we wait synchronously on the writeout of * MAX_SYNC_LAUNDER pages. * * We also wake up bdflush, since bdflush should, under most * loads, flush out the dirty pages before we have to wait on * IO. */ 683 if (can_get_io_locks && !launder_loop && free_shortage()) { launder_loop = 1; /* If we cleaned pages, never do synchronous IO. */ 686 if (cleaned_pages) sync = 0; /* We only do a few "out of order" flushes. */ maxlaunder = MAX_LAUNDER; /* Kflushd takes care of the rest. */ wakeup_bdflush(0); 692 goto dirty_page_rescan; } /* Return the number of pages moved to the inactive_clean list. */ 696 return cleaned_pages; } /** * refill_inactive_scan - scan the active list and find pages to deactivate * @priority: the priority at which to scan * @oneshot: exit after deactivating one page * * This function will scan a portion of the active list to find * unused pages, those pages will then be moved to the inactive list. */ 707 int refill_inactive_scan(unsigned int priority, int oneshot) { struct list_head * page_lru; struct page * page; int maxscan, page_active = 0; int ret = 0; /* Take the lock while messing with the list... */ spin_lock(&pagemap_lru_lock); maxscan = nr_active_pages >> priority; 717 while (maxscan-- > 0 && (page_lru = active_list.prev) != &active_list) { page = list_entry(page_lru, struct page, lru); /* Wrong page on list?! (list corruption, should not happen) */ 721 if (!PageActive(page)) { printk("VM: refill_inactive, wrong page on list.\n"); list_del(page_lru); nr_active_pages--; 725 continue; } /* Do aging on the pages. */ 729 if (PageTestandClearReferenced(page)) { age_page_up_nolock(page); page_active = 1; 732 } else { age_page_down_ageonly(page); /* * Since we don't hold a reference on the page * ourselves, we have to do our test a bit more * strict then deactivate_page(). This is needed * since otherwise the system could hang shuffling * unfreeable pages from the active list to the * inactive_dirty list and back again... * * SUBTLE: we can have buffer pages with count 1. */ if (page->age == 0 && page_count(page) <= 745 (page->buffers ? 2 : 1)) { deactivate_page_nolock(page); page_active = 0; 748 } else { page_active = 1; } } /* * If the page is still on the active list, move it * to the other end of the list. Otherwise it was * deactivated by age_page_down and we exit successfully. */ 757 if (page_active || PageActive(page)) { list_del(page_lru); list_add(page_lru, &active_list); 760 } else { ret = 1; 762 if (oneshot) 763 break; } } 766 spin_unlock(&pagemap_lru_lock); 768 return ret; } /* * Check if there are zones with a severe shortage of free pages, * or if all zones have a minor shortage. */ 775 int free_shortage(void) { pg_data_t *pgdat = pgdat_list; int sum = 0; int freeable = nr_free_pages() + nr_inactive_clean_pages(); int freetarget = freepages.high + inactive_target / 3; /* Are we low on free pages globally? */ 783 if (freeable < freetarget) 784 return freetarget - freeable; /* If not, are we very low on any particular zone? */ 787 do { int i; 789 for(i = 0; i < MAX_NR_ZONES; i++) { zone_t *zone = pgdat->node_zones+ i; if (zone->size && (zone->inactive_clean_pages + 792 zone->free_pages < zone->pages_min+1)) { /* + 1 to have overlap with alloc_pages() !! */ sum += zone->pages_min + 1; sum -= zone->free_pages; sum -= zone->inactive_clean_pages; } } pgdat = pgdat->node_next; 800 } while (pgdat); 802 return sum; } /* * How many inactive pages are we short? */ 808 int inactive_shortage(void) { int shortage = 0; shortage += freepages.high; shortage += inactive_target; shortage -= nr_free_pages(); shortage -= nr_inactive_clean_pages(); shortage -= nr_inactive_dirty_pages; 818 if (shortage > 0) 819 return shortage; 821 return 0; } /* * We need to make the locks finer granularity, but right * now we need this so that we can do page allocations * without holding the kernel lock etc. * * We want to try to free "count" pages, and we want to * cluster them so that we get good swap-out behaviour. * * OTOH, if we're a user process (and not kswapd), we * really care about latency. In that case we don't try * to free too many pages. */ 836 static int refill_inactive(unsigned int gfp_mask, int user) { int priority, count, start_count, made_progress; count = inactive_shortage() + free_shortage(); 841 if (user) count = (1 << page_cluster); start_count = count; /* Always trim SLAB caches when memory gets low. */ kmem_cache_reap(gfp_mask); priority = 6; 849 do { made_progress = 0; 852 if (current->need_resched) { 853 __set_current_state(TASK_RUNNING); schedule(); } 857 while (refill_inactive_scan(priority, 1)) { made_progress = 1; 859 if (--count <= 0) 860 goto done; } /* * don't be too light against the d/i cache since * refill_inactive() almost never fail when there's * really plenty of memory free. */ shrink_dcache_memory(priority, gfp_mask); shrink_icache_memory(priority, gfp_mask); /* * Then, try to page stuff out.. */ 874 while (swap_out(priority, gfp_mask)) { made_progress = 1; 876 if (--count <= 0) 877 goto done; } /* * If we either have enough free memory, or if * page_launder() will be able to make enough * free memory, then stop. */ 885 if (!inactive_shortage() || !free_shortage()) 886 goto done; /* * Only switch to a lower "priority" if we * didn't make any useful progress in the * last loop. */ 893 if (!made_progress) priority--; 895 } while (priority >= 0); /* Always end on a refill_inactive.., may sleep... */ 898 while (refill_inactive_scan(0, 1)) { 899 if (--count <= 0) 900 goto done; } done: 904 return (count < start_count); } 907 static int do_try_to_free_pages(unsigned int gfp_mask, int user) { int ret = 0; /* * If we're low on free pages, move pages from the * inactive_dirty list to the inactive_clean list. * * Usually bdflush will have pre-cleaned the pages * before we get around to moving them to the other * list, so this is a relatively cheap operation. */ if (free_shortage() || nr_inactive_dirty_pages > nr_free_pages() + 920 nr_inactive_clean_pages()) ret += page_launder(gfp_mask, user); /* * If needed, we move pages from the active list * to the inactive list. We also "eat" pages from * the inode and dentry cache whenever we do this. */ 928 if (free_shortage() || inactive_shortage()) { shrink_dcache_memory(6, gfp_mask); shrink_icache_memory(6, gfp_mask); ret += refill_inactive(gfp_mask, user); 932 } else { /* * Reclaim unused slab cache memory. */ kmem_cache_reap(gfp_mask); ret = 1; } 940 return ret; } DECLARE_WAIT_QUEUE_HEAD(kswapd_wait); DECLARE_WAIT_QUEUE_HEAD(kswapd_done); struct task_struct *kswapd_task; /* * The background pageout daemon, started as a kernel thread * from the init process. * * This basically trickles out pages so that we have _some_ * free memory available even if there is no other activity * that frees anything up. This is needed for things like routing * etc, where we otherwise might have all activity going on in * asynchronous contexts that cannot page things out. * * If there are applications that are active memory-allocators * (most normal use), this basically shouldn't matter. */ 960 int kswapd(void *unused) { struct task_struct *tsk = current; tsk->session = 1; tsk->pgrp = 1; strcpy(tsk->comm, "kswapd"); sigfillset(&tsk->blocked); kswapd_task = tsk; /* * Tell the memory management that we're a "memory allocator", * and that if we need more memory we should get access to it * regardless (see "__alloc_pages()"). "kswapd" should * never get caught in the normal page freeing logic. * * (Kswapd normally doesn't need memory anyway, but sometimes * you need a small amount of memory in order to be able to * page out something else, and this flag essentially protects * us from recursively trying to free more memory as we're * trying to free the first piece of memory in the first place). */ tsk->flags |= PF_MEMALLOC; /* * Kswapd main loop. */ 987 for (;;) { static int recalc = 0; /* If needed, try to free some memory. */ 991 if (inactive_shortage() || free_shortage()) { int wait = 0; /* Do we need to do some synchronous flushing? */ 994 if (waitqueue_active(&kswapd_done)) wait = 1; do_try_to_free_pages(GFP_KSWAPD, wait); } /* * Do some (very minimal) background scanning. This * will scan all pages on the active list once * every minute. This clears old referenced bits * and moves unused pages to the inactive list. */ refill_inactive_scan(6, 0); /* Once a second, recalculate some VM stats. */ 1008 if (time_after(jiffies, recalc + HZ)) { recalc = jiffies; recalculate_vm_stats(); } /* * Wake up everybody waiting for free memory * and unplug the disk queue. */ wake_up_all(&kswapd_done); run_task_queue(&tq_disk); /* * We go to sleep if either the free page shortage * or the inactive page shortage is gone. We do this * because: * 1) we need no more free pages or * 2) the inactive pages need to be flushed to disk, * it wouldn't help to eat CPU time now ... * * We go to sleep for one second, but if it's needed * we'll be woken up earlier... */ 1031 if (!free_shortage() || !inactive_shortage()) { interruptible_sleep_on_timeout(&kswapd_wait, HZ); /* * If we couldn't free enough memory, we see if it was * due to the system just not having enough memory. * If that is the case, the only solution is to kill * a process (the alternative is enternal deadlock). * * If there still is enough memory around, we just loop * and try free some more memory... */ 1042 } else if (out_of_memory()) { oom_kill(); } } } 1048 void wakeup_kswapd(int block) { DECLARE_WAITQUEUE(wait, current); 1052 if (current == kswapd_task) 1053 return; 1055 if (!block) { 1056 if (waitqueue_active(&kswapd_wait)) wake_up(&kswapd_wait); 1058 return; } /* * Kswapd could wake us up before we get a chance * to sleep, so we have to be very careful here to * prevent SMP races... */ 1066 __set_current_state(TASK_UNINTERRUPTIBLE); add_wait_queue(&kswapd_done, &wait); 1069 if (waitqueue_active(&kswapd_wait)) wake_up(&kswapd_wait); schedule(); remove_wait_queue(&kswapd_done, &wait); 1074 __set_current_state(TASK_RUNNING); } /* * Called by non-kswapd processes when they want more * memory but are unable to sleep on kswapd because * they might be holding some IO locks ... */ 1082 int try_to_free_pages(unsigned int gfp_mask) { int ret = 1; 1086 if (gfp_mask & __GFP_WAIT) { current->flags |= PF_MEMALLOC; ret = do_try_to_free_pages(gfp_mask, 1); current->flags &= ~PF_MEMALLOC; } 1092 return ret; } DECLARE_WAIT_QUEUE_HEAD(kreclaimd_wait); /* * Kreclaimd will move pages from the inactive_clean list to the * free list, in order to keep atomic allocations possible under * all circumstances. Even when kswapd is blocked on IO. */ 1101 int kreclaimd(void *unused) { struct task_struct *tsk = current; pg_data_t *pgdat; tsk->session = 1; tsk->pgrp = 1; strcpy(tsk->comm, "kreclaimd"); sigfillset(&tsk->blocked); current->flags |= PF_MEMALLOC; 1112 while (1) { /* * We sleep until someone wakes us up from * page_alloc.c::__alloc_pages(). */ interruptible_sleep_on(&kreclaimd_wait); /* * Move some pages from the inactive_clean lists to * the free lists, if it is needed. */ pgdat = pgdat_list; 1125 do { int i; 1127 for(i = 0; i < MAX_NR_ZONES; i++) { zone_t *zone = pgdat->node_zones + i; 1129 if (!zone->size) 1130 continue; 1132 while (zone->free_pages < zone->pages_low) { struct page * page; page = reclaim_page(zone); 1135 if (!page) 1136 break; __free_page(page); } } pgdat = pgdat->node_next; 1141 } while (pgdat); } } 1146 static int __init kswapd_init(void) { printk("Starting kswapd v1.8\n"); swap_setup(); kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL); kernel_thread(kreclaimd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL); 1152 return 0; } module_init(kswapd_init)