./mm/page_alloc.c

      /*
       *  linux/mm/page_alloc.c
       *
       *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
       *  Swap reorganised 29.12.95, Stephen Tweedie
       *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
       *  Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
       *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
       *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
       */
      
      #include <linux/config.h>
      #include <linux/mm.h>
      #include <linux/swap.h>
      #include <linux/swapctl.h>
      #include <linux/interrupt.h>
      #include <linux/pagemap.h>
      #include <linux/bootmem.h>
      
      int nr_swap_pages;
      int nr_active_pages;
      int nr_inactive_dirty_pages;
      pg_data_t *pgdat_list;
      
      static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
      static int zone_balance_ratio[MAX_NR_ZONES] = { 32, 128, 128, };
      static int zone_balance_min[MAX_NR_ZONES] = { 10 , 10, 10, };
      static int zone_balance_max[MAX_NR_ZONES] = { 255 , 255, 255, };
      
      struct list_head active_list;
      struct list_head inactive_dirty_list;
      /*
       * Free_page() adds the page to the free lists. This is optimized for
       * fast normal cases (no error jumps taken normally).
       *
       * The way to optimize jumps for gcc-2.2.2 is to:
       *  - select the "normal" case and put it inside the if () { XXX }
       *  - no else-statements if you can avoid them
       *
       * With the above two rules, you get a straight-line execution path
       * for the normal case, giving better asm-code.
       */
      
      #define memlist_init(x) INIT_LIST_HEAD(x)
      #define memlist_add_head list_add
      #define memlist_add_tail list_add_tail
      #define memlist_del list_del
      #define memlist_entry list_entry
      #define memlist_next(x) ((x)->next)
      #define memlist_prev(x) ((x)->prev)
      
      /*
       * Temporary debugging check.
       */
      #define BAD_RANGE(zone,x) (((zone) != (x)->zone) || (((x)-mem_map) < (zone)->offset) || (((x)-mem_map) >= (zone)->offset+(zone)->size))
      
      /*
       * Buddy system. Hairy. You really aren't expected to understand this
       *
       * Hint: -mask = 1+~mask
       */
      
      static void FASTCALL(__free_pages_ok (struct page *page, unsigned long order));
  64  static void __free_pages_ok (struct page *page, unsigned long order)
      {
      	unsigned long index, page_idx, mask, flags;
      	free_area_t *area;
      	struct page *base;
      	zone_t *zone;
      
  71  	if (page->buffers)
  72  		BUG();
  73  	if (page->mapping)
  74  		BUG();
  75  	if (!VALID_PAGE(page))
  76  		BUG();
  77  	if (PageSwapCache(page))
  78  		BUG();
  79  	if (PageLocked(page))
  80  		BUG();
  81  	if (PageDecrAfter(page))
  82  		BUG();
  83  	if (PageActive(page))
  84  		BUG();
  85  	if (PageInactiveDirty(page))
  86  		BUG();
  87  	if (PageInactiveClean(page))
  88  		BUG();
      
      	page->flags &= ~((1<<PG_referenced) | (1<<PG_dirty));
      	page->age = PAGE_AGE_START;
      	
      	zone = page->zone;
      
      	mask = (~0UL) << order;
      	base = mem_map + zone->offset;
      	page_idx = page - base;
  98  	if (page_idx & ~mask)
  99  		BUG();
      	index = page_idx >> (1 + order);
      
      	area = zone->free_area + order;
      
 104  	spin_lock_irqsave(&zone->lock, flags);
      
      	zone->free_pages -= mask;
      
 108  	while (mask + (1 << (MAX_ORDER-1))) {
      		struct page *buddy1, *buddy2;
      
 111  		if (area >= zone->free_area + MAX_ORDER)
 112  			BUG();
 113  		if (!test_and_change_bit(index, area->map))
      			/*
      			 * the buddy page is still allocated.
      			 */
 117  			break;
      		/*
      		 * Move the buddy up one level.
      		 */
      		buddy1 = base + (page_idx ^ -mask);
      		buddy2 = base + page_idx;
 123  		if (BAD_RANGE(zone,buddy1))
 124  			BUG();
 125  		if (BAD_RANGE(zone,buddy2))
 126  			BUG();
      
      		memlist_del(&buddy1->list);
      		mask <<= 1;
      		area++;
      		index >>= 1;
      		page_idx &= mask;
      	}
      	memlist_add_head(&(base + page_idx)->list, &area->free_list);
      
 136  	spin_unlock_irqrestore(&zone->lock, flags);
      
      	/*
      	 * We don't want to protect this variable from race conditions
      	 * since it's nothing important, but we do want to make sure
      	 * it never gets negative.
      	 */
 143  	if (memory_pressure > NR_CPUS)
      		memory_pressure--;
      }
      
      #define MARK_USED(index, order, area) \
      	change_bit((index) >> (1+(order)), (area)->map)
      
 150  static inline struct page * expand (zone_t *zone, struct page *page,
      	 unsigned long index, int low, int high, free_area_t * area)
      {
      	unsigned long size = 1 << high;
      
 155  	while (high > low) {
 156  		if (BAD_RANGE(zone,page))
 157  			BUG();
      		area--;
      		high--;
      		size >>= 1;
      		memlist_add_head(&(page)->list, &(area)->free_list);
      		MARK_USED(index, high, area);
      		index += size;
      		page += size;
      	}
 166  	if (BAD_RANGE(zone,page))
 167  		BUG();
 168  	return page;
      }
      
      static FASTCALL(struct page * rmqueue(zone_t *zone, unsigned long order));
 172  static struct page * rmqueue(zone_t *zone, unsigned long order)
      {
      	free_area_t * area = zone->free_area + order;
      	unsigned long curr_order = order;
      	struct list_head *head, *curr;
      	unsigned long flags;
      	struct page *page;
      
 180  	spin_lock_irqsave(&zone->lock, flags);
 181  	do {
      		head = &area->free_list;
      		curr = memlist_next(head);
      
 185  		if (curr != head) {
      			unsigned int index;
      
      			page = memlist_entry(curr, struct page, list);
 189  			if (BAD_RANGE(zone,page))
 190  				BUG();
      			memlist_del(curr);
      			index = (page - mem_map) - zone->offset;
      			MARK_USED(index, curr_order, area);
      			zone->free_pages -= 1 << order;
      
      			page = expand(zone, page, index, order, curr_order, area);
 197  			spin_unlock_irqrestore(&zone->lock, flags);
      
      			set_page_count(page, 1);
 200  			if (BAD_RANGE(zone,page))
 201  				BUG();
 202  			DEBUG_ADD_PAGE
 203  			return page;	
      		}
      		curr_order++;
      		area++;
 207  	} while (curr_order < MAX_ORDER);
 208  	spin_unlock_irqrestore(&zone->lock, flags);
      
 210  	return NULL;
      }
      
      #define PAGES_MIN	0
      #define PAGES_LOW	1
      #define PAGES_HIGH	2
      
      /*
       * This function does the dirty work for __alloc_pages
       * and is separated out to keep the code size smaller.
       * (suggested by Davem at 1:30 AM, typed by Rik at 6 AM)
       */
 222  static struct page * __alloc_pages_limit(zonelist_t *zonelist,
      			unsigned long order, int limit, int direct_reclaim)
      {
      	zone_t **zone = zonelist->zones;
      
 227  	for (;;) {
      		zone_t *z = *(zone++);
      		unsigned long water_mark;
      
 231  		if (!z)
 232  			break;
 233  		if (!z->size)
 234  			BUG();
      
      		/*
      		 * We allocate if the number of free + inactive_clean
      		 * pages is above the watermark.
      		 */
 240  		switch (limit) {
 241  			default:
 242  			case PAGES_MIN:
      				water_mark = z->pages_min;
 244  				break;
 245  			case PAGES_LOW:
      				water_mark = z->pages_low;
 247  				break;
 248  			case PAGES_HIGH:
      				water_mark = z->pages_high;
      		}
      
 252  		if (z->free_pages + z->inactive_clean_pages > water_mark) {
      			struct page *page = NULL;
      			/* If possible, reclaim a page directly. */
 255  			if (direct_reclaim && z->free_pages < z->pages_min + 8)
      				page = reclaim_page(z);
      			/* If that fails, fall back to rmqueue. */
 258  			if (!page)
      				page = rmqueue(z, order);
 260  			if (page)
 261  				return page;
      		}
      	}
      
      	/* Found nothing. */
 266  	return NULL;
      }
      
      
      /*
       * This is the 'heart' of the zoned buddy allocator:
       */
 273  struct page * __alloc_pages(zonelist_t *zonelist, unsigned long order)
      {
      	zone_t **zone;
      	int direct_reclaim = 0;
      	unsigned int gfp_mask = zonelist->gfp_mask;
      	struct page * page;
      
      	/*
      	 * Allocations put pressure on the VM subsystem.
      	 */
      	memory_pressure++;
      
      	/*
      	 * (If anyone calls gfp from interrupts nonatomically then it
      	 * will sooner or later tripped up by a schedule().)
      	 *
      	 * We are falling back to lower-level zones if allocation
      	 * in a higher zone fails.
      	 */
      
      	/*
      	 * Can we take pages directly from the inactive_clean
      	 * list?
      	 */
      	if (order == 0 && (gfp_mask & __GFP_WAIT) &&
 298  			!(current->flags & PF_MEMALLOC))
      		direct_reclaim = 1;
      
      	/*
      	 * If we are about to get low on free pages and we also have
      	 * an inactive page shortage, wake up kswapd.
      	 */
 305  	if (inactive_shortage() > inactive_target / 2 && free_shortage())
      		wakeup_kswapd(0);
      	/*
      	 * If we are about to get low on free pages and cleaning
      	 * the inactive_dirty pages would fix the situation,
      	 * wake up bdflush.
      	 */
      	else if (free_shortage() && nr_inactive_dirty_pages > free_shortage()
 313  			&& nr_inactive_dirty_pages >= freepages.high)
      		wakeup_bdflush(0);
      
      try_again:
      	/*
      	 * First, see if we have any zones with lots of free memory.
      	 *
      	 * We allocate free memory first because it doesn't contain
      	 * any data ... DUH!
      	 */
      	zone = zonelist->zones;
 324  	for (;;) {
      		zone_t *z = *(zone++);
 326  		if (!z)
 327  			break;
 328  		if (!z->size)
 329  			BUG();
      
 331  		if (z->free_pages >= z->pages_low) {
      			page = rmqueue(z, order);
 333  			if (page)
 334  				return page;
      		} else if (z->free_pages < z->pages_min &&
 336  					waitqueue_active(&kreclaimd_wait)) {
      				wake_up_interruptible(&kreclaimd_wait);
      		}
      	}
      
      	/*
      	 * Try to allocate a page from a zone with a HIGH
      	 * amount of free + inactive_clean pages.
      	 *
      	 * If there is a lot of activity, inactive_target
      	 * will be high and we'll have a good chance of
      	 * finding a page using the HIGH limit.
      	 */
      	page = __alloc_pages_limit(zonelist, order, PAGES_HIGH, direct_reclaim);
 350  	if (page)
 351  		return page;
      
      	/*
      	 * Then try to allocate a page from a zone with more
      	 * than zone->pages_low free + inactive_clean pages.
      	 *
      	 * When the working set is very large and VM activity
      	 * is low, we're most likely to have our allocation
      	 * succeed here.
      	 */
      	page = __alloc_pages_limit(zonelist, order, PAGES_LOW, direct_reclaim);
 362  	if (page)
 363  		return page;
      
      	/*
      	 * OK, none of the zones on our zonelist has lots
      	 * of pages free.
      	 *
      	 * We wake up kswapd, in the hope that kswapd will
      	 * resolve this situation before memory gets tight.
      	 *
      	 * We also yield the CPU, because that:
      	 * - gives kswapd a chance to do something
      	 * - slows down allocations, in particular the
      	 *   allocations from the fast allocator that's
      	 *   causing the problems ...
      	 * - ... which minimises the impact the "bad guys"
      	 *   have on the rest of the system
      	 * - if we don't have __GFP_IO set, kswapd may be
      	 *   able to free some memory we can't free ourselves
      	 */
      	wakeup_kswapd(0);
 383  	if (gfp_mask & __GFP_WAIT) {
 384  		__set_current_state(TASK_RUNNING);
      		current->policy |= SCHED_YIELD;
      		schedule();
      	}
      
      	/*
      	 * After waking up kswapd, we try to allocate a page
      	 * from any zone which isn't critical yet.
      	 *
      	 * Kswapd should, in most situations, bring the situation
      	 * back to normal in no time.
      	 */
      	page = __alloc_pages_limit(zonelist, order, PAGES_MIN, direct_reclaim);
 397  	if (page)
 398  		return page;
      
      	/*
      	 * Damn, we didn't succeed.
      	 *
      	 * This can be due to 2 reasons:
      	 * - we're doing a higher-order allocation
      	 * 	--> move pages to the free list until we succeed
      	 * - we're /really/ tight on memory
      	 * 	--> wait on the kswapd waitqueue until memory is freed
      	 */
 409  	if (!(current->flags & PF_MEMALLOC)) {
      		/*
      		 * Are we dealing with a higher order allocation?
      		 *
      		 * Move pages from the inactive_clean to the free list
      		 * in the hope of creating a large, physically contiguous
      		 * piece of free memory.
      		 */
 417  		if (order > 0 && (gfp_mask & __GFP_WAIT)) {
      			zone = zonelist->zones;
      			/* First, clean some dirty pages. */
      			current->flags |= PF_MEMALLOC;
      			page_launder(gfp_mask, 1);
      			current->flags &= ~PF_MEMALLOC;
 423  			for (;;) {
      				zone_t *z = *(zone++);
 425  				if (!z)
 426  					break;
 427  				if (!z->size)
 428  					continue;
 429  				while (z->inactive_clean_pages) {
      					struct page * page;
      					/* Move one page to the free list. */
      					page = reclaim_page(z);
 433  					if (!page)
 434  						break;
      					__free_page(page);
      					/* Try if the allocation succeeds. */
      					page = rmqueue(z, order);
 438  					if (page)
 439  						return page;
      				}
      			}
      		}
      		/*
      		 * When we arrive here, we are really tight on memory.
      		 *
      		 * We wake up kswapd and sleep until kswapd wakes us
      		 * up again. After that we loop back to the start.
      		 *
      		 * We have to do this because something else might eat
      		 * the memory kswapd frees for us and we need to be
      		 * reliable. Note that we don't loop back for higher
      		 * order allocations since it is possible that kswapd
      		 * simply cannot free a large enough contiguous area
      		 * of memory *ever*.
      		 */
 456  		if ((gfp_mask & (__GFP_WAIT|__GFP_IO)) == (__GFP_WAIT|__GFP_IO)) {
      			wakeup_kswapd(1);
      			memory_pressure++;
 459  			if (!order)
 460  				goto try_again;
      		/*
      		 * If __GFP_IO isn't set, we can't wait on kswapd because
      		 * kswapd just might need some IO locks /we/ are holding ...
      		 *
      		 * SUBTLE: The scheduling point above makes sure that
      		 * kswapd does get the chance to free memory we can't
      		 * free ourselves...
      		 */
 469  		} else if (gfp_mask & __GFP_WAIT) {
      			try_to_free_pages(gfp_mask);
      			memory_pressure++;
 472  			if (!order)
 473  				goto try_again;
      		}
      
      	}
      
      	/*
      	 * Final phase: allocate anything we can!
      	 *
      	 * Higher order allocations, GFP_ATOMIC allocations and
      	 * recursive allocations (PF_MEMALLOC) end up here.
      	 *
      	 * Only recursive allocations can use the very last pages
      	 * in the system, otherwise it would be just too easy to
      	 * deadlock the system...
      	 */
      	zone = zonelist->zones;
 489  	for (;;) {
      		zone_t *z = *(zone++);
      		struct page * page = NULL;
 492  		if (!z)
 493  			break;
 494  		if (!z->size)
 495  			BUG();
      
      		/*
      		 * SUBTLE: direct_reclaim is only possible if the task
      		 * becomes PF_MEMALLOC while looping above. This will
      		 * happen when the OOM killer selects this task for
      		 * instant execution...
      		 */
 503  		if (direct_reclaim) {
      			page = reclaim_page(z);
 505  			if (page)
 506  				return page;
      		}
      
      		/* XXX: is pages_min/4 a good amount to reserve for this? */
      		if (z->free_pages < z->pages_min / 4 &&
 511  				!(current->flags & PF_MEMALLOC))
 512  			continue;
      		page = rmqueue(z, order);
 514  		if (page)
 515  			return page;
      	}
      
      	/* No luck.. */
      	printk(KERN_ERR "__alloc_pages: %lu-order allocation failed.\n", order);
 520  	return NULL;
      }
      
      /*
       * Common helper functions.
       */
 526  unsigned long __get_free_pages(int gfp_mask, unsigned long order)
      {
      	struct page * page;
      
      	page = alloc_pages(gfp_mask, order);
 531  	if (!page)
 532  		return 0;
 533  	return (unsigned long) page_address(page);
      }
      
 536  unsigned long get_zeroed_page(int gfp_mask)
      {
      	struct page * page;
      
      	page = alloc_pages(gfp_mask, 0);
 541  	if (page) {
      		void *address = page_address(page);
      		clear_page(address);
 544  		return (unsigned long) address;
      	}
 546  	return 0;
      }
      
 549  void __free_pages(struct page *page, unsigned long order)
      {
 551  	if (!PageReserved(page) && put_page_testzero(page))
      		__free_pages_ok(page, order);
      }
      
 555  void free_pages(unsigned long addr, unsigned long order)
      {
      	struct page *fpage;
      
      #ifdef CONFIG_DISCONTIGMEM
      	if (addr == 0) return;
      #endif
      	fpage = virt_to_page(addr);
 563  	if (VALID_PAGE(fpage))
      		__free_pages(fpage, order);
      }
      
      /*
       * Total amount of free (allocatable) RAM:
       */
 570  unsigned int nr_free_pages (void)
      {
      	unsigned int sum;
      	zone_t *zone;
      	pg_data_t *pgdat = pgdat_list;
      
      	sum = 0;
 577  	while (pgdat) {
 578  		for (zone = pgdat->node_zones; zone < pgdat->node_zones + MAX_NR_ZONES; zone++)
      			sum += zone->free_pages;
      		pgdat = pgdat->node_next;
      	}
 582  	return sum;
      }
      
      /*
       * Total amount of inactive_clean (allocatable) RAM:
       */
 588  unsigned int nr_inactive_clean_pages (void)
      {
      	unsigned int sum;
      	zone_t *zone;
      	pg_data_t *pgdat = pgdat_list;
      
      	sum = 0;
 595  	while (pgdat) {
 596  		for (zone = pgdat->node_zones; zone < pgdat->node_zones + MAX_NR_ZONES; zone++)
      			sum += zone->inactive_clean_pages;
      		pgdat = pgdat->node_next;
      	}
 600  	return sum;
      }
      
      /*
       * Amount of free RAM allocatable as buffer memory:
       */
 606  unsigned int nr_free_buffer_pages (void)
      {
      	unsigned int sum;
      
      	sum = nr_free_pages();
      	sum += nr_inactive_clean_pages();
      	sum += nr_inactive_dirty_pages;
      
      	/*
      	 * Keep our write behind queue filled, even if
      	 * kswapd lags a bit right now.
      	 */
 618  	if (sum < freepages.high + inactive_target)
      		sum = freepages.high + inactive_target;
      	/*
      	 * We don't want dirty page writebehind to put too
      	 * much pressure on the working set, but we want it
      	 * to be possible to have some dirty pages in the
      	 * working set without upsetting the writebehind logic.
      	 */
      	sum += nr_active_pages >> 4;
      
 628  	return sum;
      }
      
      #if CONFIG_HIGHMEM
      unsigned int nr_free_highpages (void)
      {
      	pg_data_t *pgdat = pgdat_list;
      	unsigned int pages = 0;
      
      	while (pgdat) {
      		pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages;
      		pgdat = pgdat->node_next;
      	}
      	return pages;
      }
      #endif
      
      /*
       * Show free area list (used inside shift_scroll-lock stuff)
       * We also calculate the percentage fragmentation. We do this by counting the
       * memory on each free list with the exception of the first item on the list.
       */
 650  void show_free_areas_core(pg_data_t *pgdat)
      {
       	unsigned long order;
      	unsigned type;
      
      	printk("Free pages:      %6dkB (%6dkB HighMem)\n",
      		nr_free_pages() << (PAGE_SHIFT-10),
      		nr_free_highpages() << (PAGE_SHIFT-10));
      
      	printk("( Active: %d, inactive_dirty: %d, inactive_clean: %d, free: %d (%d %d %d) )\n",
      		nr_active_pages,
      		nr_inactive_dirty_pages,
      		nr_inactive_clean_pages(),
      		nr_free_pages(),
      		freepages.min,
      		freepages.low,
      		freepages.high);
      
 668  	for (type = 0; type < MAX_NR_ZONES; type++) {
      		struct list_head *head, *curr;
      		zone_t *zone = pgdat->node_zones + type;
       		unsigned long nr, total, flags;
      
      		total = 0;
 674  		if (zone->size) {
 675  			spin_lock_irqsave(&zone->lock, flags);
 676  		 	for (order = 0; order < MAX_ORDER; order++) {
      				head = &(zone->free_area + order)->free_list;
      				curr = head;
      				nr = 0;
 680  				for (;;) {
      					curr = memlist_next(curr);
 682  					if (curr == head)
 683  						break;
      					nr++;
      				}
      				total += nr * (1 << order);
      				printk("%lu*%lukB ", nr,
      						(PAGE_SIZE>>10) << order);
      			}
 690  			spin_unlock_irqrestore(&zone->lock, flags);
      		}
      		printk("= %lukB)\n", total * (PAGE_SIZE>>10));
      	}
      
      #ifdef SWAP_CACHE_INFO
      	show_swap_cache_info();
      #endif	
      }
      
 700  void show_free_areas(void)
      {
      	show_free_areas_core(pgdat_list);
      }
      
      /*
       * Builds allocation fallback zone lists.
       */
 708  static inline void build_zonelists(pg_data_t *pgdat)
      {
      	int i, j, k;
      
 712  	for (i = 0; i < NR_GFPINDEX; i++) {
      		zonelist_t *zonelist;
      		zone_t *zone;
      
      		zonelist = pgdat->node_zonelists + i;
      		memset(zonelist, 0, sizeof(*zonelist));
      
      		zonelist->gfp_mask = i;
      		j = 0;
      		k = ZONE_NORMAL;
 722  		if (i & __GFP_HIGHMEM)
      			k = ZONE_HIGHMEM;
 724  		if (i & __GFP_DMA)
      			k = ZONE_DMA;
      
 727  		switch (k) {
 728  			default:
 729  				BUG();
      			/*
      			 * fallthrough:
      			 */
 733  			case ZONE_HIGHMEM:
      				zone = pgdat->node_zones + ZONE_HIGHMEM;
 735  				if (zone->size) {
      #ifndef CONFIG_HIGHMEM
 737  					BUG();
      #endif
      					zonelist->zones[j++] = zone;
      				}
 741  			case ZONE_NORMAL:
      				zone = pgdat->node_zones + ZONE_NORMAL;
 743  				if (zone->size)
      					zonelist->zones[j++] = zone;
 745  			case ZONE_DMA:
      				zone = pgdat->node_zones + ZONE_DMA;
 747  				if (zone->size)
      					zonelist->zones[j++] = zone;
      		}
      		zonelist->zones[j++] = NULL;
      	} 
      }
      
      #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
      
      /*
       * Set up the zone data structures:
       *   - mark all pages reserved
       *   - mark all memory queues empty
       *   - clear the memory bitmaps
       */
 762  void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
      	unsigned long *zones_size, unsigned long zone_start_paddr, 
      	unsigned long *zholes_size, struct page *lmem_map)
      {
      	struct page *p;
      	unsigned long i, j;
      	unsigned long map_size;
      	unsigned long totalpages, offset, realtotalpages;
      	unsigned int cumulative = 0;
      
      	totalpages = 0;
 773  	for (i = 0; i < MAX_NR_ZONES; i++) {
      		unsigned long size = zones_size[i];
      		totalpages += size;
      	}
      	realtotalpages = totalpages;
 778  	if (zholes_size)
 779  		for (i = 0; i < MAX_NR_ZONES; i++)
      			realtotalpages -= zholes_size[i];
      			
      	printk("On node %d totalpages: %lu\n", nid, realtotalpages);
      
 784  	memlist_init(&active_list);
 785  	memlist_init(&inactive_dirty_list);
      
      	/*
      	 * Some architectures (with lots of mem and discontinous memory
      	 * maps) have to search for a good mem_map area:
      	 * For discontigmem, the conceptual mem map array starts from 
      	 * PAGE_OFFSET, we need to align the actual array onto a mem map 
      	 * boundary, so that MAP_NR works.
      	 */
      	map_size = (totalpages + 1)*sizeof(struct page);
 795  	if (lmem_map == (struct page *)0) {
      		lmem_map = (struct page *) alloc_bootmem_node(pgdat, map_size);
      		lmem_map = (struct page *)(PAGE_OFFSET + 
      			MAP_ALIGN((unsigned long)lmem_map - PAGE_OFFSET));
      	}
      	*gmap = pgdat->node_mem_map = lmem_map;
      	pgdat->node_size = totalpages;
      	pgdat->node_start_paddr = zone_start_paddr;
      	pgdat->node_start_mapnr = (lmem_map - mem_map);
      
      	/*
      	 * Initially all pages are reserved - free ones are freed
      	 * up by free_all_bootmem() once the early boot process is
      	 * done.
      	 */
 810  	for (p = lmem_map; p < lmem_map + totalpages; p++) {
      		set_page_count(p, 0);
      		SetPageReserved(p);
      		init_waitqueue_head(&p->wait);
 814  		memlist_init(&p->list);
      	}
      
      	offset = lmem_map - mem_map;	
 818  	for (j = 0; j < MAX_NR_ZONES; j++) {
      		zone_t *zone = pgdat->node_zones + j;
      		unsigned long mask;
      		unsigned long size, realsize;
      
      		realsize = size = zones_size[j];
 824  		if (zholes_size)
      			realsize -= zholes_size[j];
      
      		printk("zone(%lu): %lu pages.\n", j, size);
      		zone->size = size;
      		zone->name = zone_names[j];
      		zone->lock = SPIN_LOCK_UNLOCKED;
      		zone->zone_pgdat = pgdat;
      		zone->free_pages = 0;
      		zone->inactive_clean_pages = 0;
      		zone->inactive_dirty_pages = 0;
 835  		memlist_init(&zone->inactive_clean_list);
 836  		if (!size)
 837  			continue;
      
      		zone->offset = offset;
      		cumulative += size;
      		mask = (realsize / zone_balance_ratio[j]);
 842  		if (mask < zone_balance_min[j])
      			mask = zone_balance_min[j];
 844  		else if (mask > zone_balance_max[j])
      			mask = zone_balance_max[j];
      		zone->pages_min = mask;
      		zone->pages_low = mask*2;
      		zone->pages_high = mask*3;
      		/*
      		 * Add these free targets to the global free target;
      		 * we have to be SURE that freepages.high is higher
      		 * than SUM [zone->pages_min] for all zones, otherwise
      		 * we may have bad bad problems.
      		 *
      		 * This means we cannot make the freepages array writable
      		 * in /proc, but have to add a separate extra_free_target
      		 * for people who require it to catch load spikes in eg.
      		 * gigabit ethernet routing...
      		 */
      		freepages.min += mask;
      		freepages.low += mask*2;
      		freepages.high += mask*3;
      		zone->zone_mem_map = mem_map + offset;
      		zone->zone_start_mapnr = offset;
      		zone->zone_start_paddr = zone_start_paddr;
      
 867  		for (i = 0; i < size; i++) {
      			struct page *page = mem_map + offset + i;
      			page->zone = zone;
 870  			if (j != ZONE_HIGHMEM) {
      				page->virtual = __va(zone_start_paddr);
      				zone_start_paddr += PAGE_SIZE;
      			}
      		}
      
      		offset += size;
      		mask = -1;
 878  		for (i = 0; i < MAX_ORDER; i++) {
      			unsigned long bitmap_size;
      
 881  			memlist_init(&zone->free_area[i].free_list);
      			mask += mask;
      			size = (size + ~mask) & mask;
      			bitmap_size = size >> i;
      			bitmap_size = (bitmap_size + 7) >> 3;
      			bitmap_size = LONG_ALIGN(bitmap_size);
      			zone->free_area[i].map = 
      			  (unsigned int *) alloc_bootmem_node(pgdat, bitmap_size);
      		}
      	}
      	build_zonelists(pgdat);
      }
      
 894  void __init free_area_init(unsigned long *zones_size)
      {
      	free_area_init_core(0, &contig_page_data, &mem_map, zones_size, 0, 0, 0);
      }
      
 899  static int __init setup_mem_frac(char *str)
      {
      	int j = 0;
      
 903  	while (get_option(&str, &zone_balance_ratio[j++]) == 2);
      	printk("setup_mem_frac: ");
 905  	for (j = 0; j < MAX_NR_ZONES; j++) printk("%d  ", zone_balance_ratio[j]);
      	printk("\n");
 907  	return 1;
      }
      
      __setup("memfrac=", setup_mem_frac);