~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~ [ freetext search ] ~ [ file search ] ~

Linux Cross Reference
Linux/mm/page_alloc.c

Version: ~ [ 2.4.0 ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 /*
  2  *  linux/mm/page_alloc.c
  3  *
  4  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
  5  *  Swap reorganised 29.12.95, Stephen Tweedie
  6  *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
  7  *  Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
  8  *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
  9  *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
 10  */
 11 
 12 #include <linux/config.h>
 13 #include <linux/mm.h>
 14 #include <linux/swap.h>
 15 #include <linux/swapctl.h>
 16 #include <linux/interrupt.h>
 17 #include <linux/pagemap.h>
 18 #include <linux/bootmem.h>
 19 
 20 int nr_swap_pages;
 21 int nr_active_pages;
 22 int nr_inactive_dirty_pages;
 23 pg_data_t *pgdat_list;
 24 
 25 static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
 26 static int zone_balance_ratio[MAX_NR_ZONES] = { 32, 128, 128, };
 27 static int zone_balance_min[MAX_NR_ZONES] = { 10 , 10, 10, };
 28 static int zone_balance_max[MAX_NR_ZONES] = { 255 , 255, 255, };
 29 
 30 struct list_head active_list;
 31 struct list_head inactive_dirty_list;
 32 /*
 33  * Free_page() adds the page to the free lists. This is optimized for
 34  * fast normal cases (no error jumps taken normally).
 35  *
 36  * The way to optimize jumps for gcc-2.2.2 is to:
 37  *  - select the "normal" case and put it inside the if () { XXX }
 38  *  - no else-statements if you can avoid them
 39  *
 40  * With the above two rules, you get a straight-line execution path
 41  * for the normal case, giving better asm-code.
 42  */
 43 
 44 #define memlist_init(x) INIT_LIST_HEAD(x)
 45 #define memlist_add_head list_add
 46 #define memlist_add_tail list_add_tail
 47 #define memlist_del list_del
 48 #define memlist_entry list_entry
 49 #define memlist_next(x) ((x)->next)
 50 #define memlist_prev(x) ((x)->prev)
 51 
 52 /*
 53  * Temporary debugging check.
 54  */
 55 #define BAD_RANGE(zone,x) (((zone) != (x)->zone) || (((x)-mem_map) < (zone)->offset) || (((x)-mem_map) >= (zone)->offset+(zone)->size))
 56 
 57 /*
 58  * Buddy system. Hairy. You really aren't expected to understand this
 59  *
 60  * Hint: -mask = 1+~mask
 61  */
 62 
 63 static void FASTCALL(__free_pages_ok (struct page *page, unsigned long order));
 64 static void __free_pages_ok (struct page *page, unsigned long order)
 65 {
 66         unsigned long index, page_idx, mask, flags;
 67         free_area_t *area;
 68         struct page *base;
 69         zone_t *zone;
 70 
 71         if (page->buffers)
 72                 BUG();
 73         if (page->mapping)
 74                 BUG();
 75         if (!VALID_PAGE(page))
 76                 BUG();
 77         if (PageSwapCache(page))
 78                 BUG();
 79         if (PageLocked(page))
 80                 BUG();
 81         if (PageDecrAfter(page))
 82                 BUG();
 83         if (PageActive(page))
 84                 BUG();
 85         if (PageInactiveDirty(page))
 86                 BUG();
 87         if (PageInactiveClean(page))
 88                 BUG();
 89 
 90         page->flags &= ~((1<<PG_referenced) | (1<<PG_dirty));
 91         page->age = PAGE_AGE_START;
 92         
 93         zone = page->zone;
 94 
 95         mask = (~0UL) << order;
 96         base = mem_map + zone->offset;
 97         page_idx = page - base;
 98         if (page_idx & ~mask)
 99                 BUG();
100         index = page_idx >> (1 + order);
101 
102         area = zone->free_area + order;
103 
104         spin_lock_irqsave(&zone->lock, flags);
105 
106         zone->free_pages -= mask;
107 
108         while (mask + (1 << (MAX_ORDER-1))) {
109                 struct page *buddy1, *buddy2;
110 
111                 if (area >= zone->free_area + MAX_ORDER)
112                         BUG();
113                 if (!test_and_change_bit(index, area->map))
114                         /*
115                          * the buddy page is still allocated.
116                          */
117                         break;
118                 /*
119                  * Move the buddy up one level.
120                  */
121                 buddy1 = base + (page_idx ^ -mask);
122                 buddy2 = base + page_idx;
123                 if (BAD_RANGE(zone,buddy1))
124                         BUG();
125                 if (BAD_RANGE(zone,buddy2))
126                         BUG();
127 
128                 memlist_del(&buddy1->list);
129                 mask <<= 1;
130                 area++;
131                 index >>= 1;
132                 page_idx &= mask;
133         }
134         memlist_add_head(&(base + page_idx)->list, &area->free_list);
135 
136         spin_unlock_irqrestore(&zone->lock, flags);
137 
138         /*
139          * We don't want to protect this variable from race conditions
140          * since it's nothing important, but we do want to make sure
141          * it never gets negative.
142          */
143         if (memory_pressure > NR_CPUS)
144                 memory_pressure--;
145 }
146 
147 #define MARK_USED(index, order, area) \
148         change_bit((index) >> (1+(order)), (area)->map)
149 
150 static inline struct page * expand (zone_t *zone, struct page *page,
151          unsigned long index, int low, int high, free_area_t * area)
152 {
153         unsigned long size = 1 << high;
154 
155         while (high > low) {
156                 if (BAD_RANGE(zone,page))
157                         BUG();
158                 area--;
159                 high--;
160                 size >>= 1;
161                 memlist_add_head(&(page)->list, &(area)->free_list);
162                 MARK_USED(index, high, area);
163                 index += size;
164                 page += size;
165         }
166         if (BAD_RANGE(zone,page))
167                 BUG();
168         return page;
169 }
170 
171 static FASTCALL(struct page * rmqueue(zone_t *zone, unsigned long order));
172 static struct page * rmqueue(zone_t *zone, unsigned long order)
173 {
174         free_area_t * area = zone->free_area + order;
175         unsigned long curr_order = order;
176         struct list_head *head, *curr;
177         unsigned long flags;
178         struct page *page;
179 
180         spin_lock_irqsave(&zone->lock, flags);
181         do {
182                 head = &area->free_list;
183                 curr = memlist_next(head);
184 
185                 if (curr != head) {
186                         unsigned int index;
187 
188                         page = memlist_entry(curr, struct page, list);
189                         if (BAD_RANGE(zone,page))
190                                 BUG();
191                         memlist_del(curr);
192                         index = (page - mem_map) - zone->offset;
193                         MARK_USED(index, curr_order, area);
194                         zone->free_pages -= 1 << order;
195 
196                         page = expand(zone, page, index, order, curr_order, area);
197                         spin_unlock_irqrestore(&zone->lock, flags);
198 
199                         set_page_count(page, 1);
200                         if (BAD_RANGE(zone,page))
201                                 BUG();
202                         DEBUG_ADD_PAGE
203                         return page;    
204                 }
205                 curr_order++;
206                 area++;
207         } while (curr_order < MAX_ORDER);
208         spin_unlock_irqrestore(&zone->lock, flags);
209 
210         return NULL;
211 }
212 
213 #define PAGES_MIN       0
214 #define PAGES_LOW       1
215 #define PAGES_HIGH      2
216 
217 /*
218  * This function does the dirty work for __alloc_pages
219  * and is separated out to keep the code size smaller.
220  * (suggested by Davem at 1:30 AM, typed by Rik at 6 AM)
221  */
222 static struct page * __alloc_pages_limit(zonelist_t *zonelist,
223                         unsigned long order, int limit, int direct_reclaim)
224 {
225         zone_t **zone = zonelist->zones;
226 
227         for (;;) {
228                 zone_t *z = *(zone++);
229                 unsigned long water_mark;
230 
231                 if (!z)
232                         break;
233                 if (!z->size)
234                         BUG();
235 
236                 /*
237                  * We allocate if the number of free + inactive_clean
238                  * pages is above the watermark.
239                  */
240                 switch (limit) {
241                         default:
242                         case PAGES_MIN:
243                                 water_mark = z->pages_min;
244                                 break;
245                         case PAGES_LOW:
246                                 water_mark = z->pages_low;
247                                 break;
248                         case PAGES_HIGH:
249                                 water_mark = z->pages_high;
250                 }
251 
252                 if (z->free_pages + z->inactive_clean_pages > water_mark) {
253                         struct page *page = NULL;
254                         /* If possible, reclaim a page directly. */
255                         if (direct_reclaim && z->free_pages < z->pages_min + 8)
256                                 page = reclaim_page(z);
257                         /* If that fails, fall back to rmqueue. */
258                         if (!page)
259                                 page = rmqueue(z, order);
260                         if (page)
261                                 return page;
262                 }
263         }
264 
265         /* Found nothing. */
266         return NULL;
267 }
268 
269 
270 /*
271  * This is the 'heart' of the zoned buddy allocator:
272  */
273 struct page * __alloc_pages(zonelist_t *zonelist, unsigned long order)
274 {
275         zone_t **zone;
276         int direct_reclaim = 0;
277         unsigned int gfp_mask = zonelist->gfp_mask;
278         struct page * page;
279 
280         /*
281          * Allocations put pressure on the VM subsystem.
282          */
283         memory_pressure++;
284 
285         /*
286          * (If anyone calls gfp from interrupts nonatomically then it
287          * will sooner or later tripped up by a schedule().)
288          *
289          * We are falling back to lower-level zones if allocation
290          * in a higher zone fails.
291          */
292 
293         /*
294          * Can we take pages directly from the inactive_clean
295          * list?
296          */
297         if (order == 0 && (gfp_mask & __GFP_WAIT) &&
298                         !(current->flags & PF_MEMALLOC))
299                 direct_reclaim = 1;
300 
301         /*
302          * If we are about to get low on free pages and we also have
303          * an inactive page shortage, wake up kswapd.
304          */
305         if (inactive_shortage() > inactive_target / 2 && free_shortage())
306                 wakeup_kswapd(0);
307         /*
308          * If we are about to get low on free pages and cleaning
309          * the inactive_dirty pages would fix the situation,
310          * wake up bdflush.
311          */
312         else if (free_shortage() && nr_inactive_dirty_pages > free_shortage()
313                         && nr_inactive_dirty_pages >= freepages.high)
314                 wakeup_bdflush(0);
315 
316 try_again:
317         /*
318          * First, see if we have any zones with lots of free memory.
319          *
320          * We allocate free memory first because it doesn't contain
321          * any data ... DUH!
322          */
323         zone = zonelist->zones;
324         for (;;) {
325                 zone_t *z = *(zone++);
326                 if (!z)
327                         break;
328                 if (!z->size)
329                         BUG();
330 
331                 if (z->free_pages >= z->pages_low) {
332                         page = rmqueue(z, order);
333                         if (page)
334                                 return page;
335                 } else if (z->free_pages < z->pages_min &&
336                                         waitqueue_active(&kreclaimd_wait)) {
337                                 wake_up_interruptible(&kreclaimd_wait);
338                 }
339         }
340 
341         /*
342          * Try to allocate a page from a zone with a HIGH
343          * amount of free + inactive_clean pages.
344          *
345          * If there is a lot of activity, inactive_target
346          * will be high and we'll have a good chance of
347          * finding a page using the HIGH limit.
348          */
349         page = __alloc_pages_limit(zonelist, order, PAGES_HIGH, direct_reclaim);
350         if (page)
351                 return page;
352 
353         /*
354          * Then try to allocate a page from a zone with more
355          * than zone->pages_low free + inactive_clean pages.
356          *
357          * When the working set is very large and VM activity
358          * is low, we're most likely to have our allocation
359          * succeed here.
360          */
361         page = __alloc_pages_limit(zonelist, order, PAGES_LOW, direct_reclaim);
362         if (page)
363                 return page;
364 
365         /*
366          * OK, none of the zones on our zonelist has lots
367          * of pages free.
368          *
369          * We wake up kswapd, in the hope that kswapd will
370          * resolve this situation before memory gets tight.
371          *
372          * We also yield the CPU, because that:
373          * - gives kswapd a chance to do something
374          * - slows down allocations, in particular the
375          *   allocations from the fast allocator that's
376          *   causing the problems ...
377          * - ... which minimises the impact the "bad guys"
378          *   have on the rest of the system
379          * - if we don't have __GFP_IO set, kswapd may be
380          *   able to free some memory we can't free ourselves
381          */
382         wakeup_kswapd(0);
383         if (gfp_mask & __GFP_WAIT) {
384                 __set_current_state(TASK_RUNNING);
385                 current->policy |= SCHED_YIELD;
386                 schedule();
387         }
388 
389         /*
390          * After waking up kswapd, we try to allocate a page
391          * from any zone which isn't critical yet.
392          *
393          * Kswapd should, in most situations, bring the situation
394          * back to normal in no time.
395          */
396         page = __alloc_pages_limit(zonelist, order, PAGES_MIN, direct_reclaim);
397         if (page)
398                 return page;
399 
400         /*
401          * Damn, we didn't succeed.
402          *
403          * This can be due to 2 reasons:
404          * - we're doing a higher-order allocation
405          *      --> move pages to the free list until we succeed
406          * - we're /really/ tight on memory
407          *      --> wait on the kswapd waitqueue until memory is freed
408          */
409         if (!(current->flags & PF_MEMALLOC)) {
410                 /*
411                  * Are we dealing with a higher order allocation?
412                  *
413                  * Move pages from the inactive_clean to the free list
414                  * in the hope of creating a large, physically contiguous
415                  * piece of free memory.
416                  */
417                 if (order > 0 && (gfp_mask & __GFP_WAIT)) {
418                         zone = zonelist->zones;
419                         /* First, clean some dirty pages. */
420                         current->flags |= PF_MEMALLOC;
421                         page_launder(gfp_mask, 1);
422                         current->flags &= ~PF_MEMALLOC;
423                         for (;;) {
424                                 zone_t *z = *(zone++);
425                                 if (!z)
426                                         break;
427                                 if (!z->size)
428                                         continue;
429                                 while (z->inactive_clean_pages) {
430                                         struct page * page;
431                                         /* Move one page to the free list. */
432                                         page = reclaim_page(z);
433                                         if (!page)
434                                                 break;
435                                         __free_page(page);
436                                         /* Try if the allocation succeeds. */
437                                         page = rmqueue(z, order);
438                                         if (page)
439                                                 return page;
440                                 }
441                         }
442                 }
443                 /*
444                  * When we arrive here, we are really tight on memory.
445                  *
446                  * We wake up kswapd and sleep until kswapd wakes us
447                  * up again. After that we loop back to the start.
448                  *
449                  * We have to do this because something else might eat
450                  * the memory kswapd frees for us and we need to be
451                  * reliable. Note that we don't loop back for higher
452                  * order allocations since it is possible that kswapd
453                  * simply cannot free a large enough contiguous area
454                  * of memory *ever*.
455                  */
456                 if ((gfp_mask & (__GFP_WAIT|__GFP_IO)) == (__GFP_WAIT|__GFP_IO)) {
457                         wakeup_kswapd(1);
458                         memory_pressure++;
459                         if (!order)
460                                 goto try_again;
461                 /*
462                  * If __GFP_IO isn't set, we can't wait on kswapd because
463                  * kswapd just might need some IO locks /we/ are holding ...
464                  *
465                  * SUBTLE: The scheduling point above makes sure that
466                  * kswapd does get the chance to free memory we can't
467                  * free ourselves...
468                  */
469                 } else if (gfp_mask & __GFP_WAIT) {
470                         try_to_free_pages(gfp_mask);
471                         memory_pressure++;
472                         if (!order)
473                                 goto try_again;
474                 }
475 
476         }
477 
478         /*
479          * Final phase: allocate anything we can!
480          *
481          * Higher order allocations, GFP_ATOMIC allocations and
482          * recursive allocations (PF_MEMALLOC) end up here.
483          *
484          * Only recursive allocations can use the very last pages
485          * in the system, otherwise it would be just too easy to
486          * deadlock the system...
487          */
488         zone = zonelist->zones;
489         for (;;) {
490                 zone_t *z = *(zone++);
491                 struct page * page = NULL;
492                 if (!z)
493                         break;
494                 if (!z->size)
495                         BUG();
496 
497                 /*
498                  * SUBTLE: direct_reclaim is only possible if the task
499                  * becomes PF_MEMALLOC while looping above. This will
500                  * happen when the OOM killer selects this task for
501                  * instant execution...
502                  */
503                 if (direct_reclaim) {
504                         page = reclaim_page(z);
505                         if (page)
506                                 return page;
507                 }
508 
509                 /* XXX: is pages_min/4 a good amount to reserve for this? */
510                 if (z->free_pages < z->pages_min / 4 &&
511                                 !(current->flags & PF_MEMALLOC))
512                         continue;
513                 page = rmqueue(z, order);
514                 if (page)
515                         return page;
516         }
517 
518         /* No luck.. */
519         printk(KERN_ERR "__alloc_pages: %lu-order allocation failed.\n", order);
520         return NULL;
521 }
522 
523 /*
524  * Common helper functions.
525  */
526 unsigned long __get_free_pages(int gfp_mask, unsigned long order)
527 {
528         struct page * page;
529 
530         page = alloc_pages(gfp_mask, order);
531         if (!page)
532                 return 0;
533         return (unsigned long) page_address(page);
534 }
535 
536 unsigned long get_zeroed_page(int gfp_mask)
537 {
538         struct page * page;
539 
540         page = alloc_pages(gfp_mask, 0);
541         if (page) {
542                 void *address = page_address(page);
543                 clear_page(address);
544                 return (unsigned long) address;
545         }
546         return 0;
547 }
548 
549 void __free_pages(struct page *page, unsigned long order)
550 {
551         if (!PageReserved(page) && put_page_testzero(page))
552                 __free_pages_ok(page, order);
553 }
554 
555 void free_pages(unsigned long addr, unsigned long order)
556 {
557         struct page *fpage;
558 
559 #ifdef CONFIG_DISCONTIGMEM
560         if (addr == 0) return;
561 #endif
562         fpage = virt_to_page(addr);
563         if (VALID_PAGE(fpage))
564                 __free_pages(fpage, order);
565 }
566 
567 /*
568  * Total amount of free (allocatable) RAM:
569  */
570 unsigned int nr_free_pages (void)
571 {
572         unsigned int sum;
573         zone_t *zone;
574         pg_data_t *pgdat = pgdat_list;
575 
576         sum = 0;
577         while (pgdat) {
578                 for (zone = pgdat->node_zones; zone < pgdat->node_zones + MAX_NR_ZONES; zone++)
579                         sum += zone->free_pages;
580                 pgdat = pgdat->node_next;
581         }
582         return sum;
583 }
584 
585 /*
586  * Total amount of inactive_clean (allocatable) RAM:
587  */
588 unsigned int nr_inactive_clean_pages (void)
589 {
590         unsigned int sum;
591         zone_t *zone;
592         pg_data_t *pgdat = pgdat_list;
593 
594         sum = 0;
595         while (pgdat) {
596                 for (zone = pgdat->node_zones; zone < pgdat->node_zones + MAX_NR_ZONES; zone++)
597                         sum += zone->inactive_clean_pages;
598                 pgdat = pgdat->node_next;
599         }
600         return sum;
601 }
602 
603 /*
604  * Amount of free RAM allocatable as buffer memory:
605  */
606 unsigned int nr_free_buffer_pages (void)
607 {
608         unsigned int sum;
609 
610         sum = nr_free_pages();
611         sum += nr_inactive_clean_pages();
612         sum += nr_inactive_dirty_pages;
613 
614         /*
615          * Keep our write behind queue filled, even if
616          * kswapd lags a bit right now.
617          */
618         if (sum < freepages.high + inactive_target)
619                 sum = freepages.high + inactive_target;
620         /*
621          * We don't want dirty page writebehind to put too
622          * much pressure on the working set, but we want it
623          * to be possible to have some dirty pages in the
624          * working set without upsetting the writebehind logic.
625          */
626         sum += nr_active_pages >> 4;
627 
628         return sum;
629 }
630 
631 #if CONFIG_HIGHMEM
632 unsigned int nr_free_highpages (void)
633 {
634         pg_data_t *pgdat = pgdat_list;
635         unsigned int pages = 0;
636 
637         while (pgdat) {
638                 pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages;
639                 pgdat = pgdat->node_next;
640         }
641         return pages;
642 }
643 #endif
644 
645 /*
646  * Show free area list (used inside shift_scroll-lock stuff)
647  * We also calculate the percentage fragmentation. We do this by counting the
648  * memory on each free list with the exception of the first item on the list.
649  */
650 void show_free_areas_core(pg_data_t *pgdat)
651 {
652         unsigned long order;
653         unsigned type;
654 
655         printk("Free pages:      %6dkB (%6dkB HighMem)\n",
656                 nr_free_pages() << (PAGE_SHIFT-10),
657                 nr_free_highpages() << (PAGE_SHIFT-10));
658 
659         printk("( Active: %d, inactive_dirty: %d, inactive_clean: %d, free: %d (%d %d %d) )\n",
660                 nr_active_pages,
661                 nr_inactive_dirty_pages,
662                 nr_inactive_clean_pages(),
663                 nr_free_pages(),
664                 freepages.min,
665                 freepages.low,
666                 freepages.high);
667 
668         for (type = 0; type < MAX_NR_ZONES; type++) {
669                 struct list_head *head, *curr;
670                 zone_t *zone = pgdat->node_zones + type;
671                 unsigned long nr, total, flags;
672 
673                 total = 0;
674                 if (zone->size) {
675                         spin_lock_irqsave(&zone->lock, flags);
676                         for (order = 0; order < MAX_ORDER; order++) {
677                                 head = &(zone->free_area + order)->free_list;
678                                 curr = head;
679                                 nr = 0;
680                                 for (;;) {
681                                         curr = memlist_next(curr);
682                                         if (curr == head)
683                                                 break;
684                                         nr++;
685                                 }
686                                 total += nr * (1 << order);
687                                 printk("%lu*%lukB ", nr,
688                                                 (PAGE_SIZE>>10) << order);
689                         }
690                         spin_unlock_irqrestore(&zone->lock, flags);
691                 }
692                 printk("= %lukB)\n", total * (PAGE_SIZE>>10));
693         }
694 
695 #ifdef SWAP_CACHE_INFO
696         show_swap_cache_info();
697 #endif  
698 }
699 
700 void show_free_areas(void)
701 {
702         show_free_areas_core(pgdat_list);
703 }
704 
705 /*
706  * Builds allocation fallback zone lists.
707  */
708 static inline void build_zonelists(pg_data_t *pgdat)
709 {
710         int i, j, k;
711 
712         for (i = 0; i < NR_GFPINDEX; i++) {
713                 zonelist_t *zonelist;
714                 zone_t *zone;
715 
716                 zonelist = pgdat->node_zonelists + i;
717                 memset(zonelist, 0, sizeof(*zonelist));
718 
719                 zonelist->gfp_mask = i;
720                 j = 0;
721                 k = ZONE_NORMAL;
722                 if (i & __GFP_HIGHMEM)
723                         k = ZONE_HIGHMEM;
724                 if (i & __GFP_DMA)
725                         k = ZONE_DMA;
726 
727                 switch (k) {
728                         default:
729                                 BUG();
730                         /*
731                          * fallthrough:
732                          */
733                         case ZONE_HIGHMEM:
734                                 zone = pgdat->node_zones + ZONE_HIGHMEM;
735                                 if (zone->size) {
736 #ifndef CONFIG_HIGHMEM
737                                         BUG();
738 #endif
739                                         zonelist->zones[j++] = zone;
740                                 }
741                         case ZONE_NORMAL:
742                                 zone = pgdat->node_zones + ZONE_NORMAL;
743                                 if (zone->size)
744                                         zonelist->zones[j++] = zone;
745                         case ZONE_DMA:
746                                 zone = pgdat->node_zones + ZONE_DMA;
747                                 if (zone->size)
748                                         zonelist->zones[j++] = zone;
749                 }
750                 zonelist->zones[j++] = NULL;
751         } 
752 }
753 
754 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
755 
756 /*
757  * Set up the zone data structures:
758  *   - mark all pages reserved
759  *   - mark all memory queues empty
760  *   - clear the memory bitmaps
761  */
762 void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
763         unsigned long *zones_size, unsigned long zone_start_paddr, 
764         unsigned long *zholes_size, struct page *lmem_map)
765 {
766         struct page *p;
767         unsigned long i, j;
768         unsigned long map_size;
769         unsigned long totalpages, offset, realtotalpages;
770         unsigned int cumulative = 0;
771 
772         totalpages = 0;
773         for (i = 0; i < MAX_NR_ZONES; i++) {
774                 unsigned long size = zones_size[i];
775                 totalpages += size;
776         }
777         realtotalpages = totalpages;
778         if (zholes_size)
779                 for (i = 0; i < MAX_NR_ZONES; i++)
780                         realtotalpages -= zholes_size[i];
781                         
782         printk("On node %d totalpages: %lu\n", nid, realtotalpages);
783 
784         memlist_init(&active_list);
785         memlist_init(&inactive_dirty_list);
786 
787         /*
788          * Some architectures (with lots of mem and discontinous memory
789          * maps) have to search for a good mem_map area:
790          * For discontigmem, the conceptual mem map array starts from 
791          * PAGE_OFFSET, we need to align the actual array onto a mem map 
792          * boundary, so that MAP_NR works.
793          */
794         map_size = (totalpages + 1)*sizeof(struct page);
795         if (lmem_map == (struct page *)0) {
796                 lmem_map = (struct page *) alloc_bootmem_node(pgdat, map_size);
797                 lmem_map = (struct page *)(PAGE_OFFSET + 
798                         MAP_ALIGN((unsigned long)lmem_map - PAGE_OFFSET));
799         }
800         *gmap = pgdat->node_mem_map = lmem_map;
801         pgdat->node_size = totalpages;
802         pgdat->node_start_paddr = zone_start_paddr;
803         pgdat->node_start_mapnr = (lmem_map - mem_map);
804 
805         /*
806          * Initially all pages are reserved - free ones are freed
807          * up by free_all_bootmem() once the early boot process is
808          * done.
809          */
810         for (p = lmem_map; p < lmem_map + totalpages; p++) {
811                 set_page_count(p, 0);
812                 SetPageReserved(p);
813                 init_waitqueue_head(&p->wait);
814                 memlist_init(&p->list);
815         }
816 
817         offset = lmem_map - mem_map;    
818         for (j = 0; j < MAX_NR_ZONES; j++) {
819                 zone_t *zone = pgdat->node_zones + j;
820                 unsigned long mask;
821                 unsigned long size, realsize;
822 
823                 realsize = size = zones_size[j];
824                 if (zholes_size)
825                         realsize -= zholes_size[j];
826 
827                 printk("zone(%lu): %lu pages.\n", j, size);
828                 zone->size = size;
829                 zone->name = zone_names[j];
830                 zone->lock = SPIN_LOCK_UNLOCKED;
831                 zone->zone_pgdat = pgdat;
832                 zone->free_pages = 0;
833                 zone->inactive_clean_pages = 0;
834                 zone->inactive_dirty_pages = 0;
835                 memlist_init(&zone->inactive_clean_list);
836                 if (!size)
837                         continue;
838 
839                 zone->offset = offset;
840                 cumulative += size;
841                 mask = (realsize / zone_balance_ratio[j]);
842                 if (mask < zone_balance_min[j])
843                         mask = zone_balance_min[j];
844                 else if (mask > zone_balance_max[j])
845                         mask = zone_balance_max[j];
846                 zone->pages_min = mask;
847                 zone->pages_low = mask*2;
848                 zone->pages_high = mask*3;
849                 /*
850                  * Add these free targets to the global free target;
851                  * we have to be SURE that freepages.high is higher
852                  * than SUM [zone->pages_min] for all zones, otherwise
853                  * we may have bad bad problems.
854                  *
855                  * This means we cannot make the freepages array writable
856                  * in /proc, but have to add a separate extra_free_target
857                  * for people who require it to catch load spikes in eg.
858                  * gigabit ethernet routing...
859                  */
860                 freepages.min += mask;
861                 freepages.low += mask*2;
862                 freepages.high += mask*3;
863                 zone->zone_mem_map = mem_map + offset;
864                 zone->zone_start_mapnr = offset;
865                 zone->zone_start_paddr = zone_start_paddr;
866 
867                 for (i = 0; i < size; i++) {
868                         struct page *page = mem_map + offset + i;
869                         page->zone = zone;
870                         if (j != ZONE_HIGHMEM) {
871                                 page->virtual = __va(zone_start_paddr);
872                                 zone_start_paddr += PAGE_SIZE;
873                         }
874                 }
875 
876                 offset += size;
877                 mask = -1;
878                 for (i = 0; i < MAX_ORDER; i++) {
879                         unsigned long bitmap_size;
880 
881                         memlist_init(&zone->free_area[i].free_list);
882                         mask += mask;
883                         size = (size + ~mask) & mask;
884                         bitmap_size = size >> i;
885                         bitmap_size = (bitmap_size + 7) >> 3;
886                         bitmap_size = LONG_ALIGN(bitmap_size);
887                         zone->free_area[i].map = 
888                           (unsigned int *) alloc_bootmem_node(pgdat, bitmap_size);
889                 }
890         }
891         build_zonelists(pgdat);
892 }
893 
894 void __init free_area_init(unsigned long *zones_size)
895 {
896         free_area_init_core(0, &contig_page_data, &mem_map, zones_size, 0, 0, 0);
897 }
898 
899 static int __init setup_mem_frac(char *str)
900 {
901         int j = 0;
902 
903         while (get_option(&str, &zone_balance_ratio[j++]) == 2);
904         printk("setup_mem_frac: ");
905         for (j = 0; j < MAX_NR_ZONES; j++) printk("%d  ", zone_balance_ratio[j]);
906         printk("\n");
907         return 1;
908 }
909 
910 __setup("memfrac=", setup_mem_frac);
911 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~ [ freetext search ] ~ [ file search ] ~

This page was automatically generated by the LXR engine.
Visit the LXR main site for more information.