~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~ [ freetext search ] ~ [ file search ] ~

Linux Cross Reference
Linux/mm/vmscan.c

Version: ~ [ 2.4.0 ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 /*
  2  *  linux/mm/vmscan.c
  3  *
  4  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
  5  *
  6  *  Swap reorganised 29.12.95, Stephen Tweedie.
  7  *  kswapd added: 7.1.96  sct
  8  *  Removed kswapd_ctl limits, and swap out as many pages as needed
  9  *  to bring the system back to freepages.high: 2.4.97, Rik van Riel.
 10  *  Version: $Id: vmscan.c,v 1.5 1998/02/23 22:14:28 sct Exp $
 11  *  Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
 12  *  Multiqueue VM started 5.8.00, Rik van Riel.
 13  */
 14 
 15 #include <linux/slab.h>
 16 #include <linux/kernel_stat.h>
 17 #include <linux/swap.h>
 18 #include <linux/swapctl.h>
 19 #include <linux/smp_lock.h>
 20 #include <linux/pagemap.h>
 21 #include <linux/init.h>
 22 #include <linux/highmem.h>
 23 #include <linux/file.h>
 24 
 25 #include <asm/pgalloc.h>
 26 
 27 /*
 28  * The swap-out functions return 1 if they successfully
 29  * threw something out, and we got a free page. It returns
 30  * zero if it couldn't do anything, and any other value
 31  * indicates it decreased rss, but the page was shared.
 32  *
 33  * NOTE! If it sleeps, it *must* return 1 to make sure we
 34  * don't continue with the swap-out. Otherwise we may be
 35  * using a process that no longer actually exists (it might
 36  * have died while we slept).
 37  */
 38 static int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, int gfp_mask)
 39 {
 40         pte_t pte;
 41         swp_entry_t entry;
 42         struct page * page;
 43         int onlist;
 44 
 45         pte = *page_table;
 46         if (!pte_present(pte))
 47                 goto out_failed;
 48         page = pte_page(pte);
 49         if ((!VALID_PAGE(page)) || PageReserved(page))
 50                 goto out_failed;
 51 
 52         if (!mm->swap_cnt)
 53                 return 1;
 54 
 55         mm->swap_cnt--;
 56 
 57         onlist = PageActive(page);
 58         /* Don't look at this pte if it's been accessed recently. */
 59         if (ptep_test_and_clear_young(page_table)) {
 60                 age_page_up(page);
 61                 goto out_failed;
 62         }
 63         if (!onlist)
 64                 /* The page is still mapped, so it can't be freeable... */
 65                 age_page_down_ageonly(page);
 66 
 67         /*
 68          * If the page is in active use by us, or if the page
 69          * is in active use by others, don't unmap it or
 70          * (worse) start unneeded IO.
 71          */
 72         if (page->age > 0)
 73                 goto out_failed;
 74 
 75         if (TryLockPage(page))
 76                 goto out_failed;
 77 
 78         /* From this point on, the odds are that we're going to
 79          * nuke this pte, so read and clear the pte.  This hook
 80          * is needed on CPUs which update the accessed and dirty
 81          * bits in hardware.
 82          */
 83         pte = ptep_get_and_clear(page_table);
 84         flush_tlb_page(vma, address);
 85 
 86         /*
 87          * Is the page already in the swap cache? If so, then
 88          * we can just drop our reference to it without doing
 89          * any IO - it's already up-to-date on disk.
 90          *
 91          * Return 0, as we didn't actually free any real
 92          * memory, and we should just continue our scan.
 93          */
 94         if (PageSwapCache(page)) {
 95                 entry.val = page->index;
 96                 if (pte_dirty(pte))
 97                         set_page_dirty(page);
 98 set_swap_pte:
 99                 swap_duplicate(entry);
100                 set_pte(page_table, swp_entry_to_pte(entry));
101 drop_pte:
102                 UnlockPage(page);
103                 mm->rss--;
104                 deactivate_page(page);
105                 page_cache_release(page);
106 out_failed:
107                 return 0;
108         }
109 
110         /*
111          * Is it a clean page? Then it must be recoverable
112          * by just paging it in again, and we can just drop
113          * it..
114          *
115          * However, this won't actually free any real
116          * memory, as the page will just be in the page cache
117          * somewhere, and as such we should just continue
118          * our scan.
119          *
120          * Basically, this just makes it possible for us to do
121          * some real work in the future in "refill_inactive()".
122          */
123         flush_cache_page(vma, address);
124         if (!pte_dirty(pte))
125                 goto drop_pte;
126 
127         /*
128          * Ok, it's really dirty. That means that
129          * we should either create a new swap cache
130          * entry for it, or we should write it back
131          * to its own backing store.
132          */
133         if (page->mapping) {
134                 set_page_dirty(page);
135                 goto drop_pte;
136         }
137 
138         /*
139          * This is a dirty, swappable page.  First of all,
140          * get a suitable swap entry for it, and make sure
141          * we have the swap cache set up to associate the
142          * page with that swap entry.
143          */
144         entry = get_swap_page();
145         if (!entry.val)
146                 goto out_unlock_restore; /* No swap space left */
147 
148         /* Add it to the swap cache and mark it dirty */
149         add_to_swap_cache(page, entry);
150         set_page_dirty(page);
151         goto set_swap_pte;
152 
153 out_unlock_restore:
154         set_pte(page_table, pte);
155         UnlockPage(page);
156         return 0;
157 }
158 
159 /*
160  * A new implementation of swap_out().  We do not swap complete processes,
161  * but only a small number of blocks, before we continue with the next
162  * process.  The number of blocks actually swapped is determined on the
163  * number of page faults, that this process actually had in the last time,
164  * so we won't swap heavily used processes all the time ...
165  *
166  * Note: the priority argument is a hint on much CPU to waste with the
167  *       swap block search, not a hint, of how much blocks to swap with
168  *       each process.
169  *
170  * (C) 1993 Kai Petzke, wpp@marie.physik.tu-berlin.de
171  */
172 
173 static inline int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
174 {
175         pte_t * pte;
176         unsigned long pmd_end;
177 
178         if (pmd_none(*dir))
179                 return 0;
180         if (pmd_bad(*dir)) {
181                 pmd_ERROR(*dir);
182                 pmd_clear(dir);
183                 return 0;
184         }
185         
186         pte = pte_offset(dir, address);
187         
188         pmd_end = (address + PMD_SIZE) & PMD_MASK;
189         if (end > pmd_end)
190                 end = pmd_end;
191 
192         do {
193                 int result;
194                 mm->swap_address = address + PAGE_SIZE;
195                 result = try_to_swap_out(mm, vma, address, pte, gfp_mask);
196                 if (result)
197                         return result;
198                 address += PAGE_SIZE;
199                 pte++;
200         } while (address && (address < end));
201         return 0;
202 }
203 
204 static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
205 {
206         pmd_t * pmd;
207         unsigned long pgd_end;
208 
209         if (pgd_none(*dir))
210                 return 0;
211         if (pgd_bad(*dir)) {
212                 pgd_ERROR(*dir);
213                 pgd_clear(dir);
214                 return 0;
215         }
216 
217         pmd = pmd_offset(dir, address);
218 
219         pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK;  
220         if (pgd_end && (end > pgd_end))
221                 end = pgd_end;
222         
223         do {
224                 int result = swap_out_pmd(mm, vma, pmd, address, end, gfp_mask);
225                 if (result)
226                         return result;
227                 address = (address + PMD_SIZE) & PMD_MASK;
228                 pmd++;
229         } while (address && (address < end));
230         return 0;
231 }
232 
233 static int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int gfp_mask)
234 {
235         pgd_t *pgdir;
236         unsigned long end;
237 
238         /* Don't swap out areas which are locked down */
239         if (vma->vm_flags & (VM_LOCKED|VM_RESERVED))
240                 return 0;
241 
242         pgdir = pgd_offset(mm, address);
243 
244         end = vma->vm_end;
245         if (address >= end)
246                 BUG();
247         do {
248                 int result = swap_out_pgd(mm, vma, pgdir, address, end, gfp_mask);
249                 if (result)
250                         return result;
251                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
252                 pgdir++;
253         } while (address && (address < end));
254         return 0;
255 }
256 
257 static int swap_out_mm(struct mm_struct * mm, int gfp_mask)
258 {
259         int result = 0;
260         unsigned long address;
261         struct vm_area_struct* vma;
262 
263         /*
264          * Go through process' page directory.
265          */
266 
267         /*
268          * Find the proper vm-area after freezing the vma chain 
269          * and ptes.
270          */
271         spin_lock(&mm->page_table_lock);
272         address = mm->swap_address;
273         vma = find_vma(mm, address);
274         if (vma) {
275                 if (address < vma->vm_start)
276                         address = vma->vm_start;
277 
278                 for (;;) {
279                         result = swap_out_vma(mm, vma, address, gfp_mask);
280                         if (result)
281                                 goto out_unlock;
282                         vma = vma->vm_next;
283                         if (!vma)
284                                 break;
285                         address = vma->vm_start;
286                 }
287         }
288         /* Reset to 0 when we reach the end of address space */
289         mm->swap_address = 0;
290         mm->swap_cnt = 0;
291 
292 out_unlock:
293         spin_unlock(&mm->page_table_lock);
294         return result;
295 }
296 
297 /*
298  * Select the task with maximal swap_cnt and try to swap out a page.
299  * N.B. This function returns only 0 or 1.  Return values != 1 from
300  * the lower level routines result in continued processing.
301  */
302 #define SWAP_SHIFT 5
303 #define SWAP_MIN 8
304 
305 static int swap_out(unsigned int priority, int gfp_mask)
306 {
307         int counter;
308         int __ret = 0;
309 
310         /* 
311          * We make one or two passes through the task list, indexed by 
312          * assign = {0, 1}:
313          *   Pass 1: select the swappable task with maximal RSS that has
314          *         not yet been swapped out. 
315          *   Pass 2: re-assign rss swap_cnt values, then select as above.
316          *
317          * With this approach, there's no need to remember the last task
318          * swapped out.  If the swap-out fails, we clear swap_cnt so the 
319          * task won't be selected again until all others have been tried.
320          *
321          * Think of swap_cnt as a "shadow rss" - it tells us which process
322          * we want to page out (always try largest first).
323          */
324         counter = (nr_threads << SWAP_SHIFT) >> priority;
325         if (counter < 1)
326                 counter = 1;
327 
328         for (; counter >= 0; counter--) {
329                 struct list_head *p;
330                 unsigned long max_cnt = 0;
331                 struct mm_struct *best = NULL;
332                 int assign = 0;
333                 int found_task = 0;
334         select:
335                 spin_lock(&mmlist_lock);
336                 p = init_mm.mmlist.next;
337                 for (; p != &init_mm.mmlist; p = p->next) {
338                         struct mm_struct *mm = list_entry(p, struct mm_struct, mmlist);
339                         if (mm->rss <= 0)
340                                 continue;
341                         found_task++;
342                         /* Refresh swap_cnt? */
343                         if (assign == 1) {
344                                 mm->swap_cnt = (mm->rss >> SWAP_SHIFT);
345                                 if (mm->swap_cnt < SWAP_MIN)
346                                         mm->swap_cnt = SWAP_MIN;
347                         }
348                         if (mm->swap_cnt > max_cnt) {
349                                 max_cnt = mm->swap_cnt;
350                                 best = mm;
351                         }
352                 }
353 
354                 /* Make sure it doesn't disappear */
355                 if (best)
356                         atomic_inc(&best->mm_users);
357                 spin_unlock(&mmlist_lock);
358 
359                 /*
360                  * We have dropped the tasklist_lock, but we
361                  * know that "mm" still exists: we are running
362                  * with the big kernel lock, and exit_mm()
363                  * cannot race with us.
364                  */
365                 if (!best) {
366                         if (!assign && found_task > 0) {
367                                 assign = 1;
368                                 goto select;
369                         }
370                         break;
371                 } else {
372                         __ret = swap_out_mm(best, gfp_mask);
373                         mmput(best);
374                         break;
375                 }
376         }
377         return __ret;
378 }
379 
380 
381 /**
382  * reclaim_page -       reclaims one page from the inactive_clean list
383  * @zone: reclaim a page from this zone
384  *
385  * The pages on the inactive_clean can be instantly reclaimed.
386  * The tests look impressive, but most of the time we'll grab
387  * the first page of the list and exit successfully.
388  */
389 struct page * reclaim_page(zone_t * zone)
390 {
391         struct page * page = NULL;
392         struct list_head * page_lru;
393         int maxscan;
394 
395         /*
396          * We only need the pagemap_lru_lock if we don't reclaim the page,
397          * but we have to grab the pagecache_lock before the pagemap_lru_lock
398          * to avoid deadlocks and most of the time we'll succeed anyway.
399          */
400         spin_lock(&pagecache_lock);
401         spin_lock(&pagemap_lru_lock);
402         maxscan = zone->inactive_clean_pages;
403         while ((page_lru = zone->inactive_clean_list.prev) !=
404                         &zone->inactive_clean_list && maxscan--) {
405                 page = list_entry(page_lru, struct page, lru);
406 
407                 /* Wrong page on list?! (list corruption, should not happen) */
408                 if (!PageInactiveClean(page)) {
409                         printk("VM: reclaim_page, wrong page on list.\n");
410                         list_del(page_lru);
411                         page->zone->inactive_clean_pages--;
412                         continue;
413                 }
414 
415                 /* Page is or was in use?  Move it to the active list. */
416                 if (PageTestandClearReferenced(page) || page->age > 0 ||
417                                 (!page->buffers && page_count(page) > 1)) {
418                         del_page_from_inactive_clean_list(page);
419                         add_page_to_active_list(page);
420                         continue;
421                 }
422 
423                 /* The page is dirty, or locked, move to inactive_dirty list. */
424                 if (page->buffers || PageDirty(page) || TryLockPage(page)) {
425                         del_page_from_inactive_clean_list(page);
426                         add_page_to_inactive_dirty_list(page);
427                         continue;
428                 }
429 
430                 /* OK, remove the page from the caches. */
431                 if (PageSwapCache(page)) {
432                         __delete_from_swap_cache(page);
433                         goto found_page;
434                 }
435 
436                 if (page->mapping) {
437                         __remove_inode_page(page);
438                         goto found_page;
439                 }
440 
441                 /* We should never ever get here. */
442                 printk(KERN_ERR "VM: reclaim_page, found unknown page\n");
443                 list_del(page_lru);
444                 zone->inactive_clean_pages--;
445                 UnlockPage(page);
446         }
447         /* Reset page pointer, maybe we encountered an unfreeable page. */
448         page = NULL;
449         goto out;
450 
451 found_page:
452         del_page_from_inactive_clean_list(page);
453         UnlockPage(page);
454         page->age = PAGE_AGE_START;
455         if (page_count(page) != 1)
456                 printk("VM: reclaim_page, found page with count %d!\n",
457                                 page_count(page));
458 out:
459         spin_unlock(&pagemap_lru_lock);
460         spin_unlock(&pagecache_lock);
461         memory_pressure++;
462         return page;
463 }
464 
465 /**
466  * page_launder - clean dirty inactive pages, move to inactive_clean list
467  * @gfp_mask: what operations we are allowed to do
468  * @sync: should we wait synchronously for the cleaning of pages
469  *
470  * When this function is called, we are most likely low on free +
471  * inactive_clean pages. Since we want to refill those pages as
472  * soon as possible, we'll make two loops over the inactive list,
473  * one to move the already cleaned pages to the inactive_clean lists
474  * and one to (often asynchronously) clean the dirty inactive pages.
475  *
476  * In situations where kswapd cannot keep up, user processes will
477  * end up calling this function. Since the user process needs to
478  * have a page before it can continue with its allocation, we'll
479  * do synchronous page flushing in that case.
480  *
481  * This code is heavily inspired by the FreeBSD source code. Thanks
482  * go out to Matthew Dillon.
483  */
484 #define MAX_LAUNDER             (4 * (1 << page_cluster))
485 int page_launder(int gfp_mask, int sync)
486 {
487         int launder_loop, maxscan, cleaned_pages, maxlaunder;
488         int can_get_io_locks;
489         struct list_head * page_lru;
490         struct page * page;
491 
492         /*
493          * We can only grab the IO locks (eg. for flushing dirty
494          * buffers to disk) if __GFP_IO is set.
495          */
496         can_get_io_locks = gfp_mask & __GFP_IO;
497 
498         launder_loop = 0;
499         maxlaunder = 0;
500         cleaned_pages = 0;
501 
502 dirty_page_rescan:
503         spin_lock(&pagemap_lru_lock);
504         maxscan = nr_inactive_dirty_pages;
505         while ((page_lru = inactive_dirty_list.prev) != &inactive_dirty_list &&
506                                 maxscan-- > 0) {
507                 page = list_entry(page_lru, struct page, lru);
508 
509                 /* Wrong page on list?! (list corruption, should not happen) */
510                 if (!PageInactiveDirty(page)) {
511                         printk("VM: page_launder, wrong page on list.\n");
512                         list_del(page_lru);
513                         nr_inactive_dirty_pages--;
514                         page->zone->inactive_dirty_pages--;
515                         continue;
516                 }
517 
518                 /* Page is or was in use?  Move it to the active list. */
519                 if (PageTestandClearReferenced(page) || page->age > 0 ||
520                                 (!page->buffers && page_count(page) > 1) ||
521                                 page_ramdisk(page)) {
522                         del_page_from_inactive_dirty_list(page);
523                         add_page_to_active_list(page);
524                         continue;
525                 }
526 
527                 /*
528                  * The page is locked. IO in progress?
529                  * Move it to the back of the list.
530                  */
531                 if (TryLockPage(page)) {
532                         list_del(page_lru);
533                         list_add(page_lru, &inactive_dirty_list);
534                         continue;
535                 }
536 
537                 /*
538                  * Dirty swap-cache page? Write it out if
539                  * last copy..
540                  */
541                 if (PageDirty(page)) {
542                         int (*writepage)(struct page *) = page->mapping->a_ops->writepage;
543                         int result;
544 
545                         if (!writepage)
546                                 goto page_active;
547 
548                         /* First time through? Move it to the back of the list */
549                         if (!launder_loop) {
550                                 list_del(page_lru);
551                                 list_add(page_lru, &inactive_dirty_list);
552                                 UnlockPage(page);
553                                 continue;
554                         }
555 
556                         /* OK, do a physical asynchronous write to swap.  */
557                         ClearPageDirty(page);
558                         page_cache_get(page);
559                         spin_unlock(&pagemap_lru_lock);
560 
561                         result = writepage(page);
562                         page_cache_release(page);
563 
564                         /* And re-start the thing.. */
565                         spin_lock(&pagemap_lru_lock);
566                         if (result != 1)
567                                 continue;
568                         /* writepage refused to do anything */
569                         set_page_dirty(page);
570                         goto page_active;
571                 }
572 
573                 /*
574                  * If the page has buffers, try to free the buffer mappings
575                  * associated with this page. If we succeed we either free
576                  * the page (in case it was a buffercache only page) or we
577                  * move the page to the inactive_clean list.
578                  *
579                  * On the first round, we should free all previously cleaned
580                  * buffer pages
581                  */
582                 if (page->buffers) {
583                         int wait, clearedbuf;
584                         int freed_page = 0;
585                         /*
586                          * Since we might be doing disk IO, we have to
587                          * drop the spinlock and take an extra reference
588                          * on the page so it doesn't go away from under us.
589                          */
590                         del_page_from_inactive_dirty_list(page);
591                         page_cache_get(page);
592                         spin_unlock(&pagemap_lru_lock);
593 
594                         /* Will we do (asynchronous) IO? */
595                         if (launder_loop && maxlaunder == 0 && sync)
596                                 wait = 2;       /* Synchrounous IO */
597                         else if (launder_loop && maxlaunder-- > 0)
598                                 wait = 1;       /* Async IO */
599                         else
600                                 wait = 0;       /* No IO */
601 
602                         /* Try to free the page buffers. */
603                         clearedbuf = try_to_free_buffers(page, wait);
604 
605                         /*
606                          * Re-take the spinlock. Note that we cannot
607                          * unlock the page yet since we're still
608                          * accessing the page_struct here...
609                          */
610                         spin_lock(&pagemap_lru_lock);
611 
612                         /* The buffers were not freed. */
613                         if (!clearedbuf) {
614                                 add_page_to_inactive_dirty_list(page);
615 
616                         /* The page was only in the buffer cache. */
617                         } else if (!page->mapping) {
618                                 atomic_dec(&buffermem_pages);
619                                 freed_page = 1;
620                                 cleaned_pages++;
621 
622                         /* The page has more users besides the cache and us. */
623                         } else if (page_count(page) > 2) {
624                                 add_page_to_active_list(page);
625 
626                         /* OK, we "created" a freeable page. */
627                         } else /* page->mapping && page_count(page) == 2 */ {
628                                 add_page_to_inactive_clean_list(page);
629                                 cleaned_pages++;
630                         }
631 
632                         /*
633                          * Unlock the page and drop the extra reference.
634                          * We can only do it here because we ar accessing
635                          * the page struct above.
636                          */
637                         UnlockPage(page);
638                         page_cache_release(page);
639 
640                         /* 
641                          * If we're freeing buffer cache pages, stop when
642                          * we've got enough free memory.
643                          */
644                         if (freed_page && !free_shortage())
645                                 break;
646                         continue;
647                 } else if (page->mapping && !PageDirty(page)) {
648                         /*
649                          * If a page had an extra reference in
650                          * deactivate_page(), we will find it here.
651                          * Now the page is really freeable, so we
652                          * move it to the inactive_clean list.
653                          */
654                         del_page_from_inactive_dirty_list(page);
655                         add_page_to_inactive_clean_list(page);
656                         UnlockPage(page);
657                         cleaned_pages++;
658                 } else {
659 page_active:
660                         /*
661                          * OK, we don't know what to do with the page.
662                          * It's no use keeping it here, so we move it to
663                          * the active list.
664                          */
665                         del_page_from_inactive_dirty_list(page);
666                         add_page_to_active_list(page);
667                         UnlockPage(page);
668                 }
669         }
670         spin_unlock(&pagemap_lru_lock);
671 
672         /*
673          * If we don't have enough free pages, we loop back once
674          * to queue the dirty pages for writeout. When we were called
675          * by a user process (that /needs/ a free page) and we didn't
676          * free anything yet, we wait synchronously on the writeout of
677          * MAX_SYNC_LAUNDER pages.
678          *
679          * We also wake up bdflush, since bdflush should, under most
680          * loads, flush out the dirty pages before we have to wait on
681          * IO.
682          */
683         if (can_get_io_locks && !launder_loop && free_shortage()) {
684                 launder_loop = 1;
685                 /* If we cleaned pages, never do synchronous IO. */
686                 if (cleaned_pages)
687                         sync = 0;
688                 /* We only do a few "out of order" flushes. */
689                 maxlaunder = MAX_LAUNDER;
690                 /* Kflushd takes care of the rest. */
691                 wakeup_bdflush(0);
692                 goto dirty_page_rescan;
693         }
694 
695         /* Return the number of pages moved to the inactive_clean list. */
696         return cleaned_pages;
697 }
698 
699 /**
700  * refill_inactive_scan - scan the active list and find pages to deactivate
701  * @priority: the priority at which to scan
702  * @oneshot: exit after deactivating one page
703  *
704  * This function will scan a portion of the active list to find
705  * unused pages, those pages will then be moved to the inactive list.
706  */
707 int refill_inactive_scan(unsigned int priority, int oneshot)
708 {
709         struct list_head * page_lru;
710         struct page * page;
711         int maxscan, page_active = 0;
712         int ret = 0;
713 
714         /* Take the lock while messing with the list... */
715         spin_lock(&pagemap_lru_lock);
716         maxscan = nr_active_pages >> priority;
717         while (maxscan-- > 0 && (page_lru = active_list.prev) != &active_list) {
718                 page = list_entry(page_lru, struct page, lru);
719 
720                 /* Wrong page on list?! (list corruption, should not happen) */
721                 if (!PageActive(page)) {
722                         printk("VM: refill_inactive, wrong page on list.\n");
723                         list_del(page_lru);
724                         nr_active_pages--;
725                         continue;
726                 }
727 
728                 /* Do aging on the pages. */
729                 if (PageTestandClearReferenced(page)) {
730                         age_page_up_nolock(page);
731                         page_active = 1;
732                 } else {
733                         age_page_down_ageonly(page);
734                         /*
735                          * Since we don't hold a reference on the page
736                          * ourselves, we have to do our test a bit more
737                          * strict then deactivate_page(). This is needed
738                          * since otherwise the system could hang shuffling
739                          * unfreeable pages from the active list to the
740                          * inactive_dirty list and back again...
741                          *
742                          * SUBTLE: we can have buffer pages with count 1.
743                          */
744                         if (page->age == 0 && page_count(page) <=
745                                                 (page->buffers ? 2 : 1)) {
746                                 deactivate_page_nolock(page);
747                                 page_active = 0;
748                         } else {
749                                 page_active = 1;
750                         }
751                 }
752                 /*
753                  * If the page is still on the active list, move it
754                  * to the other end of the list. Otherwise it was
755                  * deactivated by age_page_down and we exit successfully.
756                  */
757                 if (page_active || PageActive(page)) {
758                         list_del(page_lru);
759                         list_add(page_lru, &active_list);
760                 } else {
761                         ret = 1;
762                         if (oneshot)
763                                 break;
764                 }
765         }
766         spin_unlock(&pagemap_lru_lock);
767 
768         return ret;
769 }
770 
771 /*
772  * Check if there are zones with a severe shortage of free pages,
773  * or if all zones have a minor shortage.
774  */
775 int free_shortage(void)
776 {
777         pg_data_t *pgdat = pgdat_list;
778         int sum = 0;
779         int freeable = nr_free_pages() + nr_inactive_clean_pages();
780         int freetarget = freepages.high + inactive_target / 3;
781 
782         /* Are we low on free pages globally? */
783         if (freeable < freetarget)
784                 return freetarget - freeable;
785 
786         /* If not, are we very low on any particular zone? */
787         do {
788                 int i;
789                 for(i = 0; i < MAX_NR_ZONES; i++) {
790                         zone_t *zone = pgdat->node_zones+ i;
791                         if (zone->size && (zone->inactive_clean_pages +
792                                         zone->free_pages < zone->pages_min+1)) {
793                                 /* + 1 to have overlap with alloc_pages() !! */
794                                 sum += zone->pages_min + 1;
795                                 sum -= zone->free_pages;
796                                 sum -= zone->inactive_clean_pages;
797                         }
798                 }
799                 pgdat = pgdat->node_next;
800         } while (pgdat);
801 
802         return sum;
803 }
804 
805 /*
806  * How many inactive pages are we short?
807  */
808 int inactive_shortage(void)
809 {
810         int shortage = 0;
811 
812         shortage += freepages.high;
813         shortage += inactive_target;
814         shortage -= nr_free_pages();
815         shortage -= nr_inactive_clean_pages();
816         shortage -= nr_inactive_dirty_pages;
817 
818         if (shortage > 0)
819                 return shortage;
820 
821         return 0;
822 }
823 
824 /*
825  * We need to make the locks finer granularity, but right
826  * now we need this so that we can do page allocations
827  * without holding the kernel lock etc.
828  *
829  * We want to try to free "count" pages, and we want to 
830  * cluster them so that we get good swap-out behaviour.
831  *
832  * OTOH, if we're a user process (and not kswapd), we
833  * really care about latency. In that case we don't try
834  * to free too many pages.
835  */
836 static int refill_inactive(unsigned int gfp_mask, int user)
837 {
838         int priority, count, start_count, made_progress;
839 
840         count = inactive_shortage() + free_shortage();
841         if (user)
842                 count = (1 << page_cluster);
843         start_count = count;
844 
845         /* Always trim SLAB caches when memory gets low. */
846         kmem_cache_reap(gfp_mask);
847 
848         priority = 6;
849         do {
850                 made_progress = 0;
851 
852                 if (current->need_resched) {
853                         __set_current_state(TASK_RUNNING);
854                         schedule();
855                 }
856 
857                 while (refill_inactive_scan(priority, 1)) {
858                         made_progress = 1;
859                         if (--count <= 0)
860                                 goto done;
861                 }
862 
863                 /*
864                  * don't be too light against the d/i cache since
865                  * refill_inactive() almost never fail when there's
866                  * really plenty of memory free. 
867                  */
868                 shrink_dcache_memory(priority, gfp_mask);
869                 shrink_icache_memory(priority, gfp_mask);
870 
871                 /*
872                  * Then, try to page stuff out..
873                  */
874                 while (swap_out(priority, gfp_mask)) {
875                         made_progress = 1;
876                         if (--count <= 0)
877                                 goto done;
878                 }
879 
880                 /*
881                  * If we either have enough free memory, or if
882                  * page_launder() will be able to make enough
883                  * free memory, then stop.
884                  */
885                 if (!inactive_shortage() || !free_shortage())
886                         goto done;
887 
888                 /*
889                  * Only switch to a lower "priority" if we
890                  * didn't make any useful progress in the
891                  * last loop.
892                  */
893                 if (!made_progress)
894                         priority--;
895         } while (priority >= 0);
896 
897         /* Always end on a refill_inactive.., may sleep... */
898         while (refill_inactive_scan(0, 1)) {
899                 if (--count <= 0)
900                         goto done;
901         }
902 
903 done:
904         return (count < start_count);
905 }
906 
907 static int do_try_to_free_pages(unsigned int gfp_mask, int user)
908 {
909         int ret = 0;
910 
911         /*
912          * If we're low on free pages, move pages from the
913          * inactive_dirty list to the inactive_clean list.
914          *
915          * Usually bdflush will have pre-cleaned the pages
916          * before we get around to moving them to the other
917          * list, so this is a relatively cheap operation.
918          */
919         if (free_shortage() || nr_inactive_dirty_pages > nr_free_pages() +
920                         nr_inactive_clean_pages())
921                 ret += page_launder(gfp_mask, user);
922 
923         /*
924          * If needed, we move pages from the active list
925          * to the inactive list. We also "eat" pages from
926          * the inode and dentry cache whenever we do this.
927          */
928         if (free_shortage() || inactive_shortage()) {
929                 shrink_dcache_memory(6, gfp_mask);
930                 shrink_icache_memory(6, gfp_mask);
931                 ret += refill_inactive(gfp_mask, user);
932         } else {
933                 /*
934                  * Reclaim unused slab cache memory.
935                  */
936                 kmem_cache_reap(gfp_mask);
937                 ret = 1;
938         }
939 
940         return ret;
941 }
942 
943 DECLARE_WAIT_QUEUE_HEAD(kswapd_wait);
944 DECLARE_WAIT_QUEUE_HEAD(kswapd_done);
945 struct task_struct *kswapd_task;
946 
947 /*
948  * The background pageout daemon, started as a kernel thread
949  * from the init process. 
950  *
951  * This basically trickles out pages so that we have _some_
952  * free memory available even if there is no other activity
953  * that frees anything up. This is needed for things like routing
954  * etc, where we otherwise might have all activity going on in
955  * asynchronous contexts that cannot page things out.
956  *
957  * If there are applications that are active memory-allocators
958  * (most normal use), this basically shouldn't matter.
959  */
960 int kswapd(void *unused)
961 {
962         struct task_struct *tsk = current;
963 
964         tsk->session = 1;
965         tsk->pgrp = 1;
966         strcpy(tsk->comm, "kswapd");
967         sigfillset(&tsk->blocked);
968         kswapd_task = tsk;
969         
970         /*
971          * Tell the memory management that we're a "memory allocator",
972          * and that if we need more memory we should get access to it
973          * regardless (see "__alloc_pages()"). "kswapd" should
974          * never get caught in the normal page freeing logic.
975          *
976          * (Kswapd normally doesn't need memory anyway, but sometimes
977          * you need a small amount of memory in order to be able to
978          * page out something else, and this flag essentially protects
979          * us from recursively trying to free more memory as we're
980          * trying to free the first piece of memory in the first place).
981          */
982         tsk->flags |= PF_MEMALLOC;
983 
984         /*
985          * Kswapd main loop.
986          */
987         for (;;) {
988                 static int recalc = 0;
989 
990                 /* If needed, try to free some memory. */
991                 if (inactive_shortage() || free_shortage()) {
992                         int wait = 0;
993                         /* Do we need to do some synchronous flushing? */
994                         if (waitqueue_active(&kswapd_done))
995                                 wait = 1;
996                         do_try_to_free_pages(GFP_KSWAPD, wait);
997                 }
998 
999                 /*
1000                  * Do some (very minimal) background scanning. This
1001                  * will scan all pages on the active list once
1002                  * every minute. This clears old referenced bits
1003                  * and moves unused pages to the inactive list.
1004                  */
1005                 refill_inactive_scan(6, 0);
1006 
1007                 /* Once a second, recalculate some VM stats. */
1008                 if (time_after(jiffies, recalc + HZ)) {
1009                         recalc = jiffies;
1010                         recalculate_vm_stats();
1011                 }
1012 
1013                 /*
1014                  * Wake up everybody waiting for free memory
1015                  * and unplug the disk queue.
1016                  */
1017                 wake_up_all(&kswapd_done);
1018                 run_task_queue(&tq_disk);
1019 
1020                 /* 
1021                  * We go to sleep if either the free page shortage
1022                  * or the inactive page shortage is gone. We do this
1023                  * because:
1024                  * 1) we need no more free pages   or
1025                  * 2) the inactive pages need to be flushed to disk,
1026                  *    it wouldn't help to eat CPU time now ...
1027                  *
1028                  * We go to sleep for one second, but if it's needed
1029                  * we'll be woken up earlier...
1030                  */
1031                 if (!free_shortage() || !inactive_shortage()) {
1032                         interruptible_sleep_on_timeout(&kswapd_wait, HZ);
1033                 /*
1034                  * If we couldn't free enough memory, we see if it was
1035                  * due to the system just not having enough memory.
1036                  * If that is the case, the only solution is to kill
1037                  * a process (the alternative is enternal deadlock).
1038                  *
1039                  * If there still is enough memory around, we just loop
1040                  * and try free some more memory...
1041                  */
1042                 } else if (out_of_memory()) {
1043                         oom_kill();
1044                 }
1045         }
1046 }
1047 
1048 void wakeup_kswapd(int block)
1049 {
1050         DECLARE_WAITQUEUE(wait, current);
1051 
1052         if (current == kswapd_task)
1053                 return;
1054 
1055         if (!block) {
1056                 if (waitqueue_active(&kswapd_wait))
1057                         wake_up(&kswapd_wait);
1058                 return;
1059         }
1060 
1061         /*
1062          * Kswapd could wake us up before we get a chance
1063          * to sleep, so we have to be very careful here to
1064          * prevent SMP races...
1065          */
1066         __set_current_state(TASK_UNINTERRUPTIBLE);
1067         add_wait_queue(&kswapd_done, &wait);
1068 
1069         if (waitqueue_active(&kswapd_wait))
1070                 wake_up(&kswapd_wait);
1071         schedule();
1072 
1073         remove_wait_queue(&kswapd_done, &wait);
1074         __set_current_state(TASK_RUNNING);
1075 }
1076 
1077 /*
1078  * Called by non-kswapd processes when they want more
1079  * memory but are unable to sleep on kswapd because
1080  * they might be holding some IO locks ...
1081  */
1082 int try_to_free_pages(unsigned int gfp_mask)
1083 {
1084         int ret = 1;
1085 
1086         if (gfp_mask & __GFP_WAIT) {
1087                 current->flags |= PF_MEMALLOC;
1088                 ret = do_try_to_free_pages(gfp_mask, 1);
1089                 current->flags &= ~PF_MEMALLOC;
1090         }
1091 
1092         return ret;
1093 }
1094 
1095 DECLARE_WAIT_QUEUE_HEAD(kreclaimd_wait);
1096 /*
1097  * Kreclaimd will move pages from the inactive_clean list to the
1098  * free list, in order to keep atomic allocations possible under
1099  * all circumstances. Even when kswapd is blocked on IO.
1100  */
1101 int kreclaimd(void *unused)
1102 {
1103         struct task_struct *tsk = current;
1104         pg_data_t *pgdat;
1105 
1106         tsk->session = 1;
1107         tsk->pgrp = 1;
1108         strcpy(tsk->comm, "kreclaimd");
1109         sigfillset(&tsk->blocked);
1110         current->flags |= PF_MEMALLOC;
1111 
1112         while (1) {
1113 
1114                 /*
1115                  * We sleep until someone wakes us up from
1116                  * page_alloc.c::__alloc_pages().
1117                  */
1118                 interruptible_sleep_on(&kreclaimd_wait);
1119 
1120                 /*
1121                  * Move some pages from the inactive_clean lists to
1122                  * the free lists, if it is needed.
1123                  */
1124                 pgdat = pgdat_list;
1125                 do {
1126                         int i;
1127                         for(i = 0; i < MAX_NR_ZONES; i++) {
1128                                 zone_t *zone = pgdat->node_zones + i;
1129                                 if (!zone->size)
1130                                         continue;
1131 
1132                                 while (zone->free_pages < zone->pages_low) {
1133                                         struct page * page;
1134                                         page = reclaim_page(zone);
1135                                         if (!page)
1136                                                 break;
1137                                         __free_page(page);
1138                                 }
1139                         }
1140                         pgdat = pgdat->node_next;
1141                 } while (pgdat);
1142         }
1143 }
1144 
1145 
1146 static int __init kswapd_init(void)
1147 {
1148         printk("Starting kswapd v1.8\n");
1149         swap_setup();
1150         kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
1151         kernel_thread(kreclaimd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
1152         return 0;
1153 }
1154 
1155 module_init(kswapd_init)
1156 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~ [ freetext search ] ~ [ file search ] ~

This page was automatically generated by the LXR engine.
Visit the LXR main site for more information.