~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~ [ freetext search ] ~ [ file search ] ~

Linux Cross Reference
Linux/mm/memory.c

Version: ~ [ 2.4.0 ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 /*
  2  *  linux/mm/memory.c
  3  *
  4  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
  5  */
  6 
  7 /*
  8  * demand-loading started 01.12.91 - seems it is high on the list of
  9  * things wanted, and it should be easy to implement. - Linus
 10  */
 11 
 12 /*
 13  * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
 14  * pages started 02.12.91, seems to work. - Linus.
 15  *
 16  * Tested sharing by executing about 30 /bin/sh: under the old kernel it
 17  * would have taken more than the 6M I have free, but it worked well as
 18  * far as I could see.
 19  *
 20  * Also corrected some "invalidate()"s - I wasn't doing enough of them.
 21  */
 22 
 23 /*
 24  * Real VM (paging to/from disk) started 18.12.91. Much more work and
 25  * thought has to go into this. Oh, well..
 26  * 19.12.91  -  works, somewhat. Sometimes I get faults, don't know why.
 27  *              Found it. Everything seems to work now.
 28  * 20.12.91  -  Ok, making the swap-device changeable like the root.
 29  */
 30 
 31 /*
 32  * 05.04.94  -  Multi-page memory management added for v1.1.
 33  *              Idea by Alex Bligh (alex@cconcepts.co.uk)
 34  *
 35  * 16.07.99  -  Support of BIGMEM added by Gerhard Wichert, Siemens AG
 36  *              (Gerhard.Wichert@pdb.siemens.de)
 37  */
 38 
 39 #include <linux/mm.h>
 40 #include <linux/mman.h>
 41 #include <linux/swap.h>
 42 #include <linux/smp_lock.h>
 43 #include <linux/swapctl.h>
 44 #include <linux/iobuf.h>
 45 #include <asm/uaccess.h>
 46 #include <asm/pgalloc.h>
 47 #include <linux/highmem.h>
 48 #include <linux/pagemap.h>
 49 
 50 
 51 unsigned long max_mapnr;
 52 unsigned long num_physpages;
 53 void * high_memory;
 54 struct page *highmem_start_page;
 55 
 56 /*
 57  * We special-case the C-O-W ZERO_PAGE, because it's such
 58  * a common occurrence (no need to read the page to know
 59  * that it's zero - better for the cache and memory subsystem).
 60  */
 61 static inline void copy_cow_page(struct page * from, struct page * to, unsigned long address)
 62 {
 63         if (from == ZERO_PAGE(address)) {
 64                 clear_user_highpage(to, address);
 65                 return;
 66         }
 67         copy_user_highpage(to, from, address);
 68 }
 69 
 70 mem_map_t * mem_map;
 71 
 72 /*
 73  * Note: this doesn't free the actual pages themselves. That
 74  * has been handled earlier when unmapping all the memory regions.
 75  */
 76 static inline void free_one_pmd(pmd_t * dir)
 77 {
 78         pte_t * pte;
 79 
 80         if (pmd_none(*dir))
 81                 return;
 82         if (pmd_bad(*dir)) {
 83                 pmd_ERROR(*dir);
 84                 pmd_clear(dir);
 85                 return;
 86         }
 87         pte = pte_offset(dir, 0);
 88         pmd_clear(dir);
 89         pte_free(pte);
 90 }
 91 
 92 static inline void free_one_pgd(pgd_t * dir)
 93 {
 94         int j;
 95         pmd_t * pmd;
 96 
 97         if (pgd_none(*dir))
 98                 return;
 99         if (pgd_bad(*dir)) {
100                 pgd_ERROR(*dir);
101                 pgd_clear(dir);
102                 return;
103         }
104         pmd = pmd_offset(dir, 0);
105         pgd_clear(dir);
106         for (j = 0; j < PTRS_PER_PMD ; j++)
107                 free_one_pmd(pmd+j);
108         pmd_free(pmd);
109 }
110 
111 /* Low and high watermarks for page table cache.
112    The system should try to have pgt_water[0] <= cache elements <= pgt_water[1]
113  */
114 int pgt_cache_water[2] = { 25, 50 };
115 
116 /* Returns the number of pages freed */
117 int check_pgt_cache(void)
118 {
119         return do_check_pgt_cache(pgt_cache_water[0], pgt_cache_water[1]);
120 }
121 
122 
123 /*
124  * This function clears all user-level page tables of a process - this
125  * is needed by execve(), so that old pages aren't in the way.
126  */
127 void clear_page_tables(struct mm_struct *mm, unsigned long first, int nr)
128 {
129         pgd_t * page_dir = mm->pgd;
130 
131         page_dir += first;
132         do {
133                 free_one_pgd(page_dir);
134                 page_dir++;
135         } while (--nr);
136 
137         /* keep the page table cache within bounds */
138         check_pgt_cache();
139 }
140 
141 #define PTE_TABLE_MASK  ((PTRS_PER_PTE-1) * sizeof(pte_t))
142 #define PMD_TABLE_MASK  ((PTRS_PER_PMD-1) * sizeof(pmd_t))
143 
144 /*
145  * copy one vm_area from one task to the other. Assumes the page tables
146  * already present in the new task to be cleared in the whole range
147  * covered by this vma.
148  *
149  * 08Jan98 Merged into one routine from several inline routines to reduce
150  *         variable count and make things faster. -jj
151  */
152 int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
153                         struct vm_area_struct *vma)
154 {
155         pgd_t * src_pgd, * dst_pgd;
156         unsigned long address = vma->vm_start;
157         unsigned long end = vma->vm_end;
158         unsigned long cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
159 
160         src_pgd = pgd_offset(src, address)-1;
161         dst_pgd = pgd_offset(dst, address)-1;
162         
163         for (;;) {
164                 pmd_t * src_pmd, * dst_pmd;
165 
166                 src_pgd++; dst_pgd++;
167                 
168                 /* copy_pmd_range */
169                 
170                 if (pgd_none(*src_pgd))
171                         goto skip_copy_pmd_range;
172                 if (pgd_bad(*src_pgd)) {
173                         pgd_ERROR(*src_pgd);
174                         pgd_clear(src_pgd);
175 skip_copy_pmd_range:    address = (address + PGDIR_SIZE) & PGDIR_MASK;
176                         if (!address || (address >= end))
177                                 goto out;
178                         continue;
179                 }
180                 if (pgd_none(*dst_pgd)) {
181                         if (!pmd_alloc(dst_pgd, 0))
182                                 goto nomem;
183                 }
184                 
185                 src_pmd = pmd_offset(src_pgd, address);
186                 dst_pmd = pmd_offset(dst_pgd, address);
187 
188                 do {
189                         pte_t * src_pte, * dst_pte;
190                 
191                         /* copy_pte_range */
192                 
193                         if (pmd_none(*src_pmd))
194                                 goto skip_copy_pte_range;
195                         if (pmd_bad(*src_pmd)) {
196                                 pmd_ERROR(*src_pmd);
197                                 pmd_clear(src_pmd);
198 skip_copy_pte_range:            address = (address + PMD_SIZE) & PMD_MASK;
199                                 if (address >= end)
200                                         goto out;
201                                 goto cont_copy_pmd_range;
202                         }
203                         if (pmd_none(*dst_pmd)) {
204                                 if (!pte_alloc(dst_pmd, 0))
205                                         goto nomem;
206                         }
207                         
208                         src_pte = pte_offset(src_pmd, address);
209                         dst_pte = pte_offset(dst_pmd, address);
210                         
211                         do {
212                                 pte_t pte = *src_pte;
213                                 struct page *ptepage;
214                                 
215                                 /* copy_one_pte */
216 
217                                 if (pte_none(pte))
218                                         goto cont_copy_pte_range_noset;
219                                 if (!pte_present(pte)) {
220                                         swap_duplicate(pte_to_swp_entry(pte));
221                                         goto cont_copy_pte_range;
222                                 }
223                                 ptepage = pte_page(pte);
224                                 if ((!VALID_PAGE(ptepage)) || 
225                                     PageReserved(ptepage))
226                                         goto cont_copy_pte_range;
227 
228                                 /* If it's a COW mapping, write protect it both in the parent and the child */
229                                 if (cow) {
230                                         ptep_set_wrprotect(src_pte);
231                                         pte = *src_pte;
232                                 }
233 
234                                 /* If it's a shared mapping, mark it clean in the child */
235                                 if (vma->vm_flags & VM_SHARED)
236                                         pte = pte_mkclean(pte);
237                                 pte = pte_mkold(pte);
238                                 get_page(ptepage);
239 
240 cont_copy_pte_range:            set_pte(dst_pte, pte);
241 cont_copy_pte_range_noset:      address += PAGE_SIZE;
242                                 if (address >= end)
243                                         goto out;
244                                 src_pte++;
245                                 dst_pte++;
246                         } while ((unsigned long)src_pte & PTE_TABLE_MASK);
247                 
248 cont_copy_pmd_range:    src_pmd++;
249                         dst_pmd++;
250                 } while ((unsigned long)src_pmd & PMD_TABLE_MASK);
251         }
252 out:
253         return 0;
254 
255 nomem:
256         return -ENOMEM;
257 }
258 
259 /*
260  * Return indicates whether a page was freed so caller can adjust rss
261  */
262 static inline int free_pte(pte_t pte)
263 {
264         if (pte_present(pte)) {
265                 struct page *page = pte_page(pte);
266                 if ((!VALID_PAGE(page)) || PageReserved(page))
267                         return 0;
268                 /* 
269                  * free_page() used to be able to clear swap cache
270                  * entries.  We may now have to do it manually.  
271                  */
272                 if (pte_dirty(pte) && page->mapping)
273                         set_page_dirty(page);
274                 free_page_and_swap_cache(page);
275                 return 1;
276         }
277         swap_free(pte_to_swp_entry(pte));
278         return 0;
279 }
280 
281 static inline void forget_pte(pte_t page)
282 {
283         if (!pte_none(page)) {
284                 printk("forget_pte: old mapping existed!\n");
285                 free_pte(page);
286         }
287 }
288 
289 static inline int zap_pte_range(struct mm_struct *mm, pmd_t * pmd, unsigned long address, unsigned long size)
290 {
291         pte_t * pte;
292         int freed;
293 
294         if (pmd_none(*pmd))
295                 return 0;
296         if (pmd_bad(*pmd)) {
297                 pmd_ERROR(*pmd);
298                 pmd_clear(pmd);
299                 return 0;
300         }
301         pte = pte_offset(pmd, address);
302         address &= ~PMD_MASK;
303         if (address + size > PMD_SIZE)
304                 size = PMD_SIZE - address;
305         size >>= PAGE_SHIFT;
306         freed = 0;
307         for (;;) {
308                 pte_t page;
309                 if (!size)
310                         break;
311                 page = ptep_get_and_clear(pte);
312                 pte++;
313                 size--;
314                 if (pte_none(page))
315                         continue;
316                 freed += free_pte(page);
317         }
318         return freed;
319 }
320 
321 static inline int zap_pmd_range(struct mm_struct *mm, pgd_t * dir, unsigned long address, unsigned long size)
322 {
323         pmd_t * pmd;
324         unsigned long end;
325         int freed;
326 
327         if (pgd_none(*dir))
328                 return 0;
329         if (pgd_bad(*dir)) {
330                 pgd_ERROR(*dir);
331                 pgd_clear(dir);
332                 return 0;
333         }
334         pmd = pmd_offset(dir, address);
335         address &= ~PGDIR_MASK;
336         end = address + size;
337         if (end > PGDIR_SIZE)
338                 end = PGDIR_SIZE;
339         freed = 0;
340         do {
341                 freed += zap_pte_range(mm, pmd, address, end - address);
342                 address = (address + PMD_SIZE) & PMD_MASK; 
343                 pmd++;
344         } while (address < end);
345         return freed;
346 }
347 
348 /*
349  * remove user pages in a given range.
350  */
351 void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size)
352 {
353         pgd_t * dir;
354         unsigned long end = address + size;
355         int freed = 0;
356 
357         dir = pgd_offset(mm, address);
358 
359         /*
360          * This is a long-lived spinlock. That's fine.
361          * There's no contention, because the page table
362          * lock only protects against kswapd anyway, and
363          * even if kswapd happened to be looking at this
364          * process we _want_ it to get stuck.
365          */
366         if (address >= end)
367                 BUG();
368         spin_lock(&mm->page_table_lock);
369         do {
370                 freed += zap_pmd_range(mm, dir, address, end - address);
371                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
372                 dir++;
373         } while (address && (address < end));
374         spin_unlock(&mm->page_table_lock);
375         /*
376          * Update rss for the mm_struct (not necessarily current->mm)
377          * Notice that rss is an unsigned long.
378          */
379         if (mm->rss > freed)
380                 mm->rss -= freed;
381         else
382                 mm->rss = 0;
383 }
384 
385 
386 /*
387  * Do a quick page-table lookup for a single page. 
388  */
389 static struct page * follow_page(unsigned long address) 
390 {
391         pgd_t *pgd;
392         pmd_t *pmd;
393 
394         pgd = pgd_offset(current->mm, address);
395         pmd = pmd_offset(pgd, address);
396         if (pmd) {
397                 pte_t * pte = pte_offset(pmd, address);
398                 if (pte && pte_present(*pte))
399                         return pte_page(*pte);
400         }
401         
402         return NULL;
403 }
404 
405 /* 
406  * Given a physical address, is there a useful struct page pointing to
407  * it?  This may become more complex in the future if we start dealing
408  * with IO-aperture pages in kiobufs.
409  */
410 
411 static inline struct page * get_page_map(struct page *page)
412 {
413         if (!VALID_PAGE(page))
414                 return 0;
415         return page;
416 }
417 
418 /*
419  * Force in an entire range of pages from the current process's user VA,
420  * and pin them in physical memory.  
421  */
422 
423 #define dprintk(x...)
424 int map_user_kiobuf(int rw, struct kiobuf *iobuf, unsigned long va, size_t len)
425 {
426         unsigned long           ptr, end;
427         int                     err;
428         struct mm_struct *      mm;
429         struct vm_area_struct * vma = 0;
430         struct page *           map;
431         int                     i;
432         int                     datain = (rw == READ);
433         
434         /* Make sure the iobuf is not already mapped somewhere. */
435         if (iobuf->nr_pages)
436                 return -EINVAL;
437 
438         mm = current->mm;
439         dprintk ("map_user_kiobuf: begin\n");
440         
441         ptr = va & PAGE_MASK;
442         end = (va + len + PAGE_SIZE - 1) & PAGE_MASK;
443         err = expand_kiobuf(iobuf, (end - ptr) >> PAGE_SHIFT);
444         if (err)
445                 return err;
446 
447         down(&mm->mmap_sem);
448 
449         err = -EFAULT;
450         iobuf->locked = 0;
451         iobuf->offset = va & ~PAGE_MASK;
452         iobuf->length = len;
453         
454         i = 0;
455         
456         /* 
457          * First of all, try to fault in all of the necessary pages
458          */
459         while (ptr < end) {
460                 if (!vma || ptr >= vma->vm_end) {
461                         vma = find_vma(current->mm, ptr);
462                         if (!vma) 
463                                 goto out_unlock;
464                         if (vma->vm_start > ptr) {
465                                 if (!(vma->vm_flags & VM_GROWSDOWN))
466                                         goto out_unlock;
467                                 if (expand_stack(vma, ptr))
468                                         goto out_unlock;
469                         }
470                         if (((datain) && (!(vma->vm_flags & VM_WRITE))) ||
471                                         (!(vma->vm_flags & VM_READ))) {
472                                 err = -EACCES;
473                                 goto out_unlock;
474                         }
475                 }
476                 if (handle_mm_fault(current->mm, vma, ptr, datain) <= 0) 
477                         goto out_unlock;
478                 spin_lock(&mm->page_table_lock);
479                 map = follow_page(ptr);
480                 if (!map) {
481                         spin_unlock(&mm->page_table_lock);
482                         dprintk (KERN_ERR "Missing page in map_user_kiobuf\n");
483                         goto out_unlock;
484                 }
485                 map = get_page_map(map);
486                 if (map) {
487                         flush_dcache_page(map);
488                         atomic_inc(&map->count);
489                 } else
490                         printk (KERN_INFO "Mapped page missing [%d]\n", i);
491                 spin_unlock(&mm->page_table_lock);
492                 iobuf->maplist[i] = map;
493                 iobuf->nr_pages = ++i;
494                 
495                 ptr += PAGE_SIZE;
496         }
497 
498         up(&mm->mmap_sem);
499         dprintk ("map_user_kiobuf: end OK\n");
500         return 0;
501 
502  out_unlock:
503         up(&mm->mmap_sem);
504         unmap_kiobuf(iobuf);
505         dprintk ("map_user_kiobuf: end %d\n", err);
506         return err;
507 }
508 
509 
510 /*
511  * Unmap all of the pages referenced by a kiobuf.  We release the pages,
512  * and unlock them if they were locked. 
513  */
514 
515 void unmap_kiobuf (struct kiobuf *iobuf) 
516 {
517         int i;
518         struct page *map;
519         
520         for (i = 0; i < iobuf->nr_pages; i++) {
521                 map = iobuf->maplist[i];
522                 if (map) {
523                         if (iobuf->locked)
524                                 UnlockPage(map);
525                         __free_page(map);
526                 }
527         }
528         
529         iobuf->nr_pages = 0;
530         iobuf->locked = 0;
531 }
532 
533 
534 /*
535  * Lock down all of the pages of a kiovec for IO.
536  *
537  * If any page is mapped twice in the kiovec, we return the error -EINVAL.
538  *
539  * The optional wait parameter causes the lock call to block until all
540  * pages can be locked if set.  If wait==0, the lock operation is
541  * aborted if any locked pages are found and -EAGAIN is returned.
542  */
543 
544 int lock_kiovec(int nr, struct kiobuf *iovec[], int wait)
545 {
546         struct kiobuf *iobuf;
547         int i, j;
548         struct page *page, **ppage;
549         int doublepage = 0;
550         int repeat = 0;
551         
552  repeat:
553         
554         for (i = 0; i < nr; i++) {
555                 iobuf = iovec[i];
556 
557                 if (iobuf->locked)
558                         continue;
559                 iobuf->locked = 1;
560 
561                 ppage = iobuf->maplist;
562                 for (j = 0; j < iobuf->nr_pages; ppage++, j++) {
563                         page = *ppage;
564                         if (!page)
565                                 continue;
566                         
567                         if (TryLockPage(page))
568                                 goto retry;
569                 }
570         }
571 
572         return 0;
573         
574  retry:
575         
576         /* 
577          * We couldn't lock one of the pages.  Undo the locking so far,
578          * wait on the page we got to, and try again.  
579          */
580         
581         unlock_kiovec(nr, iovec);
582         if (!wait)
583                 return -EAGAIN;
584         
585         /* 
586          * Did the release also unlock the page we got stuck on?
587          */
588         if (!PageLocked(page)) {
589                 /* 
590                  * If so, we may well have the page mapped twice
591                  * in the IO address range.  Bad news.  Of
592                  * course, it _might_ just be a coincidence,
593                  * but if it happens more than once, chances
594                  * are we have a double-mapped page. 
595                  */
596                 if (++doublepage >= 3) 
597                         return -EINVAL;
598                 
599                 /* Try again...  */
600                 wait_on_page(page);
601         }
602         
603         if (++repeat < 16)
604                 goto repeat;
605         return -EAGAIN;
606 }
607 
608 /*
609  * Unlock all of the pages of a kiovec after IO.
610  */
611 
612 int unlock_kiovec(int nr, struct kiobuf *iovec[])
613 {
614         struct kiobuf *iobuf;
615         int i, j;
616         struct page *page, **ppage;
617         
618         for (i = 0; i < nr; i++) {
619                 iobuf = iovec[i];
620 
621                 if (!iobuf->locked)
622                         continue;
623                 iobuf->locked = 0;
624                 
625                 ppage = iobuf->maplist;
626                 for (j = 0; j < iobuf->nr_pages; ppage++, j++) {
627                         page = *ppage;
628                         if (!page)
629                                 continue;
630                         UnlockPage(page);
631                 }
632         }
633         return 0;
634 }
635 
636 static inline void zeromap_pte_range(pte_t * pte, unsigned long address,
637                                      unsigned long size, pgprot_t prot)
638 {
639         unsigned long end;
640 
641         address &= ~PMD_MASK;
642         end = address + size;
643         if (end > PMD_SIZE)
644                 end = PMD_SIZE;
645         do {
646                 pte_t zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE(address), prot));
647                 pte_t oldpage = ptep_get_and_clear(pte);
648                 set_pte(pte, zero_pte);
649                 forget_pte(oldpage);
650                 address += PAGE_SIZE;
651                 pte++;
652         } while (address && (address < end));
653 }
654 
655 static inline int zeromap_pmd_range(pmd_t * pmd, unsigned long address,
656                                     unsigned long size, pgprot_t prot)
657 {
658         unsigned long end;
659 
660         address &= ~PGDIR_MASK;
661         end = address + size;
662         if (end > PGDIR_SIZE)
663                 end = PGDIR_SIZE;
664         do {
665                 pte_t * pte = pte_alloc(pmd, address);
666                 if (!pte)
667                         return -ENOMEM;
668                 zeromap_pte_range(pte, address, end - address, prot);
669                 address = (address + PMD_SIZE) & PMD_MASK;
670                 pmd++;
671         } while (address && (address < end));
672         return 0;
673 }
674 
675 int zeromap_page_range(unsigned long address, unsigned long size, pgprot_t prot)
676 {
677         int error = 0;
678         pgd_t * dir;
679         unsigned long beg = address;
680         unsigned long end = address + size;
681 
682         dir = pgd_offset(current->mm, address);
683         flush_cache_range(current->mm, beg, end);
684         if (address >= end)
685                 BUG();
686         do {
687                 pmd_t *pmd = pmd_alloc(dir, address);
688                 error = -ENOMEM;
689                 if (!pmd)
690                         break;
691                 error = zeromap_pmd_range(pmd, address, end - address, prot);
692                 if (error)
693                         break;
694                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
695                 dir++;
696         } while (address && (address < end));
697         flush_tlb_range(current->mm, beg, end);
698         return error;
699 }
700 
701 /*
702  * maps a range of physical memory into the requested pages. the old
703  * mappings are removed. any references to nonexistent pages results
704  * in null mappings (currently treated as "copy-on-access")
705  */
706 static inline void remap_pte_range(pte_t * pte, unsigned long address, unsigned long size,
707         unsigned long phys_addr, pgprot_t prot)
708 {
709         unsigned long end;
710 
711         address &= ~PMD_MASK;
712         end = address + size;
713         if (end > PMD_SIZE)
714                 end = PMD_SIZE;
715         do {
716                 struct page *page;
717                 pte_t oldpage;
718                 oldpage = ptep_get_and_clear(pte);
719 
720                 page = virt_to_page(__va(phys_addr));
721                 if ((!VALID_PAGE(page)) || PageReserved(page))
722                         set_pte(pte, mk_pte_phys(phys_addr, prot));
723                 forget_pte(oldpage);
724                 address += PAGE_SIZE;
725                 phys_addr += PAGE_SIZE;
726                 pte++;
727         } while (address && (address < end));
728 }
729 
730 static inline int remap_pmd_range(pmd_t * pmd, unsigned long address, unsigned long size,
731         unsigned long phys_addr, pgprot_t prot)
732 {
733         unsigned long end;
734 
735         address &= ~PGDIR_MASK;
736         end = address + size;
737         if (end > PGDIR_SIZE)
738                 end = PGDIR_SIZE;
739         phys_addr -= address;
740         do {
741                 pte_t * pte = pte_alloc(pmd, address);
742                 if (!pte)
743                         return -ENOMEM;
744                 remap_pte_range(pte, address, end - address, address + phys_addr, prot);
745                 address = (address + PMD_SIZE) & PMD_MASK;
746                 pmd++;
747         } while (address && (address < end));
748         return 0;
749 }
750 
751 /*  Note: this is only safe if the mm semaphore is held when called. */
752 int remap_page_range(unsigned long from, unsigned long phys_addr, unsigned long size, pgprot_t prot)
753 {
754         int error = 0;
755         pgd_t * dir;
756         unsigned long beg = from;
757         unsigned long end = from + size;
758 
759         phys_addr -= from;
760         dir = pgd_offset(current->mm, from);
761         flush_cache_range(current->mm, beg, end);
762         if (from >= end)
763                 BUG();
764         do {
765                 pmd_t *pmd = pmd_alloc(dir, from);
766                 error = -ENOMEM;
767                 if (!pmd)
768                         break;
769                 error = remap_pmd_range(pmd, from, end - from, phys_addr + from, prot);
770                 if (error)
771                         break;
772                 from = (from + PGDIR_SIZE) & PGDIR_MASK;
773                 dir++;
774         } while (from && (from < end));
775         flush_tlb_range(current->mm, beg, end);
776         return error;
777 }
778 
779 /*
780  * Establish a new mapping:
781  *  - flush the old one
782  *  - update the page tables
783  *  - inform the TLB about the new one
784  */
785 static inline void establish_pte(struct vm_area_struct * vma, unsigned long address, pte_t *page_table, pte_t entry)
786 {
787         set_pte(page_table, entry);
788         flush_tlb_page(vma, address);
789         update_mmu_cache(vma, address, entry);
790 }
791 
792 static inline void break_cow(struct vm_area_struct * vma, struct page * old_page, struct page * new_page, unsigned long address, 
793                 pte_t *page_table)
794 {
795         copy_cow_page(old_page,new_page,address);
796         flush_page_to_ram(new_page);
797         flush_cache_page(vma, address);
798         establish_pte(vma, address, page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot))));
799 }
800 
801 /*
802  * This routine handles present pages, when users try to write
803  * to a shared page. It is done by copying the page to a new address
804  * and decrementing the shared-page counter for the old page.
805  *
806  * Goto-purists beware: the only reason for goto's here is that it results
807  * in better assembly code.. The "default" path will see no jumps at all.
808  *
809  * Note that this routine assumes that the protection checks have been
810  * done by the caller (the low-level page fault routine in most cases).
811  * Thus we can safely just mark it writable once we've done any necessary
812  * COW.
813  *
814  * We also mark the page dirty at this point even though the page will
815  * change only once the write actually happens. This avoids a few races,
816  * and potentially makes it more efficient.
817  *
818  * We enter with the page table read-lock held, and need to exit without
819  * it.
820  */
821 static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
822         unsigned long address, pte_t *page_table, pte_t pte)
823 {
824         struct page *old_page, *new_page;
825 
826         old_page = pte_page(pte);
827         if (!VALID_PAGE(old_page))
828                 goto bad_wp_page;
829         
830         /*
831          * We can avoid the copy if:
832          * - we're the only user (count == 1)
833          * - the only other user is the swap cache,
834          *   and the only swap cache user is itself,
835          *   in which case we can just continue to
836          *   use the same swap cache (it will be
837          *   marked dirty).
838          */
839         switch (page_count(old_page)) {
840         case 2:
841                 /*
842                  * Lock the page so that no one can look it up from
843                  * the swap cache, grab a reference and start using it.
844                  * Can not do lock_page, holding page_table_lock.
845                  */
846                 if (!PageSwapCache(old_page) || TryLockPage(old_page))
847                         break;
848                 if (is_page_shared(old_page)) {
849                         UnlockPage(old_page);
850                         break;
851                 }
852                 UnlockPage(old_page);
853                 /* FallThrough */
854         case 1:
855                 flush_cache_page(vma, address);
856                 establish_pte(vma, address, page_table, pte_mkyoung(pte_mkdirty(pte_mkwrite(pte))));
857                 spin_unlock(&mm->page_table_lock);
858                 return 1;       /* Minor fault */
859         }
860 
861         /*
862          * Ok, we need to copy. Oh, well..
863          */
864         spin_unlock(&mm->page_table_lock);
865         new_page = page_cache_alloc();
866         if (!new_page)
867                 return -1;
868         spin_lock(&mm->page_table_lock);
869 
870         /*
871          * Re-check the pte - we dropped the lock
872          */
873         if (pte_same(*page_table, pte)) {
874                 if (PageReserved(old_page))
875                         ++mm->rss;
876                 break_cow(vma, old_page, new_page, address, page_table);
877 
878                 /* Free the old page.. */
879                 new_page = old_page;
880         }
881         spin_unlock(&mm->page_table_lock);
882         page_cache_release(new_page);
883         return 1;       /* Minor fault */
884 
885 bad_wp_page:
886         spin_unlock(&mm->page_table_lock);
887         printk("do_wp_page: bogus page at address %08lx (page 0x%lx)\n",address,(unsigned long)old_page);
888         return -1;
889 }
890 
891 static void vmtruncate_list(struct vm_area_struct *mpnt,
892                             unsigned long pgoff, unsigned long partial)
893 {
894         do {
895                 struct mm_struct *mm = mpnt->vm_mm;
896                 unsigned long start = mpnt->vm_start;
897                 unsigned long end = mpnt->vm_end;
898                 unsigned long len = end - start;
899                 unsigned long diff;
900 
901                 /* mapping wholly truncated? */
902                 if (mpnt->vm_pgoff >= pgoff) {
903                         flush_cache_range(mm, start, end);
904                         zap_page_range(mm, start, len);
905                         flush_tlb_range(mm, start, end);
906                         continue;
907                 }
908 
909                 /* mapping wholly unaffected? */
910                 len = len >> PAGE_SHIFT;
911                 diff = pgoff - mpnt->vm_pgoff;
912                 if (diff >= len)
913                         continue;
914 
915                 /* Ok, partially affected.. */
916                 start += diff << PAGE_SHIFT;
917                 len = (len - diff) << PAGE_SHIFT;
918                 flush_cache_range(mm, start, end);
919                 zap_page_range(mm, start, len);
920                 flush_tlb_range(mm, start, end);
921         } while ((mpnt = mpnt->vm_next_share) != NULL);
922 }
923                               
924 
925 /*
926  * Handle all mappings that got truncated by a "truncate()"
927  * system call.
928  *
929  * NOTE! We have to be ready to update the memory sharing
930  * between the file and the memory map for a potential last
931  * incomplete page.  Ugly, but necessary.
932  */
933 void vmtruncate(struct inode * inode, loff_t offset)
934 {
935         unsigned long partial, pgoff;
936         struct address_space *mapping = inode->i_mapping;
937         unsigned long limit;
938 
939         if (inode->i_size < offset)
940                 goto do_expand;
941         inode->i_size = offset;
942         truncate_inode_pages(mapping, offset);
943         spin_lock(&mapping->i_shared_lock);
944         if (!mapping->i_mmap && !mapping->i_mmap_shared)
945                 goto out_unlock;
946 
947         pgoff = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
948         partial = (unsigned long)offset & (PAGE_CACHE_SIZE - 1);
949 
950         if (mapping->i_mmap != NULL)
951                 vmtruncate_list(mapping->i_mmap, pgoff, partial);
952         if (mapping->i_mmap_shared != NULL)
953                 vmtruncate_list(mapping->i_mmap_shared, pgoff, partial);
954 
955 out_unlock:
956         spin_unlock(&mapping->i_shared_lock);
957         /* this should go into ->truncate */
958         inode->i_size = offset;
959         if (inode->i_op && inode->i_op->truncate)
960                 inode->i_op->truncate(inode);
961         return;
962 
963 do_expand:
964         limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
965         if (limit != RLIM_INFINITY) {
966                 if (inode->i_size >= limit) {
967                         send_sig(SIGXFSZ, current, 0);
968                         goto out;
969                 }
970                 if (offset > limit) {
971                         send_sig(SIGXFSZ, current, 0);
972                         offset = limit;
973                 }
974         }
975         inode->i_size = offset;
976         if (inode->i_op && inode->i_op->truncate)
977                 inode->i_op->truncate(inode);
978 out:
979         return;
980 }
981 
982 
983 
984 /* 
985  * Primitive swap readahead code. We simply read an aligned block of
986  * (1 << page_cluster) entries in the swap area. This method is chosen
987  * because it doesn't cost us any seek time.  We also make sure to queue
988  * the 'original' request together with the readahead ones...  
989  */
990 void swapin_readahead(swp_entry_t entry)
991 {
992         int i, num;
993         struct page *new_page;
994         unsigned long offset;
995 
996         /*
997          * Get the number of handles we should do readahead io to. Also,
998          * grab temporary references on them, releasing them as io completes.
999          */
1000         num = valid_swaphandles(entry, &offset);
1001         for (i = 0; i < num; offset++, i++) {
1002                 /* Don't block on I/O for read-ahead */
1003                 if (atomic_read(&nr_async_pages) >= pager_daemon.swap_cluster
1004                                 * (1 << page_cluster)) {
1005                         while (i++ < num)
1006                                 swap_free(SWP_ENTRY(SWP_TYPE(entry), offset++));
1007                         break;
1008                 }
1009                 /* Ok, do the async read-ahead now */
1010                 new_page = read_swap_cache_async(SWP_ENTRY(SWP_TYPE(entry), offset), 0);
1011                 if (new_page != NULL)
1012                         page_cache_release(new_page);
1013                 swap_free(SWP_ENTRY(SWP_TYPE(entry), offset));
1014         }
1015         return;
1016 }
1017 
1018 static int do_swap_page(struct mm_struct * mm,
1019         struct vm_area_struct * vma, unsigned long address,
1020         pte_t * page_table, swp_entry_t entry, int write_access)
1021 {
1022         struct page *page = lookup_swap_cache(entry);
1023         pte_t pte;
1024 
1025         if (!page) {
1026                 lock_kernel();
1027                 swapin_readahead(entry);
1028                 page = read_swap_cache(entry);
1029                 unlock_kernel();
1030                 if (!page)
1031                         return -1;
1032 
1033                 flush_page_to_ram(page);
1034                 flush_icache_page(vma, page);
1035         }
1036 
1037         mm->rss++;
1038 
1039         pte = mk_pte(page, vma->vm_page_prot);
1040 
1041         /*
1042          * Freeze the "shared"ness of the page, ie page_count + swap_count.
1043          * Must lock page before transferring our swap count to already
1044          * obtained page count.
1045          */
1046         lock_page(page);
1047         swap_free(entry);
1048         if (write_access && !is_page_shared(page))
1049                 pte = pte_mkwrite(pte_mkdirty(pte));
1050         UnlockPage(page);
1051 
1052         set_pte(page_table, pte);
1053         /* No need to invalidate - it was non-present before */
1054         update_mmu_cache(vma, address, pte);
1055         return 1;       /* Minor fault */
1056 }
1057 
1058 /*
1059  * This only needs the MM semaphore
1060  */
1061 static int do_anonymous_page(struct mm_struct * mm, struct vm_area_struct * vma, pte_t *page_table, int write_access, unsigned long addr)
1062 {
1063         struct page *page = NULL;
1064         pte_t entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot));
1065         if (write_access) {
1066                 page = alloc_page(GFP_HIGHUSER);
1067                 if (!page)
1068                         return -1;
1069                 clear_user_highpage(page, addr);
1070                 entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
1071                 mm->rss++;
1072                 flush_page_to_ram(page);
1073         }
1074         set_pte(page_table, entry);
1075         /* No need to invalidate - it was non-present before */
1076         update_mmu_cache(vma, addr, entry);
1077         return 1;       /* Minor fault */
1078 }
1079 
1080 /*
1081  * do_no_page() tries to create a new page mapping. It aggressively
1082  * tries to share with existing pages, but makes a separate copy if
1083  * the "write_access" parameter is true in order to avoid the next
1084  * page fault.
1085  *
1086  * As this is called only for pages that do not currently exist, we
1087  * do not need to flush old virtual caches or the TLB.
1088  *
1089  * This is called with the MM semaphore held.
1090  */
1091 static int do_no_page(struct mm_struct * mm, struct vm_area_struct * vma,
1092         unsigned long address, int write_access, pte_t *page_table)
1093 {
1094         struct page * new_page;
1095         pte_t entry;
1096 
1097         if (!vma->vm_ops || !vma->vm_ops->nopage)
1098                 return do_anonymous_page(mm, vma, page_table, write_access, address);
1099 
1100         /*
1101          * The third argument is "no_share", which tells the low-level code
1102          * to copy, not share the page even if sharing is possible.  It's
1103          * essentially an early COW detection.
1104          */
1105         new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, (vma->vm_flags & VM_SHARED)?0:write_access);
1106         if (new_page == NULL)   /* no page was available -- SIGBUS */
1107                 return 0;
1108         if (new_page == NOPAGE_OOM)
1109                 return -1;
1110         ++mm->rss;
1111         /*
1112          * This silly early PAGE_DIRTY setting removes a race
1113          * due to the bad i386 page protection. But it's valid
1114          * for other architectures too.
1115          *
1116          * Note that if write_access is true, we either now have
1117          * an exclusive copy of the page, or this is a shared mapping,
1118          * so we can make it writable and dirty to avoid having to
1119          * handle that later.
1120          */
1121         flush_page_to_ram(new_page);
1122         flush_icache_page(vma, new_page);
1123         entry = mk_pte(new_page, vma->vm_page_prot);
1124         if (write_access) {
1125                 entry = pte_mkwrite(pte_mkdirty(entry));
1126         } else if (page_count(new_page) > 1 &&
1127                    !(vma->vm_flags & VM_SHARED))
1128                 entry = pte_wrprotect(entry);
1129         set_pte(page_table, entry);
1130         /* no need to invalidate: a not-present page shouldn't be cached */
1131         update_mmu_cache(vma, address, entry);
1132         return 2;       /* Major fault */
1133 }
1134 
1135 /*
1136  * These routines also need to handle stuff like marking pages dirty
1137  * and/or accessed for architectures that don't do it in hardware (most
1138  * RISC architectures).  The early dirtying is also good on the i386.
1139  *
1140  * There is also a hook called "update_mmu_cache()" that architectures
1141  * with external mmu caches can use to update those (ie the Sparc or
1142  * PowerPC hashed page tables that act as extended TLBs).
1143  *
1144  * Note the "page_table_lock". It is to protect against kswapd removing
1145  * pages from under us. Note that kswapd only ever _removes_ pages, never
1146  * adds them. As such, once we have noticed that the page is not present,
1147  * we can drop the lock early.
1148  *
1149  * The adding of pages is protected by the MM semaphore (which we hold),
1150  * so we don't need to worry about a page being suddenly been added into
1151  * our VM.
1152  */
1153 static inline int handle_pte_fault(struct mm_struct *mm,
1154         struct vm_area_struct * vma, unsigned long address,
1155         int write_access, pte_t * pte)
1156 {
1157         pte_t entry;
1158 
1159         /*
1160          * We need the page table lock to synchronize with kswapd
1161          * and the SMP-safe atomic PTE updates.
1162          */
1163         spin_lock(&mm->page_table_lock);
1164         entry = *pte;
1165         if (!pte_present(entry)) {
1166                 /*
1167                  * If it truly wasn't present, we know that kswapd
1168                  * and the PTE updates will not touch it later. So
1169                  * drop the lock.
1170                  */
1171                 spin_unlock(&mm->page_table_lock);
1172                 if (pte_none(entry))
1173                         return do_no_page(mm, vma, address, write_access, pte);
1174                 return do_swap_page(mm, vma, address, pte, pte_to_swp_entry(entry), write_access);
1175         }
1176 
1177         if (write_access) {
1178                 if (!pte_write(entry))
1179                         return do_wp_page(mm, vma, address, pte, entry);
1180 
1181                 entry = pte_mkdirty(entry);
1182         }
1183         entry = pte_mkyoung(entry);
1184         establish_pte(vma, address, pte, entry);
1185         spin_unlock(&mm->page_table_lock);
1186         return 1;
1187 }
1188 
1189 /*
1190  * By the time we get here, we already hold the mm semaphore
1191  */
1192 int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma,
1193         unsigned long address, int write_access)
1194 {
1195         int ret = -1;
1196         pgd_t *pgd;
1197         pmd_t *pmd;
1198 
1199         pgd = pgd_offset(mm, address);
1200         pmd = pmd_alloc(pgd, address);
1201 
1202         if (pmd) {
1203                 pte_t * pte = pte_alloc(pmd, address);
1204                 if (pte)
1205                         ret = handle_pte_fault(mm, vma, address, write_access, pte);
1206         }
1207         return ret;
1208 }
1209 
1210 /*
1211  * Simplistic page force-in..
1212  */
1213 int make_pages_present(unsigned long addr, unsigned long end)
1214 {
1215         int write;
1216         struct mm_struct *mm = current->mm;
1217         struct vm_area_struct * vma;
1218 
1219         vma = find_vma(mm, addr);
1220         write = (vma->vm_flags & VM_WRITE) != 0;
1221         if (addr >= end)
1222                 BUG();
1223         do {
1224                 if (handle_mm_fault(mm, vma, addr, write) < 0)
1225                         return -1;
1226                 addr += PAGE_SIZE;
1227         } while (addr < end);
1228         return 0;
1229 }
1230 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~ [ freetext search ] ~ [ file search ] ~

This page was automatically generated by the LXR engine.
Visit the LXR main site for more information.