~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~ [ freetext search ] ~ [ file search ] ~

Linux Cross Reference
Linux/mm/filemap.c

Version: ~ [ 2.4.0 ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 /*
  2  *      linux/mm/filemap.c
  3  *
  4  * Copyright (C) 1994-1999  Linus Torvalds
  5  */
  6 
  7 /*
  8  * This file handles the generic file mmap semantics used by
  9  * most "normal" filesystems (but you don't /have/ to use this:
 10  * the NFS filesystem used to do this differently, for example)
 11  */
 12 #include <linux/malloc.h>
 13 #include <linux/shm.h>
 14 #include <linux/mman.h>
 15 #include <linux/locks.h>
 16 #include <linux/pagemap.h>
 17 #include <linux/swap.h>
 18 #include <linux/smp_lock.h>
 19 #include <linux/blkdev.h>
 20 #include <linux/file.h>
 21 #include <linux/swapctl.h>
 22 #include <linux/slab.h>
 23 #include <linux/init.h>
 24 #include <linux/mm.h>
 25 
 26 #include <asm/pgalloc.h>
 27 #include <asm/uaccess.h>
 28 #include <asm/mman.h>
 29 
 30 #include <linux/highmem.h>
 31 
 32 /*
 33  * Shared mappings implemented 30.11.1994. It's not fully working yet,
 34  * though.
 35  *
 36  * Shared mappings now work. 15.8.1995  Bruno.
 37  *
 38  * finished 'unifying' the page and buffer cache and SMP-threaded the
 39  * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
 40  *
 41  * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
 42  */
 43 
 44 atomic_t page_cache_size = ATOMIC_INIT(0);
 45 unsigned int page_hash_bits;
 46 struct page **page_hash_table;
 47 
 48 spinlock_t pagecache_lock = SPIN_LOCK_UNLOCKED;
 49 /*
 50  * NOTE: to avoid deadlocking you must never acquire the pagecache_lock with
 51  *       the pagemap_lru_lock held.
 52  */
 53 spinlock_t pagemap_lru_lock = SPIN_LOCK_UNLOCKED;
 54 
 55 #define CLUSTER_PAGES           (1 << page_cluster)
 56 #define CLUSTER_OFFSET(x)       (((x) >> page_cluster) << page_cluster)
 57 
 58 static void add_page_to_hash_queue(struct page * page, struct page **p)
 59 {
 60         struct page *next = *p;
 61 
 62         *p = page;
 63         page->next_hash = next;
 64         page->pprev_hash = p;
 65         if (next)
 66                 next->pprev_hash = &page->next_hash;
 67         if (page->buffers)
 68                 PAGE_BUG(page);
 69         atomic_inc(&page_cache_size);
 70 }
 71 
 72 static inline void add_page_to_inode_queue(struct address_space *mapping, struct page * page)
 73 {
 74         struct list_head *head = &mapping->clean_pages;
 75 
 76         mapping->nrpages++;
 77         list_add(&page->list, head);
 78         page->mapping = mapping;
 79 }
 80 
 81 static inline void remove_page_from_inode_queue(struct page * page)
 82 {
 83         struct address_space * mapping = page->mapping;
 84 
 85         mapping->nrpages--;
 86         list_del(&page->list);
 87         page->mapping = NULL;
 88 }
 89 
 90 static inline void remove_page_from_hash_queue(struct page * page)
 91 {
 92         struct page *next = page->next_hash;
 93         struct page **pprev = page->pprev_hash;
 94 
 95         if (next)
 96                 next->pprev_hash = pprev;
 97         *pprev = next;
 98         page->pprev_hash = NULL;
 99         atomic_dec(&page_cache_size);
100 }
101 
102 /*
103  * Remove a page from the page cache and free it. Caller has to make
104  * sure the page is locked and that nobody else uses it - or that usage
105  * is safe.
106  */
107 void __remove_inode_page(struct page *page)
108 {
109         if (PageDirty(page)) BUG();
110         remove_page_from_inode_queue(page);
111         remove_page_from_hash_queue(page);
112         page->mapping = NULL;
113 }
114 
115 void remove_inode_page(struct page *page)
116 {
117         if (!PageLocked(page))
118                 PAGE_BUG(page);
119 
120         spin_lock(&pagecache_lock);
121         __remove_inode_page(page);
122         spin_unlock(&pagecache_lock);
123 }
124 
125 static inline int sync_page(struct page *page)
126 {
127         struct address_space *mapping = page->mapping;
128 
129         if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
130                 return mapping->a_ops->sync_page(page);
131         return 0;
132 }
133 
134 /*
135  * Add a page to the dirty page list.
136  */
137 void __set_page_dirty(struct page *page)
138 {
139         struct address_space *mapping = page->mapping;
140 
141         spin_lock(&pagecache_lock);
142         list_del(&page->list);
143         list_add(&page->list, &mapping->dirty_pages);
144         spin_unlock(&pagecache_lock);
145 
146         mark_inode_dirty_pages(mapping->host);
147 }
148 
149 /**
150  * invalidate_inode_pages - Invalidate all the unlocked pages of one inode
151  * @inode: the inode which pages we want to invalidate
152  *
153  * This function only removes the unlocked pages, if you want to
154  * remove all the pages of one inode, you must call truncate_inode_pages.
155  */
156 
157 void invalidate_inode_pages(struct inode * inode)
158 {
159         struct list_head *head, *curr;
160         struct page * page;
161 
162         head = &inode->i_mapping->clean_pages;
163 
164         spin_lock(&pagecache_lock);
165         spin_lock(&pagemap_lru_lock);
166         curr = head->next;
167 
168         while (curr != head) {
169                 page = list_entry(curr, struct page, list);
170                 curr = curr->next;
171 
172                 /* We cannot invalidate something in use.. */
173                 if (page_count(page) != 1)
174                         continue;
175 
176                 /* ..or dirty.. */
177                 if (PageDirty(page))
178                         continue;
179 
180                 /* ..or locked */
181                 if (TryLockPage(page))
182                         continue;
183 
184                 __lru_cache_del(page);
185                 __remove_inode_page(page);
186                 UnlockPage(page);
187                 page_cache_release(page);
188         }
189 
190         spin_unlock(&pagemap_lru_lock);
191         spin_unlock(&pagecache_lock);
192 }
193 
194 static inline void truncate_partial_page(struct page *page, unsigned partial)
195 {
196         memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
197                                 
198         if (page->buffers)
199                 block_flushpage(page, partial);
200 
201 }
202 
203 static inline void truncate_complete_page(struct page *page)
204 {
205         /* Leave it on the LRU if it gets converted into anonymous buffers */
206         if (!page->buffers || block_flushpage(page, 0))
207                 lru_cache_del(page);
208 
209         /*
210          * We remove the page from the page cache _after_ we have
211          * destroyed all buffer-cache references to it. Otherwise some
212          * other process might think this inode page is not in the
213          * page cache and creates a buffer-cache alias to it causing
214          * all sorts of fun problems ...  
215          */
216         ClearPageDirty(page);
217         ClearPageUptodate(page);
218         remove_inode_page(page);
219         page_cache_release(page);
220 }
221 
222 static int FASTCALL(truncate_list_pages(struct list_head *, unsigned long, unsigned *));
223 static int truncate_list_pages(struct list_head *head, unsigned long start, unsigned *partial)
224 {
225         struct list_head *curr;
226         struct page * page;
227 
228         curr = head->next;
229         while (curr != head) {
230                 unsigned long offset;
231 
232                 page = list_entry(curr, struct page, list);
233                 curr = curr->next;
234                 offset = page->index;
235 
236                 /* Is one of the pages to truncate? */
237                 if ((offset >= start) || (*partial && (offset + 1) == start)) {
238                         if (TryLockPage(page)) {
239                                 page_cache_get(page);
240                                 spin_unlock(&pagecache_lock);
241                                 wait_on_page(page);
242                                 page_cache_release(page);
243                                 return 1;
244                         }
245                         page_cache_get(page);
246                         spin_unlock(&pagecache_lock);
247 
248                         if (*partial && (offset + 1) == start) {
249                                 truncate_partial_page(page, *partial);
250                                 *partial = 0;
251                         } else 
252                                 truncate_complete_page(page);
253 
254                         UnlockPage(page);
255                         page_cache_release(page);
256                         return 1;
257                 }
258         }
259         return 0;
260 }
261 
262 
263 /**
264  * truncate_inode_pages - truncate *all* the pages from an offset
265  * @mapping: mapping to truncate
266  * @lstart: offset from with to truncate
267  *
268  * Truncate the page cache at a set offset, removing the pages
269  * that are beyond that offset (and zeroing out partial pages).
270  * If any page is locked we wait for it to become unlocked.
271  */
272 void truncate_inode_pages(struct address_space * mapping, loff_t lstart) 
273 {
274         unsigned long start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
275         unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
276 
277 repeat:
278         spin_lock(&pagecache_lock);
279         if (truncate_list_pages(&mapping->clean_pages, start, &partial))
280                 goto repeat;
281         if (truncate_list_pages(&mapping->dirty_pages, start, &partial))
282                 goto repeat;
283         if (truncate_list_pages(&mapping->locked_pages, start, &partial))
284                 goto repeat;
285         spin_unlock(&pagecache_lock);
286 }
287 
288 static inline struct page * __find_page_nolock(struct address_space *mapping, unsigned long offset, struct page *page)
289 {
290         goto inside;
291 
292         for (;;) {
293                 page = page->next_hash;
294 inside:
295                 if (!page)
296                         goto not_found;
297                 if (page->mapping != mapping)
298                         continue;
299                 if (page->index == offset)
300                         break;
301         }
302         /*
303          * Touching the page may move it to the active list.
304          * If we end up with too few inactive pages, we wake
305          * up kswapd.
306          */
307         age_page_up(page);
308         if (inactive_shortage() > inactive_target / 2 && free_shortage())
309                         wakeup_kswapd(0);
310 not_found:
311         return page;
312 }
313 
314 /*
315  * By the time this is called, the page is locked and
316  * we don't have to worry about any races any more.
317  *
318  * Start the IO..
319  */
320 static int writeout_one_page(struct page *page)
321 {
322         struct buffer_head *bh, *head = page->buffers;
323 
324         bh = head;
325         do {
326                 if (buffer_locked(bh) || !buffer_dirty(bh) || !buffer_uptodate(bh))
327                         continue;
328 
329                 bh->b_flushtime = jiffies;
330                 ll_rw_block(WRITE, 1, &bh);     
331         } while ((bh = bh->b_this_page) != head);
332         return 0;
333 }
334 
335 static int waitfor_one_page(struct page *page)
336 {
337         int error = 0;
338         struct buffer_head *bh, *head = page->buffers;
339 
340         bh = head;
341         do {
342                 wait_on_buffer(bh);
343                 if (buffer_req(bh) && !buffer_uptodate(bh))
344                         error = -EIO;
345         } while ((bh = bh->b_this_page) != head);
346         return error;
347 }
348 
349 static int do_buffer_fdatasync(struct list_head *head, unsigned long start, unsigned long end, int (*fn)(struct page *))
350 {
351         struct list_head *curr;
352         struct page *page;
353         int retval = 0;
354 
355         spin_lock(&pagecache_lock);
356         curr = head->next;
357         while (curr != head) {
358                 page = list_entry(curr, struct page, list);
359                 curr = curr->next;
360                 if (!page->buffers)
361                         continue;
362                 if (page->index >= end)
363                         continue;
364                 if (page->index < start)
365                         continue;
366 
367                 page_cache_get(page);
368                 spin_unlock(&pagecache_lock);
369                 lock_page(page);
370 
371                 /* The buffers could have been free'd while we waited for the page lock */
372                 if (page->buffers)
373                         retval |= fn(page);
374 
375                 UnlockPage(page);
376                 spin_lock(&pagecache_lock);
377                 curr = page->list.next;
378                 page_cache_release(page);
379         }
380         spin_unlock(&pagecache_lock);
381 
382         return retval;
383 }
384 
385 /*
386  * Two-stage data sync: first start the IO, then go back and
387  * collect the information..
388  */
389 int generic_buffer_fdatasync(struct inode *inode, unsigned long start_idx, unsigned long end_idx)
390 {
391         int retval;
392 
393         /* writeout dirty buffers on pages from both clean and dirty lists */
394         retval = do_buffer_fdatasync(&inode->i_mapping->dirty_pages, start_idx, end_idx, writeout_one_page);
395         retval |= do_buffer_fdatasync(&inode->i_mapping->clean_pages, start_idx, end_idx, writeout_one_page);
396         retval |= do_buffer_fdatasync(&inode->i_mapping->locked_pages, start_idx, end_idx, writeout_one_page);
397 
398         /* now wait for locked buffers on pages from both clean and dirty lists */
399         retval |= do_buffer_fdatasync(&inode->i_mapping->dirty_pages, start_idx, end_idx, writeout_one_page);
400         retval |= do_buffer_fdatasync(&inode->i_mapping->clean_pages, start_idx, end_idx, waitfor_one_page);
401         retval |= do_buffer_fdatasync(&inode->i_mapping->locked_pages, start_idx, end_idx, waitfor_one_page);
402 
403         return retval;
404 }
405 
406 /**
407  *      filemap_fdatasync - walk the list of dirty pages of the given address space
408  *      and writepage() all of them.
409  * 
410  *      @mapping: address space structure to write
411  *
412  */
413 void filemap_fdatasync(struct address_space * mapping)
414 {
415         int (*writepage)(struct page *) = mapping->a_ops->writepage;
416 
417         spin_lock(&pagecache_lock);
418 
419         while (!list_empty(&mapping->dirty_pages)) {
420                 struct page *page = list_entry(mapping->dirty_pages.next, struct page, list);
421 
422                 list_del(&page->list);
423                 list_add(&page->list, &mapping->locked_pages);
424 
425                 if (!PageDirty(page))
426                         continue;
427 
428                 page_cache_get(page);
429                 spin_unlock(&pagecache_lock);
430 
431                 lock_page(page);
432 
433                 if (PageDirty(page)) {
434                         ClearPageDirty(page);
435                         writepage(page);
436                 } else
437                         UnlockPage(page);
438 
439                 page_cache_release(page);
440                 spin_lock(&pagecache_lock);
441         }
442         spin_unlock(&pagecache_lock);
443 }
444 
445 /**
446  *      filemap_fdatawait - walk the list of locked pages of the given address space
447  *      and wait for all of them.
448  * 
449  *      @mapping: address space structure to wait for
450  *
451  */
452 void filemap_fdatawait(struct address_space * mapping)
453 {
454         spin_lock(&pagecache_lock);
455 
456         while (!list_empty(&mapping->locked_pages)) {
457                 struct page *page = list_entry(mapping->locked_pages.next, struct page, list);
458 
459                 list_del(&page->list);
460                 list_add(&page->list, &mapping->clean_pages);
461 
462                 if (!PageLocked(page))
463                         continue;
464 
465                 page_cache_get(page);
466                 spin_unlock(&pagecache_lock);
467 
468                 ___wait_on_page(page);
469 
470                 page_cache_release(page);
471                 spin_lock(&pagecache_lock);
472         }
473         spin_unlock(&pagecache_lock);
474 }
475 
476 /*
477  * Add a page to the inode page cache.
478  *
479  * The caller must have locked the page and 
480  * set all the page flags correctly..
481  */
482 void add_to_page_cache_locked(struct page * page, struct address_space *mapping, unsigned long index)
483 {
484         if (!PageLocked(page))
485                 BUG();
486 
487         page_cache_get(page);
488         spin_lock(&pagecache_lock);
489         page->index = index;
490         add_page_to_inode_queue(mapping, page);
491         add_page_to_hash_queue(page, page_hash(mapping, index));
492         lru_cache_add(page);
493         spin_unlock(&pagecache_lock);
494 }
495 
496 /*
497  * This adds a page to the page cache, starting out as locked,
498  * owned by us, but unreferenced, not uptodate and with no errors.
499  */
500 static inline void __add_to_page_cache(struct page * page,
501         struct address_space *mapping, unsigned long offset,
502         struct page **hash)
503 {
504         unsigned long flags;
505 
506         if (PageLocked(page))
507                 BUG();
508 
509         flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_dirty) | (1 << PG_referenced) | (1 << PG_arch_1));
510         page->flags = flags | (1 << PG_locked);
511         page_cache_get(page);
512         page->index = offset;
513         add_page_to_inode_queue(mapping, page);
514         add_page_to_hash_queue(page, hash);
515         lru_cache_add(page);
516 }
517 
518 void add_to_page_cache(struct page * page, struct address_space * mapping, unsigned long offset)
519 {
520         spin_lock(&pagecache_lock);
521         __add_to_page_cache(page, mapping, offset, page_hash(mapping, offset));
522         spin_unlock(&pagecache_lock);
523 }
524 
525 static int add_to_page_cache_unique(struct page * page,
526         struct address_space *mapping, unsigned long offset,
527         struct page **hash)
528 {
529         int err;
530         struct page *alias;
531 
532         spin_lock(&pagecache_lock);
533         alias = __find_page_nolock(mapping, offset, *hash);
534 
535         err = 1;
536         if (!alias) {
537                 __add_to_page_cache(page,mapping,offset,hash);
538                 err = 0;
539         }
540 
541         spin_unlock(&pagecache_lock);
542         return err;
543 }
544 
545 /*
546  * This adds the requested page to the page cache if it isn't already there,
547  * and schedules an I/O to read in its contents from disk.
548  */
549 static inline int page_cache_read(struct file * file, unsigned long offset) 
550 {
551         struct inode *inode = file->f_dentry->d_inode;
552         struct address_space *mapping = inode->i_mapping;
553         struct page **hash = page_hash(mapping, offset);
554         struct page *page; 
555 
556         spin_lock(&pagecache_lock);
557         page = __find_page_nolock(mapping, offset, *hash); 
558         spin_unlock(&pagecache_lock);
559         if (page)
560                 return 0;
561 
562         page = page_cache_alloc();
563         if (!page)
564                 return -ENOMEM;
565 
566         if (!add_to_page_cache_unique(page, mapping, offset, hash)) {
567                 int error = mapping->a_ops->readpage(file, page);
568                 page_cache_release(page);
569                 return error;
570         }
571         /*
572          * We arrive here in the unlikely event that someone 
573          * raced with us and added our page to the cache first.
574          */
575         page_cache_free(page);
576         return 0;
577 }
578 
579 /*
580  * Read in an entire cluster at once.  A cluster is usually a 64k-
581  * aligned block that includes the page requested in "offset."
582  */
583 static int read_cluster_nonblocking(struct file * file, unsigned long offset,
584         unsigned long filesize)
585 {
586         unsigned long pages = CLUSTER_PAGES;
587 
588         offset = CLUSTER_OFFSET(offset);
589         while ((pages-- > 0) && (offset < filesize)) {
590                 int error = page_cache_read(file, offset);
591                 if (error < 0)
592                         return error;
593                 offset ++;
594         }
595 
596         return 0;
597 }
598 
599 /* 
600  * Wait for a page to get unlocked.
601  *
602  * This must be called with the caller "holding" the page,
603  * ie with increased "page->count" so that the page won't
604  * go away during the wait..
605  */
606 void ___wait_on_page(struct page *page)
607 {
608         struct task_struct *tsk = current;
609         DECLARE_WAITQUEUE(wait, tsk);
610 
611         add_wait_queue(&page->wait, &wait);
612         do {
613                 sync_page(page);
614                 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
615                 if (!PageLocked(page))
616                         break;
617                 run_task_queue(&tq_disk);
618                 schedule();
619         } while (PageLocked(page));
620         tsk->state = TASK_RUNNING;
621         remove_wait_queue(&page->wait, &wait);
622 }
623 
624 /*
625  * Get a lock on the page, assuming we need to sleep
626  * to get it..
627  */
628 static void __lock_page(struct page *page)
629 {
630         struct task_struct *tsk = current;
631         DECLARE_WAITQUEUE(wait, tsk);
632 
633         add_wait_queue_exclusive(&page->wait, &wait);
634         for (;;) {
635                 sync_page(page);
636                 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
637                 if (PageLocked(page)) {
638                         run_task_queue(&tq_disk);
639                         schedule();
640                         continue;
641                 }
642                 if (!TryLockPage(page))
643                         break;
644         }
645         tsk->state = TASK_RUNNING;
646         remove_wait_queue(&page->wait, &wait);
647 }
648         
649 
650 /*
651  * Get an exclusive lock on the page, optimistically
652  * assuming it's not locked..
653  */
654 void lock_page(struct page *page)
655 {
656         if (TryLockPage(page))
657                 __lock_page(page);
658 }
659 
660 /*
661  * a rather lightweight function, finding and getting a reference to a
662  * hashed page atomically, waiting for it if it's locked.
663  */
664 struct page * __find_get_page(struct address_space *mapping,
665                               unsigned long offset, struct page **hash)
666 {
667         struct page *page;
668 
669         /*
670          * We scan the hash list read-only. Addition to and removal from
671          * the hash-list needs a held write-lock.
672          */
673         spin_lock(&pagecache_lock);
674         page = __find_page_nolock(mapping, offset, *hash);
675         if (page)
676                 page_cache_get(page);
677         spin_unlock(&pagecache_lock);
678         return page;
679 }
680 
681 /*
682  * Get the lock to a page atomically.
683  */
684 struct page * __find_lock_page (struct address_space *mapping,
685                                 unsigned long offset, struct page **hash)
686 {
687         struct page *page;
688 
689         /*
690          * We scan the hash list read-only. Addition to and removal from
691          * the hash-list needs a held write-lock.
692          */
693 repeat:
694         spin_lock(&pagecache_lock);
695         page = __find_page_nolock(mapping, offset, *hash);
696         if (page) {
697                 page_cache_get(page);
698                 spin_unlock(&pagecache_lock);
699 
700                 lock_page(page);
701 
702                 /* Is the page still hashed? Ok, good.. */
703                 if (page->mapping)
704                         return page;
705 
706                 /* Nope: we raced. Release and try again.. */
707                 UnlockPage(page);
708                 page_cache_release(page);
709                 goto repeat;
710         }
711         spin_unlock(&pagecache_lock);
712         return NULL;
713 }
714 
715 #if 0
716 #define PROFILE_READAHEAD
717 #define DEBUG_READAHEAD
718 #endif
719 
720 /*
721  * We combine this with read-ahead to deactivate pages when we
722  * think there's sequential IO going on. Note that this is
723  * harmless since we don't actually evict the pages from memory
724  * but just move them to the inactive list.
725  *
726  * TODO:
727  * - make the readahead code smarter
728  * - move readahead to the VMA level so we can do the same
729  *   trick with mmap()
730  *
731  * Rik van Riel, 2000
732  */
733 static void drop_behind(struct file * file, unsigned long index)
734 {
735         struct inode *inode = file->f_dentry->d_inode;
736         struct address_space *mapping = inode->i_mapping;
737         struct page **hash;
738         struct page *page;
739         unsigned long start;
740 
741         /* Nothing to drop-behind if we're on the first page. */
742         if (!index)
743                 return;
744 
745         if (index > file->f_rawin)
746                 start = index - file->f_rawin;
747         else
748                 start = 0;
749 
750         /*
751          * Go backwards from index-1 and drop all pages in the
752          * readahead window. Since the readahead window may have
753          * been increased since the last time we were called, we
754          * stop when the page isn't there.
755          */
756         spin_lock(&pagecache_lock);
757         while (--index >= start) {
758                 hash = page_hash(mapping, index);
759                 page = __find_page_nolock(mapping, index, *hash);
760                 if (!page)
761                         break;
762                 deactivate_page(page);
763         }
764         spin_unlock(&pagecache_lock);
765 }
766 
767 /*
768  * Read-ahead profiling information
769  * --------------------------------
770  * Every PROFILE_MAXREADCOUNT, the following information is written 
771  * to the syslog:
772  *   Percentage of asynchronous read-ahead.
773  *   Average of read-ahead fields context value.
774  * If DEBUG_READAHEAD is defined, a snapshot of these fields is written 
775  * to the syslog.
776  */
777 
778 #ifdef PROFILE_READAHEAD
779 
780 #define PROFILE_MAXREADCOUNT 1000
781 
782 static unsigned long total_reada;
783 static unsigned long total_async;
784 static unsigned long total_ramax;
785 static unsigned long total_ralen;
786 static unsigned long total_rawin;
787 
788 static void profile_readahead(int async, struct file *filp)
789 {
790         unsigned long flags;
791 
792         ++total_reada;
793         if (async)
794                 ++total_async;
795 
796         total_ramax     += filp->f_ramax;
797         total_ralen     += filp->f_ralen;
798         total_rawin     += filp->f_rawin;
799 
800         if (total_reada > PROFILE_MAXREADCOUNT) {
801                 save_flags(flags);
802                 cli();
803                 if (!(total_reada > PROFILE_MAXREADCOUNT)) {
804                         restore_flags(flags);
805                         return;
806                 }
807 
808                 printk("Readahead average:  max=%ld, len=%ld, win=%ld, async=%ld%%\n",
809                         total_ramax/total_reada,
810                         total_ralen/total_reada,
811                         total_rawin/total_reada,
812                         (total_async*100)/total_reada);
813 #ifdef DEBUG_READAHEAD
814                 printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%Ld\n",
815                         filp->f_ramax, filp->f_ralen, filp->f_rawin, filp->f_raend);
816 #endif
817 
818                 total_reada     = 0;
819                 total_async     = 0;
820                 total_ramax     = 0;
821                 total_ralen     = 0;
822                 total_rawin     = 0;
823 
824                 restore_flags(flags);
825         }
826 }
827 #endif  /* defined PROFILE_READAHEAD */
828 
829 /*
830  * Read-ahead context:
831  * -------------------
832  * The read ahead context fields of the "struct file" are the following:
833  * - f_raend : position of the first byte after the last page we tried to
834  *             read ahead.
835  * - f_ramax : current read-ahead maximum size.
836  * - f_ralen : length of the current IO read block we tried to read-ahead.
837  * - f_rawin : length of the current read-ahead window.
838  *              if last read-ahead was synchronous then
839  *                      f_rawin = f_ralen
840  *              otherwise (was asynchronous)
841  *                      f_rawin = previous value of f_ralen + f_ralen
842  *
843  * Read-ahead limits:
844  * ------------------
845  * MIN_READAHEAD   : minimum read-ahead size when read-ahead.
846  * MAX_READAHEAD   : maximum read-ahead size when read-ahead.
847  *
848  * Synchronous read-ahead benefits:
849  * --------------------------------
850  * Using reasonable IO xfer length from peripheral devices increase system 
851  * performances.
852  * Reasonable means, in this context, not too large but not too small.
853  * The actual maximum value is:
854  *      MAX_READAHEAD + PAGE_CACHE_SIZE = 76k is CONFIG_READA_SMALL is undefined
855  *      and 32K if defined (4K page size assumed).
856  *
857  * Asynchronous read-ahead benefits:
858  * ---------------------------------
859  * Overlapping next read request and user process execution increase system 
860  * performance.
861  *
862  * Read-ahead risks:
863  * -----------------
864  * We have to guess which further data are needed by the user process.
865  * If these data are often not really needed, it's bad for system 
866  * performances.
867  * However, we know that files are often accessed sequentially by 
868  * application programs and it seems that it is possible to have some good 
869  * strategy in that guessing.
870  * We only try to read-ahead files that seems to be read sequentially.
871  *
872  * Asynchronous read-ahead risks:
873  * ------------------------------
874  * In order to maximize overlapping, we must start some asynchronous read 
875  * request from the device, as soon as possible.
876  * We must be very careful about:
877  * - The number of effective pending IO read requests.
878  *   ONE seems to be the only reasonable value.
879  * - The total memory pool usage for the file access stream.
880  *   This maximum memory usage is implicitly 2 IO read chunks:
881  *   2*(MAX_READAHEAD + PAGE_CACHE_SIZE) = 156K if CONFIG_READA_SMALL is undefined,
882  *   64k if defined (4K page size assumed).
883  */
884 
885 static inline int get_max_readahead(struct inode * inode)
886 {
887         if (!inode->i_dev || !max_readahead[MAJOR(inode->i_dev)])
888                 return MAX_READAHEAD;
889         return max_readahead[MAJOR(inode->i_dev)][MINOR(inode->i_dev)];
890 }
891 
892 static void generic_file_readahead(int reada_ok,
893         struct file * filp, struct inode * inode,
894         struct page * page)
895 {
896         unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
897         unsigned long index = page->index;
898         unsigned long max_ahead, ahead;
899         unsigned long raend;
900         int max_readahead = get_max_readahead(inode);
901 
902         raend = filp->f_raend;
903         max_ahead = 0;
904 
905 /*
906  * The current page is locked.
907  * If the current position is inside the previous read IO request, do not
908  * try to reread previously read ahead pages.
909  * Otherwise decide or not to read ahead some pages synchronously.
910  * If we are not going to read ahead, set the read ahead context for this 
911  * page only.
912  */
913         if (PageLocked(page)) {
914                 if (!filp->f_ralen || index >= raend || index + filp->f_rawin < raend) {
915                         raend = index;
916                         if (raend < end_index)
917                                 max_ahead = filp->f_ramax;
918                         filp->f_rawin = 0;
919                         filp->f_ralen = 1;
920                         if (!max_ahead) {
921                                 filp->f_raend  = index + filp->f_ralen;
922                                 filp->f_rawin += filp->f_ralen;
923                         }
924                 }
925         }
926 /*
927  * The current page is not locked.
928  * If we were reading ahead and,
929  * if the current max read ahead size is not zero and,
930  * if the current position is inside the last read-ahead IO request,
931  *   it is the moment to try to read ahead asynchronously.
932  * We will later force unplug device in order to force asynchronous read IO.
933  */
934         else if (reada_ok && filp->f_ramax && raend >= 1 &&
935                  index <= raend && index + filp->f_ralen >= raend) {
936 /*
937  * Add ONE page to max_ahead in order to try to have about the same IO max size
938  * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_CACHE_SIZE.
939  * Compute the position of the last page we have tried to read in order to 
940  * begin to read ahead just at the next page.
941  */
942                 raend -= 1;
943                 if (raend < end_index)
944                         max_ahead = filp->f_ramax + 1;
945 
946                 if (max_ahead) {
947                         filp->f_rawin = filp->f_ralen;
948                         filp->f_ralen = 0;
949                         reada_ok      = 2;
950                 }
951         }
952 /*
953  * Try to read ahead pages.
954  * We hope that ll_rw_blk() plug/unplug, coalescence, requests sort and the
955  * scheduler, will work enough for us to avoid too bad actuals IO requests.
956  */
957         ahead = 0;
958         while (ahead < max_ahead) {
959                 ahead ++;
960                 if ((raend + ahead) >= end_index)
961                         break;
962                 if (page_cache_read(filp, raend + ahead) < 0)
963                         break;
964         }
965 /*
966  * If we tried to read ahead some pages,
967  * If we tried to read ahead asynchronously,
968  *   Try to force unplug of the device in order to start an asynchronous
969  *   read IO request.
970  * Update the read-ahead context.
971  * Store the length of the current read-ahead window.
972  * Double the current max read ahead size.
973  *   That heuristic avoid to do some large IO for files that are not really
974  *   accessed sequentially.
975  */
976         if (ahead) {
977                 if (reada_ok == 2) {
978                         run_task_queue(&tq_disk);
979                 }
980 
981                 filp->f_ralen += ahead;
982                 filp->f_rawin += filp->f_ralen;
983                 filp->f_raend = raend + ahead + 1;
984 
985                 filp->f_ramax += filp->f_ramax;
986 
987                 if (filp->f_ramax > max_readahead)
988                         filp->f_ramax = max_readahead;
989 
990                 /*
991                  * Move the pages that have already been passed
992                  * to the inactive list.
993                  */
994                 drop_behind(filp, index);
995 
996 #ifdef PROFILE_READAHEAD
997                 profile_readahead((reada_ok == 2), filp);
998 #endif
999         }
1000 
1001         return;
1002 }
1003 
1004 
1005 /*
1006  * This is a generic file read routine, and uses the
1007  * inode->i_op->readpage() function for the actual low-level
1008  * stuff.
1009  *
1010  * This is really ugly. But the goto's actually try to clarify some
1011  * of the logic when it comes to error handling etc.
1012  */
1013 void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor)
1014 {
1015         struct inode *inode = filp->f_dentry->d_inode;
1016         struct address_space *mapping = inode->i_mapping;
1017         unsigned long index, offset;
1018         struct page *cached_page;
1019         int reada_ok;
1020         int error;
1021         int max_readahead = get_max_readahead(inode);
1022 
1023         cached_page = NULL;
1024         index = *ppos >> PAGE_CACHE_SHIFT;
1025         offset = *ppos & ~PAGE_CACHE_MASK;
1026 
1027 /*
1028  * If the current position is outside the previous read-ahead window, 
1029  * we reset the current read-ahead context and set read ahead max to zero
1030  * (will be set to just needed value later),
1031  * otherwise, we assume that the file accesses are sequential enough to
1032  * continue read-ahead.
1033  */
1034         if (index > filp->f_raend || index + filp->f_rawin < filp->f_raend) {
1035                 reada_ok = 0;
1036                 filp->f_raend = 0;
1037                 filp->f_ralen = 0;
1038                 filp->f_ramax = 0;
1039                 filp->f_rawin = 0;
1040         } else {
1041                 reada_ok = 1;
1042         }
1043 /*
1044  * Adjust the current value of read-ahead max.
1045  * If the read operation stay in the first half page, force no readahead.
1046  * Otherwise try to increase read ahead max just enough to do the read request.
1047  * Then, at least MIN_READAHEAD if read ahead is ok,
1048  * and at most MAX_READAHEAD in all cases.
1049  */
1050         if (!index && offset + desc->count <= (PAGE_CACHE_SIZE >> 1)) {
1051                 filp->f_ramax = 0;
1052         } else {
1053                 unsigned long needed;
1054 
1055                 needed = ((offset + desc->count) >> PAGE_CACHE_SHIFT) + 1;
1056 
1057                 if (filp->f_ramax < needed)
1058                         filp->f_ramax = needed;
1059 
1060                 if (reada_ok && filp->f_ramax < MIN_READAHEAD)
1061                                 filp->f_ramax = MIN_READAHEAD;
1062                 if (filp->f_ramax > max_readahead)
1063                         filp->f_ramax = max_readahead;
1064         }
1065 
1066         for (;;) {
1067                 struct page *page, **hash;
1068                 unsigned long end_index, nr;
1069 
1070                 end_index = inode->i_size >> PAGE_CACHE_SHIFT;
1071                 if (index > end_index)
1072                         break;
1073                 nr = PAGE_CACHE_SIZE;
1074                 if (index == end_index) {
1075                         nr = inode->i_size & ~PAGE_CACHE_MASK;
1076                         if (nr <= offset)
1077                                 break;
1078                 }
1079 
1080                 nr = nr - offset;
1081 
1082                 /*
1083                  * Try to find the data in the page cache..
1084                  */
1085                 hash = page_hash(mapping, index);
1086 
1087                 spin_lock(&pagecache_lock);
1088                 page = __find_page_nolock(mapping, index, *hash);
1089                 if (!page)
1090                         goto no_cached_page;
1091 found_page:
1092                 page_cache_get(page);
1093                 spin_unlock(&pagecache_lock);
1094 
1095                 if (!Page_Uptodate(page))
1096                         goto page_not_up_to_date;
1097                 generic_file_readahead(reada_ok, filp, inode, page);
1098 page_ok:
1099                 /* If users can be writing to this page using arbitrary
1100                  * virtual addresses, take care about potential aliasing
1101                  * before reading the page on the kernel side.
1102                  */
1103                 if (mapping->i_mmap_shared != NULL)
1104                         flush_dcache_page(page);
1105 
1106                 /*
1107                  * Ok, we have the page, and it's up-to-date, so
1108                  * now we can copy it to user space...
1109                  *
1110                  * The actor routine returns how many bytes were actually used..
1111                  * NOTE! This may not be the same as how much of a user buffer
1112                  * we filled up (we may be padding etc), so we can only update
1113                  * "pos" here (the actor routine has to update the user buffer
1114                  * pointers and the remaining count).
1115                  */
1116                 nr = actor(desc, page, offset, nr);
1117                 offset += nr;
1118                 index += offset >> PAGE_CACHE_SHIFT;
1119                 offset &= ~PAGE_CACHE_MASK;
1120         
1121                 page_cache_release(page);
1122                 if (nr && desc->count)
1123                         continue;
1124                 break;
1125 
1126 /*
1127  * Ok, the page was not immediately readable, so let's try to read ahead while we're at it..
1128  */
1129 page_not_up_to_date:
1130                 generic_file_readahead(reada_ok, filp, inode, page);
1131 
1132                 if (Page_Uptodate(page))
1133                         goto page_ok;
1134 
1135                 /* Get exclusive access to the page ... */
1136                 lock_page(page);
1137 
1138                 /* Did it get unhashed before we got the lock? */
1139                 if (!page->mapping) {
1140                         UnlockPage(page);
1141                         page_cache_release(page);
1142                         continue;
1143                 }
1144 
1145                 /* Did somebody else fill it already? */
1146                 if (Page_Uptodate(page)) {
1147                         UnlockPage(page);
1148                         goto page_ok;
1149                 }
1150 
1151 readpage:
1152                 /* ... and start the actual read. The read will unlock the page. */
1153                 error = mapping->a_ops->readpage(filp, page);
1154 
1155                 if (!error) {
1156                         if (Page_Uptodate(page))
1157                                 goto page_ok;
1158 
1159                         /* Again, try some read-ahead while waiting for the page to finish.. */
1160                         generic_file_readahead(reada_ok, filp, inode, page);
1161                         wait_on_page(page);
1162                         if (Page_Uptodate(page))
1163                                 goto page_ok;
1164                         error = -EIO;
1165                 }
1166 
1167                 /* UHHUH! A synchronous read error occurred. Report it */
1168                 desc->error = error;
1169                 page_cache_release(page);
1170                 break;
1171 
1172 no_cached_page:
1173                 /*
1174                  * Ok, it wasn't cached, so we need to create a new
1175                  * page..
1176                  *
1177                  * We get here with the page cache lock held.
1178                  */
1179                 if (!cached_page) {
1180                         spin_unlock(&pagecache_lock);
1181                         cached_page = page_cache_alloc();
1182                         if (!cached_page) {
1183                                 desc->error = -ENOMEM;
1184                                 break;
1185                         }
1186 
1187                         /*
1188                          * Somebody may have added the page while we
1189                          * dropped the page cache lock. Check for that.
1190                          */
1191                         spin_lock(&pagecache_lock);
1192                         page = __find_page_nolock(mapping, index, *hash);
1193                         if (page)
1194                                 goto found_page;
1195                 }
1196 
1197                 /*
1198                  * Ok, add the new page to the hash-queues...
1199                  */
1200                 page = cached_page;
1201                 __add_to_page_cache(page, mapping, index, hash);
1202                 spin_unlock(&pagecache_lock);
1203                 cached_page = NULL;
1204 
1205                 goto readpage;
1206         }
1207 
1208         *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
1209         filp->f_reada = 1;
1210         if (cached_page)
1211                 page_cache_free(cached_page);
1212         UPDATE_ATIME(inode);
1213 }
1214 
1215 static int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size)
1216 {
1217         char *kaddr;
1218         unsigned long left, count = desc->count;
1219 
1220         if (size > count)
1221                 size = count;
1222 
1223         kaddr = kmap(page);
1224         left = __copy_to_user(desc->buf, kaddr + offset, size);
1225         kunmap(page);
1226         
1227         if (left) {
1228                 size -= left;
1229                 desc->error = -EFAULT;
1230         }
1231         desc->count = count - size;
1232         desc->written += size;
1233         desc->buf += size;
1234         return size;
1235 }
1236 
1237 /*
1238  * This is the "read()" routine for all filesystems
1239  * that can use the page cache directly.
1240  */
1241 ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *ppos)
1242 {
1243         ssize_t retval;
1244 
1245         retval = -EFAULT;
1246         if (access_ok(VERIFY_WRITE, buf, count)) {
1247                 retval = 0;
1248 
1249                 if (count) {
1250                         read_descriptor_t desc;
1251 
1252                         desc.written = 0;
1253                         desc.count = count;
1254                         desc.buf = buf;
1255                         desc.error = 0;
1256                         do_generic_file_read(filp, ppos, &desc, file_read_actor);
1257 
1258                         retval = desc.written;
1259                         if (!retval)
1260                                 retval = desc.error;
1261                 }
1262         }
1263         return retval;
1264 }
1265 
1266 static int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset , unsigned long size)
1267 {
1268         char *kaddr;
1269         ssize_t written;
1270         unsigned long count = desc->count;
1271         struct file *file = (struct file *) desc->buf;
1272         mm_segment_t old_fs;
1273 
1274         if (size > count)
1275                 size = count;
1276         old_fs = get_fs();
1277         set_fs(KERNEL_DS);
1278 
1279         kaddr = kmap(page);
1280         written = file->f_op->write(file, kaddr + offset, size, &file->f_pos);
1281         kunmap(page);
1282         set_fs(old_fs);
1283         if (written < 0) {
1284                 desc->error = written;
1285                 written = 0;
1286         }
1287         desc->count = count - written;
1288         desc->written += written;
1289         return written;
1290 }
1291 
1292 asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t *offset, size_t count)
1293 {
1294         ssize_t retval;
1295         struct file * in_file, * out_file;
1296         struct inode * in_inode, * out_inode;
1297 
1298         /*
1299          * Get input file, and verify that it is ok..
1300          */
1301         retval = -EBADF;
1302         in_file = fget(in_fd);
1303         if (!in_file)
1304                 goto out;
1305         if (!(in_file->f_mode & FMODE_READ))
1306                 goto fput_in;
1307         retval = -EINVAL;
1308         in_inode = in_file->f_dentry->d_inode;
1309         if (!in_inode)
1310                 goto fput_in;
1311         if (!in_inode->i_mapping->a_ops->readpage)
1312                 goto fput_in;
1313         retval = locks_verify_area(FLOCK_VERIFY_READ, in_inode, in_file, in_file->f_pos, count);
1314         if (retval)
1315                 goto fput_in;
1316 
1317         /*
1318          * Get output file, and verify that it is ok..
1319          */
1320         retval = -EBADF;
1321         out_file = fget(out_fd);
1322         if (!out_file)
1323                 goto fput_in;
1324         if (!(out_file->f_mode & FMODE_WRITE))
1325                 goto fput_out;
1326         retval = -EINVAL;
1327         if (!out_file->f_op || !out_file->f_op->write)
1328                 goto fput_out;
1329         out_inode = out_file->f_dentry->d_inode;
1330         retval = locks_verify_area(FLOCK_VERIFY_WRITE, out_inode, out_file, out_file->f_pos, count);
1331         if (retval)
1332                 goto fput_out;
1333 
1334         retval = 0;
1335         if (count) {
1336                 read_descriptor_t desc;
1337                 loff_t pos = 0, *ppos;
1338 
1339                 retval = -EFAULT;
1340                 ppos = &in_file->f_pos;
1341                 if (offset) {
1342                         if (get_user(pos, offset))
1343                                 goto fput_out;
1344                         ppos = &pos;
1345                 }
1346 
1347                 desc.written = 0;
1348                 desc.count = count;
1349                 desc.buf = (char *) out_file;
1350                 desc.error = 0;
1351                 do_generic_file_read(in_file, ppos, &desc, file_send_actor);
1352 
1353                 retval = desc.written;
1354                 if (!retval)
1355                         retval = desc.error;
1356                 if (offset)
1357                         put_user(pos, offset);
1358         }
1359 
1360 fput_out:
1361         fput(out_file);
1362 fput_in:
1363         fput(in_file);
1364 out:
1365         return retval;
1366 }
1367 
1368 /*
1369  * Read-ahead and flush behind for MADV_SEQUENTIAL areas.  Since we are
1370  * sure this is sequential access, we don't need a flexible read-ahead
1371  * window size -- we can always use a large fixed size window.
1372  */
1373 static void nopage_sequential_readahead(struct vm_area_struct * vma,
1374         unsigned long pgoff, unsigned long filesize)
1375 {
1376         unsigned long ra_window;
1377 
1378         ra_window = get_max_readahead(vma->vm_file->f_dentry->d_inode);
1379         ra_window = CLUSTER_OFFSET(ra_window + CLUSTER_PAGES - 1);
1380 
1381         /* vm_raend is zero if we haven't read ahead in this area yet.  */
1382         if (vma->vm_raend == 0)
1383                 vma->vm_raend = vma->vm_pgoff + ra_window;
1384 
1385         /*
1386          * If we've just faulted the page half-way through our window,
1387          * then schedule reads for the next window, and release the
1388          * pages in the previous window.
1389          */
1390         if ((pgoff + (ra_window >> 1)) == vma->vm_raend) {
1391                 unsigned long start = vma->vm_pgoff + vma->vm_raend;
1392                 unsigned long end = start + ra_window;
1393 
1394                 if (end > ((vma->vm_end >> PAGE_SHIFT) + vma->vm_pgoff))
1395                         end = (vma->vm_end >> PAGE_SHIFT) + vma->vm_pgoff;
1396                 if (start > end)
1397                         return;
1398 
1399                 while ((start < end) && (start < filesize)) {
1400                         if (read_cluster_nonblocking(vma->vm_file,
1401                                                         start, filesize) < 0)
1402                                 break;
1403                         start += CLUSTER_PAGES;
1404                 }
1405                 run_task_queue(&tq_disk);
1406 
1407                 /* if we're far enough past the beginning of this area,
1408                    recycle pages that are in the previous window. */
1409                 if (vma->vm_raend > (vma->vm_pgoff + ra_window + ra_window)) {
1410                         unsigned long window = ra_window << PAGE_SHIFT;
1411 
1412                         end = vma->vm_start + (vma->vm_raend << PAGE_SHIFT);
1413                         end -= window + window;
1414                         filemap_sync(vma, end - window, window, MS_INVALIDATE);
1415                 }
1416 
1417                 vma->vm_raend += ra_window;
1418         }
1419 
1420         return;
1421 }
1422 
1423 /*
1424  * filemap_nopage() is invoked via the vma operations vector for a
1425  * mapped memory region to read in file data during a page fault.
1426  *
1427  * The goto's are kind of ugly, but this streamlines the normal case of having
1428  * it in the page cache, and handles the special cases reasonably without
1429  * having a lot of duplicated code.
1430  */
1431 struct page * filemap_nopage(struct vm_area_struct * area,
1432         unsigned long address, int no_share)
1433 {
1434         int error;
1435         struct file *file = area->vm_file;
1436         struct inode *inode = file->f_dentry->d_inode;
1437         struct address_space *mapping = inode->i_mapping;
1438         struct page *page, **hash, *old_page;
1439         unsigned long size, pgoff;
1440 
1441         pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
1442 
1443 retry_all:
1444         /*
1445          * An external ptracer can access pages that normally aren't
1446          * accessible..
1447          */
1448         size = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1449         if ((pgoff >= size) && (area->vm_mm == current->mm))
1450                 return NULL;
1451 
1452         /*
1453          * Do we have something in the page cache already?
1454          */
1455         hash = page_hash(mapping, pgoff);
1456 retry_find:
1457         page = __find_get_page(mapping, pgoff, hash);
1458         if (!page)
1459                 goto no_cached_page;
1460 
1461         /*
1462          * Ok, found a page in the page cache, now we need to check
1463          * that it's up-to-date.
1464          */
1465         if (!Page_Uptodate(page))
1466                 goto page_not_uptodate;
1467 
1468 success:
1469         /*
1470          * Try read-ahead for sequential areas.
1471          */
1472         if (VM_SequentialReadHint(area))
1473                 nopage_sequential_readahead(area, pgoff, size);
1474 
1475         /*
1476          * Found the page and have a reference on it, need to check sharing
1477          * and possibly copy it over to another page..
1478          */
1479         old_page = page;
1480         if (no_share) {
1481                 struct page *new_page = page_cache_alloc();
1482 
1483                 if (new_page) {
1484                         copy_user_highpage(new_page, old_page, address);
1485                         flush_page_to_ram(new_page);
1486                 } else
1487                         new_page = NOPAGE_OOM;
1488                 page_cache_release(page);
1489                 return new_page;
1490         }
1491 
1492         flush_page_to_ram(old_page);
1493         return old_page;
1494 
1495 no_cached_page:
1496         /*
1497          * If the requested offset is within our file, try to read a whole 
1498          * cluster of pages at once.
1499          *
1500          * Otherwise, we're off the end of a privately mapped file,
1501          * so we need to map a zero page.
1502          */
1503         if ((pgoff < size) && !VM_RandomReadHint(area))
1504                 error = read_cluster_nonblocking(file, pgoff, size);
1505         else
1506                 error = page_cache_read(file, pgoff);
1507 
1508         /*
1509          * The page we want has now been added to the page cache.
1510          * In the unlikely event that someone removed it in the
1511          * meantime, we'll just come back here and read it again.
1512          */
1513         if (error >= 0)
1514                 goto retry_find;
1515 
1516         /*
1517          * An error return from page_cache_read can result if the
1518          * system is low on memory, or a problem occurs while trying
1519          * to schedule I/O.
1520          */
1521         if (error == -ENOMEM)
1522                 return NOPAGE_OOM;
1523         return NULL;
1524 
1525 page_not_uptodate:
1526         lock_page(page);
1527 
1528         /* Did it get unhashed while we waited for it? */
1529         if (!page->mapping) {
1530                 UnlockPage(page);
1531                 page_cache_release(page);
1532                 goto retry_all;
1533         }
1534 
1535         /* Did somebody else get it up-to-date? */
1536         if (Page_Uptodate(page)) {
1537                 UnlockPage(page);
1538                 goto success;
1539         }
1540 
1541         if (!mapping->a_ops->readpage(file, page)) {
1542                 wait_on_page(page);
1543                 if (Page_Uptodate(page))
1544                         goto success;
1545         }
1546 
1547         /*
1548          * Umm, take care of errors if the page isn't up-to-date.
1549          * Try to re-read it _once_. We do this synchronously,
1550          * because there really aren't any performance issues here
1551          * and we need to check for errors.
1552          */
1553         lock_page(page);
1554 
1555         /* Somebody truncated the page on us? */
1556         if (!page->mapping) {
1557                 UnlockPage(page);
1558                 page_cache_release(page);
1559                 goto retry_all;
1560         }
1561 
1562         /* Somebody else successfully read it in? */
1563         if (Page_Uptodate(page)) {
1564                 UnlockPage(page);
1565                 goto success;
1566         }
1567         ClearPageError(page);
1568         if (!mapping->a_ops->readpage(file, page)) {
1569                 wait_on_page(page);
1570                 if (Page_Uptodate(page))
1571                         goto success;
1572         }
1573 
1574         /*
1575          * Things didn't work out. Return zero to tell the
1576          * mm layer so, possibly freeing the page cache page first.
1577          */
1578         page_cache_release(page);
1579         return NULL;
1580 }
1581 
1582 /* Called with mm->page_table_lock held to protect against other
1583  * threads/the swapper from ripping pte's out from under us.
1584  */
1585 static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
1586         unsigned long address, unsigned int flags)
1587 {
1588         pte_t pte = *ptep;
1589 
1590         if (pte_present(pte) && ptep_test_and_clear_dirty(ptep)) {
1591                 struct page *page = pte_page(pte);
1592                 flush_tlb_page(vma, address);
1593                 set_page_dirty(page);
1594         }
1595         return 0;
1596 }
1597 
1598 static inline int filemap_sync_pte_range(pmd_t * pmd,
1599         unsigned long address, unsigned long size, 
1600         struct vm_area_struct *vma, unsigned long offset, unsigned int flags)
1601 {
1602         pte_t * pte;
1603         unsigned long end;
1604         int error;
1605 
1606         if (pmd_none(*pmd))
1607                 return 0;
1608         if (pmd_bad(*pmd)) {
1609                 pmd_ERROR(*pmd);
1610                 pmd_clear(pmd);
1611                 return 0;
1612         }
1613         pte = pte_offset(pmd, address);
1614         offset += address & PMD_MASK;
1615         address &= ~PMD_MASK;
1616         end = address + size;
1617         if (end > PMD_SIZE)
1618                 end = PMD_SIZE;
1619         error = 0;
1620         do {
1621                 error |= filemap_sync_pte(pte, vma, address + offset, flags);
1622                 address += PAGE_SIZE;
1623                 pte++;
1624         } while (address && (address < end));
1625         return error;
1626 }
1627 
1628 static inline int filemap_sync_pmd_range(pgd_t * pgd,
1629         unsigned long address, unsigned long size, 
1630         struct vm_area_struct *vma, unsigned int flags)
1631 {
1632         pmd_t * pmd;
1633         unsigned long offset, end;
1634         int error;
1635 
1636         if (pgd_none(*pgd))
1637                 return 0;
1638         if (pgd_bad(*pgd)) {
1639                 pgd_ERROR(*pgd);
1640                 pgd_clear(pgd);
1641                 return 0;
1642         }
1643         pmd = pmd_offset(pgd, address);
1644         offset = address & PGDIR_MASK;
1645         address &= ~PGDIR_MASK;
1646         end = address + size;
1647         if (end > PGDIR_SIZE)
1648                 end = PGDIR_SIZE;
1649         error = 0;
1650         do {
1651                 error |= filemap_sync_pte_range(pmd, address, end - address, vma, offset, flags);
1652                 address = (address + PMD_SIZE) & PMD_MASK;
1653                 pmd++;
1654         } while (address && (address < end));
1655         return error;
1656 }
1657 
1658 int filemap_sync(struct vm_area_struct * vma, unsigned long address,
1659         size_t size, unsigned int flags)
1660 {
1661         pgd_t * dir;
1662         unsigned long end = address + size;
1663         int error = 0;
1664 
1665         /* Aquire the lock early; it may be possible to avoid dropping
1666          * and reaquiring it repeatedly.
1667          */
1668         spin_lock(&vma->vm_mm->page_table_lock);
1669 
1670         dir = pgd_offset(vma->vm_mm, address);
1671         flush_cache_range(vma->vm_mm, end - size, end);
1672         if (address >= end)
1673                 BUG();
1674         do {
1675                 error |= filemap_sync_pmd_range(dir, address, end - address, vma, flags);
1676                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
1677                 dir++;
1678         } while (address && (address < end));
1679         flush_tlb_range(vma->vm_mm, end - size, end);
1680 
1681         spin_unlock(&vma->vm_mm->page_table_lock);
1682 
1683         return error;
1684 }
1685 
1686 /*
1687  * Shared mappings need to be able to do the right thing at
1688  * close/unmap/sync. They will also use the private file as
1689  * backing-store for swapping..
1690  */
1691 static struct vm_operations_struct file_shared_mmap = {
1692         nopage:         filemap_nopage,
1693 };
1694 
1695 /*
1696  * Private mappings just need to be able to load in the map.
1697  *
1698  * (This is actually used for shared mappings as well, if we
1699  * know they can't ever get write permissions..)
1700  */
1701 static struct vm_operations_struct file_private_mmap = {
1702         nopage:         filemap_nopage,
1703 };
1704 
1705 /* This is used for a general mmap of a disk file */
1706 
1707 int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1708 {
1709         struct vm_operations_struct * ops;
1710         struct inode *inode = file->f_dentry->d_inode;
1711 
1712         ops = &file_private_mmap;
1713         if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
1714                 if (!inode->i_mapping->a_ops->writepage)
1715                         return -EINVAL;
1716                 ops = &file_shared_mmap;
1717         }
1718         if (!inode->i_sb || !S_ISREG(inode->i_mode))
1719                 return -EACCES;
1720         if (!inode->i_mapping->a_ops->readpage)
1721                 return -ENOEXEC;
1722         UPDATE_ATIME(inode);
1723         vma->vm_ops = ops;
1724         return 0;
1725 }
1726 
1727 /*
1728  * The msync() system call.
1729  */
1730 
1731 static int msync_interval(struct vm_area_struct * vma,
1732         unsigned long start, unsigned long end, int flags)
1733 {
1734         struct file * file = vma->vm_file;
1735         if (file && (vma->vm_flags & VM_SHARED)) {
1736                 int error;
1737                 error = filemap_sync(vma, start, end-start, flags);
1738 
1739                 if (!error && (flags & MS_SYNC)) {
1740                         struct inode * inode = file->f_dentry->d_inode;
1741                         down(&inode->i_sem);
1742                         filemap_fdatasync(inode->i_mapping);
1743                         if (file->f_op && file->f_op->fsync)
1744                                 error = file->f_op->fsync(file, file->f_dentry, 1);
1745                         filemap_fdatawait(inode->i_mapping);
1746                         up(&inode->i_sem);
1747                 }
1748                 return error;
1749         }
1750         return 0;
1751 }
1752 
1753 asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
1754 {
1755         unsigned long end;
1756         struct vm_area_struct * vma;
1757         int unmapped_error, error = -EINVAL;
1758 
1759         down(&current->mm->mmap_sem);
1760         if (start & ~PAGE_MASK)
1761                 goto out;
1762         len = (len + ~PAGE_MASK) & PAGE_MASK;
1763         end = start + len;
1764         if (end < start)
1765                 goto out;
1766         if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
1767                 goto out;
1768         error = 0;
1769         if (end == start)
1770                 goto out;
1771         /*
1772          * If the interval [start,end) covers some unmapped address ranges,
1773          * just ignore them, but return -EFAULT at the end.
1774          */
1775         vma = find_vma(current->mm, start);
1776         unmapped_error = 0;
1777         for (;;) {
1778                 /* Still start < end. */
1779                 error = -EFAULT;
1780                 if (!vma)
1781                         goto out;
1782                 /* Here start < vma->vm_end. */
1783                 if (start < vma->vm_start) {
1784                         unmapped_error = -EFAULT;
1785                         start = vma->vm_start;
1786                 }
1787                 /* Here vma->vm_start <= start < vma->vm_end. */
1788                 if (end <= vma->vm_end) {
1789                         if (start < end) {
1790                                 error = msync_interval(vma, start, end, flags);
1791                                 if (error)
1792                                         goto out;
1793                         }
1794                         error = unmapped_error;
1795                         goto out;
1796                 }
1797                 /* Here vma->vm_start <= start < vma->vm_end < end. */
1798                 error = msync_interval(vma, start, vma->vm_end, flags);
1799                 if (error)
1800                         goto out;
1801                 start = vma->vm_end;
1802                 vma = vma->vm_next;
1803         }
1804 out:
1805         up(&current->mm->mmap_sem);
1806         return error;
1807 }
1808 
1809 static inline void setup_read_behavior(struct vm_area_struct * vma,
1810         int behavior)
1811 {
1812         VM_ClearReadHint(vma);
1813         switch(behavior) {
1814                 case MADV_SEQUENTIAL:
1815                         vma->vm_flags |= VM_SEQ_READ;
1816                         break;
1817                 case MADV_RANDOM:
1818                         vma->vm_flags |= VM_RAND_READ;
1819                         break;
1820                 default:
1821                         break;
1822         }
1823         return;
1824 }
1825 
1826 static long madvise_fixup_start(struct vm_area_struct * vma,
1827         unsigned long end, int behavior)
1828 {
1829         struct vm_area_struct * n;
1830 
1831         n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
1832         if (!n)
1833                 return -EAGAIN;
1834         *n = *vma;
1835         n->vm_end = end;
1836         setup_read_behavior(n, behavior);
1837         n->vm_raend = 0;
1838         get_file(n->vm_file);
1839         if (n->vm_ops && n->vm_ops->open)
1840                 n->vm_ops->open(n);
1841         lock_vma_mappings(vma);
1842         spin_lock(&vma->vm_mm->page_table_lock);
1843         vma->vm_pgoff += (end - vma->vm_start) >> PAGE_SHIFT;
1844         vma->vm_start = end;
1845         __insert_vm_struct(current->mm, n);
1846         spin_unlock(&vma->vm_mm->page_table_lock);
1847         unlock_vma_mappings(vma);
1848         return 0;
1849 }
1850 
1851 static long madvise_fixup_end(struct vm_area_struct * vma,
1852         unsigned long start, int behavior)
1853 {
1854         struct vm_area_struct * n;
1855 
1856         n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
1857         if (!n)
1858                 return -EAGAIN;
1859         *n = *vma;
1860         n->vm_start = start;
1861         n->vm_pgoff += (n->vm_start - vma->vm_start) >> PAGE_SHIFT;
1862         setup_read_behavior(n, behavior);
1863         n->vm_raend = 0;
1864         get_file(n->vm_file);
1865         if (n->vm_ops && n->vm_ops->open)
1866                 n->vm_ops->open(n);
1867         lock_vma_mappings(vma);
1868         spin_lock(&vma->vm_mm->page_table_lock);
1869         vma->vm_end = start;
1870         __insert_vm_struct(current->mm, n);
1871         spin_unlock(&vma->vm_mm->page_table_lock);
1872         unlock_vma_mappings(vma);
1873         return 0;
1874 }
1875 
1876 static long madvise_fixup_middle(struct vm_area_struct * vma,
1877         unsigned long start, unsigned long end, int behavior)
1878 {
1879         struct vm_area_struct * left, * right;
1880 
1881         left = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
1882         if (!left)
1883                 return -EAGAIN;
1884         right = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
1885         if (!right) {
1886                 kmem_cache_free(vm_area_cachep, left);
1887                 return -EAGAIN;
1888         }
1889         *left = *vma;
1890         *right = *vma;
1891         left->vm_end = start;
1892         right->vm_start = end;
1893         right->vm_pgoff += (right->vm_start - left->vm_start) >> PAGE_SHIFT;
1894         left->vm_raend = 0;
1895         right->vm_raend = 0;
1896         atomic_add(2, &vma->vm_file->f_count);
1897 
1898         if (vma->vm_ops && vma->vm_ops->open) {
1899                 vma->vm_ops->open(left);
1900                 vma->vm_ops->open(right);
1901         }
1902         lock_vma_mappings(vma);
1903         spin_lock(&vma->vm_mm->page_table_lock);
1904         vma->vm_pgoff += (start - vma->vm_start) >> PAGE_SHIFT;
1905         vma->vm_start = start;
1906         vma->vm_end = end;
1907         setup_read_behavior(vma, behavior);
1908         vma->vm_raend = 0;
1909         __insert_vm_struct(current->mm, left);
1910         __insert_vm_struct(current->mm, right);
1911         spin_unlock(&vma->vm_mm->page_table_lock);
1912         unlock_vma_mappings(vma);
1913         return 0;
1914 }
1915 
1916 /*
1917  * We can potentially split a vm area into separate
1918  * areas, each area with its own behavior.
1919  */
1920 static long madvise_behavior(struct vm_area_struct * vma,
1921         unsigned long start, unsigned long end, int behavior)
1922 {
1923         int error = 0;
1924 
1925         /* This caps the number of vma's this process can own */
1926         if (vma->vm_mm->map_count > MAX_MAP_COUNT)
1927                 return -ENOMEM;
1928 
1929         if (start == vma->vm_start) {
1930                 if (end == vma->vm_end) {
1931                         setup_read_behavior(vma, behavior);
1932                         vma->vm_raend = 0;
1933                 } else
1934                         error = madvise_fixup_start(vma, end, behavior);
1935         } else {
1936                 if (end == vma->vm_end)
1937                         error = madvise_fixup_end(vma, start, behavior);
1938                 else
1939                         error = madvise_fixup_middle(vma, start, end, behavior);
1940         }
1941 
1942         return error;
1943 }
1944 
1945 /*
1946  * Schedule all required I/O operations, then run the disk queue
1947  * to make sure they are started.  Do not wait for completion.
1948  */
1949 static long madvise_willneed(struct vm_area_struct * vma,
1950         unsigned long start, unsigned long end)
1951 {
1952         long error = -EBADF;
1953         struct file * file;
1954         unsigned long size, rlim_rss;
1955 
1956         /* Doesn't work if there's no mapped file. */
1957         if (!vma->vm_file)
1958                 return error;
1959         file = vma->vm_file;
1960         size = (file->f_dentry->d_inode->i_size + PAGE_CACHE_SIZE - 1) >>
1961                                                         PAGE_CACHE_SHIFT;
1962 
1963         start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
1964         if (end > vma->vm_end)
1965                 end = vma->vm_end;
1966         end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
1967 
1968         /* Make sure this doesn't exceed the process's max rss. */
1969         error = -EIO;
1970         rlim_rss = current->rlim ?  current->rlim[RLIMIT_RSS].rlim_cur :
1971                                 LONG_MAX; /* default: see resource.h */
1972         if ((vma->vm_mm->rss + (end - start)) > rlim_rss)
1973                 return error;
1974 
1975         /* round to cluster boundaries if this isn't a "random" area. */
1976         if (!VM_RandomReadHint(vma)) {
1977                 start = CLUSTER_OFFSET(start);
1978                 end = CLUSTER_OFFSET(end + CLUSTER_PAGES - 1);
1979 
1980                 while ((start < end) && (start < size)) {
1981                         error = read_cluster_nonblocking(file, start, size);
1982                         start += CLUSTER_PAGES;
1983                         if (error < 0)
1984                                 break;
1985                 }
1986         } else {
1987                 while ((start < end) && (start < size)) {
1988                         error = page_cache_read(file, start);
1989                         start++;
1990                         if (error < 0)
1991                                 break;
1992                 }
1993         }
1994 
1995         /* Don't wait for someone else to push these requests. */
1996         run_task_queue(&tq_disk);
1997 
1998         return error;
1999 }
2000 
2001 /*
2002  * Application no longer needs these pages.  If the pages are dirty,
2003  * it's OK to just throw them away.  The app will be more careful about
2004  * data it wants to keep.  Be sure to free swap resources too.  The
2005  * zap_page_range call sets things up for refill_inactive to actually free
2006  * these pages later if no one else has touched them in the meantime,
2007  * although we could add these pages to a global reuse list for
2008  * refill_inactive to pick up before reclaiming other pages.
2009  *
2010  * NB: This interface discards data rather than pushes it out to swap,
2011  * as some implementations do.  This has performance implications for
2012  * applications like large transactional databases which want to discard
2013  * pages in anonymous maps after committing to backing store the data
2014  * that was kept in them.  There is no reason to write this data out to
2015  * the swap area if the application is discarding it.
2016  *
2017  * An interface that causes the system to free clean pages and flush
2018  * dirty pages is already available as msync(MS_INVALIDATE).
2019  */
2020 static long madvise_dontneed(struct vm_area_struct * vma,
2021         unsigned long start, unsigned long end)
2022 {
2023         if (vma->vm_flags & VM_LOCKED)
2024                 return -EINVAL;
2025 
2026         flush_cache_range(vma->vm_mm, start, end);
2027         zap_page_range(vma->vm_mm, start, end - start);
2028         flush_tlb_range(vma->vm_mm, start, end);
2029         return 0;
2030 }
2031 
2032 static long madvise_vma(struct vm_area_struct * vma, unsigned long start,
2033         unsigned long end, int behavior)
2034 {
2035         long error = -EBADF;
2036 
2037         switch (behavior) {
2038         case MADV_NORMAL:
2039         case MADV_SEQUENTIAL:
2040         case MADV_RANDOM:
2041                 error = madvise_behavior(vma, start, end, behavior);
2042                 break;
2043 
2044         case MADV_WILLNEED:
2045                 error = madvise_willneed(vma, start, end);
2046                 break;
2047 
2048         case MADV_DONTNEED:
2049                 error = madvise_dontneed(vma, start, end);
2050                 break;
2051 
2052         default:
2053                 error = -EINVAL;
2054                 break;
2055         }
2056                 
2057         return error;
2058 }
2059 
2060 /*
2061  * The madvise(2) system call.
2062  *
2063  * Applications can use madvise() to advise the kernel how it should
2064  * handle paging I/O in this VM area.  The idea is to help the kernel
2065  * use appropriate read-ahead and caching techniques.  The information
2066  * provided is advisory only, and can be safely disregarded by the
2067  * kernel without affecting the correct operation of the application.
2068  *
2069  * behavior values:
2070  *  MADV_NORMAL - the default behavior is to read clusters.  This
2071  *              results in some read-ahead and read-behind.
2072  *  MADV_RANDOM - the system should read the minimum amount of data
2073  *              on any access, since it is unlikely that the appli-
2074  *              cation will need more than what it asks for.
2075  *  MADV_SEQUENTIAL - pages in the given range will probably be accessed
2076  *              once, so they can be aggressively read ahead, and
2077  *              can be freed soon after they are accessed.
2078  *  MADV_WILLNEED - the application is notifying the system to read
2079  *              some pages ahead.
2080  *  MADV_DONTNEED - the application is finished with the given range,
2081  *              so the kernel can free resources associated with it.
2082  *
2083  * return values:
2084  *  zero    - success
2085  *  -EINVAL - start + len < 0, start is not page-aligned,
2086  *              "behavior" is not a valid value, or application
2087  *              is attempting to release locked or shared pages.
2088  *  -ENOMEM - addresses in the specified range are not currently
2089  *              mapped, or are outside the AS of the process.
2090  *  -EIO    - an I/O error occurred while paging in data.
2091  *  -EBADF  - map exists, but area maps something that isn't a file.
2092  *  -EAGAIN - a kernel resource was temporarily unavailable.
2093  */
2094 asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior)
2095 {
2096         unsigned long end;
2097         struct vm_area_struct * vma;
2098         int unmapped_error = 0;
2099         int error = -EINVAL;
2100 
2101         down(&current->mm->mmap_sem);
2102 
2103         if (start & ~PAGE_MASK)
2104                 goto out;
2105         len = (len + ~PAGE_MASK) & PAGE_MASK;
2106         end = start + len;
2107         if (end < start)
2108                 goto out;
2109 
2110         error = 0;
2111         if (end == start)
2112                 goto out;
2113 
2114         /*
2115          * If the interval [start,end) covers some unmapped address
2116          * ranges, just ignore them, but return -ENOMEM at the end.
2117          */
2118         vma = find_vma(current->mm, start);
2119         for (;;) {
2120                 /* Still start < end. */
2121                 error = -ENOMEM;
2122                 if (!vma)
2123                         goto out;
2124 
2125                 /* Here start < vma->vm_end. */
2126                 if (start < vma->vm_start) {
2127                         unmapped_error = -ENOMEM;
2128                         start = vma->vm_start;
2129                 }
2130 
2131                 /* Here vma->vm_start <= start < vma->vm_end. */
2132                 if (end <= vma->vm_end) {
2133                         if (start < end) {
2134                                 error = madvise_vma(vma, start, end,
2135                                                         behavior);
2136                                 if (error)
2137                                         goto out;
2138                         }
2139                         error = unmapped_error;
2140                         goto out;
2141                 }
2142 
2143                 /* Here vma->vm_start <= start < vma->vm_end < end. */
2144                 error = madvise_vma(vma, start, vma->vm_end, behavior);
2145                 if (error)
2146                         goto out;
2147                 start = vma->vm_end;
2148                 vma = vma->vm_next;
2149         }
2150 
2151 out:
2152         up(&current->mm->mmap_sem);
2153         return error;
2154 }
2155 
2156 /*
2157  * Later we can get more picky about what "in core" means precisely.
2158  * For now, simply check to see if the page is in the page cache,
2159  * and is up to date; i.e. that no page-in operation would be required
2160  * at this time if an application were to map and access this page.
2161  */
2162 static unsigned char mincore_page(struct vm_area_struct * vma,
2163         unsigned long pgoff)
2164 {
2165         unsigned char present = 0;
2166         struct address_space * as = &vma->vm_file->f_dentry->d_inode->i_data;
2167         struct page * page, ** hash = page_hash(as, pgoff);
2168 
2169         spin_lock(&pagecache_lock);
2170         page = __find_page_nolock(as, pgoff, *hash);
2171         if ((page) && (Page_Uptodate(page)))
2172                 present = 1;
2173         spin_unlock(&pagecache_lock);
2174 
2175         return present;
2176 }
2177 
2178 static long mincore_vma(struct vm_area_struct * vma,
2179         unsigned long start, unsigned long end, unsigned char * vec)
2180 {
2181         long error, i, remaining;
2182         unsigned char * tmp;
2183 
2184         error = -ENOMEM;
2185         if (!vma->vm_file)
2186                 return error;
2187 
2188         start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2189         if (end > vma->vm_end)
2190                 end = vma->vm_end;
2191         end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2192 
2193         error = -EAGAIN;
2194         tmp = (unsigned char *) __get_free_page(GFP_KERNEL);
2195         if (!tmp)
2196                 return error;
2197 
2198         /* (end - start) is # of pages, and also # of bytes in "vec */
2199         remaining = (end - start),
2200 
2201         error = 0;
2202         for (i = 0; remaining > 0; remaining -= PAGE_SIZE, i++) {
2203                 int j = 0;
2204                 long thispiece = (remaining < PAGE_SIZE) ?
2205                                                 remaining : PAGE_SIZE;
2206 
2207                 while (j < thispiece)
2208                         tmp[j++] = mincore_page(vma, start++);
2209 
2210                 if (copy_to_user(vec + PAGE_SIZE * i, tmp, thispiece)) {
2211                         error = -EFAULT;
2212                         break;
2213                 }
2214         }
2215 
2216         free_page((unsigned long) tmp);
2217         return error;
2218 }
2219 
2220 /*
2221  * The mincore(2) system call.
2222  *
2223  * mincore() returns the memory residency status of the pages in the
2224  * current process's address space specified by [addr, addr + len).
2225  * The status is returned in a vector of bytes.  The least significant
2226  * bit of each byte is 1 if the referenced page is in memory, otherwise
2227  * it is zero.
2228  *
2229  * Because the status of a page can change after mincore() checks it
2230  * but before it returns to the application, the returned vector may
2231  * contain stale information.  Only locked pages are guaranteed to
2232  * remain in memory.
2233  *
2234  * return values:
2235  *  zero    - success
2236  *  -EFAULT - vec points to an illegal address
2237  *  -EINVAL - addr is not a multiple of PAGE_CACHE_SIZE,
2238  *              or len has a nonpositive value
2239  *  -ENOMEM - Addresses in the range [addr, addr + len] are
2240  *              invalid for the address space of this process, or
2241  *              specify one or more pages which are not currently
2242  *              mapped
2243  *  -EAGAIN - A kernel resource was temporarily unavailable.
2244  */
2245 asmlinkage long sys_mincore(unsigned long start, size_t len,
2246         unsigned char * vec)
2247 {
2248         int index = 0;
2249         unsigned long end;
2250         struct vm_area_struct * vma;
2251         int unmapped_error = 0;
2252         long error = -EINVAL;
2253 
2254         down(&current->mm->mmap_sem);
2255 
2256         if (start & ~PAGE_CACHE_MASK)
2257                 goto out;
2258         len = (len + ~PAGE_CACHE_MASK) & PAGE_CACHE_MASK;
2259         end = start + len;
2260         if (end < start)
2261                 goto out;
2262 
2263         error = 0;
2264         if (end == start)
2265                 goto out;
2266 
2267         /*
2268          * If the interval [start,end) covers some unmapped address
2269          * ranges, just ignore them, but return -ENOMEM at the end.
2270          */
2271         vma = find_vma(current->mm, start);
2272         for (;;) {
2273                 /* Still start < end. */
2274                 error = -ENOMEM;
2275                 if (!vma)
2276                         goto out;
2277 
2278                 /* Here start < vma->vm_end. */
2279                 if (start < vma->vm_start) {
2280                         unmapped_error = -ENOMEM;
2281                         start = vma->vm_start;
2282                 }
2283 
2284                 /* Here vma->vm_start <= start < vma->vm_end. */
2285                 if (end <= vma->vm_end) {
2286                         if (start < end) {
2287                                 error = mincore_vma(vma, start, end,
2288                                                         &vec[index]);
2289                                 if (error)
2290                                         goto out;
2291                         }
2292                         error = unmapped_error;
2293                         goto out;
2294                 }
2295 
2296                 /* Here vma->vm_start <= start < vma->vm_end < end. */
2297                 error = mincore_vma(vma, start, vma->vm_end, &vec[index]);
2298                 if (error)
2299                         goto out;
2300                 index += (vma->vm_end - start) >> PAGE_CACHE_SHIFT;
2301                 start = vma->vm_end;
2302                 vma = vma->vm_next;
2303         }
2304 
2305 out:
2306         up(&current->mm->mmap_sem);
2307         return error;
2308 }
2309 
2310 static inline
2311 struct page *__read_cache_page(struct address_space *mapping,
2312                                 unsigned long index,
2313                                 int (*filler)(void *,struct page*),
2314                                 void *data)
2315 {
2316         struct page **hash = page_hash(mapping, index);
2317         struct page *page, *cached_page = NULL;
2318         int err;
2319 repeat:
2320         page = __find_get_page(mapping, index, hash);
2321         if (!page) {
2322                 if (!cached_page) {
2323                         cached_page = page_cache_alloc();
2324                         if (!cached_page)
2325                                 return ERR_PTR(-ENOMEM);
2326                 }
2327                 page = cached_page;
2328                 if (add_to_page_cache_unique(page, mapping, index, hash))
2329                         goto repeat;
2330                 cached_page = NULL;
2331                 err = filler(data, page);
2332                 if (err < 0) {
2333                         page_cache_release(page);
2334                         page = ERR_PTR(err);
2335                 }
2336         }
2337         if (cached_page)
2338                 page_cache_free(cached_page);
2339         return page;
2340 }
2341 
2342 /*
2343  * Read into the page cache. If a page already exists,
2344  * and Page_Uptodate() is not set, try to fill the page.
2345  */
2346 struct page *read_cache_page(struct address_space *mapping,
2347                                 unsigned long index,
2348                                 int (*filler)(void *,struct page*),
2349                                 void *data)
2350 {
2351         struct page *page;
2352         int err;
2353 
2354 retry:
2355         page = __read_cache_page(mapping, index, filler, data);
2356         if (IS_ERR(page) || Page_Uptodate(page))
2357                 goto out;
2358 
2359         lock_page(page);
2360         if (!page->mapping) {
2361                 UnlockPage(page);
2362                 page_cache_release(page);
2363                 goto retry;
2364         }
2365         if (Page_Uptodate(page)) {
2366                 UnlockPage(page);
2367                 goto out;
2368         }
2369         err = filler(data, page);
2370         if (err < 0) {
2371                 page_cache_release(page);
2372                 page = ERR_PTR(err);
2373         }
2374  out:
2375         return page;
2376 }
2377 
2378 static inline struct page * __grab_cache_page(struct address_space *mapping,
2379                                 unsigned long index, struct page **cached_page)
2380 {
2381         struct page *page, **hash = page_hash(mapping, index);
2382 repeat:
2383         page = __find_lock_page(mapping, index, hash);
2384         if (!page) {
2385                 if (!*cached_page) {
2386                         *cached_page = page_cache_alloc();
2387                         if (!*cached_page)
2388                                 return NULL;
2389                 }
2390                 page = *cached_page;
2391                 if (add_to_page_cache_unique(page, mapping, index, hash))
2392                         goto repeat;
2393                 *cached_page = NULL;
2394         }
2395         return page;
2396 }
2397 
2398 /*
2399  * Returns locked page at given index in given cache, creating it if needed.
2400  */
2401 
2402 struct page *grab_cache_page(struct address_space *mapping, unsigned long index)
2403 {
2404         struct page *cached_page = NULL;
2405         struct page *page = __grab_cache_page(mapping,index,&cached_page);
2406         if (cached_page)
2407                 page_cache_free(cached_page);
2408         return page;
2409 }
2410 
2411 static inline void remove_suid(struct inode *inode)
2412 {
2413         unsigned int mode;
2414 
2415         /* set S_IGID if S_IXGRP is set, and always set S_ISUID */
2416         mode = (inode->i_mode & S_IXGRP)*(S_ISGID/S_IXGRP) | S_ISUID;
2417 
2418         /* was any of the uid bits set? */
2419         mode &= inode->i_mode;
2420         if (mode && !capable(CAP_FSETID)) {
2421                 inode->i_mode &= ~mode;
2422                 mark_inode_dirty(inode);
2423         }
2424 }
2425 
2426 /*
2427  * Write to a file through the page cache. 
2428  *
2429  * We currently put everything into the page cache prior to writing it.
2430  * This is not a problem when writing full pages. With partial pages,
2431  * however, we first have to read the data into the cache, then
2432  * dirty the page, and finally schedule it for writing. Alternatively, we
2433  * could write-through just the portion of data that would go into that
2434  * page, but that would kill performance for applications that write data
2435  * line by line, and it's prone to race conditions.
2436  *
2437  * Note that this routine doesn't try to keep track of dirty pages. Each
2438  * file system has to do this all by itself, unfortunately.
2439  *                                                      okir@monad.swb.de
2440  */
2441 ssize_t
2442 generic_file_write(struct file *file,const char *buf,size_t count,loff_t *ppos)
2443 {
2444         struct inode    *inode = file->f_dentry->d_inode; 
2445         struct address_space *mapping = inode->i_mapping;
2446         unsigned long   limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
2447         loff_t          pos;
2448         struct page     *page, *cached_page;
2449         unsigned long   written;
2450         long            status;
2451         int             err;
2452 
2453         cached_page = NULL;
2454 
2455         down(&inode->i_sem);
2456 
2457         pos = *ppos;
2458         err = -EINVAL;
2459         if (pos < 0)
2460                 goto out;
2461 
2462         err = file->f_error;
2463         if (err) {
2464                 file->f_error = 0;
2465                 goto out;
2466         }
2467 
2468         written = 0;
2469 
2470         if (file->f_flags & O_APPEND)
2471                 pos = inode->i_size;
2472 
2473         /*
2474          * Check whether we've reached the file size limit.
2475          */
2476         err = -EFBIG;
2477         if (limit != RLIM_INFINITY) {
2478                 if (pos >= limit) {
2479                         send_sig(SIGXFSZ, current, 0);
2480                         goto out;
2481                 }
2482                 if (count > limit - pos) {
2483                         send_sig(SIGXFSZ, current, 0);
2484                         count = limit - pos;
2485                 }
2486         }
2487 
2488         status  = 0;
2489         if (count) {
2490                 remove_suid(inode);
2491                 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
2492                 mark_inode_dirty_sync(inode);
2493         }
2494 
2495         while (count) {
2496                 unsigned long bytes, index, offset;
2497                 char *kaddr;
2498                 int deactivate = 1;
2499 
2500                 /*
2501                  * Try to find the page in the cache. If it isn't there,
2502                  * allocate a free page.
2503                  */
2504                 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
2505                 index = pos >> PAGE_CACHE_SHIFT;
2506                 bytes = PAGE_CACHE_SIZE - offset;
2507                 if (bytes > count) {
2508                         bytes = count;
2509                         deactivate = 0;
2510                 }
2511 
2512                 /*
2513                  * Bring in the user page that we will copy from _first_.
2514                  * Otherwise there's a nasty deadlock on copying from the
2515                  * same page as we're writing to, without it being marked
2516                  * up-to-date.
2517                  */
2518                 { volatile unsigned char dummy;
2519                         __get_user(dummy, buf);
2520                         __get_user(dummy, buf+bytes-1);
2521                 }
2522 
2523                 status = -ENOMEM;       /* we'll assign it later anyway */
2524                 page = __grab_cache_page(mapping, index, &cached_page);
2525                 if (!page)
2526                         break;
2527 
2528                 /* We have exclusive IO access to the page.. */
2529                 if (!PageLocked(page)) {
2530                         PAGE_BUG(page);
2531                 }
2532 
2533                 status = mapping->a_ops->prepare_write(file, page, offset, offset+bytes);
2534                 if (status)
2535                         goto unlock;
2536                 kaddr = page_address(page);
2537                 status = copy_from_user(kaddr+offset, buf, bytes);
2538                 flush_dcache_page(page);
2539                 if (status)
2540                         goto fail_write;
2541                 status = mapping->a_ops->commit_write(file, page, offset, offset+bytes);
2542                 if (!status)
2543                         status = bytes;
2544 
2545                 if (status >= 0) {
2546                         written += status;
2547                         count -= status;
2548                         pos += status;
2549                         buf += status;
2550                 }
2551 unlock:
2552                 /* Mark it unlocked again and drop the page.. */
2553                 UnlockPage(page);
2554                 if (deactivate)
2555                         deactivate_page(page);
2556                 page_cache_release(page);
2557 
2558                 if (status < 0)
2559                         break;
2560         }
2561         *ppos = pos;
2562 
2563         if (cached_page)
2564                 page_cache_free(cached_page);
2565 
2566         /* For now, when the user asks for O_SYNC, we'll actually
2567          * provide O_DSYNC. */
2568         if ((status >= 0) && (file->f_flags & O_SYNC))
2569                 status = generic_osync_inode(inode, 1); /* 1 means datasync */
2570         
2571         err = written ? written : status;
2572 out:
2573 
2574         up(&inode->i_sem);
2575         return err;
2576 fail_write:
2577         status = -EFAULT;
2578         ClearPageUptodate(page);
2579         kunmap(page);
2580         goto unlock;
2581 }
2582 
2583 void __init page_cache_init(unsigned long mempages)
2584 {
2585         unsigned long htable_size, order;
2586 
2587         htable_size = mempages;
2588         htable_size *= sizeof(struct page *);
2589         for(order = 0; (PAGE_SIZE << order) < htable_size; order++)
2590                 ;
2591 
2592         do {
2593                 unsigned long tmp = (PAGE_SIZE << order) / sizeof(struct page *);
2594 
2595                 page_hash_bits = 0;
2596                 while((tmp >>= 1UL) != 0UL)
2597                         page_hash_bits++;
2598 
2599                 page_hash_table = (struct page **)
2600                         __get_free_pages(GFP_ATOMIC, order);
2601         } while(page_hash_table == NULL && --order > 0);
2602 
2603         printk("Page-cache hash table entries: %d (order: %ld, %ld bytes)\n",
2604                (1 << page_hash_bits), order, (PAGE_SIZE << order));
2605         if (!page_hash_table)
2606                 panic("Failed to allocate page hash table\n");
2607         memset((void *)page_hash_table, 0, PAGE_HASH_SIZE * sizeof(struct page *));
2608 }
2609 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~ [ freetext search ] ~ [ file search ] ~

This page was automatically generated by the LXR engine.
Visit the LXR main site for more information.