1 /*
2 * linux/mm/filemap.c
3 *
4 * Copyright (C) 1994-1999 Linus Torvalds
5 */
6
7 /*
8 * This file handles the generic file mmap semantics used by
9 * most "normal" filesystems (but you don't /have/ to use this:
10 * the NFS filesystem used to do this differently, for example)
11 */
12 #include <linux/malloc.h>
13 #include <linux/shm.h>
14 #include <linux/mman.h>
15 #include <linux/locks.h>
16 #include <linux/pagemap.h>
17 #include <linux/swap.h>
18 #include <linux/smp_lock.h>
19 #include <linux/blkdev.h>
20 #include <linux/file.h>
21 #include <linux/swapctl.h>
22 #include <linux/slab.h>
23 #include <linux/init.h>
24 #include <linux/mm.h>
25
26 #include <asm/pgalloc.h>
27 #include <asm/uaccess.h>
28 #include <asm/mman.h>
29
30 #include <linux/highmem.h>
31
32 /*
33 * Shared mappings implemented 30.11.1994. It's not fully working yet,
34 * though.
35 *
36 * Shared mappings now work. 15.8.1995 Bruno.
37 *
38 * finished 'unifying' the page and buffer cache and SMP-threaded the
39 * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
40 *
41 * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
42 */
43
44 atomic_t page_cache_size = ATOMIC_INIT(0);
45 unsigned int page_hash_bits;
46 struct page **page_hash_table;
47
48 spinlock_t pagecache_lock = SPIN_LOCK_UNLOCKED;
49 /*
50 * NOTE: to avoid deadlocking you must never acquire the pagecache_lock with
51 * the pagemap_lru_lock held.
52 */
53 spinlock_t pagemap_lru_lock = SPIN_LOCK_UNLOCKED;
54
55 #define CLUSTER_PAGES (1 << page_cluster)
56 #define CLUSTER_OFFSET(x) (((x) >> page_cluster) << page_cluster)
57
58 static void add_page_to_hash_queue(struct page * page, struct page **p)
59 {
60 struct page *next = *p;
61
62 *p = page;
63 page->next_hash = next;
64 page->pprev_hash = p;
65 if (next)
66 next->pprev_hash = &page->next_hash;
67 if (page->buffers)
68 PAGE_BUG(page);
69 atomic_inc(&page_cache_size);
70 }
71
72 static inline void add_page_to_inode_queue(struct address_space *mapping, struct page * page)
73 {
74 struct list_head *head = &mapping->clean_pages;
75
76 mapping->nrpages++;
77 list_add(&page->list, head);
78 page->mapping = mapping;
79 }
80
81 static inline void remove_page_from_inode_queue(struct page * page)
82 {
83 struct address_space * mapping = page->mapping;
84
85 mapping->nrpages--;
86 list_del(&page->list);
87 page->mapping = NULL;
88 }
89
90 static inline void remove_page_from_hash_queue(struct page * page)
91 {
92 struct page *next = page->next_hash;
93 struct page **pprev = page->pprev_hash;
94
95 if (next)
96 next->pprev_hash = pprev;
97 *pprev = next;
98 page->pprev_hash = NULL;
99 atomic_dec(&page_cache_size);
100 }
101
102 /*
103 * Remove a page from the page cache and free it. Caller has to make
104 * sure the page is locked and that nobody else uses it - or that usage
105 * is safe.
106 */
107 void __remove_inode_page(struct page *page)
108 {
109 if (PageDirty(page)) BUG();
110 remove_page_from_inode_queue(page);
111 remove_page_from_hash_queue(page);
112 page->mapping = NULL;
113 }
114
115 void remove_inode_page(struct page *page)
116 {
117 if (!PageLocked(page))
118 PAGE_BUG(page);
119
120 spin_lock(&pagecache_lock);
121 __remove_inode_page(page);
122 spin_unlock(&pagecache_lock);
123 }
124
125 static inline int sync_page(struct page *page)
126 {
127 struct address_space *mapping = page->mapping;
128
129 if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
130 return mapping->a_ops->sync_page(page);
131 return 0;
132 }
133
134 /*
135 * Add a page to the dirty page list.
136 */
137 void __set_page_dirty(struct page *page)
138 {
139 struct address_space *mapping = page->mapping;
140
141 spin_lock(&pagecache_lock);
142 list_del(&page->list);
143 list_add(&page->list, &mapping->dirty_pages);
144 spin_unlock(&pagecache_lock);
145
146 mark_inode_dirty_pages(mapping->host);
147 }
148
149 /**
150 * invalidate_inode_pages - Invalidate all the unlocked pages of one inode
151 * @inode: the inode which pages we want to invalidate
152 *
153 * This function only removes the unlocked pages, if you want to
154 * remove all the pages of one inode, you must call truncate_inode_pages.
155 */
156
157 void invalidate_inode_pages(struct inode * inode)
158 {
159 struct list_head *head, *curr;
160 struct page * page;
161
162 head = &inode->i_mapping->clean_pages;
163
164 spin_lock(&pagecache_lock);
165 spin_lock(&pagemap_lru_lock);
166 curr = head->next;
167
168 while (curr != head) {
169 page = list_entry(curr, struct page, list);
170 curr = curr->next;
171
172 /* We cannot invalidate something in use.. */
173 if (page_count(page) != 1)
174 continue;
175
176 /* ..or dirty.. */
177 if (PageDirty(page))
178 continue;
179
180 /* ..or locked */
181 if (TryLockPage(page))
182 continue;
183
184 __lru_cache_del(page);
185 __remove_inode_page(page);
186 UnlockPage(page);
187 page_cache_release(page);
188 }
189
190 spin_unlock(&pagemap_lru_lock);
191 spin_unlock(&pagecache_lock);
192 }
193
194 static inline void truncate_partial_page(struct page *page, unsigned partial)
195 {
196 memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
197
198 if (page->buffers)
199 block_flushpage(page, partial);
200
201 }
202
203 static inline void truncate_complete_page(struct page *page)
204 {
205 /* Leave it on the LRU if it gets converted into anonymous buffers */
206 if (!page->buffers || block_flushpage(page, 0))
207 lru_cache_del(page);
208
209 /*
210 * We remove the page from the page cache _after_ we have
211 * destroyed all buffer-cache references to it. Otherwise some
212 * other process might think this inode page is not in the
213 * page cache and creates a buffer-cache alias to it causing
214 * all sorts of fun problems ...
215 */
216 ClearPageDirty(page);
217 ClearPageUptodate(page);
218 remove_inode_page(page);
219 page_cache_release(page);
220 }
221
222 static int FASTCALL(truncate_list_pages(struct list_head *, unsigned long, unsigned *));
223 static int truncate_list_pages(struct list_head *head, unsigned long start, unsigned *partial)
224 {
225 struct list_head *curr;
226 struct page * page;
227
228 curr = head->next;
229 while (curr != head) {
230 unsigned long offset;
231
232 page = list_entry(curr, struct page, list);
233 curr = curr->next;
234 offset = page->index;
235
236 /* Is one of the pages to truncate? */
237 if ((offset >= start) || (*partial && (offset + 1) == start)) {
238 if (TryLockPage(page)) {
239 page_cache_get(page);
240 spin_unlock(&pagecache_lock);
241 wait_on_page(page);
242 page_cache_release(page);
243 return 1;
244 }
245 page_cache_get(page);
246 spin_unlock(&pagecache_lock);
247
248 if (*partial && (offset + 1) == start) {
249 truncate_partial_page(page, *partial);
250 *partial = 0;
251 } else
252 truncate_complete_page(page);
253
254 UnlockPage(page);
255 page_cache_release(page);
256 return 1;
257 }
258 }
259 return 0;
260 }
261
262
263 /**
264 * truncate_inode_pages - truncate *all* the pages from an offset
265 * @mapping: mapping to truncate
266 * @lstart: offset from with to truncate
267 *
268 * Truncate the page cache at a set offset, removing the pages
269 * that are beyond that offset (and zeroing out partial pages).
270 * If any page is locked we wait for it to become unlocked.
271 */
272 void truncate_inode_pages(struct address_space * mapping, loff_t lstart)
273 {
274 unsigned long start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
275 unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
276
277 repeat:
278 spin_lock(&pagecache_lock);
279 if (truncate_list_pages(&mapping->clean_pages, start, &partial))
280 goto repeat;
281 if (truncate_list_pages(&mapping->dirty_pages, start, &partial))
282 goto repeat;
283 if (truncate_list_pages(&mapping->locked_pages, start, &partial))
284 goto repeat;
285 spin_unlock(&pagecache_lock);
286 }
287
288 static inline struct page * __find_page_nolock(struct address_space *mapping, unsigned long offset, struct page *page)
289 {
290 goto inside;
291
292 for (;;) {
293 page = page->next_hash;
294 inside:
295 if (!page)
296 goto not_found;
297 if (page->mapping != mapping)
298 continue;
299 if (page->index == offset)
300 break;
301 }
302 /*
303 * Touching the page may move it to the active list.
304 * If we end up with too few inactive pages, we wake
305 * up kswapd.
306 */
307 age_page_up(page);
308 if (inactive_shortage() > inactive_target / 2 && free_shortage())
309 wakeup_kswapd(0);
310 not_found:
311 return page;
312 }
313
314 /*
315 * By the time this is called, the page is locked and
316 * we don't have to worry about any races any more.
317 *
318 * Start the IO..
319 */
320 static int writeout_one_page(struct page *page)
321 {
322 struct buffer_head *bh, *head = page->buffers;
323
324 bh = head;
325 do {
326 if (buffer_locked(bh) || !buffer_dirty(bh) || !buffer_uptodate(bh))
327 continue;
328
329 bh->b_flushtime = jiffies;
330 ll_rw_block(WRITE, 1, &bh);
331 } while ((bh = bh->b_this_page) != head);
332 return 0;
333 }
334
335 static int waitfor_one_page(struct page *page)
336 {
337 int error = 0;
338 struct buffer_head *bh, *head = page->buffers;
339
340 bh = head;
341 do {
342 wait_on_buffer(bh);
343 if (buffer_req(bh) && !buffer_uptodate(bh))
344 error = -EIO;
345 } while ((bh = bh->b_this_page) != head);
346 return error;
347 }
348
349 static int do_buffer_fdatasync(struct list_head *head, unsigned long start, unsigned long end, int (*fn)(struct page *))
350 {
351 struct list_head *curr;
352 struct page *page;
353 int retval = 0;
354
355 spin_lock(&pagecache_lock);
356 curr = head->next;
357 while (curr != head) {
358 page = list_entry(curr, struct page, list);
359 curr = curr->next;
360 if (!page->buffers)
361 continue;
362 if (page->index >= end)
363 continue;
364 if (page->index < start)
365 continue;
366
367 page_cache_get(page);
368 spin_unlock(&pagecache_lock);
369 lock_page(page);
370
371 /* The buffers could have been free'd while we waited for the page lock */
372 if (page->buffers)
373 retval |= fn(page);
374
375 UnlockPage(page);
376 spin_lock(&pagecache_lock);
377 curr = page->list.next;
378 page_cache_release(page);
379 }
380 spin_unlock(&pagecache_lock);
381
382 return retval;
383 }
384
385 /*
386 * Two-stage data sync: first start the IO, then go back and
387 * collect the information..
388 */
389 int generic_buffer_fdatasync(struct inode *inode, unsigned long start_idx, unsigned long end_idx)
390 {
391 int retval;
392
393 /* writeout dirty buffers on pages from both clean and dirty lists */
394 retval = do_buffer_fdatasync(&inode->i_mapping->dirty_pages, start_idx, end_idx, writeout_one_page);
395 retval |= do_buffer_fdatasync(&inode->i_mapping->clean_pages, start_idx, end_idx, writeout_one_page);
396 retval |= do_buffer_fdatasync(&inode->i_mapping->locked_pages, start_idx, end_idx, writeout_one_page);
397
398 /* now wait for locked buffers on pages from both clean and dirty lists */
399 retval |= do_buffer_fdatasync(&inode->i_mapping->dirty_pages, start_idx, end_idx, writeout_one_page);
400 retval |= do_buffer_fdatasync(&inode->i_mapping->clean_pages, start_idx, end_idx, waitfor_one_page);
401 retval |= do_buffer_fdatasync(&inode->i_mapping->locked_pages, start_idx, end_idx, waitfor_one_page);
402
403 return retval;
404 }
405
406 /**
407 * filemap_fdatasync - walk the list of dirty pages of the given address space
408 * and writepage() all of them.
409 *
410 * @mapping: address space structure to write
411 *
412 */
413 void filemap_fdatasync(struct address_space * mapping)
414 {
415 int (*writepage)(struct page *) = mapping->a_ops->writepage;
416
417 spin_lock(&pagecache_lock);
418
419 while (!list_empty(&mapping->dirty_pages)) {
420 struct page *page = list_entry(mapping->dirty_pages.next, struct page, list);
421
422 list_del(&page->list);
423 list_add(&page->list, &mapping->locked_pages);
424
425 if (!PageDirty(page))
426 continue;
427
428 page_cache_get(page);
429 spin_unlock(&pagecache_lock);
430
431 lock_page(page);
432
433 if (PageDirty(page)) {
434 ClearPageDirty(page);
435 writepage(page);
436 } else
437 UnlockPage(page);
438
439 page_cache_release(page);
440 spin_lock(&pagecache_lock);
441 }
442 spin_unlock(&pagecache_lock);
443 }
444
445 /**
446 * filemap_fdatawait - walk the list of locked pages of the given address space
447 * and wait for all of them.
448 *
449 * @mapping: address space structure to wait for
450 *
451 */
452 void filemap_fdatawait(struct address_space * mapping)
453 {
454 spin_lock(&pagecache_lock);
455
456 while (!list_empty(&mapping->locked_pages)) {
457 struct page *page = list_entry(mapping->locked_pages.next, struct page, list);
458
459 list_del(&page->list);
460 list_add(&page->list, &mapping->clean_pages);
461
462 if (!PageLocked(page))
463 continue;
464
465 page_cache_get(page);
466 spin_unlock(&pagecache_lock);
467
468 ___wait_on_page(page);
469
470 page_cache_release(page);
471 spin_lock(&pagecache_lock);
472 }
473 spin_unlock(&pagecache_lock);
474 }
475
476 /*
477 * Add a page to the inode page cache.
478 *
479 * The caller must have locked the page and
480 * set all the page flags correctly..
481 */
482 void add_to_page_cache_locked(struct page * page, struct address_space *mapping, unsigned long index)
483 {
484 if (!PageLocked(page))
485 BUG();
486
487 page_cache_get(page);
488 spin_lock(&pagecache_lock);
489 page->index = index;
490 add_page_to_inode_queue(mapping, page);
491 add_page_to_hash_queue(page, page_hash(mapping, index));
492 lru_cache_add(page);
493 spin_unlock(&pagecache_lock);
494 }
495
496 /*
497 * This adds a page to the page cache, starting out as locked,
498 * owned by us, but unreferenced, not uptodate and with no errors.
499 */
500 static inline void __add_to_page_cache(struct page * page,
501 struct address_space *mapping, unsigned long offset,
502 struct page **hash)
503 {
504 unsigned long flags;
505
506 if (PageLocked(page))
507 BUG();
508
509 flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_dirty) | (1 << PG_referenced) | (1 << PG_arch_1));
510 page->flags = flags | (1 << PG_locked);
511 page_cache_get(page);
512 page->index = offset;
513 add_page_to_inode_queue(mapping, page);
514 add_page_to_hash_queue(page, hash);
515 lru_cache_add(page);
516 }
517
518 void add_to_page_cache(struct page * page, struct address_space * mapping, unsigned long offset)
519 {
520 spin_lock(&pagecache_lock);
521 __add_to_page_cache(page, mapping, offset, page_hash(mapping, offset));
522 spin_unlock(&pagecache_lock);
523 }
524
525 static int add_to_page_cache_unique(struct page * page,
526 struct address_space *mapping, unsigned long offset,
527 struct page **hash)
528 {
529 int err;
530 struct page *alias;
531
532 spin_lock(&pagecache_lock);
533 alias = __find_page_nolock(mapping, offset, *hash);
534
535 err = 1;
536 if (!alias) {
537 __add_to_page_cache(page,mapping,offset,hash);
538 err = 0;
539 }
540
541 spin_unlock(&pagecache_lock);
542 return err;
543 }
544
545 /*
546 * This adds the requested page to the page cache if it isn't already there,
547 * and schedules an I/O to read in its contents from disk.
548 */
549 static inline int page_cache_read(struct file * file, unsigned long offset)
550 {
551 struct inode *inode = file->f_dentry->d_inode;
552 struct address_space *mapping = inode->i_mapping;
553 struct page **hash = page_hash(mapping, offset);
554 struct page *page;
555
556 spin_lock(&pagecache_lock);
557 page = __find_page_nolock(mapping, offset, *hash);
558 spin_unlock(&pagecache_lock);
559 if (page)
560 return 0;
561
562 page = page_cache_alloc();
563 if (!page)
564 return -ENOMEM;
565
566 if (!add_to_page_cache_unique(page, mapping, offset, hash)) {
567 int error = mapping->a_ops->readpage(file, page);
568 page_cache_release(page);
569 return error;
570 }
571 /*
572 * We arrive here in the unlikely event that someone
573 * raced with us and added our page to the cache first.
574 */
575 page_cache_free(page);
576 return 0;
577 }
578
579 /*
580 * Read in an entire cluster at once. A cluster is usually a 64k-
581 * aligned block that includes the page requested in "offset."
582 */
583 static int read_cluster_nonblocking(struct file * file, unsigned long offset,
584 unsigned long filesize)
585 {
586 unsigned long pages = CLUSTER_PAGES;
587
588 offset = CLUSTER_OFFSET(offset);
589 while ((pages-- > 0) && (offset < filesize)) {
590 int error = page_cache_read(file, offset);
591 if (error < 0)
592 return error;
593 offset ++;
594 }
595
596 return 0;
597 }
598
599 /*
600 * Wait for a page to get unlocked.
601 *
602 * This must be called with the caller "holding" the page,
603 * ie with increased "page->count" so that the page won't
604 * go away during the wait..
605 */
606 void ___wait_on_page(struct page *page)
607 {
608 struct task_struct *tsk = current;
609 DECLARE_WAITQUEUE(wait, tsk);
610
611 add_wait_queue(&page->wait, &wait);
612 do {
613 sync_page(page);
614 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
615 if (!PageLocked(page))
616 break;
617 run_task_queue(&tq_disk);
618 schedule();
619 } while (PageLocked(page));
620 tsk->state = TASK_RUNNING;
621 remove_wait_queue(&page->wait, &wait);
622 }
623
624 /*
625 * Get a lock on the page, assuming we need to sleep
626 * to get it..
627 */
628 static void __lock_page(struct page *page)
629 {
630 struct task_struct *tsk = current;
631 DECLARE_WAITQUEUE(wait, tsk);
632
633 add_wait_queue_exclusive(&page->wait, &wait);
634 for (;;) {
635 sync_page(page);
636 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
637 if (PageLocked(page)) {
638 run_task_queue(&tq_disk);
639 schedule();
640 continue;
641 }
642 if (!TryLockPage(page))
643 break;
644 }
645 tsk->state = TASK_RUNNING;
646 remove_wait_queue(&page->wait, &wait);
647 }
648
649
650 /*
651 * Get an exclusive lock on the page, optimistically
652 * assuming it's not locked..
653 */
654 void lock_page(struct page *page)
655 {
656 if (TryLockPage(page))
657 __lock_page(page);
658 }
659
660 /*
661 * a rather lightweight function, finding and getting a reference to a
662 * hashed page atomically, waiting for it if it's locked.
663 */
664 struct page * __find_get_page(struct address_space *mapping,
665 unsigned long offset, struct page **hash)
666 {
667 struct page *page;
668
669 /*
670 * We scan the hash list read-only. Addition to and removal from
671 * the hash-list needs a held write-lock.
672 */
673 spin_lock(&pagecache_lock);
674 page = __find_page_nolock(mapping, offset, *hash);
675 if (page)
676 page_cache_get(page);
677 spin_unlock(&pagecache_lock);
678 return page;
679 }
680
681 /*
682 * Get the lock to a page atomically.
683 */
684 struct page * __find_lock_page (struct address_space *mapping,
685 unsigned long offset, struct page **hash)
686 {
687 struct page *page;
688
689 /*
690 * We scan the hash list read-only. Addition to and removal from
691 * the hash-list needs a held write-lock.
692 */
693 repeat:
694 spin_lock(&pagecache_lock);
695 page = __find_page_nolock(mapping, offset, *hash);
696 if (page) {
697 page_cache_get(page);
698 spin_unlock(&pagecache_lock);
699
700 lock_page(page);
701
702 /* Is the page still hashed? Ok, good.. */
703 if (page->mapping)
704 return page;
705
706 /* Nope: we raced. Release and try again.. */
707 UnlockPage(page);
708 page_cache_release(page);
709 goto repeat;
710 }
711 spin_unlock(&pagecache_lock);
712 return NULL;
713 }
714
715 #if 0
716 #define PROFILE_READAHEAD
717 #define DEBUG_READAHEAD
718 #endif
719
720 /*
721 * We combine this with read-ahead to deactivate pages when we
722 * think there's sequential IO going on. Note that this is
723 * harmless since we don't actually evict the pages from memory
724 * but just move them to the inactive list.
725 *
726 * TODO:
727 * - make the readahead code smarter
728 * - move readahead to the VMA level so we can do the same
729 * trick with mmap()
730 *
731 * Rik van Riel, 2000
732 */
733 static void drop_behind(struct file * file, unsigned long index)
734 {
735 struct inode *inode = file->f_dentry->d_inode;
736 struct address_space *mapping = inode->i_mapping;
737 struct page **hash;
738 struct page *page;
739 unsigned long start;
740
741 /* Nothing to drop-behind if we're on the first page. */
742 if (!index)
743 return;
744
745 if (index > file->f_rawin)
746 start = index - file->f_rawin;
747 else
748 start = 0;
749
750 /*
751 * Go backwards from index-1 and drop all pages in the
752 * readahead window. Since the readahead window may have
753 * been increased since the last time we were called, we
754 * stop when the page isn't there.
755 */
756 spin_lock(&pagecache_lock);
757 while (--index >= start) {
758 hash = page_hash(mapping, index);
759 page = __find_page_nolock(mapping, index, *hash);
760 if (!page)
761 break;
762 deactivate_page(page);
763 }
764 spin_unlock(&pagecache_lock);
765 }
766
767 /*
768 * Read-ahead profiling information
769 * --------------------------------
770 * Every PROFILE_MAXREADCOUNT, the following information is written
771 * to the syslog:
772 * Percentage of asynchronous read-ahead.
773 * Average of read-ahead fields context value.
774 * If DEBUG_READAHEAD is defined, a snapshot of these fields is written
775 * to the syslog.
776 */
777
778 #ifdef PROFILE_READAHEAD
779
780 #define PROFILE_MAXREADCOUNT 1000
781
782 static unsigned long total_reada;
783 static unsigned long total_async;
784 static unsigned long total_ramax;
785 static unsigned long total_ralen;
786 static unsigned long total_rawin;
787
788 static void profile_readahead(int async, struct file *filp)
789 {
790 unsigned long flags;
791
792 ++total_reada;
793 if (async)
794 ++total_async;
795
796 total_ramax += filp->f_ramax;
797 total_ralen += filp->f_ralen;
798 total_rawin += filp->f_rawin;
799
800 if (total_reada > PROFILE_MAXREADCOUNT) {
801 save_flags(flags);
802 cli();
803 if (!(total_reada > PROFILE_MAXREADCOUNT)) {
804 restore_flags(flags);
805 return;
806 }
807
808 printk("Readahead average: max=%ld, len=%ld, win=%ld, async=%ld%%\n",
809 total_ramax/total_reada,
810 total_ralen/total_reada,
811 total_rawin/total_reada,
812 (total_async*100)/total_reada);
813 #ifdef DEBUG_READAHEAD
814 printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%Ld\n",
815 filp->f_ramax, filp->f_ralen, filp->f_rawin, filp->f_raend);
816 #endif
817
818 total_reada = 0;
819 total_async = 0;
820 total_ramax = 0;
821 total_ralen = 0;
822 total_rawin = 0;
823
824 restore_flags(flags);
825 }
826 }
827 #endif /* defined PROFILE_READAHEAD */
828
829 /*
830 * Read-ahead context:
831 * -------------------
832 * The read ahead context fields of the "struct file" are the following:
833 * - f_raend : position of the first byte after the last page we tried to
834 * read ahead.
835 * - f_ramax : current read-ahead maximum size.
836 * - f_ralen : length of the current IO read block we tried to read-ahead.
837 * - f_rawin : length of the current read-ahead window.
838 * if last read-ahead was synchronous then
839 * f_rawin = f_ralen
840 * otherwise (was asynchronous)
841 * f_rawin = previous value of f_ralen + f_ralen
842 *
843 * Read-ahead limits:
844 * ------------------
845 * MIN_READAHEAD : minimum read-ahead size when read-ahead.
846 * MAX_READAHEAD : maximum read-ahead size when read-ahead.
847 *
848 * Synchronous read-ahead benefits:
849 * --------------------------------
850 * Using reasonable IO xfer length from peripheral devices increase system
851 * performances.
852 * Reasonable means, in this context, not too large but not too small.
853 * The actual maximum value is:
854 * MAX_READAHEAD + PAGE_CACHE_SIZE = 76k is CONFIG_READA_SMALL is undefined
855 * and 32K if defined (4K page size assumed).
856 *
857 * Asynchronous read-ahead benefits:
858 * ---------------------------------
859 * Overlapping next read request and user process execution increase system
860 * performance.
861 *
862 * Read-ahead risks:
863 * -----------------
864 * We have to guess which further data are needed by the user process.
865 * If these data are often not really needed, it's bad for system
866 * performances.
867 * However, we know that files are often accessed sequentially by
868 * application programs and it seems that it is possible to have some good
869 * strategy in that guessing.
870 * We only try to read-ahead files that seems to be read sequentially.
871 *
872 * Asynchronous read-ahead risks:
873 * ------------------------------
874 * In order to maximize overlapping, we must start some asynchronous read
875 * request from the device, as soon as possible.
876 * We must be very careful about:
877 * - The number of effective pending IO read requests.
878 * ONE seems to be the only reasonable value.
879 * - The total memory pool usage for the file access stream.
880 * This maximum memory usage is implicitly 2 IO read chunks:
881 * 2*(MAX_READAHEAD + PAGE_CACHE_SIZE) = 156K if CONFIG_READA_SMALL is undefined,
882 * 64k if defined (4K page size assumed).
883 */
884
885 static inline int get_max_readahead(struct inode * inode)
886 {
887 if (!inode->i_dev || !max_readahead[MAJOR(inode->i_dev)])
888 return MAX_READAHEAD;
889 return max_readahead[MAJOR(inode->i_dev)][MINOR(inode->i_dev)];
890 }
891
892 static void generic_file_readahead(int reada_ok,
893 struct file * filp, struct inode * inode,
894 struct page * page)
895 {
896 unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
897 unsigned long index = page->index;
898 unsigned long max_ahead, ahead;
899 unsigned long raend;
900 int max_readahead = get_max_readahead(inode);
901
902 raend = filp->f_raend;
903 max_ahead = 0;
904
905 /*
906 * The current page is locked.
907 * If the current position is inside the previous read IO request, do not
908 * try to reread previously read ahead pages.
909 * Otherwise decide or not to read ahead some pages synchronously.
910 * If we are not going to read ahead, set the read ahead context for this
911 * page only.
912 */
913 if (PageLocked(page)) {
914 if (!filp->f_ralen || index >= raend || index + filp->f_rawin < raend) {
915 raend = index;
916 if (raend < end_index)
917 max_ahead = filp->f_ramax;
918 filp->f_rawin = 0;
919 filp->f_ralen = 1;
920 if (!max_ahead) {
921 filp->f_raend = index + filp->f_ralen;
922 filp->f_rawin += filp->f_ralen;
923 }
924 }
925 }
926 /*
927 * The current page is not locked.
928 * If we were reading ahead and,
929 * if the current max read ahead size is not zero and,
930 * if the current position is inside the last read-ahead IO request,
931 * it is the moment to try to read ahead asynchronously.
932 * We will later force unplug device in order to force asynchronous read IO.
933 */
934 else if (reada_ok && filp->f_ramax && raend >= 1 &&
935 index <= raend && index + filp->f_ralen >= raend) {
936 /*
937 * Add ONE page to max_ahead in order to try to have about the same IO max size
938 * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_CACHE_SIZE.
939 * Compute the position of the last page we have tried to read in order to
940 * begin to read ahead just at the next page.
941 */
942 raend -= 1;
943 if (raend < end_index)
944 max_ahead = filp->f_ramax + 1;
945
946 if (max_ahead) {
947 filp->f_rawin = filp->f_ralen;
948 filp->f_ralen = 0;
949 reada_ok = 2;
950 }
951 }
952 /*
953 * Try to read ahead pages.
954 * We hope that ll_rw_blk() plug/unplug, coalescence, requests sort and the
955 * scheduler, will work enough for us to avoid too bad actuals IO requests.
956 */
957 ahead = 0;
958 while (ahead < max_ahead) {
959 ahead ++;
960 if ((raend + ahead) >= end_index)
961 break;
962 if (page_cache_read(filp, raend + ahead) < 0)
963 break;
964 }
965 /*
966 * If we tried to read ahead some pages,
967 * If we tried to read ahead asynchronously,
968 * Try to force unplug of the device in order to start an asynchronous
969 * read IO request.
970 * Update the read-ahead context.
971 * Store the length of the current read-ahead window.
972 * Double the current max read ahead size.
973 * That heuristic avoid to do some large IO for files that are not really
974 * accessed sequentially.
975 */
976 if (ahead) {
977 if (reada_ok == 2) {
978 run_task_queue(&tq_disk);
979 }
980
981 filp->f_ralen += ahead;
982 filp->f_rawin += filp->f_ralen;
983 filp->f_raend = raend + ahead + 1;
984
985 filp->f_ramax += filp->f_ramax;
986
987 if (filp->f_ramax > max_readahead)
988 filp->f_ramax = max_readahead;
989
990 /*
991 * Move the pages that have already been passed
992 * to the inactive list.
993 */
994 drop_behind(filp, index);
995
996 #ifdef PROFILE_READAHEAD
997 profile_readahead((reada_ok == 2), filp);
998 #endif
999 }
1000
1001 return;
1002 }
1003
1004
1005 /*
1006 * This is a generic file read routine, and uses the
1007 * inode->i_op->readpage() function for the actual low-level
1008 * stuff.
1009 *
1010 * This is really ugly. But the goto's actually try to clarify some
1011 * of the logic when it comes to error handling etc.
1012 */
1013 void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor)
1014 {
1015 struct inode *inode = filp->f_dentry->d_inode;
1016 struct address_space *mapping = inode->i_mapping;
1017 unsigned long index, offset;
1018 struct page *cached_page;
1019 int reada_ok;
1020 int error;
1021 int max_readahead = get_max_readahead(inode);
1022
1023 cached_page = NULL;
1024 index = *ppos >> PAGE_CACHE_SHIFT;
1025 offset = *ppos & ~PAGE_CACHE_MASK;
1026
1027 /*
1028 * If the current position is outside the previous read-ahead window,
1029 * we reset the current read-ahead context and set read ahead max to zero
1030 * (will be set to just needed value later),
1031 * otherwise, we assume that the file accesses are sequential enough to
1032 * continue read-ahead.
1033 */
1034 if (index > filp->f_raend || index + filp->f_rawin < filp->f_raend) {
1035 reada_ok = 0;
1036 filp->f_raend = 0;
1037 filp->f_ralen = 0;
1038 filp->f_ramax = 0;
1039 filp->f_rawin = 0;
1040 } else {
1041 reada_ok = 1;
1042 }
1043 /*
1044 * Adjust the current value of read-ahead max.
1045 * If the read operation stay in the first half page, force no readahead.
1046 * Otherwise try to increase read ahead max just enough to do the read request.
1047 * Then, at least MIN_READAHEAD if read ahead is ok,
1048 * and at most MAX_READAHEAD in all cases.
1049 */
1050 if (!index && offset + desc->count <= (PAGE_CACHE_SIZE >> 1)) {
1051 filp->f_ramax = 0;
1052 } else {
1053 unsigned long needed;
1054
1055 needed = ((offset + desc->count) >> PAGE_CACHE_SHIFT) + 1;
1056
1057 if (filp->f_ramax < needed)
1058 filp->f_ramax = needed;
1059
1060 if (reada_ok && filp->f_ramax < MIN_READAHEAD)
1061 filp->f_ramax = MIN_READAHEAD;
1062 if (filp->f_ramax > max_readahead)
1063 filp->f_ramax = max_readahead;
1064 }
1065
1066 for (;;) {
1067 struct page *page, **hash;
1068 unsigned long end_index, nr;
1069
1070 end_index = inode->i_size >> PAGE_CACHE_SHIFT;
1071 if (index > end_index)
1072 break;
1073 nr = PAGE_CACHE_SIZE;
1074 if (index == end_index) {
1075 nr = inode->i_size & ~PAGE_CACHE_MASK;
1076 if (nr <= offset)
1077 break;
1078 }
1079
1080 nr = nr - offset;
1081
1082 /*
1083 * Try to find the data in the page cache..
1084 */
1085 hash = page_hash(mapping, index);
1086
1087 spin_lock(&pagecache_lock);
1088 page = __find_page_nolock(mapping, index, *hash);
1089 if (!page)
1090 goto no_cached_page;
1091 found_page:
1092 page_cache_get(page);
1093 spin_unlock(&pagecache_lock);
1094
1095 if (!Page_Uptodate(page))
1096 goto page_not_up_to_date;
1097 generic_file_readahead(reada_ok, filp, inode, page);
1098 page_ok:
1099 /* If users can be writing to this page using arbitrary
1100 * virtual addresses, take care about potential aliasing
1101 * before reading the page on the kernel side.
1102 */
1103 if (mapping->i_mmap_shared != NULL)
1104 flush_dcache_page(page);
1105
1106 /*
1107 * Ok, we have the page, and it's up-to-date, so
1108 * now we can copy it to user space...
1109 *
1110 * The actor routine returns how many bytes were actually used..
1111 * NOTE! This may not be the same as how much of a user buffer
1112 * we filled up (we may be padding etc), so we can only update
1113 * "pos" here (the actor routine has to update the user buffer
1114 * pointers and the remaining count).
1115 */
1116 nr = actor(desc, page, offset, nr);
1117 offset += nr;
1118 index += offset >> PAGE_CACHE_SHIFT;
1119 offset &= ~PAGE_CACHE_MASK;
1120
1121 page_cache_release(page);
1122 if (nr && desc->count)
1123 continue;
1124 break;
1125
1126 /*
1127 * Ok, the page was not immediately readable, so let's try to read ahead while we're at it..
1128 */
1129 page_not_up_to_date:
1130 generic_file_readahead(reada_ok, filp, inode, page);
1131
1132 if (Page_Uptodate(page))
1133 goto page_ok;
1134
1135 /* Get exclusive access to the page ... */
1136 lock_page(page);
1137
1138 /* Did it get unhashed before we got the lock? */
1139 if (!page->mapping) {
1140 UnlockPage(page);
1141 page_cache_release(page);
1142 continue;
1143 }
1144
1145 /* Did somebody else fill it already? */
1146 if (Page_Uptodate(page)) {
1147 UnlockPage(page);
1148 goto page_ok;
1149 }
1150
1151 readpage:
1152 /* ... and start the actual read. The read will unlock the page. */
1153 error = mapping->a_ops->readpage(filp, page);
1154
1155 if (!error) {
1156 if (Page_Uptodate(page))
1157 goto page_ok;
1158
1159 /* Again, try some read-ahead while waiting for the page to finish.. */
1160 generic_file_readahead(reada_ok, filp, inode, page);
1161 wait_on_page(page);
1162 if (Page_Uptodate(page))
1163 goto page_ok;
1164 error = -EIO;
1165 }
1166
1167 /* UHHUH! A synchronous read error occurred. Report it */
1168 desc->error = error;
1169 page_cache_release(page);
1170 break;
1171
1172 no_cached_page:
1173 /*
1174 * Ok, it wasn't cached, so we need to create a new
1175 * page..
1176 *
1177 * We get here with the page cache lock held.
1178 */
1179 if (!cached_page) {
1180 spin_unlock(&pagecache_lock);
1181 cached_page = page_cache_alloc();
1182 if (!cached_page) {
1183 desc->error = -ENOMEM;
1184 break;
1185 }
1186
1187 /*
1188 * Somebody may have added the page while we
1189 * dropped the page cache lock. Check for that.
1190 */
1191 spin_lock(&pagecache_lock);
1192 page = __find_page_nolock(mapping, index, *hash);
1193 if (page)
1194 goto found_page;
1195 }
1196
1197 /*
1198 * Ok, add the new page to the hash-queues...
1199 */
1200 page = cached_page;
1201 __add_to_page_cache(page, mapping, index, hash);
1202 spin_unlock(&pagecache_lock);
1203 cached_page = NULL;
1204
1205 goto readpage;
1206 }
1207
1208 *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
1209 filp->f_reada = 1;
1210 if (cached_page)
1211 page_cache_free(cached_page);
1212 UPDATE_ATIME(inode);
1213 }
1214
1215 static int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size)
1216 {
1217 char *kaddr;
1218 unsigned long left, count = desc->count;
1219
1220 if (size > count)
1221 size = count;
1222
1223 kaddr = kmap(page);
1224 left = __copy_to_user(desc->buf, kaddr + offset, size);
1225 kunmap(page);
1226
1227 if (left) {
1228 size -= left;
1229 desc->error = -EFAULT;
1230 }
1231 desc->count = count - size;
1232 desc->written += size;
1233 desc->buf += size;
1234 return size;
1235 }
1236
1237 /*
1238 * This is the "read()" routine for all filesystems
1239 * that can use the page cache directly.
1240 */
1241 ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *ppos)
1242 {
1243 ssize_t retval;
1244
1245 retval = -EFAULT;
1246 if (access_ok(VERIFY_WRITE, buf, count)) {
1247 retval = 0;
1248
1249 if (count) {
1250 read_descriptor_t desc;
1251
1252 desc.written = 0;
1253 desc.count = count;
1254 desc.buf = buf;
1255 desc.error = 0;
1256 do_generic_file_read(filp, ppos, &desc, file_read_actor);
1257
1258 retval = desc.written;
1259 if (!retval)
1260 retval = desc.error;
1261 }
1262 }
1263 return retval;
1264 }
1265
1266 static int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset , unsigned long size)
1267 {
1268 char *kaddr;
1269 ssize_t written;
1270 unsigned long count = desc->count;
1271 struct file *file = (struct file *) desc->buf;
1272 mm_segment_t old_fs;
1273
1274 if (size > count)
1275 size = count;
1276 old_fs = get_fs();
1277 set_fs(KERNEL_DS);
1278
1279 kaddr = kmap(page);
1280 written = file->f_op->write(file, kaddr + offset, size, &file->f_pos);
1281 kunmap(page);
1282 set_fs(old_fs);
1283 if (written < 0) {
1284 desc->error = written;
1285 written = 0;
1286 }
1287 desc->count = count - written;
1288 desc->written += written;
1289 return written;
1290 }
1291
1292 asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t *offset, size_t count)
1293 {
1294 ssize_t retval;
1295 struct file * in_file, * out_file;
1296 struct inode * in_inode, * out_inode;
1297
1298 /*
1299 * Get input file, and verify that it is ok..
1300 */
1301 retval = -EBADF;
1302 in_file = fget(in_fd);
1303 if (!in_file)
1304 goto out;
1305 if (!(in_file->f_mode & FMODE_READ))
1306 goto fput_in;
1307 retval = -EINVAL;
1308 in_inode = in_file->f_dentry->d_inode;
1309 if (!in_inode)
1310 goto fput_in;
1311 if (!in_inode->i_mapping->a_ops->readpage)
1312 goto fput_in;
1313 retval = locks_verify_area(FLOCK_VERIFY_READ, in_inode, in_file, in_file->f_pos, count);
1314 if (retval)
1315 goto fput_in;
1316
1317 /*
1318 * Get output file, and verify that it is ok..
1319 */
1320 retval = -EBADF;
1321 out_file = fget(out_fd);
1322 if (!out_file)
1323 goto fput_in;
1324 if (!(out_file->f_mode & FMODE_WRITE))
1325 goto fput_out;
1326 retval = -EINVAL;
1327 if (!out_file->f_op || !out_file->f_op->write)
1328 goto fput_out;
1329 out_inode = out_file->f_dentry->d_inode;
1330 retval = locks_verify_area(FLOCK_VERIFY_WRITE, out_inode, out_file, out_file->f_pos, count);
1331 if (retval)
1332 goto fput_out;
1333
1334 retval = 0;
1335 if (count) {
1336 read_descriptor_t desc;
1337 loff_t pos = 0, *ppos;
1338
1339 retval = -EFAULT;
1340 ppos = &in_file->f_pos;
1341 if (offset) {
1342 if (get_user(pos, offset))
1343 goto fput_out;
1344 ppos = &pos;
1345 }
1346
1347 desc.written = 0;
1348 desc.count = count;
1349 desc.buf = (char *) out_file;
1350 desc.error = 0;
1351 do_generic_file_read(in_file, ppos, &desc, file_send_actor);
1352
1353 retval = desc.written;
1354 if (!retval)
1355 retval = desc.error;
1356 if (offset)
1357 put_user(pos, offset);
1358 }
1359
1360 fput_out:
1361 fput(out_file);
1362 fput_in:
1363 fput(in_file);
1364 out:
1365 return retval;
1366 }
1367
1368 /*
1369 * Read-ahead and flush behind for MADV_SEQUENTIAL areas. Since we are
1370 * sure this is sequential access, we don't need a flexible read-ahead
1371 * window size -- we can always use a large fixed size window.
1372 */
1373 static void nopage_sequential_readahead(struct vm_area_struct * vma,
1374 unsigned long pgoff, unsigned long filesize)
1375 {
1376 unsigned long ra_window;
1377
1378 ra_window = get_max_readahead(vma->vm_file->f_dentry->d_inode);
1379 ra_window = CLUSTER_OFFSET(ra_window + CLUSTER_PAGES - 1);
1380
1381 /* vm_raend is zero if we haven't read ahead in this area yet. */
1382 if (vma->vm_raend == 0)
1383 vma->vm_raend = vma->vm_pgoff + ra_window;
1384
1385 /*
1386 * If we've just faulted the page half-way through our window,
1387 * then schedule reads for the next window, and release the
1388 * pages in the previous window.
1389 */
1390 if ((pgoff + (ra_window >> 1)) == vma->vm_raend) {
1391 unsigned long start = vma->vm_pgoff + vma->vm_raend;
1392 unsigned long end = start + ra_window;
1393
1394 if (end > ((vma->vm_end >> PAGE_SHIFT) + vma->vm_pgoff))
1395 end = (vma->vm_end >> PAGE_SHIFT) + vma->vm_pgoff;
1396 if (start > end)
1397 return;
1398
1399 while ((start < end) && (start < filesize)) {
1400 if (read_cluster_nonblocking(vma->vm_file,
1401 start, filesize) < 0)
1402 break;
1403 start += CLUSTER_PAGES;
1404 }
1405 run_task_queue(&tq_disk);
1406
1407 /* if we're far enough past the beginning of this area,
1408 recycle pages that are in the previous window. */
1409 if (vma->vm_raend > (vma->vm_pgoff + ra_window + ra_window)) {
1410 unsigned long window = ra_window << PAGE_SHIFT;
1411
1412 end = vma->vm_start + (vma->vm_raend << PAGE_SHIFT);
1413 end -= window + window;
1414 filemap_sync(vma, end - window, window, MS_INVALIDATE);
1415 }
1416
1417 vma->vm_raend += ra_window;
1418 }
1419
1420 return;
1421 }
1422
1423 /*
1424 * filemap_nopage() is invoked via the vma operations vector for a
1425 * mapped memory region to read in file data during a page fault.
1426 *
1427 * The goto's are kind of ugly, but this streamlines the normal case of having
1428 * it in the page cache, and handles the special cases reasonably without
1429 * having a lot of duplicated code.
1430 */
1431 struct page * filemap_nopage(struct vm_area_struct * area,
1432 unsigned long address, int no_share)
1433 {
1434 int error;
1435 struct file *file = area->vm_file;
1436 struct inode *inode = file->f_dentry->d_inode;
1437 struct address_space *mapping = inode->i_mapping;
1438 struct page *page, **hash, *old_page;
1439 unsigned long size, pgoff;
1440
1441 pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
1442
1443 retry_all:
1444 /*
1445 * An external ptracer can access pages that normally aren't
1446 * accessible..
1447 */
1448 size = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1449 if ((pgoff >= size) && (area->vm_mm == current->mm))
1450 return NULL;
1451
1452 /*
1453 * Do we have something in the page cache already?
1454 */
1455 hash = page_hash(mapping, pgoff);
1456 retry_find:
1457 page = __find_get_page(mapping, pgoff, hash);
1458 if (!page)
1459 goto no_cached_page;
1460
1461 /*
1462 * Ok, found a page in the page cache, now we need to check
1463 * that it's up-to-date.
1464 */
1465 if (!Page_Uptodate(page))
1466 goto page_not_uptodate;
1467
1468 success:
1469 /*
1470 * Try read-ahead for sequential areas.
1471 */
1472 if (VM_SequentialReadHint(area))
1473 nopage_sequential_readahead(area, pgoff, size);
1474
1475 /*
1476 * Found the page and have a reference on it, need to check sharing
1477 * and possibly copy it over to another page..
1478 */
1479 old_page = page;
1480 if (no_share) {
1481 struct page *new_page = page_cache_alloc();
1482
1483 if (new_page) {
1484 copy_user_highpage(new_page, old_page, address);
1485 flush_page_to_ram(new_page);
1486 } else
1487 new_page = NOPAGE_OOM;
1488 page_cache_release(page);
1489 return new_page;
1490 }
1491
1492 flush_page_to_ram(old_page);
1493 return old_page;
1494
1495 no_cached_page:
1496 /*
1497 * If the requested offset is within our file, try to read a whole
1498 * cluster of pages at once.
1499 *
1500 * Otherwise, we're off the end of a privately mapped file,
1501 * so we need to map a zero page.
1502 */
1503 if ((pgoff < size) && !VM_RandomReadHint(area))
1504 error = read_cluster_nonblocking(file, pgoff, size);
1505 else
1506 error = page_cache_read(file, pgoff);
1507
1508 /*
1509 * The page we want has now been added to the page cache.
1510 * In the unlikely event that someone removed it in the
1511 * meantime, we'll just come back here and read it again.
1512 */
1513 if (error >= 0)
1514 goto retry_find;
1515
1516 /*
1517 * An error return from page_cache_read can result if the
1518 * system is low on memory, or a problem occurs while trying
1519 * to schedule I/O.
1520 */
1521 if (error == -ENOMEM)
1522 return NOPAGE_OOM;
1523 return NULL;
1524
1525 page_not_uptodate:
1526 lock_page(page);
1527
1528 /* Did it get unhashed while we waited for it? */
1529 if (!page->mapping) {
1530 UnlockPage(page);
1531 page_cache_release(page);
1532 goto retry_all;
1533 }
1534
1535 /* Did somebody else get it up-to-date? */
1536 if (Page_Uptodate(page)) {
1537 UnlockPage(page);
1538 goto success;
1539 }
1540
1541 if (!mapping->a_ops->readpage(file, page)) {
1542 wait_on_page(page);
1543 if (Page_Uptodate(page))
1544 goto success;
1545 }
1546
1547 /*
1548 * Umm, take care of errors if the page isn't up-to-date.
1549 * Try to re-read it _once_. We do this synchronously,
1550 * because there really aren't any performance issues here
1551 * and we need to check for errors.
1552 */
1553 lock_page(page);
1554
1555 /* Somebody truncated the page on us? */
1556 if (!page->mapping) {
1557 UnlockPage(page);
1558 page_cache_release(page);
1559 goto retry_all;
1560 }
1561
1562 /* Somebody else successfully read it in? */
1563 if (Page_Uptodate(page)) {
1564 UnlockPage(page);
1565 goto success;
1566 }
1567 ClearPageError(page);
1568 if (!mapping->a_ops->readpage(file, page)) {
1569 wait_on_page(page);
1570 if (Page_Uptodate(page))
1571 goto success;
1572 }
1573
1574 /*
1575 * Things didn't work out. Return zero to tell the
1576 * mm layer so, possibly freeing the page cache page first.
1577 */
1578 page_cache_release(page);
1579 return NULL;
1580 }
1581
1582 /* Called with mm->page_table_lock held to protect against other
1583 * threads/the swapper from ripping pte's out from under us.
1584 */
1585 static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
1586 unsigned long address, unsigned int flags)
1587 {
1588 pte_t pte = *ptep;
1589
1590 if (pte_present(pte) && ptep_test_and_clear_dirty(ptep)) {
1591 struct page *page = pte_page(pte);
1592 flush_tlb_page(vma, address);
1593 set_page_dirty(page);
1594 }
1595 return 0;
1596 }
1597
1598 static inline int filemap_sync_pte_range(pmd_t * pmd,
1599 unsigned long address, unsigned long size,
1600 struct vm_area_struct *vma, unsigned long offset, unsigned int flags)
1601 {
1602 pte_t * pte;
1603 unsigned long end;
1604 int error;
1605
1606 if (pmd_none(*pmd))
1607 return 0;
1608 if (pmd_bad(*pmd)) {
1609 pmd_ERROR(*pmd);
1610 pmd_clear(pmd);
1611 return 0;
1612 }
1613 pte = pte_offset(pmd, address);
1614 offset += address & PMD_MASK;
1615 address &= ~PMD_MASK;
1616 end = address + size;
1617 if (end > PMD_SIZE)
1618 end = PMD_SIZE;
1619 error = 0;
1620 do {
1621 error |= filemap_sync_pte(pte, vma, address + offset, flags);
1622 address += PAGE_SIZE;
1623 pte++;
1624 } while (address && (address < end));
1625 return error;
1626 }
1627
1628 static inline int filemap_sync_pmd_range(pgd_t * pgd,
1629 unsigned long address, unsigned long size,
1630 struct vm_area_struct *vma, unsigned int flags)
1631 {
1632 pmd_t * pmd;
1633 unsigned long offset, end;
1634 int error;
1635
1636 if (pgd_none(*pgd))
1637 return 0;
1638 if (pgd_bad(*pgd)) {
1639 pgd_ERROR(*pgd);
1640 pgd_clear(pgd);
1641 return 0;
1642 }
1643 pmd = pmd_offset(pgd, address);
1644 offset = address & PGDIR_MASK;
1645 address &= ~PGDIR_MASK;
1646 end = address + size;
1647 if (end > PGDIR_SIZE)
1648 end = PGDIR_SIZE;
1649 error = 0;
1650 do {
1651 error |= filemap_sync_pte_range(pmd, address, end - address, vma, offset, flags);
1652 address = (address + PMD_SIZE) & PMD_MASK;
1653 pmd++;
1654 } while (address && (address < end));
1655 return error;
1656 }
1657
1658 int filemap_sync(struct vm_area_struct * vma, unsigned long address,
1659 size_t size, unsigned int flags)
1660 {
1661 pgd_t * dir;
1662 unsigned long end = address + size;
1663 int error = 0;
1664
1665 /* Aquire the lock early; it may be possible to avoid dropping
1666 * and reaquiring it repeatedly.
1667 */
1668 spin_lock(&vma->vm_mm->page_table_lock);
1669
1670 dir = pgd_offset(vma->vm_mm, address);
1671 flush_cache_range(vma->vm_mm, end - size, end);
1672 if (address >= end)
1673 BUG();
1674 do {
1675 error |= filemap_sync_pmd_range(dir, address, end - address, vma, flags);
1676 address = (address + PGDIR_SIZE) & PGDIR_MASK;
1677 dir++;
1678 } while (address && (address < end));
1679 flush_tlb_range(vma->vm_mm, end - size, end);
1680
1681 spin_unlock(&vma->vm_mm->page_table_lock);
1682
1683 return error;
1684 }
1685
1686 /*
1687 * Shared mappings need to be able to do the right thing at
1688 * close/unmap/sync. They will also use the private file as
1689 * backing-store for swapping..
1690 */
1691 static struct vm_operations_struct file_shared_mmap = {
1692 nopage: filemap_nopage,
1693 };
1694
1695 /*
1696 * Private mappings just need to be able to load in the map.
1697 *
1698 * (This is actually used for shared mappings as well, if we
1699 * know they can't ever get write permissions..)
1700 */
1701 static struct vm_operations_struct file_private_mmap = {
1702 nopage: filemap_nopage,
1703 };
1704
1705 /* This is used for a general mmap of a disk file */
1706
1707 int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1708 {
1709 struct vm_operations_struct * ops;
1710 struct inode *inode = file->f_dentry->d_inode;
1711
1712 ops = &file_private_mmap;
1713 if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
1714 if (!inode->i_mapping->a_ops->writepage)
1715 return -EINVAL;
1716 ops = &file_shared_mmap;
1717 }
1718 if (!inode->i_sb || !S_ISREG(inode->i_mode))
1719 return -EACCES;
1720 if (!inode->i_mapping->a_ops->readpage)
1721 return -ENOEXEC;
1722 UPDATE_ATIME(inode);
1723 vma->vm_ops = ops;
1724 return 0;
1725 }
1726
1727 /*
1728 * The msync() system call.
1729 */
1730
1731 static int msync_interval(struct vm_area_struct * vma,
1732 unsigned long start, unsigned long end, int flags)
1733 {
1734 struct file * file = vma->vm_file;
1735 if (file && (vma->vm_flags & VM_SHARED)) {
1736 int error;
1737 error = filemap_sync(vma, start, end-start, flags);
1738
1739 if (!error && (flags & MS_SYNC)) {
1740 struct inode * inode = file->f_dentry->d_inode;
1741 down(&inode->i_sem);
1742 filemap_fdatasync(inode->i_mapping);
1743 if (file->f_op && file->f_op->fsync)
1744 error = file->f_op->fsync(file, file->f_dentry, 1);
1745 filemap_fdatawait(inode->i_mapping);
1746 up(&inode->i_sem);
1747 }
1748 return error;
1749 }
1750 return 0;
1751 }
1752
1753 asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
1754 {
1755 unsigned long end;
1756 struct vm_area_struct * vma;
1757 int unmapped_error, error = -EINVAL;
1758
1759 down(¤t->mm->mmap_sem);
1760 if (start & ~PAGE_MASK)
1761 goto out;
1762 len = (len + ~PAGE_MASK) & PAGE_MASK;
1763 end = start + len;
1764 if (end < start)
1765 goto out;
1766 if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
1767 goto out;
1768 error = 0;
1769 if (end == start)
1770 goto out;
1771 /*
1772 * If the interval [start,end) covers some unmapped address ranges,
1773 * just ignore them, but return -EFAULT at the end.
1774 */
1775 vma = find_vma(current->mm, start);
1776 unmapped_error = 0;
1777 for (;;) {
1778 /* Still start < end. */
1779 error = -EFAULT;
1780 if (!vma)
1781 goto out;
1782 /* Here start < vma->vm_end. */
1783 if (start < vma->vm_start) {
1784 unmapped_error = -EFAULT;
1785 start = vma->vm_start;
1786 }
1787 /* Here vma->vm_start <= start < vma->vm_end. */
1788 if (end <= vma->vm_end) {
1789 if (start < end) {
1790 error = msync_interval(vma, start, end, flags);
1791 if (error)
1792 goto out;
1793 }
1794 error = unmapped_error;
1795 goto out;
1796 }
1797 /* Here vma->vm_start <= start < vma->vm_end < end. */
1798 error = msync_interval(vma, start, vma->vm_end, flags);
1799 if (error)
1800 goto out;
1801 start = vma->vm_end;
1802 vma = vma->vm_next;
1803 }
1804 out:
1805 up(¤t->mm->mmap_sem);
1806 return error;
1807 }
1808
1809 static inline void setup_read_behavior(struct vm_area_struct * vma,
1810 int behavior)
1811 {
1812 VM_ClearReadHint(vma);
1813 switch(behavior) {
1814 case MADV_SEQUENTIAL:
1815 vma->vm_flags |= VM_SEQ_READ;
1816 break;
1817 case MADV_RANDOM:
1818 vma->vm_flags |= VM_RAND_READ;
1819 break;
1820 default:
1821 break;
1822 }
1823 return;
1824 }
1825
1826 static long madvise_fixup_start(struct vm_area_struct * vma,
1827 unsigned long end, int behavior)
1828 {
1829 struct vm_area_struct * n;
1830
1831 n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
1832 if (!n)
1833 return -EAGAIN;
1834 *n = *vma;
1835 n->vm_end = end;
1836 setup_read_behavior(n, behavior);
1837 n->vm_raend = 0;
1838 get_file(n->vm_file);
1839 if (n->vm_ops && n->vm_ops->open)
1840 n->vm_ops->open(n);
1841 lock_vma_mappings(vma);
1842 spin_lock(&vma->vm_mm->page_table_lock);
1843 vma->vm_pgoff += (end - vma->vm_start) >> PAGE_SHIFT;
1844 vma->vm_start = end;
1845 __insert_vm_struct(current->mm, n);
1846 spin_unlock(&vma->vm_mm->page_table_lock);
1847 unlock_vma_mappings(vma);
1848 return 0;
1849 }
1850
1851 static long madvise_fixup_end(struct vm_area_struct * vma,
1852 unsigned long start, int behavior)
1853 {
1854 struct vm_area_struct * n;
1855
1856 n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
1857 if (!n)
1858 return -EAGAIN;
1859 *n = *vma;
1860 n->vm_start = start;
1861 n->vm_pgoff += (n->vm_start - vma->vm_start) >> PAGE_SHIFT;
1862 setup_read_behavior(n, behavior);
1863 n->vm_raend = 0;
1864 get_file(n->vm_file);
1865 if (n->vm_ops && n->vm_ops->open)
1866 n->vm_ops->open(n);
1867 lock_vma_mappings(vma);
1868 spin_lock(&vma->vm_mm->page_table_lock);
1869 vma->vm_end = start;
1870 __insert_vm_struct(current->mm, n);
1871 spin_unlock(&vma->vm_mm->page_table_lock);
1872 unlock_vma_mappings(vma);
1873 return 0;
1874 }
1875
1876 static long madvise_fixup_middle(struct vm_area_struct * vma,
1877 unsigned long start, unsigned long end, int behavior)
1878 {
1879 struct vm_area_struct * left, * right;
1880
1881 left = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
1882 if (!left)
1883 return -EAGAIN;
1884 right = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
1885 if (!right) {
1886 kmem_cache_free(vm_area_cachep, left);
1887 return -EAGAIN;
1888 }
1889 *left = *vma;
1890 *right = *vma;
1891 left->vm_end = start;
1892 right->vm_start = end;
1893 right->vm_pgoff += (right->vm_start - left->vm_start) >> PAGE_SHIFT;
1894 left->vm_raend = 0;
1895 right->vm_raend = 0;
1896 atomic_add(2, &vma->vm_file->f_count);
1897
1898 if (vma->vm_ops && vma->vm_ops->open) {
1899 vma->vm_ops->open(left);
1900 vma->vm_ops->open(right);
1901 }
1902 lock_vma_mappings(vma);
1903 spin_lock(&vma->vm_mm->page_table_lock);
1904 vma->vm_pgoff += (start - vma->vm_start) >> PAGE_SHIFT;
1905 vma->vm_start = start;
1906 vma->vm_end = end;
1907 setup_read_behavior(vma, behavior);
1908 vma->vm_raend = 0;
1909 __insert_vm_struct(current->mm, left);
1910 __insert_vm_struct(current->mm, right);
1911 spin_unlock(&vma->vm_mm->page_table_lock);
1912 unlock_vma_mappings(vma);
1913 return 0;
1914 }
1915
1916 /*
1917 * We can potentially split a vm area into separate
1918 * areas, each area with its own behavior.
1919 */
1920 static long madvise_behavior(struct vm_area_struct * vma,
1921 unsigned long start, unsigned long end, int behavior)
1922 {
1923 int error = 0;
1924
1925 /* This caps the number of vma's this process can own */
1926 if (vma->vm_mm->map_count > MAX_MAP_COUNT)
1927 return -ENOMEM;
1928
1929 if (start == vma->vm_start) {
1930 if (end == vma->vm_end) {
1931 setup_read_behavior(vma, behavior);
1932 vma->vm_raend = 0;
1933 } else
1934 error = madvise_fixup_start(vma, end, behavior);
1935 } else {
1936 if (end == vma->vm_end)
1937 error = madvise_fixup_end(vma, start, behavior);
1938 else
1939 error = madvise_fixup_middle(vma, start, end, behavior);
1940 }
1941
1942 return error;
1943 }
1944
1945 /*
1946 * Schedule all required I/O operations, then run the disk queue
1947 * to make sure they are started. Do not wait for completion.
1948 */
1949 static long madvise_willneed(struct vm_area_struct * vma,
1950 unsigned long start, unsigned long end)
1951 {
1952 long error = -EBADF;
1953 struct file * file;
1954 unsigned long size, rlim_rss;
1955
1956 /* Doesn't work if there's no mapped file. */
1957 if (!vma->vm_file)
1958 return error;
1959 file = vma->vm_file;
1960 size = (file->f_dentry->d_inode->i_size + PAGE_CACHE_SIZE - 1) >>
1961 PAGE_CACHE_SHIFT;
1962
1963 start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
1964 if (end > vma->vm_end)
1965 end = vma->vm_end;
1966 end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
1967
1968 /* Make sure this doesn't exceed the process's max rss. */
1969 error = -EIO;
1970 rlim_rss = current->rlim ? current->rlim[RLIMIT_RSS].rlim_cur :
1971 LONG_MAX; /* default: see resource.h */
1972 if ((vma->vm_mm->rss + (end - start)) > rlim_rss)
1973 return error;
1974
1975 /* round to cluster boundaries if this isn't a "random" area. */
1976 if (!VM_RandomReadHint(vma)) {
1977 start = CLUSTER_OFFSET(start);
1978 end = CLUSTER_OFFSET(end + CLUSTER_PAGES - 1);
1979
1980 while ((start < end) && (start < size)) {
1981 error = read_cluster_nonblocking(file, start, size);
1982 start += CLUSTER_PAGES;
1983 if (error < 0)
1984 break;
1985 }
1986 } else {
1987 while ((start < end) && (start < size)) {
1988 error = page_cache_read(file, start);
1989 start++;
1990 if (error < 0)
1991 break;
1992 }
1993 }
1994
1995 /* Don't wait for someone else to push these requests. */
1996 run_task_queue(&tq_disk);
1997
1998 return error;
1999 }
2000
2001 /*
2002 * Application no longer needs these pages. If the pages are dirty,
2003 * it's OK to just throw them away. The app will be more careful about
2004 * data it wants to keep. Be sure to free swap resources too. The
2005 * zap_page_range call sets things up for refill_inactive to actually free
2006 * these pages later if no one else has touched them in the meantime,
2007 * although we could add these pages to a global reuse list for
2008 * refill_inactive to pick up before reclaiming other pages.
2009 *
2010 * NB: This interface discards data rather than pushes it out to swap,
2011 * as some implementations do. This has performance implications for
2012 * applications like large transactional databases which want to discard
2013 * pages in anonymous maps after committing to backing store the data
2014 * that was kept in them. There is no reason to write this data out to
2015 * the swap area if the application is discarding it.
2016 *
2017 * An interface that causes the system to free clean pages and flush
2018 * dirty pages is already available as msync(MS_INVALIDATE).
2019 */
2020 static long madvise_dontneed(struct vm_area_struct * vma,
2021 unsigned long start, unsigned long end)
2022 {
2023 if (vma->vm_flags & VM_LOCKED)
2024 return -EINVAL;
2025
2026 flush_cache_range(vma->vm_mm, start, end);
2027 zap_page_range(vma->vm_mm, start, end - start);
2028 flush_tlb_range(vma->vm_mm, start, end);
2029 return 0;
2030 }
2031
2032 static long madvise_vma(struct vm_area_struct * vma, unsigned long start,
2033 unsigned long end, int behavior)
2034 {
2035 long error = -EBADF;
2036
2037 switch (behavior) {
2038 case MADV_NORMAL:
2039 case MADV_SEQUENTIAL:
2040 case MADV_RANDOM:
2041 error = madvise_behavior(vma, start, end, behavior);
2042 break;
2043
2044 case MADV_WILLNEED:
2045 error = madvise_willneed(vma, start, end);
2046 break;
2047
2048 case MADV_DONTNEED:
2049 error = madvise_dontneed(vma, start, end);
2050 break;
2051
2052 default:
2053 error = -EINVAL;
2054 break;
2055 }
2056
2057 return error;
2058 }
2059
2060 /*
2061 * The madvise(2) system call.
2062 *
2063 * Applications can use madvise() to advise the kernel how it should
2064 * handle paging I/O in this VM area. The idea is to help the kernel
2065 * use appropriate read-ahead and caching techniques. The information
2066 * provided is advisory only, and can be safely disregarded by the
2067 * kernel without affecting the correct operation of the application.
2068 *
2069 * behavior values:
2070 * MADV_NORMAL - the default behavior is to read clusters. This
2071 * results in some read-ahead and read-behind.
2072 * MADV_RANDOM - the system should read the minimum amount of data
2073 * on any access, since it is unlikely that the appli-
2074 * cation will need more than what it asks for.
2075 * MADV_SEQUENTIAL - pages in the given range will probably be accessed
2076 * once, so they can be aggressively read ahead, and
2077 * can be freed soon after they are accessed.
2078 * MADV_WILLNEED - the application is notifying the system to read
2079 * some pages ahead.
2080 * MADV_DONTNEED - the application is finished with the given range,
2081 * so the kernel can free resources associated with it.
2082 *
2083 * return values:
2084 * zero - success
2085 * -EINVAL - start + len < 0, start is not page-aligned,
2086 * "behavior" is not a valid value, or application
2087 * is attempting to release locked or shared pages.
2088 * -ENOMEM - addresses in the specified range are not currently
2089 * mapped, or are outside the AS of the process.
2090 * -EIO - an I/O error occurred while paging in data.
2091 * -EBADF - map exists, but area maps something that isn't a file.
2092 * -EAGAIN - a kernel resource was temporarily unavailable.
2093 */
2094 asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior)
2095 {
2096 unsigned long end;
2097 struct vm_area_struct * vma;
2098 int unmapped_error = 0;
2099 int error = -EINVAL;
2100
2101 down(¤t->mm->mmap_sem);
2102
2103 if (start & ~PAGE_MASK)
2104 goto out;
2105 len = (len + ~PAGE_MASK) & PAGE_MASK;
2106 end = start + len;
2107 if (end < start)
2108 goto out;
2109
2110 error = 0;
2111 if (end == start)
2112 goto out;
2113
2114 /*
2115 * If the interval [start,end) covers some unmapped address
2116 * ranges, just ignore them, but return -ENOMEM at the end.
2117 */
2118 vma = find_vma(current->mm, start);
2119 for (;;) {
2120 /* Still start < end. */
2121 error = -ENOMEM;
2122 if (!vma)
2123 goto out;
2124
2125 /* Here start < vma->vm_end. */
2126 if (start < vma->vm_start) {
2127 unmapped_error = -ENOMEM;
2128 start = vma->vm_start;
2129 }
2130
2131 /* Here vma->vm_start <= start < vma->vm_end. */
2132 if (end <= vma->vm_end) {
2133 if (start < end) {
2134 error = madvise_vma(vma, start, end,
2135 behavior);
2136 if (error)
2137 goto out;
2138 }
2139 error = unmapped_error;
2140 goto out;
2141 }
2142
2143 /* Here vma->vm_start <= start < vma->vm_end < end. */
2144 error = madvise_vma(vma, start, vma->vm_end, behavior);
2145 if (error)
2146 goto out;
2147 start = vma->vm_end;
2148 vma = vma->vm_next;
2149 }
2150
2151 out:
2152 up(¤t->mm->mmap_sem);
2153 return error;
2154 }
2155
2156 /*
2157 * Later we can get more picky about what "in core" means precisely.
2158 * For now, simply check to see if the page is in the page cache,
2159 * and is up to date; i.e. that no page-in operation would be required
2160 * at this time if an application were to map and access this page.
2161 */
2162 static unsigned char mincore_page(struct vm_area_struct * vma,
2163 unsigned long pgoff)
2164 {
2165 unsigned char present = 0;
2166 struct address_space * as = &vma->vm_file->f_dentry->d_inode->i_data;
2167 struct page * page, ** hash = page_hash(as, pgoff);
2168
2169 spin_lock(&pagecache_lock);
2170 page = __find_page_nolock(as, pgoff, *hash);
2171 if ((page) && (Page_Uptodate(page)))
2172 present = 1;
2173 spin_unlock(&pagecache_lock);
2174
2175 return present;
2176 }
2177
2178 static long mincore_vma(struct vm_area_struct * vma,
2179 unsigned long start, unsigned long end, unsigned char * vec)
2180 {
2181 long error, i, remaining;
2182 unsigned char * tmp;
2183
2184 error = -ENOMEM;
2185 if (!vma->vm_file)
2186 return error;
2187
2188 start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2189 if (end > vma->vm_end)
2190 end = vma->vm_end;
2191 end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2192
2193 error = -EAGAIN;
2194 tmp = (unsigned char *) __get_free_page(GFP_KERNEL);
2195 if (!tmp)
2196 return error;
2197
2198 /* (end - start) is # of pages, and also # of bytes in "vec */
2199 remaining = (end - start),
2200
2201 error = 0;
2202 for (i = 0; remaining > 0; remaining -= PAGE_SIZE, i++) {
2203 int j = 0;
2204 long thispiece = (remaining < PAGE_SIZE) ?
2205 remaining : PAGE_SIZE;
2206
2207 while (j < thispiece)
2208 tmp[j++] = mincore_page(vma, start++);
2209
2210 if (copy_to_user(vec + PAGE_SIZE * i, tmp, thispiece)) {
2211 error = -EFAULT;
2212 break;
2213 }
2214 }
2215
2216 free_page((unsigned long) tmp);
2217 return error;
2218 }
2219
2220 /*
2221 * The mincore(2) system call.
2222 *
2223 * mincore() returns the memory residency status of the pages in the
2224 * current process's address space specified by [addr, addr + len).
2225 * The status is returned in a vector of bytes. The least significant
2226 * bit of each byte is 1 if the referenced page is in memory, otherwise
2227 * it is zero.
2228 *
2229 * Because the status of a page can change after mincore() checks it
2230 * but before it returns to the application, the returned vector may
2231 * contain stale information. Only locked pages are guaranteed to
2232 * remain in memory.
2233 *
2234 * return values:
2235 * zero - success
2236 * -EFAULT - vec points to an illegal address
2237 * -EINVAL - addr is not a multiple of PAGE_CACHE_SIZE,
2238 * or len has a nonpositive value
2239 * -ENOMEM - Addresses in the range [addr, addr + len] are
2240 * invalid for the address space of this process, or
2241 * specify one or more pages which are not currently
2242 * mapped
2243 * -EAGAIN - A kernel resource was temporarily unavailable.
2244 */
2245 asmlinkage long sys_mincore(unsigned long start, size_t len,
2246 unsigned char * vec)
2247 {
2248 int index = 0;
2249 unsigned long end;
2250 struct vm_area_struct * vma;
2251 int unmapped_error = 0;
2252 long error = -EINVAL;
2253
2254 down(¤t->mm->mmap_sem);
2255
2256 if (start & ~PAGE_CACHE_MASK)
2257 goto out;
2258 len = (len + ~PAGE_CACHE_MASK) & PAGE_CACHE_MASK;
2259 end = start + len;
2260 if (end < start)
2261 goto out;
2262
2263 error = 0;
2264 if (end == start)
2265 goto out;
2266
2267 /*
2268 * If the interval [start,end) covers some unmapped address
2269 * ranges, just ignore them, but return -ENOMEM at the end.
2270 */
2271 vma = find_vma(current->mm, start);
2272 for (;;) {
2273 /* Still start < end. */
2274 error = -ENOMEM;
2275 if (!vma)
2276 goto out;
2277
2278 /* Here start < vma->vm_end. */
2279 if (start < vma->vm_start) {
2280 unmapped_error = -ENOMEM;
2281 start = vma->vm_start;
2282 }
2283
2284 /* Here vma->vm_start <= start < vma->vm_end. */
2285 if (end <= vma->vm_end) {
2286 if (start < end) {
2287 error = mincore_vma(vma, start, end,
2288 &vec[index]);
2289 if (error)
2290 goto out;
2291 }
2292 error = unmapped_error;
2293 goto out;
2294 }
2295
2296 /* Here vma->vm_start <= start < vma->vm_end < end. */
2297 error = mincore_vma(vma, start, vma->vm_end, &vec[index]);
2298 if (error)
2299 goto out;
2300 index += (vma->vm_end - start) >> PAGE_CACHE_SHIFT;
2301 start = vma->vm_end;
2302 vma = vma->vm_next;
2303 }
2304
2305 out:
2306 up(¤t->mm->mmap_sem);
2307 return error;
2308 }
2309
2310 static inline
2311 struct page *__read_cache_page(struct address_space *mapping,
2312 unsigned long index,
2313 int (*filler)(void *,struct page*),
2314 void *data)
2315 {
2316 struct page **hash = page_hash(mapping, index);
2317 struct page *page, *cached_page = NULL;
2318 int err;
2319 repeat:
2320 page = __find_get_page(mapping, index, hash);
2321 if (!page) {
2322 if (!cached_page) {
2323 cached_page = page_cache_alloc();
2324 if (!cached_page)
2325 return ERR_PTR(-ENOMEM);
2326 }
2327 page = cached_page;
2328 if (add_to_page_cache_unique(page, mapping, index, hash))
2329 goto repeat;
2330 cached_page = NULL;
2331 err = filler(data, page);
2332 if (err < 0) {
2333 page_cache_release(page);
2334 page = ERR_PTR(err);
2335 }
2336 }
2337 if (cached_page)
2338 page_cache_free(cached_page);
2339 return page;
2340 }
2341
2342 /*
2343 * Read into the page cache. If a page already exists,
2344 * and Page_Uptodate() is not set, try to fill the page.
2345 */
2346 struct page *read_cache_page(struct address_space *mapping,
2347 unsigned long index,
2348 int (*filler)(void *,struct page*),
2349 void *data)
2350 {
2351 struct page *page;
2352 int err;
2353
2354 retry:
2355 page = __read_cache_page(mapping, index, filler, data);
2356 if (IS_ERR(page) || Page_Uptodate(page))
2357 goto out;
2358
2359 lock_page(page);
2360 if (!page->mapping) {
2361 UnlockPage(page);
2362 page_cache_release(page);
2363 goto retry;
2364 }
2365 if (Page_Uptodate(page)) {
2366 UnlockPage(page);
2367 goto out;
2368 }
2369 err = filler(data, page);
2370 if (err < 0) {
2371 page_cache_release(page);
2372 page = ERR_PTR(err);
2373 }
2374 out:
2375 return page;
2376 }
2377
2378 static inline struct page * __grab_cache_page(struct address_space *mapping,
2379 unsigned long index, struct page **cached_page)
2380 {
2381 struct page *page, **hash = page_hash(mapping, index);
2382 repeat:
2383 page = __find_lock_page(mapping, index, hash);
2384 if (!page) {
2385 if (!*cached_page) {
2386 *cached_page = page_cache_alloc();
2387 if (!*cached_page)
2388 return NULL;
2389 }
2390 page = *cached_page;
2391 if (add_to_page_cache_unique(page, mapping, index, hash))
2392 goto repeat;
2393 *cached_page = NULL;
2394 }
2395 return page;
2396 }
2397
2398 /*
2399 * Returns locked page at given index in given cache, creating it if needed.
2400 */
2401
2402 struct page *grab_cache_page(struct address_space *mapping, unsigned long index)
2403 {
2404 struct page *cached_page = NULL;
2405 struct page *page = __grab_cache_page(mapping,index,&cached_page);
2406 if (cached_page)
2407 page_cache_free(cached_page);
2408 return page;
2409 }
2410
2411 static inline void remove_suid(struct inode *inode)
2412 {
2413 unsigned int mode;
2414
2415 /* set S_IGID if S_IXGRP is set, and always set S_ISUID */
2416 mode = (inode->i_mode & S_IXGRP)*(S_ISGID/S_IXGRP) | S_ISUID;
2417
2418 /* was any of the uid bits set? */
2419 mode &= inode->i_mode;
2420 if (mode && !capable(CAP_FSETID)) {
2421 inode->i_mode &= ~mode;
2422 mark_inode_dirty(inode);
2423 }
2424 }
2425
2426 /*
2427 * Write to a file through the page cache.
2428 *
2429 * We currently put everything into the page cache prior to writing it.
2430 * This is not a problem when writing full pages. With partial pages,
2431 * however, we first have to read the data into the cache, then
2432 * dirty the page, and finally schedule it for writing. Alternatively, we
2433 * could write-through just the portion of data that would go into that
2434 * page, but that would kill performance for applications that write data
2435 * line by line, and it's prone to race conditions.
2436 *
2437 * Note that this routine doesn't try to keep track of dirty pages. Each
2438 * file system has to do this all by itself, unfortunately.
2439 * okir@monad.swb.de
2440 */
2441 ssize_t
2442 generic_file_write(struct file *file,const char *buf,size_t count,loff_t *ppos)
2443 {
2444 struct inode *inode = file->f_dentry->d_inode;
2445 struct address_space *mapping = inode->i_mapping;
2446 unsigned long limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
2447 loff_t pos;
2448 struct page *page, *cached_page;
2449 unsigned long written;
2450 long status;
2451 int err;
2452
2453 cached_page = NULL;
2454
2455 down(&inode->i_sem);
2456
2457 pos = *ppos;
2458 err = -EINVAL;
2459 if (pos < 0)
2460 goto out;
2461
2462 err = file->f_error;
2463 if (err) {
2464 file->f_error = 0;
2465 goto out;
2466 }
2467
2468 written = 0;
2469
2470 if (file->f_flags & O_APPEND)
2471 pos = inode->i_size;
2472
2473 /*
2474 * Check whether we've reached the file size limit.
2475 */
2476 err = -EFBIG;
2477 if (limit != RLIM_INFINITY) {
2478 if (pos >= limit) {
2479 send_sig(SIGXFSZ, current, 0);
2480 goto out;
2481 }
2482 if (count > limit - pos) {
2483 send_sig(SIGXFSZ, current, 0);
2484 count = limit - pos;
2485 }
2486 }
2487
2488 status = 0;
2489 if (count) {
2490 remove_suid(inode);
2491 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
2492 mark_inode_dirty_sync(inode);
2493 }
2494
2495 while (count) {
2496 unsigned long bytes, index, offset;
2497 char *kaddr;
2498 int deactivate = 1;
2499
2500 /*
2501 * Try to find the page in the cache. If it isn't there,
2502 * allocate a free page.
2503 */
2504 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
2505 index = pos >> PAGE_CACHE_SHIFT;
2506 bytes = PAGE_CACHE_SIZE - offset;
2507 if (bytes > count) {
2508 bytes = count;
2509 deactivate = 0;
2510 }
2511
2512 /*
2513 * Bring in the user page that we will copy from _first_.
2514 * Otherwise there's a nasty deadlock on copying from the
2515 * same page as we're writing to, without it being marked
2516 * up-to-date.
2517 */
2518 { volatile unsigned char dummy;
2519 __get_user(dummy, buf);
2520 __get_user(dummy, buf+bytes-1);
2521 }
2522
2523 status = -ENOMEM; /* we'll assign it later anyway */
2524 page = __grab_cache_page(mapping, index, &cached_page);
2525 if (!page)
2526 break;
2527
2528 /* We have exclusive IO access to the page.. */
2529 if (!PageLocked(page)) {
2530 PAGE_BUG(page);
2531 }
2532
2533 status = mapping->a_ops->prepare_write(file, page, offset, offset+bytes);
2534 if (status)
2535 goto unlock;
2536 kaddr = page_address(page);
2537 status = copy_from_user(kaddr+offset, buf, bytes);
2538 flush_dcache_page(page);
2539 if (status)
2540 goto fail_write;
2541 status = mapping->a_ops->commit_write(file, page, offset, offset+bytes);
2542 if (!status)
2543 status = bytes;
2544
2545 if (status >= 0) {
2546 written += status;
2547 count -= status;
2548 pos += status;
2549 buf += status;
2550 }
2551 unlock:
2552 /* Mark it unlocked again and drop the page.. */
2553 UnlockPage(page);
2554 if (deactivate)
2555 deactivate_page(page);
2556 page_cache_release(page);
2557
2558 if (status < 0)
2559 break;
2560 }
2561 *ppos = pos;
2562
2563 if (cached_page)
2564 page_cache_free(cached_page);
2565
2566 /* For now, when the user asks for O_SYNC, we'll actually
2567 * provide O_DSYNC. */
2568 if ((status >= 0) && (file->f_flags & O_SYNC))
2569 status = generic_osync_inode(inode, 1); /* 1 means datasync */
2570
2571 err = written ? written : status;
2572 out:
2573
2574 up(&inode->i_sem);
2575 return err;
2576 fail_write:
2577 status = -EFAULT;
2578 ClearPageUptodate(page);
2579 kunmap(page);
2580 goto unlock;
2581 }
2582
2583 void __init page_cache_init(unsigned long mempages)
2584 {
2585 unsigned long htable_size, order;
2586
2587 htable_size = mempages;
2588 htable_size *= sizeof(struct page *);
2589 for(order = 0; (PAGE_SIZE << order) < htable_size; order++)
2590 ;
2591
2592 do {
2593 unsigned long tmp = (PAGE_SIZE << order) / sizeof(struct page *);
2594
2595 page_hash_bits = 0;
2596 while((tmp >>= 1UL) != 0UL)
2597 page_hash_bits++;
2598
2599 page_hash_table = (struct page **)
2600 __get_free_pages(GFP_ATOMIC, order);
2601 } while(page_hash_table == NULL && --order > 0);
2602
2603 printk("Page-cache hash table entries: %d (order: %ld, %ld bytes)\n",
2604 (1 << page_hash_bits), order, (PAGE_SIZE << order));
2605 if (!page_hash_table)
2606 panic("Failed to allocate page hash table\n");
2607 memset((void *)page_hash_table, 0, PAGE_HASH_SIZE * sizeof(struct page *));
2608 }
2609
This page was automatically generated by the
LXR engine.
Visit the LXR main site for more
information.