1 /*
2 * linux/fs/buffer.c
3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 */
6
7 /*
8 * 'buffer.c' implements the buffer-cache functions. Race-conditions have
9 * been avoided by NEVER letting an interrupt change a buffer (except for the
10 * data, of course), but instead letting the caller do it.
11 */
12
13 /* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 */
14
15 /* Removed a lot of unnecessary code and simplified things now that
16 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
17 */
18
19 /* Speed up hash, lru, and free list operations. Use gfp() for allocating
20 * hash table, use SLAB cache for buffer heads. -DaveM
21 */
22
23 /* Added 32k buffer block sizes - these are required older ARM systems.
24 * - RMK
25 */
26
27 /* Thread it... -DaveM */
28
29 /* async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de> */
30
31 #include <linux/config.h>
32 #include <linux/sched.h>
33 #include <linux/fs.h>
34 #include <linux/malloc.h>
35 #include <linux/locks.h>
36 #include <linux/errno.h>
37 #include <linux/swap.h>
38 #include <linux/swapctl.h>
39 #include <linux/smp_lock.h>
40 #include <linux/vmalloc.h>
41 #include <linux/blkdev.h>
42 #include <linux/sysrq.h>
43 #include <linux/file.h>
44 #include <linux/init.h>
45 #include <linux/quotaops.h>
46 #include <linux/iobuf.h>
47 #include <linux/highmem.h>
48
49 #include <asm/uaccess.h>
50 #include <asm/io.h>
51 #include <asm/bitops.h>
52 #include <asm/mmu_context.h>
53
54 #define NR_SIZES 7
55 static char buffersize_index[65] =
56 {-1, 0, 1, -1, 2, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1,
57 4, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
58 5, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
59 -1, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
60 6};
61
62 #define BUFSIZE_INDEX(X) ((int) buffersize_index[(X)>>9])
63 #define MAX_BUF_PER_PAGE (PAGE_CACHE_SIZE / 512)
64 #define NR_RESERVED (2*MAX_BUF_PER_PAGE)
65 #define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this
66 number of unused buffer heads */
67
68 /* Anti-deadlock ordering:
69 * lru_list_lock > hash_table_lock > free_list_lock > unused_list_lock
70 */
71
72 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_inode_buffers)
73
74 /*
75 * Hash table gook..
76 */
77 static unsigned int bh_hash_mask;
78 static unsigned int bh_hash_shift;
79 static struct buffer_head **hash_table;
80 static rwlock_t hash_table_lock = RW_LOCK_UNLOCKED;
81
82 static struct buffer_head *lru_list[NR_LIST];
83 static spinlock_t lru_list_lock = SPIN_LOCK_UNLOCKED;
84 static int nr_buffers_type[NR_LIST];
85 static unsigned long size_buffers_type[NR_LIST];
86
87 static struct buffer_head * unused_list;
88 static int nr_unused_buffer_heads;
89 static spinlock_t unused_list_lock = SPIN_LOCK_UNLOCKED;
90 static DECLARE_WAIT_QUEUE_HEAD(buffer_wait);
91
92 struct bh_free_head {
93 struct buffer_head *list;
94 spinlock_t lock;
95 };
96 static struct bh_free_head free_list[NR_SIZES];
97
98 static int grow_buffers(int size);
99 static void __refile_buffer(struct buffer_head *);
100
101 /* This is used by some architectures to estimate available memory. */
102 atomic_t buffermem_pages = ATOMIC_INIT(0);
103
104 /* Here is the parameter block for the bdflush process. If you add or
105 * remove any of the parameters, make sure to update kernel/sysctl.c.
106 */
107
108 #define N_PARAM 9
109
110 /* The dummy values in this structure are left in there for compatibility
111 * with old programs that play with the /proc entries.
112 */
113 union bdflush_param {
114 struct {
115 int nfract; /* Percentage of buffer cache dirty to
116 activate bdflush */
117 int ndirty; /* Maximum number of dirty blocks to write out per
118 wake-cycle */
119 int nrefill; /* Number of clean buffers to try to obtain
120 each time we call refill */
121 int dummy1; /* unused */
122 int interval; /* jiffies delay between kupdate flushes */
123 int age_buffer; /* Time for normal buffer to age before we flush it */
124 int nfract_sync; /* Percentage of buffer cache dirty to
125 activate bdflush synchronously */
126 int dummy2; /* unused */
127 int dummy3; /* unused */
128 } b_un;
129 unsigned int data[N_PARAM];
130 } bdf_prm = {{30, 64, 64, 256, 5*HZ, 30*HZ, 60, 0, 0}};
131
132 /* These are the min and max parameter values that we will allow to be assigned */
133 int bdflush_min[N_PARAM] = { 0, 10, 5, 25, 0, 1*HZ, 0, 0, 0};
134 int bdflush_max[N_PARAM] = {100,50000, 20000, 20000,600*HZ, 6000*HZ, 100, 0, 0};
135
136 /*
137 * Rewrote the wait-routines to use the "new" wait-queue functionality,
138 * and getting rid of the cli-sti pairs. The wait-queue routines still
139 * need cli-sti, but now it's just a couple of 386 instructions or so.
140 *
141 * Note that the real wait_on_buffer() is an inline function that checks
142 * if 'b_wait' is set before calling this, so that the queues aren't set
143 * up unnecessarily.
144 */
145 void __wait_on_buffer(struct buffer_head * bh)
146 {
147 struct task_struct *tsk = current;
148 DECLARE_WAITQUEUE(wait, tsk);
149
150 atomic_inc(&bh->b_count);
151 add_wait_queue(&bh->b_wait, &wait);
152 do {
153 run_task_queue(&tq_disk);
154 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
155 if (!buffer_locked(bh))
156 break;
157 schedule();
158 } while (buffer_locked(bh));
159 tsk->state = TASK_RUNNING;
160 remove_wait_queue(&bh->b_wait, &wait);
161 atomic_dec(&bh->b_count);
162 }
163
164 /* Call sync_buffers with wait!=0 to ensure that the call does not
165 * return until all buffer writes have completed. Sync() may return
166 * before the writes have finished; fsync() may not.
167 */
168
169 /* Godamity-damn. Some buffers (bitmaps for filesystems)
170 * spontaneously dirty themselves without ever brelse being called.
171 * We will ultimately want to put these in a separate list, but for
172 * now we search all of the lists for dirty buffers.
173 */
174 static int sync_buffers(kdev_t dev, int wait)
175 {
176 int i, retry, pass = 0, err = 0;
177 struct buffer_head * bh, *next;
178
179 /* One pass for no-wait, three for wait:
180 * 0) write out all dirty, unlocked buffers;
181 * 1) write out all dirty buffers, waiting if locked;
182 * 2) wait for completion by waiting for all buffers to unlock.
183 */
184 do {
185 retry = 0;
186
187 /* We search all lists as a failsafe mechanism, not because we expect
188 * there to be dirty buffers on any of the other lists.
189 */
190 repeat:
191 spin_lock(&lru_list_lock);
192 bh = lru_list[BUF_DIRTY];
193 if (!bh)
194 goto repeat2;
195
196 for (i = nr_buffers_type[BUF_DIRTY]*2 ; i-- > 0 ; bh = next) {
197 next = bh->b_next_free;
198
199 if (!lru_list[BUF_DIRTY])
200 break;
201 if (dev && bh->b_dev != dev)
202 continue;
203 if (buffer_locked(bh)) {
204 /* Buffer is locked; skip it unless wait is
205 * requested AND pass > 0.
206 */
207 if (!wait || !pass) {
208 retry = 1;
209 continue;
210 }
211 atomic_inc(&bh->b_count);
212 spin_unlock(&lru_list_lock);
213 wait_on_buffer (bh);
214 atomic_dec(&bh->b_count);
215 goto repeat;
216 }
217
218 /* If an unlocked buffer is not uptodate, there has
219 * been an IO error. Skip it.
220 */
221 if (wait && buffer_req(bh) && !buffer_locked(bh) &&
222 !buffer_dirty(bh) && !buffer_uptodate(bh)) {
223 err = -EIO;
224 continue;
225 }
226
227 /* Don't write clean buffers. Don't write ANY buffers
228 * on the third pass.
229 */
230 if (!buffer_dirty(bh) || pass >= 2)
231 continue;
232
233 atomic_inc(&bh->b_count);
234 spin_unlock(&lru_list_lock);
235 ll_rw_block(WRITE, 1, &bh);
236 atomic_dec(&bh->b_count);
237 retry = 1;
238 goto repeat;
239 }
240
241 repeat2:
242 bh = lru_list[BUF_LOCKED];
243 if (!bh) {
244 spin_unlock(&lru_list_lock);
245 break;
246 }
247 for (i = nr_buffers_type[BUF_LOCKED]*2 ; i-- > 0 ; bh = next) {
248 next = bh->b_next_free;
249
250 if (!lru_list[BUF_LOCKED])
251 break;
252 if (dev && bh->b_dev != dev)
253 continue;
254 if (buffer_locked(bh)) {
255 /* Buffer is locked; skip it unless wait is
256 * requested AND pass > 0.
257 */
258 if (!wait || !pass) {
259 retry = 1;
260 continue;
261 }
262 atomic_inc(&bh->b_count);
263 spin_unlock(&lru_list_lock);
264 wait_on_buffer (bh);
265 spin_lock(&lru_list_lock);
266 atomic_dec(&bh->b_count);
267 goto repeat2;
268 }
269 }
270 spin_unlock(&lru_list_lock);
271
272 /* If we are waiting for the sync to succeed, and if any dirty
273 * blocks were written, then repeat; on the second pass, only
274 * wait for buffers being written (do not pass to write any
275 * more buffers on the second pass).
276 */
277 } while (wait && retry && ++pass<=2);
278 return err;
279 }
280
281 void sync_dev(kdev_t dev)
282 {
283 sync_supers(dev);
284 sync_inodes(dev);
285 DQUOT_SYNC(dev);
286 /* sync all the dirty buffers out to disk only _after_ all the
287 high level layers finished generated buffer dirty data
288 (or we'll return with some buffer still dirty on the blockdevice
289 so breaking the semantics of this call) */
290 sync_buffers(dev, 0);
291 /*
292 * FIXME(eric) we need to sync the physical devices here.
293 * This is because some (scsi) controllers have huge amounts of
294 * cache onboard (hundreds of Mb), and we need to instruct
295 * them to commit all of the dirty memory to disk, and we should
296 * not return until this has happened.
297 *
298 * This would need to get implemented by going through the assorted
299 * layers so that each block major number can be synced, and this
300 * would call down into the upper and mid-layer scsi.
301 */
302 }
303
304 int fsync_dev(kdev_t dev)
305 {
306 sync_buffers(dev, 0);
307
308 lock_kernel();
309 sync_supers(dev);
310 sync_inodes(dev);
311 DQUOT_SYNC(dev);
312 unlock_kernel();
313
314 return sync_buffers(dev, 1);
315 }
316
317 asmlinkage long sys_sync(void)
318 {
319 fsync_dev(0);
320 return 0;
321 }
322
323 /*
324 * filp may be NULL if called via the msync of a vma.
325 */
326
327 int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
328 {
329 struct inode * inode = dentry->d_inode;
330 struct super_block * sb;
331 kdev_t dev;
332 int ret;
333
334 lock_kernel();
335 /* sync the inode to buffers */
336 write_inode_now(inode, 0);
337
338 /* sync the superblock to buffers */
339 sb = inode->i_sb;
340 lock_super(sb);
341 if (sb->s_op && sb->s_op->write_super)
342 sb->s_op->write_super(sb);
343 unlock_super(sb);
344
345 /* .. finally sync the buffers to disk */
346 dev = inode->i_dev;
347 ret = sync_buffers(dev, 1);
348 unlock_kernel();
349 return ret;
350 }
351
352 asmlinkage long sys_fsync(unsigned int fd)
353 {
354 struct file * file;
355 struct dentry * dentry;
356 struct inode * inode;
357 int err;
358
359 err = -EBADF;
360 file = fget(fd);
361 if (!file)
362 goto out;
363
364 dentry = file->f_dentry;
365 inode = dentry->d_inode;
366
367 err = -EINVAL;
368 if (!file->f_op || !file->f_op->fsync)
369 goto out_putf;
370
371 /* We need to protect against concurrent writers.. */
372 down(&inode->i_sem);
373 filemap_fdatasync(inode->i_mapping);
374 err = file->f_op->fsync(file, dentry, 0);
375 filemap_fdatawait(inode->i_mapping);
376 up(&inode->i_sem);
377
378 out_putf:
379 fput(file);
380 out:
381 return err;
382 }
383
384 asmlinkage long sys_fdatasync(unsigned int fd)
385 {
386 struct file * file;
387 struct dentry * dentry;
388 struct inode * inode;
389 int err;
390
391 err = -EBADF;
392 file = fget(fd);
393 if (!file)
394 goto out;
395
396 dentry = file->f_dentry;
397 inode = dentry->d_inode;
398
399 err = -EINVAL;
400 if (!file->f_op || !file->f_op->fsync)
401 goto out_putf;
402
403 down(&inode->i_sem);
404 filemap_fdatasync(inode->i_mapping);
405 err = file->f_op->fsync(file, dentry, 1);
406 filemap_fdatawait(inode->i_mapping);
407 up(&inode->i_sem);
408
409 out_putf:
410 fput(file);
411 out:
412 return err;
413 }
414
415 /* After several hours of tedious analysis, the following hash
416 * function won. Do not mess with it... -DaveM
417 */
418 #define _hashfn(dev,block) \
419 ((((dev)<<(bh_hash_shift - 6)) ^ ((dev)<<(bh_hash_shift - 9))) ^ \
420 (((block)<<(bh_hash_shift - 6)) ^ ((block) >> 13) ^ \
421 ((block) << (bh_hash_shift - 12))))
422 #define hash(dev,block) hash_table[(_hashfn(HASHDEV(dev),block) & bh_hash_mask)]
423
424 static __inline__ void __hash_link(struct buffer_head *bh, struct buffer_head **head)
425 {
426 if ((bh->b_next = *head) != NULL)
427 bh->b_next->b_pprev = &bh->b_next;
428 *head = bh;
429 bh->b_pprev = head;
430 }
431
432 static __inline__ void __hash_unlink(struct buffer_head *bh)
433 {
434 if (bh->b_pprev) {
435 if (bh->b_next)
436 bh->b_next->b_pprev = bh->b_pprev;
437 *(bh->b_pprev) = bh->b_next;
438 bh->b_pprev = NULL;
439 }
440 }
441
442 static void __insert_into_lru_list(struct buffer_head * bh, int blist)
443 {
444 struct buffer_head **bhp = &lru_list[blist];
445
446 if(!*bhp) {
447 *bhp = bh;
448 bh->b_prev_free = bh;
449 }
450 bh->b_next_free = *bhp;
451 bh->b_prev_free = (*bhp)->b_prev_free;
452 (*bhp)->b_prev_free->b_next_free = bh;
453 (*bhp)->b_prev_free = bh;
454 nr_buffers_type[blist]++;
455 size_buffers_type[blist] += bh->b_size;
456 }
457
458 static void __remove_from_lru_list(struct buffer_head * bh, int blist)
459 {
460 if (bh->b_prev_free || bh->b_next_free) {
461 bh->b_prev_free->b_next_free = bh->b_next_free;
462 bh->b_next_free->b_prev_free = bh->b_prev_free;
463 if (lru_list[blist] == bh)
464 lru_list[blist] = bh->b_next_free;
465 if (lru_list[blist] == bh)
466 lru_list[blist] = NULL;
467 bh->b_next_free = bh->b_prev_free = NULL;
468 nr_buffers_type[blist]--;
469 size_buffers_type[blist] -= bh->b_size;
470 }
471 }
472
473 static void __remove_from_free_list(struct buffer_head * bh, int index)
474 {
475 if(bh->b_next_free == bh)
476 free_list[index].list = NULL;
477 else {
478 bh->b_prev_free->b_next_free = bh->b_next_free;
479 bh->b_next_free->b_prev_free = bh->b_prev_free;
480 if (free_list[index].list == bh)
481 free_list[index].list = bh->b_next_free;
482 }
483 bh->b_next_free = bh->b_prev_free = NULL;
484 }
485
486 /* must be called with both the hash_table_lock and the lru_list_lock
487 held */
488 static void __remove_from_queues(struct buffer_head *bh)
489 {
490 __hash_unlink(bh);
491 __remove_from_lru_list(bh, bh->b_list);
492 }
493
494 static void __insert_into_queues(struct buffer_head *bh)
495 {
496 struct buffer_head **head = &hash(bh->b_dev, bh->b_blocknr);
497
498 __hash_link(bh, head);
499 __insert_into_lru_list(bh, bh->b_list);
500 }
501
502 /* This function must only run if there are no other
503 * references _anywhere_ to this buffer head.
504 */
505 static void put_last_free(struct buffer_head * bh)
506 {
507 struct bh_free_head *head = &free_list[BUFSIZE_INDEX(bh->b_size)];
508 struct buffer_head **bhp = &head->list;
509
510 bh->b_state = 0;
511
512 spin_lock(&head->lock);
513 bh->b_dev = B_FREE;
514 if(!*bhp) {
515 *bhp = bh;
516 bh->b_prev_free = bh;
517 }
518 bh->b_next_free = *bhp;
519 bh->b_prev_free = (*bhp)->b_prev_free;
520 (*bhp)->b_prev_free->b_next_free = bh;
521 (*bhp)->b_prev_free = bh;
522 spin_unlock(&head->lock);
523 }
524
525 /*
526 * Why like this, I hear you say... The reason is race-conditions.
527 * As we don't lock buffers (unless we are reading them, that is),
528 * something might happen to it while we sleep (ie a read-error
529 * will force it bad). This shouldn't really happen currently, but
530 * the code is ready.
531 */
532 static inline struct buffer_head * __get_hash_table(kdev_t dev, int block, int size)
533 {
534 struct buffer_head *bh = hash(dev, block);
535
536 for (; bh; bh = bh->b_next)
537 if (bh->b_blocknr == block &&
538 bh->b_size == size &&
539 bh->b_dev == dev)
540 break;
541 if (bh)
542 atomic_inc(&bh->b_count);
543
544 return bh;
545 }
546
547 struct buffer_head * get_hash_table(kdev_t dev, int block, int size)
548 {
549 struct buffer_head *bh;
550
551 read_lock(&hash_table_lock);
552 bh = __get_hash_table(dev, block, size);
553 read_unlock(&hash_table_lock);
554
555 return bh;
556 }
557
558 unsigned int get_hardblocksize(kdev_t dev)
559 {
560 /*
561 * Get the hard sector size for the given device. If we don't know
562 * what it is, return 0.
563 */
564 if (hardsect_size[MAJOR(dev)] != NULL) {
565 int blksize = hardsect_size[MAJOR(dev)][MINOR(dev)];
566 if (blksize != 0)
567 return blksize;
568 }
569
570 /*
571 * We don't know what the hardware sector size for this device is.
572 * Return 0 indicating that we don't know.
573 */
574 return 0;
575 }
576
577 void buffer_insert_inode_queue(struct buffer_head *bh, struct inode *inode)
578 {
579 spin_lock(&lru_list_lock);
580 if (bh->b_inode)
581 list_del(&bh->b_inode_buffers);
582 bh->b_inode = inode;
583 list_add(&bh->b_inode_buffers, &inode->i_dirty_buffers);
584 spin_unlock(&lru_list_lock);
585 }
586
587 /* The caller must have the lru_list lock before calling the
588 remove_inode_queue functions. */
589 static void __remove_inode_queue(struct buffer_head *bh)
590 {
591 bh->b_inode = NULL;
592 list_del(&bh->b_inode_buffers);
593 }
594
595 static inline void remove_inode_queue(struct buffer_head *bh)
596 {
597 if (bh->b_inode)
598 __remove_inode_queue(bh);
599 }
600
601 int inode_has_buffers(struct inode *inode)
602 {
603 int ret;
604
605 spin_lock(&lru_list_lock);
606 ret = !list_empty(&inode->i_dirty_buffers);
607 spin_unlock(&lru_list_lock);
608
609 return ret;
610 }
611
612
613 /* If invalidate_buffers() will trash dirty buffers, it means some kind
614 of fs corruption is going on. Trashing dirty data always imply losing
615 information that was supposed to be just stored on the physical layer
616 by the user.
617
618 Thus invalidate_buffers in general usage is not allwowed to trash dirty
619 buffers. For example ioctl(FLSBLKBUF) expects dirty data to be preserved.
620
621 NOTE: In the case where the user removed a removable-media-disk even if
622 there's still dirty data not synced on disk (due a bug in the device driver
623 or due an error of the user), by not destroying the dirty buffers we could
624 generate corruption also on the next media inserted, thus a parameter is
625 necessary to handle this case in the most safe way possible (trying
626 to not corrupt also the new disk inserted with the data belonging to
627 the old now corrupted disk). Also for the ramdisk the natural thing
628 to do in order to release the ramdisk memory is to destroy dirty buffers.
629
630 These are two special cases. Normal usage imply the device driver
631 to issue a sync on the device (without waiting I/O completation) and
632 then an invalidate_buffers call that doesn't trash dirty buffers. */
633 void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers)
634 {
635 int i, nlist, slept;
636 struct buffer_head * bh, * bh_next;
637
638 retry:
639 slept = 0;
640 spin_lock(&lru_list_lock);
641 for(nlist = 0; nlist < NR_LIST; nlist++) {
642 bh = lru_list[nlist];
643 if (!bh)
644 continue;
645 for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) {
646 bh_next = bh->b_next_free;
647
648 /* Another device? */
649 if (bh->b_dev != dev)
650 continue;
651 /* Part of a mapping? */
652 if (bh->b_page->mapping)
653 continue;
654 if (buffer_locked(bh)) {
655 atomic_inc(&bh->b_count);
656 spin_unlock(&lru_list_lock);
657 wait_on_buffer(bh);
658 slept = 1;
659 spin_lock(&lru_list_lock);
660 atomic_dec(&bh->b_count);
661 }
662
663 write_lock(&hash_table_lock);
664 if (!atomic_read(&bh->b_count) &&
665 (destroy_dirty_buffers || !buffer_dirty(bh))) {
666 remove_inode_queue(bh);
667 __remove_from_queues(bh);
668 put_last_free(bh);
669 }
670 /* else complain loudly? */
671
672 write_unlock(&hash_table_lock);
673 if (slept)
674 goto out;
675 }
676 }
677 out:
678 spin_unlock(&lru_list_lock);
679 if (slept)
680 goto retry;
681 }
682
683 void set_blocksize(kdev_t dev, int size)
684 {
685 extern int *blksize_size[];
686 int i, nlist, slept;
687 struct buffer_head * bh, * bh_next;
688
689 if (!blksize_size[MAJOR(dev)])
690 return;
691
692 /* Size must be a power of two, and between 512 and PAGE_SIZE */
693 if (size > PAGE_SIZE || size < 512 || (size & (size-1)))
694 panic("Invalid blocksize passed to set_blocksize");
695
696 if (blksize_size[MAJOR(dev)][MINOR(dev)] == 0 && size == BLOCK_SIZE) {
697 blksize_size[MAJOR(dev)][MINOR(dev)] = size;
698 return;
699 }
700 if (blksize_size[MAJOR(dev)][MINOR(dev)] == size)
701 return;
702 sync_buffers(dev, 2);
703 blksize_size[MAJOR(dev)][MINOR(dev)] = size;
704
705 retry:
706 slept = 0;
707 spin_lock(&lru_list_lock);
708 for(nlist = 0; nlist < NR_LIST; nlist++) {
709 bh = lru_list[nlist];
710 if (!bh)
711 continue;
712 for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) {
713 bh_next = bh->b_next_free;
714 if (bh->b_dev != dev || bh->b_size == size)
715 continue;
716 if (buffer_locked(bh)) {
717 atomic_inc(&bh->b_count);
718 spin_unlock(&lru_list_lock);
719 wait_on_buffer(bh);
720 slept = 1;
721 spin_lock(&lru_list_lock);
722 atomic_dec(&bh->b_count);
723 }
724
725 write_lock(&hash_table_lock);
726 if (!atomic_read(&bh->b_count)) {
727 if (buffer_dirty(bh))
728 printk(KERN_WARNING
729 "set_blocksize: dev %s buffer_dirty %lu size %hu\n",
730 kdevname(dev), bh->b_blocknr, bh->b_size);
731 remove_inode_queue(bh);
732 __remove_from_queues(bh);
733 put_last_free(bh);
734 } else {
735 if (atomic_set_buffer_clean(bh))
736 __refile_buffer(bh);
737 clear_bit(BH_Uptodate, &bh->b_state);
738 printk(KERN_WARNING
739 "set_blocksize: "
740 "b_count %d, dev %s, block %lu, from %p\n",
741 atomic_read(&bh->b_count), bdevname(bh->b_dev),
742 bh->b_blocknr, __builtin_return_address(0));
743 }
744 write_unlock(&hash_table_lock);
745 if (slept)
746 goto out;
747 }
748 }
749 out:
750 spin_unlock(&lru_list_lock);
751 if (slept)
752 goto retry;
753 }
754
755 /*
756 * We used to try various strange things. Let's not.
757 * We'll just try to balance dirty buffers, and possibly
758 * launder some pages.
759 */
760 static void refill_freelist(int size)
761 {
762 balance_dirty(NODEV);
763 if (free_shortage())
764 page_launder(GFP_BUFFER, 0);
765 grow_buffers(size);
766 }
767
768 void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
769 {
770 bh->b_list = BUF_CLEAN;
771 bh->b_end_io = handler;
772 bh->b_private = private;
773 }
774
775 static void end_buffer_io_async(struct buffer_head * bh, int uptodate)
776 {
777 static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
778 unsigned long flags;
779 struct buffer_head *tmp;
780 struct page *page;
781
782 mark_buffer_uptodate(bh, uptodate);
783
784 /* This is a temporary buffer used for page I/O. */
785 page = bh->b_page;
786
787 if (!uptodate)
788 SetPageError(page);
789
790 /*
791 * Be _very_ careful from here on. Bad things can happen if
792 * two buffer heads end IO at almost the same time and both
793 * decide that the page is now completely done.
794 *
795 * Async buffer_heads are here only as labels for IO, and get
796 * thrown away once the IO for this page is complete. IO is
797 * deemed complete once all buffers have been visited
798 * (b_count==0) and are now unlocked. We must make sure that
799 * only the _last_ buffer that decrements its count is the one
800 * that unlock the page..
801 */
802 spin_lock_irqsave(&page_uptodate_lock, flags);
803 unlock_buffer(bh);
804 atomic_dec(&bh->b_count);
805 tmp = bh->b_this_page;
806 while (tmp != bh) {
807 if (tmp->b_end_io == end_buffer_io_async && buffer_locked(tmp))
808 goto still_busy;
809 tmp = tmp->b_this_page;
810 }
811
812 /* OK, the async IO on this page is complete. */
813 spin_unlock_irqrestore(&page_uptodate_lock, flags);
814
815 /*
816 * if none of the buffers had errors then we can set the
817 * page uptodate:
818 */
819 if (!PageError(page))
820 SetPageUptodate(page);
821
822 /*
823 * Run the hooks that have to be done when a page I/O has completed.
824 */
825 if (PageTestandClearDecrAfter(page))
826 atomic_dec(&nr_async_pages);
827
828 UnlockPage(page);
829
830 return;
831
832 still_busy:
833 spin_unlock_irqrestore(&page_uptodate_lock, flags);
834 return;
835 }
836
837 /*
838 * Synchronise all the inode's dirty buffers to the disk.
839 *
840 * We have conflicting pressures: we want to make sure that all
841 * initially dirty buffers get waited on, but that any subsequently
842 * dirtied buffers don't. After all, we don't want fsync to last
843 * forever if somebody is actively writing to the file.
844 *
845 * Do this in two main stages: first we copy dirty buffers to a
846 * temporary inode list, queueing the writes as we go. Then we clean
847 * up, waiting for those writes to complete.
848 *
849 * During this second stage, any subsequent updates to the file may end
850 * up refiling the buffer on the original inode's dirty list again, so
851 * there is a chance we will end up with a buffer queued for write but
852 * not yet completed on that list. So, as a final cleanup we go through
853 * the osync code to catch these locked, dirty buffers without requeuing
854 * any newly dirty buffers for write.
855 */
856
857 int fsync_inode_buffers(struct inode *inode)
858 {
859 struct buffer_head *bh;
860 struct inode tmp;
861 int err = 0, err2;
862
863 INIT_LIST_HEAD(&tmp.i_dirty_buffers);
864
865 spin_lock(&lru_list_lock);
866
867 while (!list_empty(&inode->i_dirty_buffers)) {
868 bh = BH_ENTRY(inode->i_dirty_buffers.next);
869 list_del(&bh->b_inode_buffers);
870 if (!buffer_dirty(bh) && !buffer_locked(bh))
871 bh->b_inode = NULL;
872 else {
873 bh->b_inode = &tmp;
874 list_add(&bh->b_inode_buffers, &tmp.i_dirty_buffers);
875 if (buffer_dirty(bh)) {
876 atomic_inc(&bh->b_count);
877 spin_unlock(&lru_list_lock);
878 ll_rw_block(WRITE, 1, &bh);
879 brelse(bh);
880 spin_lock(&lru_list_lock);
881 }
882 }
883 }
884
885 while (!list_empty(&tmp.i_dirty_buffers)) {
886 bh = BH_ENTRY(tmp.i_dirty_buffers.prev);
887 remove_inode_queue(bh);
888 atomic_inc(&bh->b_count);
889 spin_unlock(&lru_list_lock);
890 wait_on_buffer(bh);
891 if (!buffer_uptodate(bh))
892 err = -EIO;
893 brelse(bh);
894 spin_lock(&lru_list_lock);
895 }
896
897 spin_unlock(&lru_list_lock);
898 err2 = osync_inode_buffers(inode);
899
900 if (err)
901 return err;
902 else
903 return err2;
904 }
905
906
907 /*
908 * osync is designed to support O_SYNC io. It waits synchronously for
909 * all already-submitted IO to complete, but does not queue any new
910 * writes to the disk.
911 *
912 * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
913 * you dirty the buffers, and then use osync_inode_buffers to wait for
914 * completion. Any other dirty buffers which are not yet queued for
915 * write will not be flushed to disk by the osync.
916 */
917
918 int osync_inode_buffers(struct inode *inode)
919 {
920 struct buffer_head *bh;
921 struct list_head *list;
922 int err = 0;
923
924 spin_lock(&lru_list_lock);
925
926 repeat:
927
928 for (list = inode->i_dirty_buffers.prev;
929 bh = BH_ENTRY(list), list != &inode->i_dirty_buffers;
930 list = bh->b_inode_buffers.prev) {
931 if (buffer_locked(bh)) {
932 atomic_inc(&bh->b_count);
933 spin_unlock(&lru_list_lock);
934 wait_on_buffer(bh);
935 if (!buffer_uptodate(bh))
936 err = -EIO;
937 brelse(bh);
938 spin_lock(&lru_list_lock);
939 goto repeat;
940 }
941 }
942
943 spin_unlock(&lru_list_lock);
944 return err;
945 }
946
947
948 /*
949 * Invalidate any and all dirty buffers on a given inode. We are
950 * probably unmounting the fs, but that doesn't mean we have already
951 * done a sync(). Just drop the buffers from the inode list.
952 */
953 void invalidate_inode_buffers(struct inode *inode)
954 {
955 struct list_head *list, *next;
956
957 spin_lock(&lru_list_lock);
958 list = inode->i_dirty_buffers.next;
959 while (list != &inode->i_dirty_buffers) {
960 next = list->next;
961 remove_inode_queue(BH_ENTRY(list));
962 list = next;
963 }
964 spin_unlock(&lru_list_lock);
965 }
966
967
968 /*
969 * Ok, this is getblk, and it isn't very clear, again to hinder
970 * race-conditions. Most of the code is seldom used, (ie repeating),
971 * so it should be much more efficient than it looks.
972 *
973 * The algorithm is changed: hopefully better, and an elusive bug removed.
974 *
975 * 14.02.92: changed it to sync dirty buffers a bit: better performance
976 * when the filesystem starts to get full of dirty blocks (I hope).
977 */
978 struct buffer_head * getblk(kdev_t dev, int block, int size)
979 {
980 struct buffer_head * bh;
981 int isize;
982
983 repeat:
984 spin_lock(&lru_list_lock);
985 write_lock(&hash_table_lock);
986 bh = __get_hash_table(dev, block, size);
987 if (bh)
988 goto out;
989
990 isize = BUFSIZE_INDEX(size);
991 spin_lock(&free_list[isize].lock);
992 bh = free_list[isize].list;
993 if (bh) {
994 __remove_from_free_list(bh, isize);
995 atomic_set(&bh->b_count, 1);
996 }
997 spin_unlock(&free_list[isize].lock);
998
999 /*
1000 * OK, FINALLY we know that this buffer is the only one of
1001 * its kind, we hold a reference (b_count>0), it is unlocked,
1002 * and it is clean.
1003 */
1004 if (bh) {
1005 init_buffer(bh, NULL, NULL);
1006 bh->b_dev = dev;
1007 bh->b_blocknr = block;
1008 bh->b_state = 1 << BH_Mapped;
1009
1010 /* Insert the buffer into the regular lists */
1011 __insert_into_queues(bh);
1012 out:
1013 write_unlock(&hash_table_lock);
1014 spin_unlock(&lru_list_lock);
1015 touch_buffer(bh);
1016 return bh;
1017 }
1018
1019 /*
1020 * If we block while refilling the free list, somebody may
1021 * create the buffer first ... search the hashes again.
1022 */
1023 write_unlock(&hash_table_lock);
1024 spin_unlock(&lru_list_lock);
1025 refill_freelist(size);
1026 goto repeat;
1027 }
1028
1029 /* -1 -> no need to flush
1030 0 -> async flush
1031 1 -> sync flush (wait for I/O completation) */
1032 int balance_dirty_state(kdev_t dev)
1033 {
1034 unsigned long dirty, tot, hard_dirty_limit, soft_dirty_limit;
1035 int shortage;
1036
1037 dirty = size_buffers_type[BUF_DIRTY] >> PAGE_SHIFT;
1038 tot = nr_free_buffer_pages();
1039
1040 dirty *= 100;
1041 soft_dirty_limit = tot * bdf_prm.b_un.nfract;
1042 hard_dirty_limit = tot * bdf_prm.b_un.nfract_sync;
1043
1044 /* First, check for the "real" dirty limit. */
1045 if (dirty > soft_dirty_limit) {
1046 if (dirty > hard_dirty_limit)
1047 return 1;
1048 return 0;
1049 }
1050
1051 /*
1052 * If we are about to get low on free pages and
1053 * cleaning the inactive_dirty pages would help
1054 * fix this, wake up bdflush.
1055 */
1056 shortage = free_shortage();
1057 if (shortage && nr_inactive_dirty_pages > shortage &&
1058 nr_inactive_dirty_pages > freepages.high)
1059 return 0;
1060
1061 return -1;
1062 }
1063
1064 /*
1065 * if a new dirty buffer is created we need to balance bdflush.
1066 *
1067 * in the future we might want to make bdflush aware of different
1068 * pressures on different devices - thus the (currently unused)
1069 * 'dev' parameter.
1070 */
1071 void balance_dirty(kdev_t dev)
1072 {
1073 int state = balance_dirty_state(dev);
1074
1075 if (state < 0)
1076 return;
1077 wakeup_bdflush(state);
1078 }
1079
1080 static __inline__ void __mark_dirty(struct buffer_head *bh)
1081 {
1082 bh->b_flushtime = jiffies + bdf_prm.b_un.age_buffer;
1083 refile_buffer(bh);
1084 }
1085
1086 /* atomic version, the user must call balance_dirty() by hand
1087 as soon as it become possible to block */
1088 void __mark_buffer_dirty(struct buffer_head *bh)
1089 {
1090 if (!atomic_set_buffer_dirty(bh))
1091 __mark_dirty(bh);
1092 }
1093
1094 void mark_buffer_dirty(struct buffer_head *bh)
1095 {
1096 if (!atomic_set_buffer_dirty(bh)) {
1097 __mark_dirty(bh);
1098 balance_dirty(bh->b_dev);
1099 }
1100 }
1101
1102 /*
1103 * A buffer may need to be moved from one buffer list to another
1104 * (e.g. in case it is not shared any more). Handle this.
1105 */
1106 static void __refile_buffer(struct buffer_head *bh)
1107 {
1108 int dispose = BUF_CLEAN;
1109 if (buffer_locked(bh))
1110 dispose = BUF_LOCKED;
1111 if (buffer_dirty(bh))
1112 dispose = BUF_DIRTY;
1113 if (buffer_protected(bh))
1114 dispose = BUF_PROTECTED;
1115 if (dispose != bh->b_list) {
1116 __remove_from_lru_list(bh, bh->b_list);
1117 bh->b_list = dispose;
1118 if (dispose == BUF_CLEAN)
1119 remove_inode_queue(bh);
1120 __insert_into_lru_list(bh, dispose);
1121 }
1122 }
1123
1124 void refile_buffer(struct buffer_head *bh)
1125 {
1126 spin_lock(&lru_list_lock);
1127 __refile_buffer(bh);
1128 spin_unlock(&lru_list_lock);
1129 }
1130
1131 /*
1132 * Release a buffer head
1133 */
1134 void __brelse(struct buffer_head * buf)
1135 {
1136 if (atomic_read(&buf->b_count)) {
1137 atomic_dec(&buf->b_count);
1138 return;
1139 }
1140 printk("VFS: brelse: Trying to free free buffer\n");
1141 }
1142
1143 /*
1144 * bforget() is like brelse(), except it puts the buffer on the
1145 * free list if it can.. We can NOT free the buffer if:
1146 * - there are other users of it
1147 * - it is locked and thus can have active IO
1148 */
1149 void __bforget(struct buffer_head * buf)
1150 {
1151 /* grab the lru lock here to block bdflush. */
1152 spin_lock(&lru_list_lock);
1153 write_lock(&hash_table_lock);
1154 if (!atomic_dec_and_test(&buf->b_count) || buffer_locked(buf))
1155 goto in_use;
1156 __hash_unlink(buf);
1157 remove_inode_queue(buf);
1158 write_unlock(&hash_table_lock);
1159 __remove_from_lru_list(buf, buf->b_list);
1160 spin_unlock(&lru_list_lock);
1161 put_last_free(buf);
1162 return;
1163
1164 in_use:
1165 write_unlock(&hash_table_lock);
1166 spin_unlock(&lru_list_lock);
1167 }
1168
1169 /*
1170 * bread() reads a specified block and returns the buffer that contains
1171 * it. It returns NULL if the block was unreadable.
1172 */
1173 struct buffer_head * bread(kdev_t dev, int block, int size)
1174 {
1175 struct buffer_head * bh;
1176
1177 bh = getblk(dev, block, size);
1178 if (buffer_uptodate(bh))
1179 return bh;
1180 ll_rw_block(READ, 1, &bh);
1181 wait_on_buffer(bh);
1182 if (buffer_uptodate(bh))
1183 return bh;
1184 brelse(bh);
1185 return NULL;
1186 }
1187
1188 /*
1189 * Note: the caller should wake up the buffer_wait list if needed.
1190 */
1191 static __inline__ void __put_unused_buffer_head(struct buffer_head * bh)
1192 {
1193 if (bh->b_inode)
1194 BUG();
1195 if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
1196 kmem_cache_free(bh_cachep, bh);
1197 } else {
1198 bh->b_blocknr = -1;
1199 init_waitqueue_head(&bh->b_wait);
1200 nr_unused_buffer_heads++;
1201 bh->b_next_free = unused_list;
1202 bh->b_this_page = NULL;
1203 unused_list = bh;
1204 }
1205 }
1206
1207 /*
1208 * Reserve NR_RESERVED buffer heads for async IO requests to avoid
1209 * no-buffer-head deadlock. Return NULL on failure; waiting for
1210 * buffer heads is now handled in create_buffers().
1211 */
1212 static struct buffer_head * get_unused_buffer_head(int async)
1213 {
1214 struct buffer_head * bh;
1215
1216 spin_lock(&unused_list_lock);
1217 if (nr_unused_buffer_heads > NR_RESERVED) {
1218 bh = unused_list;
1219 unused_list = bh->b_next_free;
1220 nr_unused_buffer_heads--;
1221 spin_unlock(&unused_list_lock);
1222 return bh;
1223 }
1224 spin_unlock(&unused_list_lock);
1225
1226 /* This is critical. We can't swap out pages to get
1227 * more buffer heads, because the swap-out may need
1228 * more buffer-heads itself. Thus SLAB_BUFFER.
1229 */
1230 if((bh = kmem_cache_alloc(bh_cachep, SLAB_BUFFER)) != NULL) {
1231 memset(bh, 0, sizeof(*bh));
1232 init_waitqueue_head(&bh->b_wait);
1233 return bh;
1234 }
1235
1236 /*
1237 * If we need an async buffer, use the reserved buffer heads.
1238 */
1239 if (async) {
1240 spin_lock(&unused_list_lock);
1241 if (unused_list) {
1242 bh = unused_list;
1243 unused_list = bh->b_next_free;
1244 nr_unused_buffer_heads--;
1245 spin_unlock(&unused_list_lock);
1246 return bh;
1247 }
1248 spin_unlock(&unused_list_lock);
1249 }
1250 #if 0
1251 /*
1252 * (Pending further analysis ...)
1253 * Ordinary (non-async) requests can use a different memory priority
1254 * to free up pages. Any swapping thus generated will use async
1255 * buffer heads.
1256 */
1257 if(!async &&
1258 (bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL)) != NULL) {
1259 memset(bh, 0, sizeof(*bh));
1260 init_waitqueue_head(&bh->b_wait);
1261 return bh;
1262 }
1263 #endif
1264
1265 return NULL;
1266 }
1267
1268 void set_bh_page (struct buffer_head *bh, struct page *page, unsigned long offset)
1269 {
1270 bh->b_page = page;
1271 if (offset >= PAGE_SIZE)
1272 BUG();
1273 if (PageHighMem(page))
1274 /*
1275 * This catches illegal uses and preserves the offset:
1276 */
1277 bh->b_data = (char *)(0 + offset);
1278 else
1279 bh->b_data = page_address(page) + offset;
1280 }
1281
1282 /*
1283 * Create the appropriate buffers when given a page for data area and
1284 * the size of each buffer.. Use the bh->b_this_page linked list to
1285 * follow the buffers created. Return NULL if unable to create more
1286 * buffers.
1287 * The async flag is used to differentiate async IO (paging, swapping)
1288 * from ordinary buffer allocations, and only async requests are allowed
1289 * to sleep waiting for buffer heads.
1290 */
1291 static struct buffer_head * create_buffers(struct page * page, unsigned long size, int async)
1292 {
1293 struct buffer_head *bh, *head;
1294 long offset;
1295
1296 try_again:
1297 head = NULL;
1298 offset = PAGE_SIZE;
1299 while ((offset -= size) >= 0) {
1300 bh = get_unused_buffer_head(async);
1301 if (!bh)
1302 goto no_grow;
1303
1304 bh->b_dev = B_FREE; /* Flag as unused */
1305 bh->b_this_page = head;
1306 head = bh;
1307
1308 bh->b_state = 0;
1309 bh->b_next_free = NULL;
1310 bh->b_pprev = NULL;
1311 atomic_set(&bh->b_count, 0);
1312 bh->b_size = size;
1313
1314 set_bh_page(bh, page, offset);
1315
1316 bh->b_list = BUF_CLEAN;
1317 bh->b_end_io = NULL;
1318 }
1319 return head;
1320 /*
1321 * In case anything failed, we just free everything we got.
1322 */
1323 no_grow:
1324 if (head) {
1325 spin_lock(&unused_list_lock);
1326 do {
1327 bh = head;
1328 head = head->b_this_page;
1329 __put_unused_buffer_head(bh);
1330 } while (head);
1331 spin_unlock(&unused_list_lock);
1332
1333 /* Wake up any waiters ... */
1334 wake_up(&buffer_wait);
1335 }
1336
1337 /*
1338 * Return failure for non-async IO requests. Async IO requests
1339 * are not allowed to fail, so we have to wait until buffer heads
1340 * become available. But we don't want tasks sleeping with
1341 * partially complete buffers, so all were released above.
1342 */
1343 if (!async)
1344 return NULL;
1345
1346 /* We're _really_ low on memory. Now we just
1347 * wait for old buffer heads to become free due to
1348 * finishing IO. Since this is an async request and
1349 * the reserve list is empty, we're sure there are
1350 * async buffer heads in use.
1351 */
1352 run_task_queue(&tq_disk);
1353
1354 /*
1355 * Set our state for sleeping, then check again for buffer heads.
1356 * This ensures we won't miss a wake_up from an interrupt.
1357 */
1358 wait_event(buffer_wait, nr_unused_buffer_heads >= MAX_BUF_PER_PAGE);
1359 goto try_again;
1360 }
1361
1362 static void unmap_buffer(struct buffer_head * bh)
1363 {
1364 if (buffer_mapped(bh)) {
1365 mark_buffer_clean(bh);
1366 wait_on_buffer(bh);
1367 clear_bit(BH_Uptodate, &bh->b_state);
1368 clear_bit(BH_Mapped, &bh->b_state);
1369 clear_bit(BH_Req, &bh->b_state);
1370 clear_bit(BH_New, &bh->b_state);
1371 }
1372 }
1373
1374 /*
1375 * We don't have to release all buffers here, but
1376 * we have to be sure that no dirty buffer is left
1377 * and no IO is going on (no buffer is locked), because
1378 * we have truncated the file and are going to free the
1379 * blocks on-disk..
1380 */
1381 int block_flushpage(struct page *page, unsigned long offset)
1382 {
1383 struct buffer_head *head, *bh, *next;
1384 unsigned int curr_off = 0;
1385
1386 if (!PageLocked(page))
1387 BUG();
1388 if (!page->buffers)
1389 return 1;
1390
1391 head = page->buffers;
1392 bh = head;
1393 do {
1394 unsigned int next_off = curr_off + bh->b_size;
1395 next = bh->b_this_page;
1396
1397 /*
1398 * is this block fully flushed?
1399 */
1400 if (offset <= curr_off)
1401 unmap_buffer(bh);
1402 curr_off = next_off;
1403 bh = next;
1404 } while (bh != head);
1405
1406 /*
1407 * subtle. We release buffer-heads only if this is
1408 * the 'final' flushpage. We have invalidated the get_block
1409 * cached value unconditionally, so real IO is not
1410 * possible anymore.
1411 *
1412 * If the free doesn't work out, the buffers can be
1413 * left around - they just turn into anonymous buffers
1414 * instead.
1415 */
1416 if (!offset) {
1417 if (!try_to_free_buffers(page, 0)) {
1418 atomic_inc(&buffermem_pages);
1419 return 0;
1420 }
1421 }
1422
1423 return 1;
1424 }
1425
1426 static void create_empty_buffers(struct page *page, kdev_t dev, unsigned long blocksize)
1427 {
1428 struct buffer_head *bh, *head, *tail;
1429
1430 head = create_buffers(page, blocksize, 1);
1431 if (page->buffers)
1432 BUG();
1433
1434 bh = head;
1435 do {
1436 bh->b_dev = dev;
1437 bh->b_blocknr = 0;
1438 bh->b_end_io = NULL;
1439 tail = bh;
1440 bh = bh->b_this_page;
1441 } while (bh);
1442 tail->b_this_page = head;
1443 page->buffers = head;
1444 page_cache_get(page);
1445 }
1446
1447 /*
1448 * We are taking a block for data and we don't want any output from any
1449 * buffer-cache aliases starting from return from that function and
1450 * until the moment when something will explicitly mark the buffer
1451 * dirty (hopefully that will not happen until we will free that block ;-)
1452 * We don't even need to mark it not-uptodate - nobody can expect
1453 * anything from a newly allocated buffer anyway. We used to used
1454 * unmap_buffer() for such invalidation, but that was wrong. We definitely
1455 * don't want to mark the alias unmapped, for example - it would confuse
1456 * anyone who might pick it with bread() afterwards...
1457 */
1458
1459 static void unmap_underlying_metadata(struct buffer_head * bh)
1460 {
1461 struct buffer_head *old_bh;
1462
1463 old_bh = get_hash_table(bh->b_dev, bh->b_blocknr, bh->b_size);
1464 if (old_bh) {
1465 mark_buffer_clean(old_bh);
1466 wait_on_buffer(old_bh);
1467 clear_bit(BH_Req, &old_bh->b_state);
1468 /* Here we could run brelse or bforget. We use
1469 bforget because it will try to put the buffer
1470 in the freelist. */
1471 __bforget(old_bh);
1472 }
1473 }
1474
1475 /*
1476 * NOTE! All mapped/uptodate combinations are valid:
1477 *
1478 * Mapped Uptodate Meaning
1479 *
1480 * No No "unknown" - must do get_block()
1481 * No Yes "hole" - zero-filled
1482 * Yes No "allocated" - allocated on disk, not read in
1483 * Yes Yes "valid" - allocated and up-to-date in memory.
1484 *
1485 * "Dirty" is valid only with the last case (mapped+uptodate).
1486 */
1487
1488 /*
1489 * block_write_full_page() is SMP-safe - currently it's still
1490 * being called with the kernel lock held, but the code is ready.
1491 */
1492 static int __block_write_full_page(struct inode *inode, struct page *page, get_block_t *get_block)
1493 {
1494 int err, i;
1495 unsigned long block;
1496 struct buffer_head *bh, *head;
1497
1498 if (!PageLocked(page))
1499 BUG();
1500
1501 if (!page->buffers)
1502 create_empty_buffers(page, inode->i_dev, inode->i_sb->s_blocksize);
1503 head = page->buffers;
1504
1505 block = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1506
1507 bh = head;
1508 i = 0;
1509
1510 /* Stage 1: make sure we have all the buffers mapped! */
1511 do {
1512 /*
1513 * If the buffer isn't up-to-date, we can't be sure
1514 * that the buffer has been initialized with the proper
1515 * block number information etc..
1516 *
1517 * Leave it to the low-level FS to make all those
1518 * decisions (block #0 may actually be a valid block)
1519 */
1520 if (!buffer_mapped(bh)) {
1521 err = get_block(inode, block, bh, 1);
1522 if (err)
1523 goto out;
1524 if (buffer_new(bh))
1525 unmap_underlying_metadata(bh);
1526 }
1527 bh = bh->b_this_page;
1528 block++;
1529 } while (bh != head);
1530
1531 /* Stage 2: lock the buffers, mark them clean */
1532 do {
1533 lock_buffer(bh);
1534 bh->b_end_io = end_buffer_io_async;
1535 atomic_inc(&bh->b_count);
1536 set_bit(BH_Uptodate, &bh->b_state);
1537 clear_bit(BH_Dirty, &bh->b_state);
1538 bh = bh->b_this_page;
1539 } while (bh != head);
1540
1541 /* Stage 3: submit the IO */
1542 do {
1543 submit_bh(WRITE, bh);
1544 bh = bh->b_this_page;
1545 } while (bh != head);
1546
1547 /* Done - end_buffer_io_async will unlock */
1548 SetPageUptodate(page);
1549 return 0;
1550
1551 out:
1552 ClearPageUptodate(page);
1553 UnlockPage(page);
1554 return err;
1555 }
1556
1557 static int __block_prepare_write(struct inode *inode, struct page *page,
1558 unsigned from, unsigned to, get_block_t *get_block)
1559 {
1560 unsigned block_start, block_end;
1561 unsigned long block;
1562 int err = 0;
1563 unsigned blocksize, bbits;
1564 struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1565 char *kaddr = kmap(page);
1566
1567 blocksize = inode->i_sb->s_blocksize;
1568 if (!page->buffers)
1569 create_empty_buffers(page, inode->i_dev, blocksize);
1570 head = page->buffers;
1571
1572 bbits = inode->i_sb->s_blocksize_bits;
1573 block = page->index << (PAGE_CACHE_SHIFT - bbits);
1574
1575 for(bh = head, block_start = 0; bh != head || !block_start;
1576 block++, block_start=block_end, bh = bh->b_this_page) {
1577 if (!bh)
1578 BUG();
1579 block_end = block_start+blocksize;
1580 if (block_end <= from)
1581 continue;
1582 if (block_start >= to)
1583 break;
1584 if (!buffer_mapped(bh)) {
1585 err = get_block(inode, block, bh, 1);
1586 if (err)
1587 goto out;
1588 if (buffer_new(bh)) {
1589 unmap_underlying_metadata(bh);
1590 if (Page_Uptodate(page)) {
1591 set_bit(BH_Uptodate, &bh->b_state);
1592 continue;
1593 }
1594 if (block_end > to)
1595 memset(kaddr+to, 0, block_end-to);
1596 if (block_start < from)
1597 memset(kaddr+block_start, 0, from-block_start);
1598 if (block_end > to || block_start < from)
1599 flush_dcache_page(page);
1600 continue;
1601 }
1602 }
1603 if (Page_Uptodate(page)) {
1604 set_bit(BH_Uptodate, &bh->b_state);
1605 continue;
1606 }
1607 if (!buffer_uptodate(bh) &&
1608 (block_start < from || block_end > to)) {
1609 ll_rw_block(READ, 1, &bh);
1610 *wait_bh++=bh;
1611 }
1612 }
1613 /*
1614 * If we issued read requests - let them complete.
1615 */
1616 while(wait_bh > wait) {
1617 wait_on_buffer(*--wait_bh);
1618 err = -EIO;
1619 if (!buffer_uptodate(*wait_bh))
1620 goto out;
1621 }
1622 return 0;
1623 out:
1624 return err;
1625 }
1626
1627 static int __block_commit_write(struct inode *inode, struct page *page,
1628 unsigned from, unsigned to)
1629 {
1630 unsigned block_start, block_end;
1631 int partial = 0, need_balance_dirty = 0;
1632 unsigned blocksize;
1633 struct buffer_head *bh, *head;
1634
1635 blocksize = inode->i_sb->s_blocksize;
1636
1637 for(bh = head = page->buffers, block_start = 0;
1638 bh != head || !block_start;
1639 block_start=block_end, bh = bh->b_this_page) {
1640 block_end = block_start + blocksize;
1641 if (block_end <= from || block_start >= to) {
1642 if (!buffer_uptodate(bh))
1643 partial = 1;
1644 } else {
1645 set_bit(BH_Uptodate, &bh->b_state);
1646 if (!atomic_set_buffer_dirty(bh)) {
1647 __mark_dirty(bh);
1648 buffer_insert_inode_queue(bh, inode);
1649 need_balance_dirty = 1;
1650 }
1651 }
1652 }
1653
1654 if (need_balance_dirty)
1655 balance_dirty(bh->b_dev);
1656 /*
1657 * is this a partial write that happened to make all buffers
1658 * uptodate then we can optimize away a bogus readpage() for
1659 * the next read(). Here we 'discover' wether the page went
1660 * uptodate as a result of this (potentially partial) write.
1661 */
1662 if (!partial)
1663 SetPageUptodate(page);
1664 return 0;
1665 }
1666
1667 /*
1668 * Generic "read page" function for block devices that have the normal
1669 * get_block functionality. This is most of the block device filesystems.
1670 * Reads the page asynchronously --- the unlock_buffer() and
1671 * mark_buffer_uptodate() functions propagate buffer state into the
1672 * page struct once IO has completed.
1673 */
1674 int block_read_full_page(struct page *page, get_block_t *get_block)
1675 {
1676 struct inode *inode = page->mapping->host;
1677 unsigned long iblock, lblock;
1678 struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
1679 unsigned int blocksize, blocks;
1680 int nr, i;
1681
1682 if (!PageLocked(page))
1683 PAGE_BUG(page);
1684 blocksize = inode->i_sb->s_blocksize;
1685 if (!page->buffers)
1686 create_empty_buffers(page, inode->i_dev, blocksize);
1687 head = page->buffers;
1688
1689 blocks = PAGE_CACHE_SIZE >> inode->i_sb->s_blocksize_bits;
1690 iblock = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1691 lblock = (inode->i_size+blocksize-1) >> inode->i_sb->s_blocksize_bits;
1692 bh = head;
1693 nr = 0;
1694 i = 0;
1695
1696 do {
1697 if (buffer_uptodate(bh))
1698 continue;
1699
1700 if (!buffer_mapped(bh)) {
1701 if (iblock < lblock) {
1702 if (get_block(inode, iblock, bh, 0))
1703 continue;
1704 }
1705 if (!buffer_mapped(bh)) {
1706 memset(kmap(page) + i*blocksize, 0, blocksize);
1707 flush_dcache_page(page);
1708 kunmap(page);
1709 set_bit(BH_Uptodate, &bh->b_state);
1710 continue;
1711 }
1712 /* get_block() might have updated the buffer synchronously */
1713 if (buffer_uptodate(bh))
1714 continue;
1715 }
1716
1717 arr[nr] = bh;
1718 nr++;
1719 } while (i++, iblock++, (bh = bh->b_this_page) != head);
1720
1721 if (!nr) {
1722 /*
1723 * all buffers are uptodate - we can set the page
1724 * uptodate as well.
1725 */
1726 SetPageUptodate(page);
1727 UnlockPage(page);
1728 return 0;
1729 }
1730
1731 /* Stage two: lock the buffers */
1732 for (i = 0; i < nr; i++) {
1733 struct buffer_head * bh = arr[i];
1734 lock_buffer(bh);
1735 bh->b_end_io = end_buffer_io_async;
1736 atomic_inc(&bh->b_count);
1737 }
1738
1739 /* Stage 3: start the IO */
1740 for (i = 0; i < nr; i++)
1741 submit_bh(READ, arr[i]);
1742
1743 return 0;
1744 }
1745
1746 /*
1747 * For moronic filesystems that do not allow holes in file.
1748 * We may have to extend the file.
1749 */
1750
1751 int cont_prepare_write(struct page *page, unsigned offset, unsigned to, get_block_t *get_block, unsigned long *bytes)
1752 {
1753 struct address_space *mapping = page->mapping;
1754 struct inode *inode = mapping->host;
1755 struct page *new_page;
1756 unsigned long pgpos;
1757 long status;
1758 unsigned zerofrom;
1759 unsigned blocksize = inode->i_sb->s_blocksize;
1760 char *kaddr;
1761
1762 while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
1763 status = -ENOMEM;
1764 new_page = grab_cache_page(mapping, pgpos);
1765 if (!new_page)
1766 goto out;
1767 /* we might sleep */
1768 if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
1769 UnlockPage(new_page);
1770 page_cache_release(new_page);
1771 continue;
1772 }
1773 zerofrom = *bytes & ~PAGE_CACHE_MASK;
1774 if (zerofrom & (blocksize-1)) {
1775 *bytes |= (blocksize-1);
1776 (*bytes)++;
1777 }
1778 status = __block_prepare_write(inode, new_page, zerofrom,
1779 PAGE_CACHE_SIZE, get_block);
1780 if (status)
1781 goto out_unmap;
1782 kaddr = page_address(new_page);
1783 memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
1784 flush_dcache_page(new_page);
1785 __block_commit_write(inode, new_page, zerofrom, PAGE_CACHE_SIZE);
1786 kunmap(new_page);
1787 UnlockPage(new_page);
1788 page_cache_release(new_page);
1789 }
1790
1791 if (page->index < pgpos) {
1792 /* completely inside the area */
1793 zerofrom = offset;
1794 } else {
1795 /* page covers the boundary, find the boundary offset */
1796 zerofrom = *bytes & ~PAGE_CACHE_MASK;
1797
1798 /* if we will expand the thing last block will be filled */
1799 if (to > zerofrom && (zerofrom & (blocksize-1))) {
1800 *bytes |= (blocksize-1);
1801 (*bytes)++;
1802 }
1803
1804 /* starting below the boundary? Nothing to zero out */
1805 if (offset <= zerofrom)
1806 zerofrom = offset;
1807 }
1808 status = __block_prepare_write(inode, page, zerofrom, to, get_block);
1809 if (status)
1810 goto out1;
1811 kaddr = page_address(page);
1812 if (zerofrom < offset) {
1813 memset(kaddr+zerofrom, 0, offset-zerofrom);
1814 flush_dcache_page(page);
1815 __block_commit_write(inode, page, zerofrom, offset);
1816 }
1817 return 0;
1818 out1:
1819 ClearPageUptodate(page);
1820 kunmap(page);
1821 return status;
1822
1823 out_unmap:
1824 ClearPageUptodate(new_page);
1825 kunmap(new_page);
1826 UnlockPage(new_page);
1827 page_cache_release(new_page);
1828 out:
1829 return status;
1830 }
1831
1832 int block_prepare_write(struct page *page, unsigned from, unsigned to,
1833 get_block_t *get_block)
1834 {
1835 struct inode *inode = page->mapping->host;
1836 int err = __block_prepare_write(inode, page, from, to, get_block);
1837 if (err) {
1838 ClearPageUptodate(page);
1839 kunmap(page);
1840 }
1841 return err;
1842 }
1843
1844 int generic_commit_write(struct file *file, struct page *page,
1845 unsigned from, unsigned to)
1846 {
1847 struct inode *inode = page->mapping->host;
1848 loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1849 __block_commit_write(inode,page,from,to);
1850 kunmap(page);
1851 if (pos > inode->i_size) {
1852 inode->i_size = pos;
1853 mark_inode_dirty(inode);
1854 }
1855 return 0;
1856 }
1857
1858 int block_truncate_page(struct address_space *mapping, loff_t from, get_block_t *get_block)
1859 {
1860 unsigned long index = from >> PAGE_CACHE_SHIFT;
1861 unsigned offset = from & (PAGE_CACHE_SIZE-1);
1862 unsigned blocksize, iblock, length, pos;
1863 struct inode *inode = mapping->host;
1864 struct page *page;
1865 struct buffer_head *bh;
1866 int err;
1867
1868 blocksize = inode->i_sb->s_blocksize;
1869 length = offset & (blocksize - 1);
1870
1871 /* Block boundary? Nothing to do */
1872 if (!length)
1873 return 0;
1874
1875 length = blocksize - length;
1876 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1877
1878 page = grab_cache_page(mapping, index);
1879 err = PTR_ERR(page);
1880 if (IS_ERR(page))
1881 goto out;
1882
1883 if (!page->buffers)
1884 create_empty_buffers(page, inode->i_dev, blocksize);
1885
1886 /* Find the buffer that contains "offset" */
1887 bh = page->buffers;
1888 pos = blocksize;
1889 while (offset >= pos) {
1890 bh = bh->b_this_page;
1891 iblock++;
1892 pos += blocksize;
1893 }
1894
1895 err = 0;
1896 if (!buffer_mapped(bh)) {
1897 /* Hole? Nothing to do */
1898 if (buffer_uptodate(bh))
1899 goto unlock;
1900 get_block(inode, iblock, bh, 0);
1901 /* Still unmapped? Nothing to do */
1902 if (!buffer_mapped(bh))
1903 goto unlock;
1904 }
1905
1906 /* Ok, it's mapped. Make sure it's up-to-date */
1907 if (Page_Uptodate(page))
1908 set_bit(BH_Uptodate, &bh->b_state);
1909
1910 if (!buffer_uptodate(bh)) {
1911 err = -EIO;
1912 ll_rw_block(READ, 1, &bh);
1913 wait_on_buffer(bh);
1914 /* Uhhuh. Read error. Complain and punt. */
1915 if (!buffer_uptodate(bh))
1916 goto unlock;
1917 }
1918
1919 memset(kmap(page) + offset, 0, length);
1920 flush_dcache_page(page);
1921 kunmap(page);
1922
1923 __mark_buffer_dirty(bh);
1924 err = 0;
1925
1926 unlock:
1927 UnlockPage(page);
1928 page_cache_release(page);
1929 out:
1930 return err;
1931 }
1932
1933 int block_write_full_page(struct page *page, get_block_t *get_block)
1934 {
1935 struct inode *inode = page->mapping->host;
1936 unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
1937 unsigned offset;
1938 int err;
1939
1940 /* easy case */
1941 if (page->index < end_index)
1942 return __block_write_full_page(inode, page, get_block);
1943
1944 /* things got complicated... */
1945 offset = inode->i_size & (PAGE_CACHE_SIZE-1);
1946 /* OK, are we completely out? */
1947 if (page->index >= end_index+1 || !offset) {
1948 UnlockPage(page);
1949 return -EIO;
1950 }
1951
1952 /* Sigh... will have to work, then... */
1953 err = __block_prepare_write(inode, page, 0, offset, get_block);
1954 if (!err) {
1955 memset(page_address(page) + offset, 0, PAGE_CACHE_SIZE - offset);
1956 flush_dcache_page(page);
1957 __block_commit_write(inode,page,0,offset);
1958 done:
1959 kunmap(page);
1960 UnlockPage(page);
1961 return err;
1962 }
1963 ClearPageUptodate(page);
1964 goto done;
1965 }
1966
1967 int generic_block_bmap(struct address_space *mapping, long block, get_block_t *get_block)
1968 {
1969 struct buffer_head tmp;
1970 struct inode *inode = mapping->host;
1971 tmp.b_state = 0;
1972 tmp.b_blocknr = 0;
1973 get_block(inode, block, &tmp, 0);
1974 return tmp.b_blocknr;
1975 }
1976
1977 /*
1978 * IO completion routine for a buffer_head being used for kiobuf IO: we
1979 * can't dispatch the kiobuf callback until io_count reaches 0.
1980 */
1981
1982 static void end_buffer_io_kiobuf(struct buffer_head *bh, int uptodate)
1983 {
1984 struct kiobuf *kiobuf;
1985
1986 mark_buffer_uptodate(bh, uptodate);
1987
1988 kiobuf = bh->b_private;
1989 unlock_buffer(bh);
1990 end_kio_request(kiobuf, uptodate);
1991 }
1992
1993
1994 /*
1995 * For brw_kiovec: submit a set of buffer_head temporary IOs and wait
1996 * for them to complete. Clean up the buffer_heads afterwards.
1997 */
1998
1999 static int wait_kio(int rw, int nr, struct buffer_head *bh[], int size)
2000 {
2001 int iosize;
2002 int i;
2003 struct buffer_head *tmp;
2004
2005
2006 iosize = 0;
2007 spin_lock(&unused_list_lock);
2008
2009 for (i = nr; --i >= 0; ) {
2010 iosize += size;
2011 tmp = bh[i];
2012 if (buffer_locked(tmp)) {
2013 spin_unlock(&unused_list_lock);
2014 wait_on_buffer(tmp);
2015 spin_lock(&unused_list_lock);
2016 }
2017
2018 if (!buffer_uptodate(tmp)) {
2019 /* We are traversing bh'es in reverse order so
2020 clearing iosize on error calculates the
2021 amount of IO before the first error. */
2022 iosize = 0;
2023 }
2024 __put_unused_buffer_head(tmp);
2025 }
2026
2027 spin_unlock(&unused_list_lock);
2028
2029 return iosize;
2030 }
2031
2032 /*
2033 * Start I/O on a physical range of kernel memory, defined by a vector
2034 * of kiobuf structs (much like a user-space iovec list).
2035 *
2036 * The kiobuf must already be locked for IO. IO is submitted
2037 * asynchronously: you need to check page->locked, page->uptodate, and
2038 * maybe wait on page->wait.
2039 *
2040 * It is up to the caller to make sure that there are enough blocks
2041 * passed in to completely map the iobufs to disk.
2042 */
2043
2044 int brw_kiovec(int rw, int nr, struct kiobuf *iovec[],
2045 kdev_t dev, unsigned long b[], int size)
2046 {
2047 int err;
2048 int length;
2049 int transferred;
2050 int i;
2051 int bufind;
2052 int pageind;
2053 int bhind;
2054 int offset;
2055 unsigned long blocknr;
2056 struct kiobuf * iobuf = NULL;
2057 struct page * map;
2058 struct buffer_head *tmp, *bh[KIO_MAX_SECTORS];
2059
2060 if (!nr)
2061 return 0;
2062
2063 /*
2064 * First, do some alignment and validity checks
2065 */
2066 for (i = 0; i < nr; i++) {
2067 iobuf = iovec[i];
2068 if ((iobuf->offset & (size-1)) ||
2069 (iobuf->length & (size-1)))
2070 return -EINVAL;
2071 if (!iobuf->nr_pages)
2072 panic("brw_kiovec: iobuf not initialised");
2073 }
2074
2075 /*
2076 * OK to walk down the iovec doing page IO on each page we find.
2077 */
2078 bufind = bhind = transferred = err = 0;
2079 for (i = 0; i < nr; i++) {
2080 iobuf = iovec[i];
2081 offset = iobuf->offset;
2082 length = iobuf->length;
2083 iobuf->errno = 0;
2084
2085 for (pageind = 0; pageind < iobuf->nr_pages; pageind++) {
2086 map = iobuf->maplist[pageind];
2087 if (!map) {
2088 err = -EFAULT;
2089 goto error;
2090 }
2091
2092 while (length > 0) {
2093 blocknr = b[bufind++];
2094 tmp = get_unused_buffer_head(0);
2095 if (!tmp) {
2096 err = -ENOMEM;
2097 goto error;
2098 }
2099
2100 tmp->b_dev = B_FREE;
2101 tmp->b_size = size;
2102 set_bh_page(tmp, map, offset);
2103 tmp->b_this_page = tmp;
2104
2105 init_buffer(tmp, end_buffer_io_kiobuf, iobuf);
2106 tmp->b_dev = dev;
2107 tmp->b_blocknr = blocknr;
2108 tmp->b_state = (1 << BH_Mapped) | (1 << BH_Lock) | (1 << BH_Req);
2109
2110 if (rw == WRITE) {
2111 set_bit(BH_Uptodate, &tmp->b_state);
2112 clear_bit(BH_Dirty, &tmp->b_state);
2113 }
2114
2115 bh[bhind++] = tmp;
2116 length -= size;
2117 offset += size;
2118
2119 atomic_inc(&iobuf->io_count);
2120
2121 submit_bh(rw, tmp);
2122 /*
2123 * Wait for IO if we have got too much
2124 */
2125 if (bhind >= KIO_MAX_SECTORS) {
2126 err = wait_kio(rw, bhind, bh, size);
2127 if (err >= 0)
2128 transferred += err;
2129 else
2130 goto finished;
2131 bhind = 0;
2132 }
2133
2134 if (offset >= PAGE_SIZE) {
2135 offset = 0;
2136 break;
2137 }
2138 } /* End of block loop */
2139 } /* End of page loop */
2140 } /* End of iovec loop */
2141
2142 /* Is there any IO still left to submit? */
2143 if (bhind) {
2144 err = wait_kio(rw, bhind, bh, size);
2145 if (err >= 0)
2146 transferred += err;
2147 else
2148 goto finished;
2149 }
2150
2151 finished:
2152 if (transferred)
2153 return transferred;
2154 return err;
2155
2156 error:
2157 /* We got an error allocating the bh'es. Just free the current
2158 buffer_heads and exit. */
2159 spin_lock(&unused_list_lock);
2160 for (i = bhind; --i >= 0; ) {
2161 __put_unused_buffer_head(bh[i]);
2162 }
2163 spin_unlock(&unused_list_lock);
2164 goto finished;
2165 }
2166
2167 /*
2168 * Start I/O on a page.
2169 * This function expects the page to be locked and may return
2170 * before I/O is complete. You then have to check page->locked,
2171 * page->uptodate, and maybe wait on page->wait.
2172 *
2173 * brw_page() is SMP-safe, although it's being called with the
2174 * kernel lock held - but the code is ready.
2175 *
2176 * FIXME: we need a swapper_inode->get_block function to remove
2177 * some of the bmap kludges and interface ugliness here.
2178 */
2179 int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size)
2180 {
2181 struct buffer_head *head, *bh;
2182
2183 if (!PageLocked(page))
2184 panic("brw_page: page not locked for I/O");
2185
2186 if (!page->buffers)
2187 create_empty_buffers(page, dev, size);
2188 head = bh = page->buffers;
2189
2190 /* Stage 1: lock all the buffers */
2191 do {
2192 lock_buffer(bh);
2193 bh->b_blocknr = *(b++);
2194 set_bit(BH_Mapped, &bh->b_state);
2195 bh->b_end_io = end_buffer_io_async;
2196 atomic_inc(&bh->b_count);
2197 bh = bh->b_this_page;
2198 } while (bh != head);
2199
2200 /* Stage 2: start the IO */
2201 do {
2202 submit_bh(rw, bh);
2203 bh = bh->b_this_page;
2204 } while (bh != head);
2205 return 0;
2206 }
2207
2208 int block_symlink(struct inode *inode, const char *symname, int len)
2209 {
2210 struct address_space *mapping = inode->i_mapping;
2211 struct page *page = grab_cache_page(mapping, 0);
2212 int err = -ENOMEM;
2213 char *kaddr;
2214
2215 if (!page)
2216 goto fail;
2217 err = mapping->a_ops->prepare_write(NULL, page, 0, len-1);
2218 if (err)
2219 goto fail_map;
2220 kaddr = page_address(page);
2221 memcpy(kaddr, symname, len-1);
2222 mapping->a_ops->commit_write(NULL, page, 0, len-1);
2223 /*
2224 * Notice that we are _not_ going to block here - end of page is
2225 * unmapped, so this will only try to map the rest of page, see
2226 * that it is unmapped (typically even will not look into inode -
2227 * ->i_size will be enough for everything) and zero it out.
2228 * OTOH it's obviously correct and should make the page up-to-date.
2229 */
2230 err = mapping->a_ops->readpage(NULL, page);
2231 wait_on_page(page);
2232 page_cache_release(page);
2233 if (err < 0)
2234 goto fail;
2235 mark_inode_dirty(inode);
2236 return 0;
2237 fail_map:
2238 UnlockPage(page);
2239 page_cache_release(page);
2240 fail:
2241 return err;
2242 }
2243
2244 /*
2245 * Try to increase the number of buffers available: the size argument
2246 * is used to determine what kind of buffers we want.
2247 */
2248 static int grow_buffers(int size)
2249 {
2250 struct page * page;
2251 struct buffer_head *bh, *tmp;
2252 struct buffer_head * insert_point;
2253 int isize;
2254
2255 if ((size & 511) || (size > PAGE_SIZE)) {
2256 printk("VFS: grow_buffers: size = %d\n",size);
2257 return 0;
2258 }
2259
2260 page = alloc_page(GFP_BUFFER);
2261 if (!page)
2262 goto out;
2263 LockPage(page);
2264 bh = create_buffers(page, size, 0);
2265 if (!bh)
2266 goto no_buffer_head;
2267
2268 isize = BUFSIZE_INDEX(size);
2269
2270 spin_lock(&free_list[isize].lock);
2271 insert_point = free_list[isize].list;
2272 tmp = bh;
2273 while (1) {
2274 if (insert_point) {
2275 tmp->b_next_free = insert_point->b_next_free;
2276 tmp->b_prev_free = insert_point;
2277 insert_point->b_next_free->b_prev_free = tmp;
2278 insert_point->b_next_free = tmp;
2279 } else {
2280 tmp->b_prev_free = tmp;
2281 tmp->b_next_free = tmp;
2282 }
2283 insert_point = tmp;
2284 if (tmp->b_this_page)
2285 tmp = tmp->b_this_page;
2286 else
2287 break;
2288 }
2289 tmp->b_this_page = bh;
2290 free_list[isize].list = bh;
2291 spin_unlock(&free_list[isize].lock);
2292
2293 page->buffers = bh;
2294 page->flags &= ~(1 << PG_referenced);
2295 lru_cache_add(page);
2296 UnlockPage(page);
2297 atomic_inc(&buffermem_pages);
2298 return 1;
2299
2300 no_buffer_head:
2301 UnlockPage(page);
2302 page_cache_release(page);
2303 out:
2304 return 0;
2305 }
2306
2307 /*
2308 * Sync all the buffers on one page..
2309 *
2310 * If we have old buffers that are locked, we'll
2311 * wait on them, but we won't wait on the new ones
2312 * we're writing out now.
2313 *
2314 * This all is required so that we can free up memory
2315 * later.
2316 *
2317 * Wait:
2318 * 0 - no wait (this does not get called - see try_to_free_buffers below)
2319 * 1 - start IO for dirty buffers
2320 * 2 - wait for completion of locked buffers
2321 */
2322 static void sync_page_buffers(struct buffer_head *bh, int wait)
2323 {
2324 struct buffer_head * tmp = bh;
2325
2326 do {
2327 struct buffer_head *p = tmp;
2328 tmp = tmp->b_this_page;
2329 if (buffer_locked(p)) {
2330 if (wait > 1)
2331 __wait_on_buffer(p);
2332 } else if (buffer_dirty(p))
2333 ll_rw_block(WRITE, 1, &p);
2334 } while (tmp != bh);
2335 }
2336
2337 /*
2338 * Can the buffer be thrown out?
2339 */
2340 #define BUFFER_BUSY_BITS ((1<<BH_Dirty) | (1<<BH_Lock) | (1<<BH_Protected))
2341 #define buffer_busy(bh) (atomic_read(&(bh)->b_count) | ((bh)->b_state & BUFFER_BUSY_BITS))
2342
2343 /*
2344 * try_to_free_buffers() checks if all the buffers on this particular page
2345 * are unused, and free's the page if so.
2346 *
2347 * Wake up bdflush() if this fails - if we're running low on memory due
2348 * to dirty buffers, we need to flush them out as quickly as possible.
2349 *
2350 * NOTE: There are quite a number of ways that threads of control can
2351 * obtain a reference to a buffer head within a page. So we must
2352 * lock out all of these paths to cleanly toss the page.
2353 */
2354 int try_to_free_buffers(struct page * page, int wait)
2355 {
2356 struct buffer_head * tmp, * bh = page->buffers;
2357 int index = BUFSIZE_INDEX(bh->b_size);
2358 int loop = 0;
2359
2360 cleaned_buffers_try_again:
2361 spin_lock(&lru_list_lock);
2362 write_lock(&hash_table_lock);
2363 spin_lock(&free_list[index].lock);
2364 tmp = bh;
2365 do {
2366 struct buffer_head *p = tmp;
2367
2368 tmp = tmp->b_this_page;
2369 if (buffer_busy(p))
2370 goto busy_buffer_page;
2371 } while (tmp != bh);
2372
2373 spin_lock(&unused_list_lock);
2374 tmp = bh;
2375 do {
2376 struct buffer_head * p = tmp;
2377 tmp = tmp->b_this_page;
2378
2379 /* The buffer can be either on the regular
2380 * queues or on the free list..
2381 */
2382 if (p->b_dev != B_FREE) {
2383 remove_inode_queue(p);
2384 __remove_from_queues(p);
2385 } else
2386 __remove_from_free_list(p, index);
2387 __put_unused_buffer_head(p);
2388 } while (tmp != bh);
2389 spin_unlock(&unused_list_lock);
2390
2391 /* Wake up anyone waiting for buffer heads */
2392 wake_up(&buffer_wait);
2393
2394 /* And free the page */
2395 page->buffers = NULL;
2396 page_cache_release(page);
2397 spin_unlock(&free_list[index].lock);
2398 write_unlock(&hash_table_lock);
2399 spin_unlock(&lru_list_lock);
2400 return 1;
2401
2402 busy_buffer_page:
2403 /* Uhhuh, start writeback so that we don't end up with all dirty pages */
2404 spin_unlock(&free_list[index].lock);
2405 write_unlock(&hash_table_lock);
2406 spin_unlock(&lru_list_lock);
2407 if (wait) {
2408 sync_page_buffers(bh, wait);
2409 /* We waited synchronously, so we can free the buffers. */
2410 if (wait > 1 && !loop) {
2411 loop = 1;
2412 goto cleaned_buffers_try_again;
2413 }
2414 }
2415 return 0;
2416 }
2417
2418 /* ================== Debugging =================== */
2419
2420 void show_buffers(void)
2421 {
2422 #ifdef CONFIG_SMP
2423 struct buffer_head * bh;
2424 int found = 0, locked = 0, dirty = 0, used = 0, lastused = 0;
2425 int protected = 0;
2426 int nlist;
2427 static char *buf_types[NR_LIST] = { "CLEAN", "LOCKED", "DIRTY", "PROTECTED", };
2428 #endif
2429
2430 printk("Buffer memory: %6dkB\n",
2431 atomic_read(&buffermem_pages) << (PAGE_SHIFT-10));
2432
2433 #ifdef CONFIG_SMP /* trylock does nothing on UP and so we could deadlock */
2434 if (!spin_trylock(&lru_list_lock))
2435 return;
2436 for(nlist = 0; nlist < NR_LIST; nlist++) {
2437 found = locked = dirty = used = lastused = protected = 0;
2438 bh = lru_list[nlist];
2439 if(!bh) continue;
2440
2441 do {
2442 found++;
2443 if (buffer_locked(bh))
2444 locked++;
2445 if (buffer_protected(bh))
2446 protected++;
2447 if (buffer_dirty(bh))
2448 dirty++;
2449 if (atomic_read(&bh->b_count))
2450 used++, lastused = found;
2451 bh = bh->b_next_free;
2452 } while (bh != lru_list[nlist]);
2453 {
2454 int tmp = nr_buffers_type[nlist];
2455 if (found != tmp)
2456 printk("%9s: BUG -> found %d, reported %d\n",
2457 buf_types[nlist], found, tmp);
2458 }
2459 printk("%9s: %d buffers, %lu kbyte, %d used (last=%d), "
2460 "%d locked, %d protected, %d dirty\n",
2461 buf_types[nlist], found, size_buffers_type[nlist]>>10,
2462 used, lastused, locked, protected, dirty);
2463 }
2464 spin_unlock(&lru_list_lock);
2465 #endif
2466 }
2467
2468 /* ===================== Init ======================= */
2469
2470 /*
2471 * allocate the hash table and init the free list
2472 * Use gfp() for the hash table to decrease TLB misses, use
2473 * SLAB cache for buffer heads.
2474 */
2475 void __init buffer_init(unsigned long mempages)
2476 {
2477 int order, i;
2478 unsigned int nr_hash;
2479
2480 /* The buffer cache hash table is less important these days,
2481 * trim it a bit.
2482 */
2483 mempages >>= 14;
2484
2485 mempages *= sizeof(struct buffer_head *);
2486
2487 for (order = 0; (1 << order) < mempages; order++)
2488 ;
2489
2490 /* try to allocate something until we get it or we're asking
2491 for something that is really too small */
2492
2493 do {
2494 unsigned long tmp;
2495
2496 nr_hash = (PAGE_SIZE << order) / sizeof(struct buffer_head *);
2497 bh_hash_mask = (nr_hash - 1);
2498
2499 tmp = nr_hash;
2500 bh_hash_shift = 0;
2501 while((tmp >>= 1UL) != 0UL)
2502 bh_hash_shift++;
2503
2504 hash_table = (struct buffer_head **)
2505 __get_free_pages(GFP_ATOMIC, order);
2506 } while (hash_table == NULL && --order > 0);
2507 printk("Buffer-cache hash table entries: %d (order: %d, %ld bytes)\n",
2508 nr_hash, order, (PAGE_SIZE << order));
2509
2510 if (!hash_table)
2511 panic("Failed to allocate buffer hash table\n");
2512
2513 /* Setup hash chains. */
2514 for(i = 0; i < nr_hash; i++)
2515 hash_table[i] = NULL;
2516
2517 /* Setup free lists. */
2518 for(i = 0; i < NR_SIZES; i++) {
2519 free_list[i].list = NULL;
2520 free_list[i].lock = SPIN_LOCK_UNLOCKED;
2521 }
2522
2523 /* Setup lru lists. */
2524 for(i = 0; i < NR_LIST; i++)
2525 lru_list[i] = NULL;
2526
2527 }
2528
2529
2530 /* ====================== bdflush support =================== */
2531
2532 /* This is a simple kernel daemon, whose job it is to provide a dynamic
2533 * response to dirty buffers. Once this process is activated, we write back
2534 * a limited number of buffers to the disks and then go back to sleep again.
2535 */
2536
2537 /* This is the _only_ function that deals with flushing async writes
2538 to disk.
2539 NOTENOTENOTENOTE: we _only_ need to browse the DIRTY lru list
2540 as all dirty buffers lives _only_ in the DIRTY lru list.
2541 As we never browse the LOCKED and CLEAN lru lists they are infact
2542 completly useless. */
2543 static int flush_dirty_buffers(int check_flushtime)
2544 {
2545 struct buffer_head * bh, *next;
2546 int flushed = 0, i;
2547
2548 restart:
2549 spin_lock(&lru_list_lock);
2550 bh = lru_list[BUF_DIRTY];
2551 if (!bh)
2552 goto out_unlock;
2553 for (i = nr_buffers_type[BUF_DIRTY]; i-- > 0; bh = next) {
2554 next = bh->b_next_free;
2555
2556 if (!buffer_dirty(bh)) {
2557 __refile_buffer(bh);
2558 continue;
2559 }
2560 if (buffer_locked(bh))
2561 continue;
2562
2563 if (check_flushtime) {
2564 /* The dirty lru list is chronologically ordered so
2565 if the current bh is not yet timed out,
2566 then also all the following bhs
2567 will be too young. */
2568 if (time_before(jiffies, bh->b_flushtime))
2569 goto out_unlock;
2570 } else {
2571 if (++flushed > bdf_prm.b_un.ndirty)
2572 goto out_unlock;
2573 }
2574
2575 /* OK, now we are committed to write it out. */
2576 atomic_inc(&bh->b_count);
2577 spin_unlock(&lru_list_lock);
2578 ll_rw_block(WRITE, 1, &bh);
2579 atomic_dec(&bh->b_count);
2580
2581 if (current->need_resched)
2582 schedule();
2583 goto restart;
2584 }
2585 out_unlock:
2586 spin_unlock(&lru_list_lock);
2587
2588 return flushed;
2589 }
2590
2591 struct task_struct *bdflush_tsk = 0;
2592
2593 void wakeup_bdflush(int block)
2594 {
2595 if (current != bdflush_tsk) {
2596 wake_up_process(bdflush_tsk);
2597
2598 if (block)
2599 flush_dirty_buffers(0);
2600 }
2601 }
2602
2603 /*
2604 * Here we attempt to write back old buffers. We also try to flush inodes
2605 * and supers as well, since this function is essentially "update", and
2606 * otherwise there would be no way of ensuring that these quantities ever
2607 * get written back. Ideally, we would have a timestamp on the inodes
2608 * and superblocks so that we could write back only the old ones as well
2609 */
2610
2611 static int sync_old_buffers(void)
2612 {
2613 lock_kernel();
2614 sync_supers(0);
2615 sync_inodes(0);
2616 unlock_kernel();
2617
2618 flush_dirty_buffers(1);
2619 /* must really sync all the active I/O request to disk here */
2620 run_task_queue(&tq_disk);
2621 return 0;
2622 }
2623
2624 int block_sync_page(struct page *page)
2625 {
2626 run_task_queue(&tq_disk);
2627 return 0;
2628 }
2629
2630 /* This is the interface to bdflush. As we get more sophisticated, we can
2631 * pass tuning parameters to this "process", to adjust how it behaves.
2632 * We would want to verify each parameter, however, to make sure that it
2633 * is reasonable. */
2634
2635 asmlinkage long sys_bdflush(int func, long data)
2636 {
2637 if (!capable(CAP_SYS_ADMIN))
2638 return -EPERM;
2639
2640 if (func == 1) {
2641 /* do_exit directly and let kupdate to do its work alone. */
2642 do_exit(0);
2643 #if 0 /* left here as it's the only example of lazy-mm-stuff used from
2644 a syscall that doesn't care about the current mm context. */
2645 int error;
2646 struct mm_struct *user_mm;
2647
2648 /*
2649 * bdflush will spend all of it's time in kernel-space,
2650 * without touching user-space, so we can switch it into
2651 * 'lazy TLB mode' to reduce the cost of context-switches
2652 * to and from bdflush.
2653 */
2654 user_mm = start_lazy_tlb();
2655 error = sync_old_buffers();
2656 end_lazy_tlb(user_mm);
2657 return error;
2658 #endif
2659 }
2660
2661 /* Basically func 1 means read param 1, 2 means write param 1, etc */
2662 if (func >= 2) {
2663 int i = (func-2) >> 1;
2664 if (i >= 0 && i < N_PARAM) {
2665 if ((func & 1) == 0)
2666 return put_user(bdf_prm.data[i], (int*)data);
2667
2668 if (data >= bdflush_min[i] && data <= bdflush_max[i]) {
2669 bdf_prm.data[i] = data;
2670 return 0;
2671 }
2672 }
2673 return -EINVAL;
2674 }
2675
2676 /* Having func 0 used to launch the actual bdflush and then never
2677 * return (unless explicitly killed). We return zero here to
2678 * remain semi-compatible with present update(8) programs.
2679 */
2680 return 0;
2681 }
2682
2683 /*
2684 * This is the actual bdflush daemon itself. It used to be started from
2685 * the syscall above, but now we launch it ourselves internally with
2686 * kernel_thread(...) directly after the first thread in init/main.c
2687 */
2688 int bdflush(void *sem)
2689 {
2690 struct task_struct *tsk = current;
2691 int flushed;
2692 /*
2693 * We have a bare-bones task_struct, and really should fill
2694 * in a few more things so "top" and /proc/2/{exe,root,cwd}
2695 * display semi-sane things. Not real crucial though...
2696 */
2697
2698 tsk->session = 1;
2699 tsk->pgrp = 1;
2700 strcpy(tsk->comm, "bdflush");
2701 bdflush_tsk = tsk;
2702
2703 /* avoid getting signals */
2704 spin_lock_irq(&tsk->sigmask_lock);
2705 flush_signals(tsk);
2706 sigfillset(&tsk->blocked);
2707 recalc_sigpending(tsk);
2708 spin_unlock_irq(&tsk->sigmask_lock);
2709
2710 up((struct semaphore *)sem);
2711
2712 for (;;) {
2713 CHECK_EMERGENCY_SYNC
2714
2715 flushed = flush_dirty_buffers(0);
2716 if (free_shortage())
2717 flushed += page_launder(GFP_KERNEL, 0);
2718
2719 /*
2720 * If there are still a lot of dirty buffers around,
2721 * skip the sleep and flush some more. Otherwise, we
2722 * go to sleep waiting a wakeup.
2723 */
2724 set_current_state(TASK_INTERRUPTIBLE);
2725 if (!flushed || balance_dirty_state(NODEV) < 0) {
2726 run_task_queue(&tq_disk);
2727 schedule();
2728 }
2729 /* Remember to mark us as running otherwise
2730 the next schedule will block. */
2731 __set_current_state(TASK_RUNNING);
2732 }
2733 }
2734
2735 /*
2736 * This is the kernel update daemon. It was used to live in userspace
2737 * but since it's need to run safely we want it unkillable by mistake.
2738 * You don't need to change your userspace configuration since
2739 * the userspace `update` will do_exit(0) at the first sys_bdflush().
2740 */
2741 int kupdate(void *sem)
2742 {
2743 struct task_struct * tsk = current;
2744 int interval;
2745
2746 tsk->session = 1;
2747 tsk->pgrp = 1;
2748 strcpy(tsk->comm, "kupdate");
2749
2750 /* sigstop and sigcont will stop and wakeup kupdate */
2751 spin_lock_irq(&tsk->sigmask_lock);
2752 sigfillset(&tsk->blocked);
2753 siginitsetinv(¤t->blocked, sigmask(SIGCONT) | sigmask(SIGSTOP));
2754 recalc_sigpending(tsk);
2755 spin_unlock_irq(&tsk->sigmask_lock);
2756
2757 up((struct semaphore *)sem);
2758
2759 for (;;) {
2760 /* update interval */
2761 interval = bdf_prm.b_un.interval;
2762 if (interval) {
2763 tsk->state = TASK_INTERRUPTIBLE;
2764 schedule_timeout(interval);
2765 } else {
2766 stop_kupdate:
2767 tsk->state = TASK_STOPPED;
2768 schedule(); /* wait for SIGCONT */
2769 }
2770 /* check for sigstop */
2771 if (signal_pending(tsk)) {
2772 int stopped = 0;
2773 spin_lock_irq(&tsk->sigmask_lock);
2774 if (sigismember(&tsk->pending.signal, SIGSTOP)) {
2775 sigdelset(&tsk->pending.signal, SIGSTOP);
2776 stopped = 1;
2777 }
2778 recalc_sigpending(tsk);
2779 spin_unlock_irq(&tsk->sigmask_lock);
2780 if (stopped)
2781 goto stop_kupdate;
2782 }
2783 #ifdef DEBUG
2784 printk("kupdate() activated...\n");
2785 #endif
2786 sync_old_buffers();
2787 }
2788 }
2789
2790 static int __init bdflush_init(void)
2791 {
2792 DECLARE_MUTEX_LOCKED(sem);
2793 kernel_thread(bdflush, &sem, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
2794 down(&sem);
2795 kernel_thread(kupdate, &sem, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
2796 down(&sem);
2797 return 0;
2798 }
2799
2800 module_init(bdflush_init)
2801
2802
This page was automatically generated by the
LXR engine.
Visit the LXR main site for more
information.