1 /*
2 md.c : Multiple Devices driver for Linux
3 Copyright (C) 1998, 1999, 2000 Ingo Molnar
4
5 completely rewritten, based on the MD driver code from Marc Zyngier
6
7 Changes:
8
9 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
10 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
11 - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
12 - kmod support by: Cyrus Durgin
13 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
14 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
15
16 - lots of fixes and improvements to the RAID1/RAID5 and generic
17 RAID code (such as request based resynchronization):
18
19 Neil Brown <neilb@cse.unsw.edu.au>.
20
21 This program is free software; you can redistribute it and/or modify
22 it under the terms of the GNU General Public License as published by
23 the Free Software Foundation; either version 2, or (at your option)
24 any later version.
25
26 You should have received a copy of the GNU General Public License
27 (for example /usr/src/linux/COPYING); if not, write to the Free
28 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
29 */
30
31 #include <linux/module.h>
32 #include <linux/config.h>
33 #include <linux/raid/md.h>
34 #include <linux/sysctl.h>
35 #include <linux/raid/xor.h>
36 #include <linux/devfs_fs_kernel.h>
37
38 #include <linux/init.h>
39
40 #ifdef CONFIG_KMOD
41 #include <linux/kmod.h>
42 #endif
43
44 #define __KERNEL_SYSCALLS__
45 #include <linux/unistd.h>
46
47 #include <asm/unaligned.h>
48
49 extern asmlinkage int sys_sched_yield(void);
50 extern asmlinkage long sys_setsid(void);
51
52 #define MAJOR_NR MD_MAJOR
53 #define MD_DRIVER
54
55 #include <linux/blk.h>
56
57 #define DEBUG 0
58 #if DEBUG
59 # define dprintk(x...) printk(x)
60 #else
61 # define dprintk(x...) do { } while(0)
62 #endif
63
64 static mdk_personality_t *pers[MAX_PERSONALITY];
65
66 /*
67 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
68 * is 100 KB/sec, so the extra system load does not show up that much.
69 * Increase it if you want to have more _guaranteed_ speed. Note that
70 * the RAID driver will use the maximum available bandwith if the IO
71 * subsystem is idle. There is also an 'absolute maximum' reconstruction
72 * speed limit - in case reconstruction slows down your system despite
73 * idle IO detection.
74 *
75 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
76 */
77
78 static int sysctl_speed_limit_min = 100;
79 static int sysctl_speed_limit_max = 100000;
80
81 static struct ctl_table_header *raid_table_header;
82
83 static ctl_table raid_table[] = {
84 {DEV_RAID_SPEED_LIMIT_MIN, "speed_limit_min",
85 &sysctl_speed_limit_min, sizeof(int), 0644, NULL, &proc_dointvec},
86 {DEV_RAID_SPEED_LIMIT_MAX, "speed_limit_max",
87 &sysctl_speed_limit_max, sizeof(int), 0644, NULL, &proc_dointvec},
88 {0}
89 };
90
91 static ctl_table raid_dir_table[] = {
92 {DEV_RAID, "raid", NULL, 0, 0555, raid_table},
93 {0}
94 };
95
96 static ctl_table raid_root_table[] = {
97 {CTL_DEV, "dev", NULL, 0, 0555, raid_dir_table},
98 {0}
99 };
100
101 /*
102 * these have to be allocated separately because external
103 * subsystems want to have a pre-defined structure
104 */
105 struct hd_struct md_hd_struct[MAX_MD_DEVS];
106 static int md_blocksizes[MAX_MD_DEVS];
107 static int md_hardsect_sizes[MAX_MD_DEVS];
108 static int md_maxreadahead[MAX_MD_DEVS];
109 static mdk_thread_t *md_recovery_thread;
110
111 int md_size[MAX_MD_DEVS];
112
113 extern struct block_device_operations md_fops;
114 static devfs_handle_t devfs_handle;
115
116 static struct gendisk md_gendisk=
117 {
118 major: MD_MAJOR,
119 major_name: "md",
120 minor_shift: 0,
121 max_p: 1,
122 part: md_hd_struct,
123 sizes: md_size,
124 nr_real: MAX_MD_DEVS,
125 real_devices: NULL,
126 next: NULL,
127 fops: &md_fops,
128 };
129
130 /*
131 * Enables to iterate over all existing md arrays
132 */
133 static MD_LIST_HEAD(all_mddevs);
134
135 /*
136 * The mapping between kdev and mddev is not necessary a simple
137 * one! Eg. HSM uses several sub-devices to implement Logical
138 * Volumes. All these sub-devices map to the same mddev.
139 */
140 dev_mapping_t mddev_map[MAX_MD_DEVS];
141
142 void add_mddev_mapping (mddev_t * mddev, kdev_t dev, void *data)
143 {
144 unsigned int minor = MINOR(dev);
145
146 if (MAJOR(dev) != MD_MAJOR) {
147 MD_BUG();
148 return;
149 }
150 if (mddev_map[minor].mddev != NULL) {
151 MD_BUG();
152 return;
153 }
154 mddev_map[minor].mddev = mddev;
155 mddev_map[minor].data = data;
156 }
157
158 void del_mddev_mapping (mddev_t * mddev, kdev_t dev)
159 {
160 unsigned int minor = MINOR(dev);
161
162 if (MAJOR(dev) != MD_MAJOR) {
163 MD_BUG();
164 return;
165 }
166 if (mddev_map[minor].mddev != mddev) {
167 MD_BUG();
168 return;
169 }
170 mddev_map[minor].mddev = NULL;
171 mddev_map[minor].data = NULL;
172 }
173
174 static int md_make_request (request_queue_t *q, int rw, struct buffer_head * bh)
175 {
176 mddev_t *mddev = kdev_to_mddev(bh->b_rdev);
177
178 if (mddev && mddev->pers)
179 return mddev->pers->make_request(mddev, rw, bh);
180 else {
181 buffer_IO_error(bh);
182 return 0;
183 }
184 }
185
186 static mddev_t * alloc_mddev (kdev_t dev)
187 {
188 mddev_t *mddev;
189
190 if (MAJOR(dev) != MD_MAJOR) {
191 MD_BUG();
192 return 0;
193 }
194 mddev = (mddev_t *) kmalloc(sizeof(*mddev), GFP_KERNEL);
195 if (!mddev)
196 return NULL;
197
198 memset(mddev, 0, sizeof(*mddev));
199
200 mddev->__minor = MINOR(dev);
201 init_MUTEX(&mddev->reconfig_sem);
202 init_MUTEX(&mddev->recovery_sem);
203 init_MUTEX(&mddev->resync_sem);
204 MD_INIT_LIST_HEAD(&mddev->disks);
205 MD_INIT_LIST_HEAD(&mddev->all_mddevs);
206 atomic_set(&mddev->active, 0);
207
208 /*
209 * The 'base' mddev is the one with data NULL.
210 * personalities can create additional mddevs
211 * if necessary.
212 */
213 add_mddev_mapping(mddev, dev, 0);
214 md_list_add(&mddev->all_mddevs, &all_mddevs);
215
216 MOD_INC_USE_COUNT;
217
218 return mddev;
219 }
220
221 struct gendisk * find_gendisk (kdev_t dev)
222 {
223 struct gendisk *tmp = gendisk_head;
224
225 while (tmp != NULL) {
226 if (tmp->major == MAJOR(dev))
227 return (tmp);
228 tmp = tmp->next;
229 }
230 return (NULL);
231 }
232
233 mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
234 {
235 mdk_rdev_t * rdev;
236 struct md_list_head *tmp;
237
238 ITERATE_RDEV(mddev,rdev,tmp) {
239 if (rdev->desc_nr == nr)
240 return rdev;
241 }
242 return NULL;
243 }
244
245 mdk_rdev_t * find_rdev(mddev_t * mddev, kdev_t dev)
246 {
247 struct md_list_head *tmp;
248 mdk_rdev_t *rdev;
249
250 ITERATE_RDEV(mddev,rdev,tmp) {
251 if (rdev->dev == dev)
252 return rdev;
253 }
254 return NULL;
255 }
256
257 static MD_LIST_HEAD(device_names);
258
259 char * partition_name (kdev_t dev)
260 {
261 struct gendisk *hd;
262 static char nomem [] = "<nomem>";
263 dev_name_t *dname;
264 struct md_list_head *tmp = device_names.next;
265
266 while (tmp != &device_names) {
267 dname = md_list_entry(tmp, dev_name_t, list);
268 if (dname->dev == dev)
269 return dname->name;
270 tmp = tmp->next;
271 }
272
273 dname = (dev_name_t *) kmalloc(sizeof(*dname), GFP_KERNEL);
274
275 if (!dname)
276 return nomem;
277 /*
278 * ok, add this new device name to the list
279 */
280 hd = find_gendisk (dev);
281 dname->name = NULL;
282 if (hd)
283 dname->name = disk_name (hd, MINOR(dev), dname->namebuf);
284 if (!dname->name) {
285 sprintf (dname->namebuf, "[dev %s]", kdevname(dev));
286 dname->name = dname->namebuf;
287 }
288
289 dname->dev = dev;
290 MD_INIT_LIST_HEAD(&dname->list);
291 md_list_add(&dname->list, &device_names);
292
293 return dname->name;
294 }
295
296 static unsigned int calc_dev_sboffset (kdev_t dev, mddev_t *mddev,
297 int persistent)
298 {
299 unsigned int size = 0;
300
301 if (blk_size[MAJOR(dev)])
302 size = blk_size[MAJOR(dev)][MINOR(dev)];
303 if (persistent)
304 size = MD_NEW_SIZE_BLOCKS(size);
305 return size;
306 }
307
308 static unsigned int calc_dev_size (kdev_t dev, mddev_t *mddev, int persistent)
309 {
310 unsigned int size;
311
312 size = calc_dev_sboffset(dev, mddev, persistent);
313 if (!mddev->sb) {
314 MD_BUG();
315 return size;
316 }
317 if (mddev->sb->chunk_size)
318 size &= ~(mddev->sb->chunk_size/1024 - 1);
319 return size;
320 }
321
322 static unsigned int zoned_raid_size (mddev_t *mddev)
323 {
324 unsigned int mask;
325 mdk_rdev_t * rdev;
326 struct md_list_head *tmp;
327
328 if (!mddev->sb) {
329 MD_BUG();
330 return -EINVAL;
331 }
332 /*
333 * do size and offset calculations.
334 */
335 mask = ~(mddev->sb->chunk_size/1024 - 1);
336
337 ITERATE_RDEV(mddev,rdev,tmp) {
338 rdev->size &= mask;
339 md_size[mdidx(mddev)] += rdev->size;
340 }
341 return 0;
342 }
343
344 /*
345 * We check wether all devices are numbered from 0 to nb_dev-1. The
346 * order is guaranteed even after device name changes.
347 *
348 * Some personalities (raid0, linear) use this. Personalities that
349 * provide data have to be able to deal with loss of individual
350 * disks, so they do their checking themselves.
351 */
352 int md_check_ordering (mddev_t *mddev)
353 {
354 int i, c;
355 mdk_rdev_t *rdev;
356 struct md_list_head *tmp;
357
358 /*
359 * First, all devices must be fully functional
360 */
361 ITERATE_RDEV(mddev,rdev,tmp) {
362 if (rdev->faulty) {
363 printk("md: md%d's device %s faulty, aborting.\n",
364 mdidx(mddev), partition_name(rdev->dev));
365 goto abort;
366 }
367 }
368
369 c = 0;
370 ITERATE_RDEV(mddev,rdev,tmp) {
371 c++;
372 }
373 if (c != mddev->nb_dev) {
374 MD_BUG();
375 goto abort;
376 }
377 if (mddev->nb_dev != mddev->sb->raid_disks) {
378 printk("md: md%d, array needs %d disks, has %d, aborting.\n",
379 mdidx(mddev), mddev->sb->raid_disks, mddev->nb_dev);
380 goto abort;
381 }
382 /*
383 * Now the numbering check
384 */
385 for (i = 0; i < mddev->nb_dev; i++) {
386 c = 0;
387 ITERATE_RDEV(mddev,rdev,tmp) {
388 if (rdev->desc_nr == i)
389 c++;
390 }
391 if (!c) {
392 printk("md: md%d, missing disk #%d, aborting.\n",
393 mdidx(mddev), i);
394 goto abort;
395 }
396 if (c > 1) {
397 printk("md: md%d, too many disks #%d, aborting.\n",
398 mdidx(mddev), i);
399 goto abort;
400 }
401 }
402 return 0;
403 abort:
404 return 1;
405 }
406
407 static void remove_descriptor (mdp_disk_t *disk, mdp_super_t *sb)
408 {
409 if (disk_active(disk)) {
410 sb->working_disks--;
411 } else {
412 if (disk_spare(disk)) {
413 sb->spare_disks--;
414 sb->working_disks--;
415 } else {
416 sb->failed_disks--;
417 }
418 }
419 sb->nr_disks--;
420 disk->major = 0;
421 disk->minor = 0;
422 mark_disk_removed(disk);
423 }
424
425 #define BAD_MAGIC KERN_ERR \
426 "md: invalid raid superblock magic on %s\n"
427
428 #define BAD_MINOR KERN_ERR \
429 "md: %s: invalid raid minor (%x)\n"
430
431 #define OUT_OF_MEM KERN_ALERT \
432 "md: out of memory.\n"
433
434 #define NO_SB KERN_ERR \
435 "md: disabled device %s, could not read superblock.\n"
436
437 #define BAD_CSUM KERN_WARNING \
438 "md: invalid superblock checksum on %s\n"
439
440 static int alloc_array_sb (mddev_t * mddev)
441 {
442 if (mddev->sb) {
443 MD_BUG();
444 return 0;
445 }
446
447 mddev->sb = (mdp_super_t *) __get_free_page (GFP_KERNEL);
448 if (!mddev->sb)
449 return -ENOMEM;
450 md_clear_page(mddev->sb);
451 return 0;
452 }
453
454 static int alloc_disk_sb (mdk_rdev_t * rdev)
455 {
456 if (rdev->sb)
457 MD_BUG();
458
459 rdev->sb = (mdp_super_t *) __get_free_page(GFP_KERNEL);
460 if (!rdev->sb) {
461 printk (OUT_OF_MEM);
462 return -EINVAL;
463 }
464 md_clear_page(rdev->sb);
465
466 return 0;
467 }
468
469 static void free_disk_sb (mdk_rdev_t * rdev)
470 {
471 if (rdev->sb) {
472 free_page((unsigned long) rdev->sb);
473 rdev->sb = NULL;
474 rdev->sb_offset = 0;
475 rdev->size = 0;
476 } else {
477 if (!rdev->faulty)
478 MD_BUG();
479 }
480 }
481
482 static void mark_rdev_faulty (mdk_rdev_t * rdev)
483 {
484 if (!rdev) {
485 MD_BUG();
486 return;
487 }
488 free_disk_sb(rdev);
489 rdev->faulty = 1;
490 }
491
492 static int read_disk_sb (mdk_rdev_t * rdev)
493 {
494 int ret = -EINVAL;
495 struct buffer_head *bh = NULL;
496 kdev_t dev = rdev->dev;
497 mdp_super_t *sb;
498 unsigned long sb_offset;
499
500 if (!rdev->sb) {
501 MD_BUG();
502 goto abort;
503 }
504
505 /*
506 * Calculate the position of the superblock,
507 * it's at the end of the disk
508 */
509 sb_offset = calc_dev_sboffset(rdev->dev, rdev->mddev, 1);
510 rdev->sb_offset = sb_offset;
511 printk("(read) %s's sb offset: %ld", partition_name(dev), sb_offset);
512 fsync_dev(dev);
513 set_blocksize (dev, MD_SB_BYTES);
514 bh = bread (dev, sb_offset / MD_SB_BLOCKS, MD_SB_BYTES);
515
516 if (bh) {
517 sb = (mdp_super_t *) bh->b_data;
518 memcpy (rdev->sb, sb, MD_SB_BYTES);
519 } else {
520 printk (NO_SB,partition_name(rdev->dev));
521 goto abort;
522 }
523 printk(" [events: %08lx]\n", (unsigned long)rdev->sb->events_lo);
524 ret = 0;
525 abort:
526 if (bh)
527 brelse (bh);
528 return ret;
529 }
530
531 static unsigned int calc_sb_csum (mdp_super_t * sb)
532 {
533 unsigned int disk_csum, csum;
534
535 disk_csum = sb->sb_csum;
536 sb->sb_csum = 0;
537 csum = csum_partial((void *)sb, MD_SB_BYTES, 0);
538 sb->sb_csum = disk_csum;
539 return csum;
540 }
541
542 /*
543 * Check one RAID superblock for generic plausibility
544 */
545
546 static int check_disk_sb (mdk_rdev_t * rdev)
547 {
548 mdp_super_t *sb;
549 int ret = -EINVAL;
550
551 sb = rdev->sb;
552 if (!sb) {
553 MD_BUG();
554 goto abort;
555 }
556
557 if (sb->md_magic != MD_SB_MAGIC) {
558 printk (BAD_MAGIC, partition_name(rdev->dev));
559 goto abort;
560 }
561
562 if (sb->md_minor >= MAX_MD_DEVS) {
563 printk (BAD_MINOR, partition_name(rdev->dev),
564 sb->md_minor);
565 goto abort;
566 }
567
568 if (calc_sb_csum(sb) != sb->sb_csum)
569 printk(BAD_CSUM, partition_name(rdev->dev));
570 ret = 0;
571 abort:
572 return ret;
573 }
574
575 static kdev_t dev_unit(kdev_t dev)
576 {
577 unsigned int mask;
578 struct gendisk *hd = find_gendisk(dev);
579
580 if (!hd)
581 return 0;
582 mask = ~((1 << hd->minor_shift) - 1);
583
584 return MKDEV(MAJOR(dev), MINOR(dev) & mask);
585 }
586
587 static mdk_rdev_t * match_dev_unit(mddev_t *mddev, kdev_t dev)
588 {
589 struct md_list_head *tmp;
590 mdk_rdev_t *rdev;
591
592 ITERATE_RDEV(mddev,rdev,tmp)
593 if (dev_unit(rdev->dev) == dev_unit(dev))
594 return rdev;
595
596 return NULL;
597 }
598
599 static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
600 {
601 struct md_list_head *tmp;
602 mdk_rdev_t *rdev;
603
604 ITERATE_RDEV(mddev1,rdev,tmp)
605 if (match_dev_unit(mddev2, rdev->dev))
606 return 1;
607
608 return 0;
609 }
610
611 static MD_LIST_HEAD(all_raid_disks);
612 static MD_LIST_HEAD(pending_raid_disks);
613
614 static void bind_rdev_to_array (mdk_rdev_t * rdev, mddev_t * mddev)
615 {
616 mdk_rdev_t *same_pdev;
617
618 if (rdev->mddev) {
619 MD_BUG();
620 return;
621 }
622 same_pdev = match_dev_unit(mddev, rdev->dev);
623 if (same_pdev)
624 printk( KERN_WARNING
625 "md%d: WARNING: %s appears to be on the same physical disk as %s. True\n"
626 " protection against single-disk failure might be compromised.\n",
627 mdidx(mddev), partition_name(rdev->dev),
628 partition_name(same_pdev->dev));
629
630 md_list_add(&rdev->same_set, &mddev->disks);
631 rdev->mddev = mddev;
632 mddev->nb_dev++;
633 printk("bind<%s,%d>\n", partition_name(rdev->dev), mddev->nb_dev);
634 }
635
636 static void unbind_rdev_from_array (mdk_rdev_t * rdev)
637 {
638 if (!rdev->mddev) {
639 MD_BUG();
640 return;
641 }
642 md_list_del(&rdev->same_set);
643 MD_INIT_LIST_HEAD(&rdev->same_set);
644 rdev->mddev->nb_dev--;
645 printk("unbind<%s,%d>\n", partition_name(rdev->dev),
646 rdev->mddev->nb_dev);
647 rdev->mddev = NULL;
648 }
649
650 /*
651 * prevent the device from being mounted, repartitioned or
652 * otherwise reused by a RAID array (or any other kernel
653 * subsystem), by opening the device. [simply getting an
654 * inode is not enough, the SCSI module usage code needs
655 * an explicit open() on the device]
656 */
657 static int lock_rdev (mdk_rdev_t *rdev)
658 {
659 int err = 0;
660 struct block_device *bdev;
661
662 bdev = bdget(rdev->dev);
663 if (bdev == NULL)
664 return -ENOMEM;
665 err = blkdev_get(bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_FILE);
666 if (!err) {
667 rdev->bdev = bdev;
668 }
669 return err;
670 }
671
672 static void unlock_rdev (mdk_rdev_t *rdev)
673 {
674 if (!rdev->bdev)
675 MD_BUG();
676 blkdev_put(rdev->bdev, BDEV_FILE);
677 bdput(rdev->bdev);
678 rdev->bdev = NULL;
679 }
680
681 static void export_rdev (mdk_rdev_t * rdev)
682 {
683 printk("export_rdev(%s)\n",partition_name(rdev->dev));
684 if (rdev->mddev)
685 MD_BUG();
686 unlock_rdev(rdev);
687 free_disk_sb(rdev);
688 md_list_del(&rdev->all);
689 MD_INIT_LIST_HEAD(&rdev->all);
690 if (rdev->pending.next != &rdev->pending) {
691 printk("(%s was pending)\n",partition_name(rdev->dev));
692 md_list_del(&rdev->pending);
693 MD_INIT_LIST_HEAD(&rdev->pending);
694 }
695 rdev->dev = 0;
696 rdev->faulty = 0;
697 kfree(rdev);
698 }
699
700 static void kick_rdev_from_array (mdk_rdev_t * rdev)
701 {
702 unbind_rdev_from_array(rdev);
703 export_rdev(rdev);
704 }
705
706 static void export_array (mddev_t *mddev)
707 {
708 struct md_list_head *tmp;
709 mdk_rdev_t *rdev;
710 mdp_super_t *sb = mddev->sb;
711
712 if (mddev->sb) {
713 mddev->sb = NULL;
714 free_page((unsigned long) sb);
715 }
716
717 ITERATE_RDEV(mddev,rdev,tmp) {
718 if (!rdev->mddev) {
719 MD_BUG();
720 continue;
721 }
722 kick_rdev_from_array(rdev);
723 }
724 if (mddev->nb_dev)
725 MD_BUG();
726 }
727
728 static void free_mddev (mddev_t *mddev)
729 {
730 if (!mddev) {
731 MD_BUG();
732 return;
733 }
734
735 export_array(mddev);
736 md_size[mdidx(mddev)] = 0;
737 md_hd_struct[mdidx(mddev)].nr_sects = 0;
738
739 /*
740 * Make sure nobody else is using this mddev
741 * (careful, we rely on the global kernel lock here)
742 */
743 while (md_atomic_read(&mddev->resync_sem.count) != 1)
744 schedule();
745 while (md_atomic_read(&mddev->recovery_sem.count) != 1)
746 schedule();
747
748 del_mddev_mapping(mddev, MKDEV(MD_MAJOR, mdidx(mddev)));
749 md_list_del(&mddev->all_mddevs);
750 MD_INIT_LIST_HEAD(&mddev->all_mddevs);
751 kfree(mddev);
752 MOD_DEC_USE_COUNT;
753 }
754
755 #undef BAD_CSUM
756 #undef BAD_MAGIC
757 #undef OUT_OF_MEM
758 #undef NO_SB
759
760 static void print_desc(mdp_disk_t *desc)
761 {
762 printk(" DISK<N:%d,%s(%d,%d),R:%d,S:%d>\n", desc->number,
763 partition_name(MKDEV(desc->major,desc->minor)),
764 desc->major,desc->minor,desc->raid_disk,desc->state);
765 }
766
767 static void print_sb(mdp_super_t *sb)
768 {
769 int i;
770
771 printk(" SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n",
772 sb->major_version, sb->minor_version, sb->patch_version,
773 sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3,
774 sb->ctime);
775 printk(" L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", sb->level,
776 sb->size, sb->nr_disks, sb->raid_disks, sb->md_minor,
777 sb->layout, sb->chunk_size);
778 printk(" UT:%08x ST:%d AD:%d WD:%d FD:%d SD:%d CSUM:%08x E:%08lx\n",
779 sb->utime, sb->state, sb->active_disks, sb->working_disks,
780 sb->failed_disks, sb->spare_disks,
781 sb->sb_csum, (unsigned long)sb->events_lo);
782
783 for (i = 0; i < MD_SB_DISKS; i++) {
784 mdp_disk_t *desc;
785
786 desc = sb->disks + i;
787 printk(" D %2d: ", i);
788 print_desc(desc);
789 }
790 printk(" THIS: ");
791 print_desc(&sb->this_disk);
792
793 }
794
795 static void print_rdev(mdk_rdev_t *rdev)
796 {
797 printk(" rdev %s: O:%s, SZ:%08ld F:%d DN:%d ",
798 partition_name(rdev->dev), partition_name(rdev->old_dev),
799 rdev->size, rdev->faulty, rdev->desc_nr);
800 if (rdev->sb) {
801 printk("rdev superblock:\n");
802 print_sb(rdev->sb);
803 } else
804 printk("no rdev superblock!\n");
805 }
806
807 void md_print_devices (void)
808 {
809 struct md_list_head *tmp, *tmp2;
810 mdk_rdev_t *rdev;
811 mddev_t *mddev;
812
813 printk("\n");
814 printk(" **********************************\n");
815 printk(" * <COMPLETE RAID STATE PRINTOUT> *\n");
816 printk(" **********************************\n");
817 ITERATE_MDDEV(mddev,tmp) {
818 printk("md%d: ", mdidx(mddev));
819
820 ITERATE_RDEV(mddev,rdev,tmp2)
821 printk("<%s>", partition_name(rdev->dev));
822
823 if (mddev->sb) {
824 printk(" array superblock:\n");
825 print_sb(mddev->sb);
826 } else
827 printk(" no array superblock.\n");
828
829 ITERATE_RDEV(mddev,rdev,tmp2)
830 print_rdev(rdev);
831 }
832 printk(" **********************************\n");
833 printk("\n");
834 }
835
836 static int sb_equal ( mdp_super_t *sb1, mdp_super_t *sb2)
837 {
838 int ret;
839 mdp_super_t *tmp1, *tmp2;
840
841 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
842 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
843
844 if (!tmp1 || !tmp2) {
845 ret = 0;
846 goto abort;
847 }
848
849 *tmp1 = *sb1;
850 *tmp2 = *sb2;
851
852 /*
853 * nr_disks is not constant
854 */
855 tmp1->nr_disks = 0;
856 tmp2->nr_disks = 0;
857
858 if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4))
859 ret = 0;
860 else
861 ret = 1;
862
863 abort:
864 if (tmp1)
865 kfree(tmp1);
866 if (tmp2)
867 kfree(tmp2);
868
869 return ret;
870 }
871
872 static int uuid_equal(mdk_rdev_t *rdev1, mdk_rdev_t *rdev2)
873 {
874 if ( (rdev1->sb->set_uuid0 == rdev2->sb->set_uuid0) &&
875 (rdev1->sb->set_uuid1 == rdev2->sb->set_uuid1) &&
876 (rdev1->sb->set_uuid2 == rdev2->sb->set_uuid2) &&
877 (rdev1->sb->set_uuid3 == rdev2->sb->set_uuid3))
878
879 return 1;
880
881 return 0;
882 }
883
884 static mdk_rdev_t * find_rdev_all (kdev_t dev)
885 {
886 struct md_list_head *tmp;
887 mdk_rdev_t *rdev;
888
889 tmp = all_raid_disks.next;
890 while (tmp != &all_raid_disks) {
891 rdev = md_list_entry(tmp, mdk_rdev_t, all);
892 if (rdev->dev == dev)
893 return rdev;
894 tmp = tmp->next;
895 }
896 return NULL;
897 }
898
899 #define GETBLK_FAILED KERN_ERR \
900 "md: getblk failed for device %s\n"
901
902 static int write_disk_sb(mdk_rdev_t * rdev)
903 {
904 struct buffer_head *bh;
905 kdev_t dev;
906 unsigned long sb_offset, size;
907 mdp_super_t *sb;
908
909 if (!rdev->sb) {
910 MD_BUG();
911 return -1;
912 }
913 if (rdev->faulty) {
914 MD_BUG();
915 return -1;
916 }
917 if (rdev->sb->md_magic != MD_SB_MAGIC) {
918 MD_BUG();
919 return -1;
920 }
921
922 dev = rdev->dev;
923 sb_offset = calc_dev_sboffset(dev, rdev->mddev, 1);
924 if (rdev->sb_offset != sb_offset) {
925 printk("%s's sb offset has changed from %ld to %ld, skipping\n", partition_name(dev), rdev->sb_offset, sb_offset);
926 goto skip;
927 }
928 /*
929 * If the disk went offline meanwhile and it's just a spare, then
930 * it's size has changed to zero silently, and the MD code does
931 * not yet know that it's faulty.
932 */
933 size = calc_dev_size(dev, rdev->mddev, 1);
934 if (size != rdev->size) {
935 printk("%s's size has changed from %ld to %ld since import, skipping\n", partition_name(dev), rdev->size, size);
936 goto skip;
937 }
938
939 printk("(write) %s's sb offset: %ld\n", partition_name(dev), sb_offset);
940 fsync_dev(dev);
941 set_blocksize(dev, MD_SB_BYTES);
942 bh = getblk(dev, sb_offset / MD_SB_BLOCKS, MD_SB_BYTES);
943 if (!bh) {
944 printk(GETBLK_FAILED, partition_name(dev));
945 return 1;
946 }
947 memset(bh->b_data,0,bh->b_size);
948 sb = (mdp_super_t *) bh->b_data;
949 memcpy(sb, rdev->sb, MD_SB_BYTES);
950
951 mark_buffer_uptodate(bh, 1);
952 mark_buffer_dirty(bh);
953 ll_rw_block(WRITE, 1, &bh);
954 wait_on_buffer(bh);
955 brelse(bh);
956 fsync_dev(dev);
957 skip:
958 return 0;
959 }
960 #undef GETBLK_FAILED
961
962 static void set_this_disk(mddev_t *mddev, mdk_rdev_t *rdev)
963 {
964 int i, ok = 0;
965 mdp_disk_t *desc;
966
967 for (i = 0; i < MD_SB_DISKS; i++) {
968 desc = mddev->sb->disks + i;
969 #if 0
970 if (disk_faulty(desc)) {
971 if (MKDEV(desc->major,desc->minor) == rdev->dev)
972 ok = 1;
973 continue;
974 }
975 #endif
976 if (MKDEV(desc->major,desc->minor) == rdev->dev) {
977 rdev->sb->this_disk = *desc;
978 rdev->desc_nr = desc->number;
979 ok = 1;
980 break;
981 }
982 }
983
984 if (!ok) {
985 MD_BUG();
986 }
987 }
988
989 static int sync_sbs(mddev_t * mddev)
990 {
991 mdk_rdev_t *rdev;
992 mdp_super_t *sb;
993 struct md_list_head *tmp;
994
995 ITERATE_RDEV(mddev,rdev,tmp) {
996 if (rdev->faulty)
997 continue;
998 sb = rdev->sb;
999 *sb = *mddev->sb;
1000 set_this_disk(mddev, rdev);
1001 sb->sb_csum = calc_sb_csum(sb);
1002 }
1003 return 0;
1004 }
1005
1006 int md_update_sb(mddev_t * mddev)
1007 {
1008 int first, err, count = 100;
1009 struct md_list_head *tmp;
1010 mdk_rdev_t *rdev;
1011
1012 repeat:
1013 mddev->sb->utime = CURRENT_TIME;
1014 if ((++mddev->sb->events_lo)==0)
1015 ++mddev->sb->events_hi;
1016
1017 if ((mddev->sb->events_lo|mddev->sb->events_hi)==0) {
1018 /*
1019 * oops, this 64-bit counter should never wrap.
1020 * Either we are in around ~1 trillion A.C., assuming
1021 * 1 reboot per second, or we have a bug:
1022 */
1023 MD_BUG();
1024 mddev->sb->events_lo = mddev->sb->events_hi = 0xffffffff;
1025 }
1026 sync_sbs(mddev);
1027
1028 /*
1029 * do not write anything to disk if using
1030 * nonpersistent superblocks
1031 */
1032 if (mddev->sb->not_persistent)
1033 return 0;
1034
1035 printk(KERN_INFO "md: updating md%d RAID superblock on device\n",
1036 mdidx(mddev));
1037
1038 first = 1;
1039 err = 0;
1040 ITERATE_RDEV(mddev,rdev,tmp) {
1041 if (!first) {
1042 first = 0;
1043 printk(", ");
1044 }
1045 if (rdev->faulty)
1046 printk("(skipping faulty ");
1047 printk("%s ", partition_name(rdev->dev));
1048 if (!rdev->faulty) {
1049 printk("[events: %08lx]",
1050 (unsigned long)rdev->sb->events_lo);
1051 err += write_disk_sb(rdev);
1052 } else
1053 printk(")\n");
1054 }
1055 printk(".\n");
1056 if (err) {
1057 printk("errors occured during superblock update, repeating\n");
1058 if (--count)
1059 goto repeat;
1060 printk("excessive errors occured during superblock update, exiting\n");
1061 }
1062 return 0;
1063 }
1064
1065 /*
1066 * Import a device. If 'on_disk', then sanity check the superblock
1067 *
1068 * mark the device faulty if:
1069 *
1070 * - the device is nonexistent (zero size)
1071 * - the device has no valid superblock
1072 *
1073 * a faulty rdev _never_ has rdev->sb set.
1074 */
1075 static int md_import_device (kdev_t newdev, int on_disk)
1076 {
1077 int err;
1078 mdk_rdev_t *rdev;
1079 unsigned int size;
1080
1081 if (find_rdev_all(newdev))
1082 return -EEXIST;
1083
1084 rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL);
1085 if (!rdev) {
1086 printk("could not alloc mem for %s!\n", partition_name(newdev));
1087 return -ENOMEM;
1088 }
1089 memset(rdev, 0, sizeof(*rdev));
1090
1091 if (get_super(newdev)) {
1092 printk("md: can not import %s, has active inodes!\n",
1093 partition_name(newdev));
1094 err = -EBUSY;
1095 goto abort_free;
1096 }
1097
1098 if ((err = alloc_disk_sb(rdev)))
1099 goto abort_free;
1100
1101 rdev->dev = newdev;
1102 if (lock_rdev(rdev)) {
1103 printk("md: could not lock %s, zero-size? Marking faulty.\n",
1104 partition_name(newdev));
1105 err = -EINVAL;
1106 goto abort_free;
1107 }
1108 rdev->desc_nr = -1;
1109 rdev->faulty = 0;
1110
1111 size = 0;
1112 if (blk_size[MAJOR(newdev)])
1113 size = blk_size[MAJOR(newdev)][MINOR(newdev)];
1114 if (!size) {
1115 printk("md: %s has zero size, marking faulty!\n",
1116 partition_name(newdev));
1117 err = -EINVAL;
1118 goto abort_free;
1119 }
1120
1121 if (on_disk) {
1122 if ((err = read_disk_sb(rdev))) {
1123 printk("md: could not read %s's sb, not importing!\n",
1124 partition_name(newdev));
1125 goto abort_free;
1126 }
1127 if ((err = check_disk_sb(rdev))) {
1128 printk("md: %s has invalid sb, not importing!\n",
1129 partition_name(newdev));
1130 goto abort_free;
1131 }
1132
1133 rdev->old_dev = MKDEV(rdev->sb->this_disk.major,
1134 rdev->sb->this_disk.minor);
1135 rdev->desc_nr = rdev->sb->this_disk.number;
1136 }
1137 md_list_add(&rdev->all, &all_raid_disks);
1138 MD_INIT_LIST_HEAD(&rdev->pending);
1139
1140 if (rdev->faulty && rdev->sb)
1141 free_disk_sb(rdev);
1142 return 0;
1143
1144 abort_free:
1145 if (rdev->sb) {
1146 if (rdev->bdev)
1147 unlock_rdev(rdev);
1148 free_disk_sb(rdev);
1149 }
1150 kfree(rdev);
1151 return err;
1152 }
1153
1154 /*
1155 * Check a full RAID array for plausibility
1156 */
1157
1158 #define INCONSISTENT KERN_ERR \
1159 "md: fatal superblock inconsistency in %s -- removing from array\n"
1160
1161 #define OUT_OF_DATE KERN_ERR \
1162 "md: superblock update time inconsistency -- using the most recent one\n"
1163
1164 #define OLD_VERSION KERN_ALERT \
1165 "md: md%d: unsupported raid array version %d.%d.%d\n"
1166
1167 #define NOT_CLEAN_IGNORE KERN_ERR \
1168 "md: md%d: raid array is not clean -- starting background reconstruction\n"
1169
1170 #define UNKNOWN_LEVEL KERN_ERR \
1171 "md: md%d: unsupported raid level %d\n"
1172
1173 static int analyze_sbs (mddev_t * mddev)
1174 {
1175 int out_of_date = 0, i;
1176 struct md_list_head *tmp, *tmp2;
1177 mdk_rdev_t *rdev, *rdev2, *freshest;
1178 mdp_super_t *sb;
1179
1180 /*
1181 * Verify the RAID superblock on each real device
1182 */
1183 ITERATE_RDEV(mddev,rdev,tmp) {
1184 if (rdev->faulty) {
1185 MD_BUG();
1186 goto abort;
1187 }
1188 if (!rdev->sb) {
1189 MD_BUG();
1190 goto abort;
1191 }
1192 if (check_disk_sb(rdev))
1193 goto abort;
1194 }
1195
1196 /*
1197 * The superblock constant part has to be the same
1198 * for all disks in the array.
1199 */
1200 sb = NULL;
1201
1202 ITERATE_RDEV(mddev,rdev,tmp) {
1203 if (!sb) {
1204 sb = rdev->sb;
1205 continue;
1206 }
1207 if (!sb_equal(sb, rdev->sb)) {
1208 printk (INCONSISTENT, partition_name(rdev->dev));
1209 kick_rdev_from_array(rdev);
1210 continue;
1211 }
1212 }
1213
1214 /*
1215 * OK, we have all disks and the array is ready to run. Let's
1216 * find the freshest superblock, that one will be the superblock
1217 * that represents the whole array.
1218 */
1219 if (!mddev->sb)
1220 if (alloc_array_sb(mddev))
1221 goto abort;
1222 sb = mddev->sb;
1223 freshest = NULL;
1224
1225 ITERATE_RDEV(mddev,rdev,tmp) {
1226 __u64 ev1, ev2;
1227 /*
1228 * if the checksum is invalid, use the superblock
1229 * only as a last resort. (decrease it's age by
1230 * one event)
1231 */
1232 if (calc_sb_csum(rdev->sb) != rdev->sb->sb_csum) {
1233 if (rdev->sb->events_lo || rdev->sb->events_hi)
1234 if ((rdev->sb->events_lo--)==0)
1235 rdev->sb->events_hi--;
1236 }
1237
1238 printk("%s's event counter: %08lx\n", partition_name(rdev->dev),
1239 (unsigned long)rdev->sb->events_lo);
1240 if (!freshest) {
1241 freshest = rdev;
1242 continue;
1243 }
1244 /*
1245 * Find the newest superblock version
1246 */
1247 ev1 = md_event(rdev->sb);
1248 ev2 = md_event(freshest->sb);
1249 if (ev1 != ev2) {
1250 out_of_date = 1;
1251 if (ev1 > ev2)
1252 freshest = rdev;
1253 }
1254 }
1255 if (out_of_date) {
1256 printk(OUT_OF_DATE);
1257 printk("freshest: %s\n", partition_name(freshest->dev));
1258 }
1259 memcpy (sb, freshest->sb, sizeof(*sb));
1260
1261 /*
1262 * at this point we have picked the 'best' superblock
1263 * from all available superblocks.
1264 * now we validate this superblock and kick out possibly
1265 * failed disks.
1266 */
1267 ITERATE_RDEV(mddev,rdev,tmp) {
1268 /*
1269 * Kick all non-fresh devices faulty
1270 */
1271 __u64 ev1, ev2;
1272 ev1 = md_event(rdev->sb);
1273 ev2 = md_event(sb);
1274 ++ev1;
1275 if (ev1 < ev2) {
1276 printk("md: kicking non-fresh %s from array!\n",
1277 partition_name(rdev->dev));
1278 kick_rdev_from_array(rdev);
1279 continue;
1280 }
1281 }
1282
1283 /*
1284 * Fix up changed device names ... but only if this disk has a
1285 * recent update time. Use faulty checksum ones too.
1286 */
1287 ITERATE_RDEV(mddev,rdev,tmp) {
1288 __u64 ev1, ev2, ev3;
1289 if (rdev->faulty) { /* REMOVEME */
1290 MD_BUG();
1291 goto abort;
1292 }
1293 ev1 = md_event(rdev->sb);
1294 ev2 = md_event(sb);
1295 ev3 = ev2;
1296 --ev3;
1297 if ((rdev->dev != rdev->old_dev) &&
1298 ((ev1 == ev2) || (ev1 == ev3))) {
1299 mdp_disk_t *desc;
1300
1301 printk("md: device name has changed from %s to %s since last import!\n", partition_name(rdev->old_dev), partition_name(rdev->dev));
1302 if (rdev->desc_nr == -1) {
1303 MD_BUG();
1304 goto abort;
1305 }
1306 desc = &sb->disks[rdev->desc_nr];
1307 if (rdev->old_dev != MKDEV(desc->major, desc->minor)) {
1308 MD_BUG();
1309 goto abort;
1310 }
1311 desc->major = MAJOR(rdev->dev);
1312 desc->minor = MINOR(rdev->dev);
1313 desc = &rdev->sb->this_disk;
1314 desc->major = MAJOR(rdev->dev);
1315 desc->minor = MINOR(rdev->dev);
1316 }
1317 }
1318
1319 /*
1320 * Remove unavailable and faulty devices ...
1321 *
1322 * note that if an array becomes completely unrunnable due to
1323 * missing devices, we do not write the superblock back, so the
1324 * administrator has a chance to fix things up. The removal thus
1325 * only happens if it's nonfatal to the contents of the array.
1326 */
1327 for (i = 0; i < MD_SB_DISKS; i++) {
1328 int found;
1329 mdp_disk_t *desc;
1330 kdev_t dev;
1331
1332 desc = sb->disks + i;
1333 dev = MKDEV(desc->major, desc->minor);
1334
1335 /*
1336 * We kick faulty devices/descriptors immediately.
1337 */
1338 if (disk_faulty(desc)) {
1339 found = 0;
1340 ITERATE_RDEV(mddev,rdev,tmp) {
1341 if (rdev->desc_nr != desc->number)
1342 continue;
1343 printk("md%d: kicking faulty %s!\n",
1344 mdidx(mddev),partition_name(rdev->dev));
1345 kick_rdev_from_array(rdev);
1346 found = 1;
1347 break;
1348 }
1349 if (!found) {
1350 if (dev == MKDEV(0,0))
1351 continue;
1352 printk("md%d: removing former faulty %s!\n",
1353 mdidx(mddev), partition_name(dev));
1354 }
1355 remove_descriptor(desc, sb);
1356 continue;
1357 }
1358
1359 if (dev == MKDEV(0,0))
1360 continue;
1361 /*
1362 * Is this device present in the rdev ring?
1363 */
1364 found = 0;
1365 ITERATE_RDEV(mddev,rdev,tmp) {
1366 if (rdev->desc_nr == desc->number) {
1367 found = 1;
1368 break;
1369 }
1370 }
1371 if (found)
1372 continue;
1373
1374 printk("md%d: former device %s is unavailable, removing from array!\n", mdidx(mddev), partition_name(dev));
1375 remove_descriptor(desc, sb);
1376 }
1377
1378 /*
1379 * Double check wether all devices mentioned in the
1380 * superblock are in the rdev ring.
1381 */
1382 for (i = 0; i < MD_SB_DISKS; i++) {
1383 mdp_disk_t *desc;
1384 kdev_t dev;
1385
1386 desc = sb->disks + i;
1387 dev = MKDEV(desc->major, desc->minor);
1388
1389 if (dev == MKDEV(0,0))
1390 continue;
1391
1392 if (disk_faulty(desc)) {
1393 MD_BUG();
1394 goto abort;
1395 }
1396
1397 rdev = find_rdev(mddev, dev);
1398 if (!rdev) {
1399 MD_BUG();
1400 goto abort;
1401 }
1402 }
1403
1404 /*
1405 * Do a final reality check.
1406 */
1407 ITERATE_RDEV(mddev,rdev,tmp) {
1408 if (rdev->desc_nr == -1) {
1409 MD_BUG();
1410 goto abort;
1411 }
1412 /*
1413 * is the desc_nr unique?
1414 */
1415 ITERATE_RDEV(mddev,rdev2,tmp2) {
1416 if ((rdev2 != rdev) &&
1417 (rdev2->desc_nr == rdev->desc_nr)) {
1418 MD_BUG();
1419 goto abort;
1420 }
1421 }
1422 /*
1423 * is the device unique?
1424 */
1425 ITERATE_RDEV(mddev,rdev2,tmp2) {
1426 if ((rdev2 != rdev) &&
1427 (rdev2->dev == rdev->dev)) {
1428 MD_BUG();
1429 goto abort;
1430 }
1431 }
1432 }
1433
1434 /*
1435 * Check if we can support this RAID array
1436 */
1437 if (sb->major_version != MD_MAJOR_VERSION ||
1438 sb->minor_version > MD_MINOR_VERSION) {
1439
1440 printk (OLD_VERSION, mdidx(mddev), sb->major_version,
1441 sb->minor_version, sb->patch_version);
1442 goto abort;
1443 }
1444
1445 if ((sb->state != (1 << MD_SB_CLEAN)) && ((sb->level == 1) ||
1446 (sb->level == 4) || (sb->level == 5)))
1447 printk (NOT_CLEAN_IGNORE, mdidx(mddev));
1448
1449 return 0;
1450 abort:
1451 return 1;
1452 }
1453
1454 #undef INCONSISTENT
1455 #undef OUT_OF_DATE
1456 #undef OLD_VERSION
1457 #undef OLD_LEVEL
1458
1459 static int device_size_calculation (mddev_t * mddev)
1460 {
1461 int data_disks = 0, persistent;
1462 unsigned int readahead;
1463 mdp_super_t *sb = mddev->sb;
1464 struct md_list_head *tmp;
1465 mdk_rdev_t *rdev;
1466
1467 /*
1468 * Do device size calculation. Bail out if too small.
1469 * (we have to do this after having validated chunk_size,
1470 * because device size has to be modulo chunk_size)
1471 */
1472 persistent = !mddev->sb->not_persistent;
1473 ITERATE_RDEV(mddev,rdev,tmp) {
1474 if (rdev->faulty)
1475 continue;
1476 if (rdev->size) {
1477 MD_BUG();
1478 continue;
1479 }
1480 rdev->size = calc_dev_size(rdev->dev, mddev, persistent);
1481 if (rdev->size < sb->chunk_size / 1024) {
1482 printk (KERN_WARNING
1483 "Dev %s smaller than chunk_size: %ldk < %dk\n",
1484 partition_name(rdev->dev),
1485 rdev->size, sb->chunk_size / 1024);
1486 return -EINVAL;
1487 }
1488 }
1489
1490 switch (sb->level) {
1491 case -3:
1492 data_disks = 1;
1493 break;
1494 case -2:
1495 data_disks = 1;
1496 break;
1497 case -1:
1498 zoned_raid_size(mddev);
1499 data_disks = 1;
1500 break;
1501 case 0:
1502 zoned_raid_size(mddev);
1503 data_disks = sb->raid_disks;
1504 break;
1505 case 1:
1506 data_disks = 1;
1507 break;
1508 case 4:
1509 case 5:
1510 data_disks = sb->raid_disks-1;
1511 break;
1512 default:
1513 printk (UNKNOWN_LEVEL, mdidx(mddev), sb->level);
1514 goto abort;
1515 }
1516 if (!md_size[mdidx(mddev)])
1517 md_size[mdidx(mddev)] = sb->size * data_disks;
1518
1519 readahead = MD_READAHEAD;
1520 if ((sb->level == 0) || (sb->level == 4) || (sb->level == 5)) {
1521 readahead = (mddev->sb->chunk_size>>PAGE_SHIFT) * 4 * data_disks;
1522 if (readahead < data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2)
1523 readahead = data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2;
1524 } else {
1525 if (sb->level == -3)
1526 readahead = 0;
1527 }
1528 md_maxreadahead[mdidx(mddev)] = readahead;
1529
1530 printk(KERN_INFO "md%d: max total readahead window set to %ldk\n",
1531 mdidx(mddev), readahead*(PAGE_SIZE/1024));
1532
1533 printk(KERN_INFO
1534 "md%d: %d data-disks, max readahead per data-disk: %ldk\n",
1535 mdidx(mddev), data_disks, readahead/data_disks*(PAGE_SIZE/1024));
1536 return 0;
1537 abort:
1538 return 1;
1539 }
1540
1541
1542 #define TOO_BIG_CHUNKSIZE KERN_ERR \
1543 "too big chunk_size: %d > %d\n"
1544
1545 #define TOO_SMALL_CHUNKSIZE KERN_ERR \
1546 "too small chunk_size: %d < %ld\n"
1547
1548 #define BAD_CHUNKSIZE KERN_ERR \
1549 "no chunksize specified, see 'man raidtab'\n"
1550
1551 static int do_md_run (mddev_t * mddev)
1552 {
1553 int pnum, err;
1554 int chunk_size;
1555 struct md_list_head *tmp;
1556 mdk_rdev_t *rdev;
1557
1558
1559 if (!mddev->nb_dev) {
1560 MD_BUG();
1561 return -EINVAL;
1562 }
1563
1564 if (mddev->pers)
1565 return -EBUSY;
1566
1567 /*
1568 * Resize disks to align partitions size on a given
1569 * chunk size.
1570 */
1571 md_size[mdidx(mddev)] = 0;
1572
1573 /*
1574 * Analyze all RAID superblock(s)
1575 */
1576 if (analyze_sbs(mddev)) {
1577 MD_BUG();
1578 return -EINVAL;
1579 }
1580
1581 chunk_size = mddev->sb->chunk_size;
1582 pnum = level_to_pers(mddev->sb->level);
1583
1584 mddev->param.chunk_size = chunk_size;
1585 mddev->param.personality = pnum;
1586
1587 if (chunk_size > MAX_CHUNK_SIZE) {
1588 printk(TOO_BIG_CHUNKSIZE, chunk_size, MAX_CHUNK_SIZE);
1589 return -EINVAL;
1590 }
1591 /*
1592 * chunk-size has to be a power of 2 and multiples of PAGE_SIZE
1593 */
1594 if ( (1 << ffz(~chunk_size)) != chunk_size) {
1595 MD_BUG();
1596 return -EINVAL;
1597 }
1598 if (chunk_size < PAGE_SIZE) {
1599 printk(TOO_SMALL_CHUNKSIZE, chunk_size, PAGE_SIZE);
1600 return -EINVAL;
1601 }
1602
1603 if (pnum >= MAX_PERSONALITY) {
1604 MD_BUG();
1605 return -EINVAL;
1606 }
1607
1608 if ((pnum != RAID1) && (pnum != LINEAR) && !chunk_size) {
1609 /*
1610 * 'default chunksize' in the old md code used to
1611 * be PAGE_SIZE, baaad.
1612 * we abort here to be on the safe side. We dont
1613 * want to continue the bad practice.
1614 */
1615 printk(BAD_CHUNKSIZE);
1616 return -EINVAL;
1617 }
1618
1619 if (!pers[pnum])
1620 {
1621 #ifdef CONFIG_KMOD
1622 char module_name[80];
1623 sprintf (module_name, "md-personality-%d", pnum);
1624 request_module (module_name);
1625 if (!pers[pnum])
1626 #endif
1627 return -EINVAL;
1628 }
1629
1630 if (device_size_calculation(mddev))
1631 return -EINVAL;
1632
1633 /*
1634 * Drop all container device buffers, from now on
1635 * the only valid external interface is through the md
1636 * device.
1637 * Also find largest hardsector size
1638 */
1639 md_hardsect_sizes[mdidx(mddev)] = 512;
1640 ITERATE_RDEV(mddev,rdev,tmp) {
1641 if (rdev->faulty)
1642 continue;
1643 fsync_dev(rdev->dev);
1644 invalidate_buffers(rdev->dev);
1645 if (get_hardsect_size(rdev->dev)
1646 > md_hardsect_sizes[mdidx(mddev)])
1647 md_hardsect_sizes[mdidx(mddev)] =
1648 get_hardsect_size(rdev->dev);
1649 }
1650 md_blocksizes[mdidx(mddev)] = 1024;
1651 if (md_blocksizes[mdidx(mddev)] < md_hardsect_sizes[mdidx(mddev)])
1652 md_blocksizes[mdidx(mddev)] = md_hardsect_sizes[mdidx(mddev)];
1653 mddev->pers = pers[pnum];
1654
1655 err = mddev->pers->run(mddev);
1656 if (err) {
1657 printk("pers->run() failed ...\n");
1658 mddev->pers = NULL;
1659 return -EINVAL;
1660 }
1661
1662 mddev->sb->state &= ~(1 << MD_SB_CLEAN);
1663 md_update_sb(mddev);
1664
1665 /*
1666 * md_size has units of 1K blocks, which are
1667 * twice as large as sectors.
1668 */
1669 md_hd_struct[mdidx(mddev)].start_sect = 0;
1670 md_hd_struct[mdidx(mddev)].nr_sects = md_size[mdidx(mddev)] << 1;
1671
1672 read_ahead[MD_MAJOR] = 1024;
1673 return (0);
1674 }
1675
1676 #undef TOO_BIG_CHUNKSIZE
1677 #undef BAD_CHUNKSIZE
1678
1679 #define OUT(x) do { err = (x); goto out; } while (0)
1680
1681 static int restart_array (mddev_t *mddev)
1682 {
1683 int err = 0;
1684
1685 /*
1686 * Complain if it has no devices
1687 */
1688 if (!mddev->nb_dev)
1689 OUT(-ENXIO);
1690
1691 if (mddev->pers) {
1692 if (!mddev->ro)
1693 OUT(-EBUSY);
1694
1695 mddev->ro = 0;
1696 set_device_ro(mddev_to_kdev(mddev), 0);
1697
1698 printk (KERN_INFO
1699 "md%d switched to read-write mode.\n", mdidx(mddev));
1700 /*
1701 * Kick recovery or resync if necessary
1702 */
1703 md_recover_arrays();
1704 if (mddev->pers->restart_resync)
1705 mddev->pers->restart_resync(mddev);
1706 } else
1707 err = -EINVAL;
1708
1709 out:
1710 return err;
1711 }
1712
1713 #define STILL_MOUNTED KERN_WARNING \
1714 "md: md%d still mounted.\n"
1715 #define STILL_IN_USE \
1716 "md: md%d still in use.\n"
1717
1718 static int do_md_stop (mddev_t * mddev, int ro)
1719 {
1720 int err = 0, resync_interrupted = 0;
1721 kdev_t dev = mddev_to_kdev(mddev);
1722
1723 if (atomic_read(&mddev->active)>1) {
1724 printk(STILL_IN_USE, mdidx(mddev));
1725 OUT(-EBUSY);
1726 }
1727
1728 /* this shouldn't be needed as above would have fired */
1729 if (!ro && get_super(dev)) {
1730 printk (STILL_MOUNTED, mdidx(mddev));
1731 OUT(-EBUSY);
1732 }
1733
1734 if (mddev->pers) {
1735 /*
1736 * It is safe to call stop here, it only frees private
1737 * data. Also, it tells us if a device is unstoppable
1738 * (eg. resyncing is in progress)
1739 */
1740 if (mddev->pers->stop_resync)
1741 if (mddev->pers->stop_resync(mddev))
1742 resync_interrupted = 1;
1743
1744 if (mddev->recovery_running)
1745 md_interrupt_thread(md_recovery_thread);
1746
1747 /*
1748 * This synchronizes with signal delivery to the
1749 * resync or reconstruction thread. It also nicely
1750 * hangs the process if some reconstruction has not
1751 * finished.
1752 */
1753 down(&mddev->recovery_sem);
1754 up(&mddev->recovery_sem);
1755
1756 /*
1757 * sync and invalidate buffers because we cannot kill the
1758 * main thread with valid IO transfers still around.
1759 * the kernel lock protects us from new requests being
1760 * added after invalidate_buffers().
1761 */
1762 fsync_dev (mddev_to_kdev(mddev));
1763 fsync_dev (dev);
1764 invalidate_buffers (dev);
1765
1766 if (ro) {
1767 if (mddev->ro)
1768 OUT(-ENXIO);
1769 mddev->ro = 1;
1770 } else {
1771 if (mddev->ro)
1772 set_device_ro(dev, 0);
1773 if (mddev->pers->stop(mddev)) {
1774 if (mddev->ro)
1775 set_device_ro(dev, 1);
1776 OUT(-EBUSY);
1777 }
1778 if (mddev->ro)
1779 mddev->ro = 0;
1780 }
1781 if (mddev->sb) {
1782 /*
1783 * mark it clean only if there was no resync
1784 * interrupted.
1785 */
1786 if (!mddev->recovery_running && !resync_interrupted) {
1787 printk("marking sb clean...\n");
1788 mddev->sb->state |= 1 << MD_SB_CLEAN;
1789 }
1790 md_update_sb(mddev);
1791 }
1792 if (ro)
1793 set_device_ro(dev, 1);
1794 }
1795
1796 /*
1797 * Free resources if final stop
1798 */
1799 if (!ro) {
1800 printk (KERN_INFO "md%d stopped.\n", mdidx(mddev));
1801 free_mddev(mddev);
1802
1803 } else
1804 printk (KERN_INFO
1805 "md%d switched to read-only mode.\n", mdidx(mddev));
1806 out:
1807 return err;
1808 }
1809
1810 #undef OUT
1811
1812 /*
1813 * We have to safely support old arrays too.
1814 */
1815 int detect_old_array (mdp_super_t *sb)
1816 {
1817 if (sb->major_version > 0)
1818 return 0;
1819 if (sb->minor_version >= 90)
1820 return 0;
1821
1822 return -EINVAL;
1823 }
1824
1825
1826 static void autorun_array (mddev_t *mddev)
1827 {
1828 mdk_rdev_t *rdev;
1829 struct md_list_head *tmp;
1830 int err;
1831
1832 if (mddev->disks.prev == &mddev->disks) {
1833 MD_BUG();
1834 return;
1835 }
1836
1837 printk("running: ");
1838
1839 ITERATE_RDEV(mddev,rdev,tmp) {
1840 printk("<%s>", partition_name(rdev->dev));
1841 }
1842 printk("\nnow!\n");
1843
1844 err = do_md_run (mddev);
1845 if (err) {
1846 printk("do_md_run() returned %d\n", err);
1847 /*
1848 * prevent the writeback of an unrunnable array
1849 */
1850 mddev->sb_dirty = 0;
1851 do_md_stop (mddev, 0);
1852 }
1853 }
1854
1855 /*
1856 * lets try to run arrays based on all disks that have arrived
1857 * until now. (those are in the ->pending list)
1858 *
1859 * the method: pick the first pending disk, collect all disks with
1860 * the same UUID, remove all from the pending list and put them into
1861 * the 'same_array' list. Then order this list based on superblock
1862 * update time (freshest comes first), kick out 'old' disks and
1863 * compare superblocks. If everything's fine then run it.
1864 *
1865 * If "unit" is allocated, then bump its reference count
1866 */
1867 static void autorun_devices (kdev_t countdev)
1868 {
1869 struct md_list_head candidates;
1870 struct md_list_head *tmp;
1871 mdk_rdev_t *rdev0, *rdev;
1872 mddev_t *mddev;
1873 kdev_t md_kdev;
1874
1875
1876 printk("autorun ...\n");
1877 while (pending_raid_disks.next != &pending_raid_disks) {
1878 rdev0 = md_list_entry(pending_raid_disks.next,
1879 mdk_rdev_t, pending);
1880
1881 printk("considering %s ...\n", partition_name(rdev0->dev));
1882 MD_INIT_LIST_HEAD(&candidates);
1883 ITERATE_RDEV_PENDING(rdev,tmp) {
1884 if (uuid_equal(rdev0, rdev)) {
1885 if (!sb_equal(rdev0->sb, rdev->sb)) {
1886 printk("%s has same UUID as %s, but superblocks differ ...\n", partition_name(rdev->dev), partition_name(rdev0->dev));
1887 continue;
1888 }
1889 printk(" adding %s ...\n", partition_name(rdev->dev));
1890 md_list_del(&rdev->pending);
1891 md_list_add(&rdev->pending, &candidates);
1892 }
1893 }
1894 /*
1895 * now we have a set of devices, with all of them having
1896 * mostly sane superblocks. It's time to allocate the
1897 * mddev.
1898 */
1899 md_kdev = MKDEV(MD_MAJOR, rdev0->sb->md_minor);
1900 mddev = kdev_to_mddev(md_kdev);
1901 if (mddev) {
1902 printk("md%d already running, cannot run %s\n",
1903 mdidx(mddev), partition_name(rdev0->dev));
1904 ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp)
1905 export_rdev(rdev);
1906 continue;
1907 }
1908 mddev = alloc_mddev(md_kdev);
1909 if (mddev == NULL) {
1910 printk("md: cannot allocate memory for md drive.\n");
1911 break;
1912 }
1913 if (md_kdev == countdev)
1914 atomic_inc(&mddev->active);
1915 printk("created md%d\n", mdidx(mddev));
1916 ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp) {
1917 bind_rdev_to_array(rdev, mddev);
1918 md_list_del(&rdev->pending);
1919 MD_INIT_LIST_HEAD(&rdev->pending);
1920 }
1921 autorun_array(mddev);
1922 }
1923 printk("... autorun DONE.\n");
1924 }
1925
1926 /*
1927 * import RAID devices based on one partition
1928 * if possible, the array gets run as well.
1929 */
1930
1931 #define BAD_VERSION KERN_ERR \
1932 "md: %s has RAID superblock version 0.%d, autodetect needs v0.90 or higher\n"
1933
1934 #define OUT_OF_MEM KERN_ALERT \
1935 "md: out of memory.\n"
1936
1937 #define NO_DEVICE KERN_ERR \
1938 "md: disabled device %s\n"
1939
1940 #define AUTOADD_FAILED KERN_ERR \
1941 "md: auto-adding devices to md%d FAILED (error %d).\n"
1942
1943 #define AUTOADD_FAILED_USED KERN_ERR \
1944 "md: cannot auto-add device %s to md%d, already used.\n"
1945
1946 #define AUTORUN_FAILED KERN_ERR \
1947 "md: auto-running md%d FAILED (error %d).\n"
1948
1949 #define MDDEV_BUSY KERN_ERR \
1950 "md: cannot auto-add to md%d, already running.\n"
1951
1952 #define AUTOADDING KERN_INFO \
1953 "md: auto-adding devices to md%d, based on %s's superblock.\n"
1954
1955 #define AUTORUNNING KERN_INFO \
1956 "md: auto-running md%d.\n"
1957
1958 static int autostart_array (kdev_t startdev, kdev_t countdev)
1959 {
1960 int err = -EINVAL, i;
1961 mdp_super_t *sb = NULL;
1962 mdk_rdev_t *start_rdev = NULL, *rdev;
1963
1964 if (md_import_device(startdev, 1)) {
1965 printk("could not import %s!\n", partition_name(startdev));
1966 goto abort;
1967 }
1968
1969 start_rdev = find_rdev_all(startdev);
1970 if (!start_rdev) {
1971 MD_BUG();
1972 goto abort;
1973 }
1974 if (start_rdev->faulty) {
1975 printk("can not autostart based on faulty %s!\n",
1976 partition_name(startdev));
1977 goto abort;
1978 }
1979 md_list_add(&start_rdev->pending, &pending_raid_disks);
1980
1981 sb = start_rdev->sb;
1982
1983 err = detect_old_array(sb);
1984 if (err) {
1985 printk("array version is too old to be autostarted, use raidtools 0.90 mkraid --upgrade\nto upgrade the array without data loss!\n");
1986 goto abort;
1987 }
1988
1989 for (i = 0; i < MD_SB_DISKS; i++) {
1990 mdp_disk_t *desc;
1991 kdev_t dev;
1992
1993 desc = sb->disks + i;
1994 dev = MKDEV(desc->major, desc->minor);
1995
1996 if (dev == MKDEV(0,0))
1997 continue;
1998 if (dev == startdev)
1999 continue;
2000 if (md_import_device(dev, 1)) {
2001 printk("could not import %s, trying to run array nevertheless.\n", partition_name(dev));
2002 continue;
2003 }
2004 rdev = find_rdev_all(dev);
2005 if (!rdev) {
2006 MD_BUG();
2007 goto abort;
2008 }
2009 md_list_add(&rdev->pending, &pending_raid_disks);
2010 }
2011
2012 /*
2013 * possibly return codes
2014 */
2015 autorun_devices(countdev);
2016 return 0;
2017
2018 abort:
2019 if (start_rdev)
2020 export_rdev(start_rdev);
2021 return err;
2022 }
2023
2024 #undef BAD_VERSION
2025 #undef OUT_OF_MEM
2026 #undef NO_DEVICE
2027 #undef AUTOADD_FAILED_USED
2028 #undef AUTOADD_FAILED
2029 #undef AUTORUN_FAILED
2030 #undef AUTOADDING
2031 #undef AUTORUNNING
2032
2033 struct {
2034 int set;
2035 int noautodetect;
2036
2037 } raid_setup_args md__initdata = { 0, 0 };
2038
2039 void md_setup_drive(void) md__init;
2040
2041 /*
2042 * Searches all registered partitions for autorun RAID arrays
2043 * at boot time.
2044 */
2045 #ifdef CONFIG_AUTODETECT_RAID
2046 static int detected_devices[128] md__initdata = { 0, };
2047 static int dev_cnt=0;
2048 void md_autodetect_dev(kdev_t dev)
2049 {
2050 if (dev_cnt >= 0 && dev_cnt < 127)
2051 detected_devices[dev_cnt++] = dev;
2052 }
2053 #endif
2054
2055 int md__init md_run_setup(void)
2056 {
2057 #ifdef CONFIG_AUTODETECT_RAID
2058 mdk_rdev_t *rdev;
2059 int i;
2060
2061 if (raid_setup_args.noautodetect)
2062 printk(KERN_INFO "skipping autodetection of RAID arrays\n");
2063 else {
2064
2065 printk(KERN_INFO "autodetecting RAID arrays\n");
2066
2067 for (i=0; i<dev_cnt; i++) {
2068 kdev_t dev = detected_devices[i];
2069
2070 if (md_import_device(dev,1)) {
2071 printk(KERN_ALERT "could not import %s!\n",
2072 partition_name(dev));
2073 continue;
2074 }
2075 /*
2076 * Sanity checks:
2077 */
2078 rdev = find_rdev_all(dev);
2079 if (!rdev) {
2080 MD_BUG();
2081 continue;
2082 }
2083 if (rdev->faulty) {
2084 MD_BUG();
2085 continue;
2086 }
2087 md_list_add(&rdev->pending, &pending_raid_disks);
2088 }
2089
2090 autorun_devices(-1);
2091 }
2092
2093 dev_cnt = -1; /* make sure further calls to md_autodetect_dev are ignored */
2094 #endif
2095 #ifdef CONFIG_MD_BOOT
2096 md_setup_drive();
2097 #endif
2098 return 0;
2099 }
2100
2101 static int get_version (void * arg)
2102 {
2103 mdu_version_t ver;
2104
2105 ver.major = MD_MAJOR_VERSION;
2106 ver.minor = MD_MINOR_VERSION;
2107 ver.patchlevel = MD_PATCHLEVEL_VERSION;
2108
2109 if (md_copy_to_user(arg, &ver, sizeof(ver)))
2110 return -EFAULT;
2111
2112 return 0;
2113 }
2114
2115 #define SET_FROM_SB(x) info.x = mddev->sb->x
2116 static int get_array_info (mddev_t * mddev, void * arg)
2117 {
2118 mdu_array_info_t info;
2119
2120 if (!mddev->sb)
2121 return -EINVAL;
2122
2123 SET_FROM_SB(major_version);
2124 SET_FROM_SB(minor_version);
2125 SET_FROM_SB(patch_version);
2126 SET_FROM_SB(ctime);
2127 SET_FROM_SB(level);
2128 SET_FROM_SB(size);
2129 SET_FROM_SB(nr_disks);
2130 SET_FROM_SB(raid_disks);
2131 SET_FROM_SB(md_minor);
2132 SET_FROM_SB(not_persistent);
2133
2134 SET_FROM_SB(utime);
2135 SET_FROM_SB(state);
2136 SET_FROM_SB(active_disks);
2137 SET_FROM_SB(working_disks);
2138 SET_FROM_SB(failed_disks);
2139 SET_FROM_SB(spare_disks);
2140
2141 SET_FROM_SB(layout);
2142 SET_FROM_SB(chunk_size);
2143
2144 if (md_copy_to_user(arg, &info, sizeof(info)))
2145 return -EFAULT;
2146
2147 return 0;
2148 }
2149 #undef SET_FROM_SB
2150
2151 #define SET_FROM_SB(x) info.x = mddev->sb->disks[nr].x
2152 static int get_disk_info (mddev_t * mddev, void * arg)
2153 {
2154 mdu_disk_info_t info;
2155 unsigned int nr;
2156
2157 if (!mddev->sb)
2158 return -EINVAL;
2159
2160 if (md_copy_from_user(&info, arg, sizeof(info)))
2161 return -EFAULT;
2162
2163 nr = info.number;
2164 if (nr >= mddev->sb->nr_disks)
2165 return -EINVAL;
2166
2167 SET_FROM_SB(major);
2168 SET_FROM_SB(minor);
2169 SET_FROM_SB(raid_disk);
2170 SET_FROM_SB(state);
2171
2172 if (md_copy_to_user(arg, &info, sizeof(info)))
2173 return -EFAULT;
2174
2175 return 0;
2176 }
2177 #undef SET_FROM_SB
2178
2179 #define SET_SB(x) mddev->sb->disks[nr].x = info->x
2180
2181 static int add_new_disk (mddev_t * mddev, mdu_disk_info_t *info)
2182 {
2183 int err, size, persistent;
2184 mdk_rdev_t *rdev;
2185 unsigned int nr;
2186 kdev_t dev;
2187 dev = MKDEV(info->major,info->minor);
2188
2189 if (find_rdev_all(dev)) {
2190 printk("device %s already used in a RAID array!\n",
2191 partition_name(dev));
2192 return -EBUSY;
2193 }
2194 if (!mddev->sb) {
2195 /* expecting a device which has a superblock */
2196 err = md_import_device(dev, 1);
2197 if (err) {
2198 printk("md error, md_import_device returned %d\n", err);
2199 return -EINVAL;
2200 }
2201 rdev = find_rdev_all(dev);
2202 if (!rdev) {
2203 MD_BUG();
2204 return -EINVAL;
2205 }
2206 if (mddev->nb_dev) {
2207 mdk_rdev_t *rdev0 = md_list_entry(mddev->disks.next,
2208 mdk_rdev_t, same_set);
2209 if (!uuid_equal(rdev0, rdev)) {
2210 printk("md: %s has different UUID to %s\n", partition_name(rdev->dev), partition_name(rdev0->dev));
2211 export_rdev(rdev);
2212 return -EINVAL;
2213 }
2214 if (!sb_equal(rdev0->sb, rdev->sb)) {
2215 printk("md: %s has same UUID but different superblock to %s\n", partition_name(rdev->dev), partition_name(rdev0->dev));
2216 export_rdev(rdev);
2217 return -EINVAL;
2218 }
2219 }
2220 bind_rdev_to_array(rdev, mddev);
2221 return 0;
2222 }
2223
2224 nr = info->number;
2225 if (nr >= mddev->sb->nr_disks)
2226 return -EINVAL;
2227
2228 SET_SB(number);
2229 SET_SB(major);
2230 SET_SB(minor);
2231 SET_SB(raid_disk);
2232 SET_SB(state);
2233
2234 if ((info->state & (1<<MD_DISK_FAULTY))==0) {
2235 err = md_import_device (dev, 0);
2236 if (err) {
2237 printk("md: error, md_import_device() returned %d\n", err);
2238 return -EINVAL;
2239 }
2240 rdev = find_rdev_all(dev);
2241 if (!rdev) {
2242 MD_BUG();
2243 return -EINVAL;
2244 }
2245
2246 rdev->old_dev = dev;
2247 rdev->desc_nr = info->number;
2248
2249 bind_rdev_to_array(rdev, mddev);
2250
2251 persistent = !mddev->sb->not_persistent;
2252 if (!persistent)
2253 printk("nonpersistent superblock ...\n");
2254 if (!mddev->sb->chunk_size)
2255 printk("no chunksize?\n");
2256
2257 size = calc_dev_size(dev, mddev, persistent);
2258 rdev->sb_offset = calc_dev_sboffset(dev, mddev, persistent);
2259
2260 if (!mddev->sb->size || (mddev->sb->size > size))
2261 mddev->sb->size = size;
2262 }
2263
2264 /*
2265 * sync all other superblocks with the main superblock
2266 */
2267 sync_sbs(mddev);
2268
2269 return 0;
2270 }
2271 #undef SET_SB
2272
2273 static int hot_remove_disk (mddev_t * mddev, kdev_t dev)
2274 {
2275 int err;
2276 mdk_rdev_t *rdev;
2277 mdp_disk_t *disk;
2278
2279 if (!mddev->pers)
2280 return -ENODEV;
2281
2282 printk("trying to remove %s from md%d ... \n",
2283 partition_name(dev), mdidx(mddev));
2284
2285 if (!mddev->pers->diskop) {
2286 printk("md%d: personality does not support diskops!\n",
2287 mdidx(mddev));
2288 return -EINVAL;
2289 }
2290
2291 rdev = find_rdev(mddev, dev);
2292 if (!rdev)
2293 return -ENXIO;
2294
2295 if (rdev->desc_nr == -1) {
2296 MD_BUG();
2297 return -EINVAL;
2298 }
2299 disk = &mddev->sb->disks[rdev->desc_nr];
2300 if (disk_active(disk))
2301 goto busy;
2302 if (disk_removed(disk)) {
2303 MD_BUG();
2304 return -EINVAL;
2305 }
2306
2307 err = mddev->pers->diskop(mddev, &disk, DISKOP_HOT_REMOVE_DISK);
2308 if (err == -EBUSY)
2309 goto busy;
2310 if (err) {
2311 MD_BUG();
2312 return -EINVAL;
2313 }
2314
2315 remove_descriptor(disk, mddev->sb);
2316 kick_rdev_from_array(rdev);
2317 mddev->sb_dirty = 1;
2318 md_update_sb(mddev);
2319
2320 return 0;
2321 busy:
2322 printk("cannot remove active disk %s from md%d ... \n",
2323 partition_name(dev), mdidx(mddev));
2324 return -EBUSY;
2325 }
2326
2327 static int hot_add_disk (mddev_t * mddev, kdev_t dev)
2328 {
2329 int i, err, persistent;
2330 unsigned int size;
2331 mdk_rdev_t *rdev;
2332 mdp_disk_t *disk;
2333
2334 if (!mddev->pers)
2335 return -ENODEV;
2336
2337 printk("trying to hot-add %s to md%d ... \n",
2338 partition_name(dev), mdidx(mddev));
2339
2340 if (!mddev->pers->diskop) {
2341 printk("md%d: personality does not support diskops!\n",
2342 mdidx(mddev));
2343 return -EINVAL;
2344 }
2345
2346 persistent = !mddev->sb->not_persistent;
2347 size = calc_dev_size(dev, mddev, persistent);
2348
2349 if (size < mddev->sb->size) {
2350 printk("md%d: disk size %d blocks < array size %d\n",
2351 mdidx(mddev), size, mddev->sb->size);
2352 return -ENOSPC;
2353 }
2354
2355 rdev = find_rdev(mddev, dev);
2356 if (rdev)
2357 return -EBUSY;
2358
2359 err = md_import_device (dev, 0);
2360 if (err) {
2361 printk("md: error, md_import_device() returned %d\n", err);
2362 return -EINVAL;
2363 }
2364 rdev = find_rdev_all(dev);
2365 if (!rdev) {
2366 MD_BUG();
2367 return -EINVAL;
2368 }
2369 if (rdev->faulty) {
2370 printk("md: can not hot-add faulty %s disk to md%d!\n",
2371 partition_name(dev), mdidx(mddev));
2372 err = -EINVAL;
2373 goto abort_export;
2374 }
2375 bind_rdev_to_array(rdev, mddev);
2376
2377 /*
2378 * The rest should better be atomic, we can have disk failures
2379 * noticed in interrupt contexts ...
2380 */
2381 rdev->old_dev = dev;
2382 rdev->size = size;
2383 rdev->sb_offset = calc_dev_sboffset(dev, mddev, persistent);
2384
2385 disk = mddev->sb->disks + mddev->sb->raid_disks;
2386 for (i = mddev->sb->raid_disks; i < MD_SB_DISKS; i++) {
2387 disk = mddev->sb->disks + i;
2388
2389 if (!disk->major && !disk->minor)
2390 break;
2391 if (disk_removed(disk))
2392 break;
2393 }
2394 if (i == MD_SB_DISKS) {
2395 printk("md%d: can not hot-add to full array!\n", mdidx(mddev));
2396 err = -EBUSY;
2397 goto abort_unbind_export;
2398 }
2399
2400 if (disk_removed(disk)) {
2401 /*
2402 * reuse slot
2403 */
2404 if (disk->number != i) {
2405 MD_BUG();
2406 err = -EINVAL;
2407 goto abort_unbind_export;
2408 }
2409 } else {
2410 disk->number = i;
2411 }
2412
2413 disk->raid_disk = disk->number;
2414 disk->major = MAJOR(dev);
2415 disk->minor = MINOR(dev);
2416
2417 if (mddev->pers->diskop(mddev, &disk, DISKOP_HOT_ADD_DISK)) {
2418 MD_BUG();
2419 err = -EINVAL;
2420 goto abort_unbind_export;
2421 }
2422
2423 mark_disk_spare(disk);
2424 mddev->sb->nr_disks++;
2425 mddev->sb->spare_disks++;
2426 mddev->sb->working_disks++;
2427
2428 mddev->sb_dirty = 1;
2429
2430 md_update_sb(mddev);
2431
2432 /*
2433 * Kick recovery, maybe this spare has to be added to the
2434 * array immediately.
2435 */
2436 md_recover_arrays();
2437
2438 return 0;
2439
2440 abort_unbind_export:
2441 unbind_rdev_from_array(rdev);
2442
2443 abort_export:
2444 export_rdev(rdev);
2445 return err;
2446 }
2447
2448 #define SET_SB(x) mddev->sb->x = info->x
2449 static int set_array_info (mddev_t * mddev, mdu_array_info_t *info)
2450 {
2451
2452 if (alloc_array_sb(mddev))
2453 return -ENOMEM;
2454
2455 mddev->sb->major_version = MD_MAJOR_VERSION;
2456 mddev->sb->minor_version = MD_MINOR_VERSION;
2457 mddev->sb->patch_version = MD_PATCHLEVEL_VERSION;
2458 mddev->sb->ctime = CURRENT_TIME;
2459
2460 SET_SB(level);
2461 SET_SB(size);
2462 SET_SB(nr_disks);
2463 SET_SB(raid_disks);
2464 SET_SB(md_minor);
2465 SET_SB(not_persistent);
2466
2467 SET_SB(state);
2468 SET_SB(active_disks);
2469 SET_SB(working_disks);
2470 SET_SB(failed_disks);
2471 SET_SB(spare_disks);
2472
2473 SET_SB(layout);
2474 SET_SB(chunk_size);
2475
2476 mddev->sb->md_magic = MD_SB_MAGIC;
2477
2478 /*
2479 * Generate a 128 bit UUID
2480 */
2481 get_random_bytes(&mddev->sb->set_uuid0, 4);
2482 get_random_bytes(&mddev->sb->set_uuid1, 4);
2483 get_random_bytes(&mddev->sb->set_uuid2, 4);
2484 get_random_bytes(&mddev->sb->set_uuid3, 4);
2485
2486 return 0;
2487 }
2488 #undef SET_SB
2489
2490 static int set_disk_info (mddev_t * mddev, void * arg)
2491 {
2492 printk("not yet");
2493 return -EINVAL;
2494 }
2495
2496 static int clear_array (mddev_t * mddev)
2497 {
2498 printk("not yet");
2499 return -EINVAL;
2500 }
2501
2502 static int write_raid_info (mddev_t * mddev)
2503 {
2504 printk("not yet");
2505 return -EINVAL;
2506 }
2507
2508 static int protect_array (mddev_t * mddev)
2509 {
2510 printk("not yet");
2511 return -EINVAL;
2512 }
2513
2514 static int unprotect_array (mddev_t * mddev)
2515 {
2516 printk("not yet");
2517 return -EINVAL;
2518 }
2519
2520 static int set_disk_faulty (mddev_t *mddev, kdev_t dev)
2521 {
2522 int ret;
2523
2524 fsync_dev(mddev_to_kdev(mddev));
2525 ret = md_error(mddev_to_kdev(mddev), dev);
2526 return ret;
2527 }
2528
2529 static int md_ioctl (struct inode *inode, struct file *file,
2530 unsigned int cmd, unsigned long arg)
2531 {
2532 unsigned int minor;
2533 int err = 0;
2534 struct hd_geometry *loc = (struct hd_geometry *) arg;
2535 mddev_t *mddev = NULL;
2536 kdev_t dev;
2537
2538 if (!md_capable_admin())
2539 return -EACCES;
2540
2541 dev = inode->i_rdev;
2542 minor = MINOR(dev);
2543 if (minor >= MAX_MD_DEVS)
2544 return -EINVAL;
2545
2546 /*
2547 * Commands dealing with the RAID driver but not any
2548 * particular array:
2549 */
2550 switch (cmd)
2551 {
2552 case RAID_VERSION:
2553 err = get_version((void *)arg);
2554 goto done;
2555
2556 case PRINT_RAID_DEBUG:
2557 err = 0;
2558 md_print_devices();
2559 goto done_unlock;
2560
2561 case BLKGETSIZE: /* Return device size */
2562 if (!arg) {
2563 err = -EINVAL;
2564 goto abort;
2565 }
2566 err = md_put_user(md_hd_struct[minor].nr_sects,
2567 (long *) arg);
2568 goto done;
2569
2570 case BLKFLSBUF:
2571 fsync_dev(dev);
2572 invalidate_buffers(dev);
2573 goto done;
2574
2575 case BLKRASET:
2576 if (arg > 0xff) {
2577 err = -EINVAL;
2578 goto abort;
2579 }
2580 read_ahead[MAJOR(dev)] = arg;
2581 goto done;
2582
2583 case BLKRAGET:
2584 if (!arg) {
2585 err = -EINVAL;
2586 goto abort;
2587 }
2588 err = md_put_user (read_ahead[
2589 MAJOR(dev)], (long *) arg);
2590 goto done;
2591 default:
2592 }
2593
2594 /*
2595 * Commands creating/starting a new array:
2596 */
2597
2598 mddev = kdev_to_mddev(dev);
2599
2600 switch (cmd)
2601 {
2602 case SET_ARRAY_INFO:
2603 case START_ARRAY:
2604 if (mddev) {
2605 printk("array md%d already exists!\n",
2606 mdidx(mddev));
2607 err = -EEXIST;
2608 goto abort;
2609 }
2610 default:
2611 }
2612 switch (cmd)
2613 {
2614 case SET_ARRAY_INFO:
2615 mddev = alloc_mddev(dev);
2616 if (!mddev) {
2617 err = -ENOMEM;
2618 goto abort;
2619 }
2620 atomic_inc(&mddev->active);
2621
2622 /*
2623 * alloc_mddev() should possibly self-lock.
2624 */
2625 err = lock_mddev(mddev);
2626 if (err) {
2627 printk("ioctl, reason %d, cmd %d\n", err, cmd);
2628 goto abort;
2629 }
2630
2631 if (mddev->sb) {
2632 printk("array md%d already has a superblock!\n",
2633 mdidx(mddev));
2634 err = -EBUSY;
2635 goto abort_unlock;
2636 }
2637 if (arg) {
2638 mdu_array_info_t info;
2639 if (md_copy_from_user(&info, (void*)arg, sizeof(info))) {
2640 err = -EFAULT;
2641 goto abort_unlock;
2642 }
2643 err = set_array_info(mddev, &info);
2644 if (err) {
2645 printk("couldnt set array info. %d\n", err);
2646 goto abort_unlock;
2647 }
2648 }
2649 goto done_unlock;
2650
2651 case START_ARRAY:
2652 /*
2653 * possibly make it lock the array ...
2654 */
2655 err = autostart_array((kdev_t)arg, dev);
2656 if (err) {
2657 printk("autostart %s failed!\n",
2658 partition_name((kdev_t)arg));
2659 goto abort;
2660 }
2661 goto done;
2662
2663 default:
2664 }
2665
2666 /*
2667 * Commands querying/configuring an existing array:
2668 */
2669
2670 if (!mddev) {
2671 err = -ENODEV;
2672 goto abort;
2673 }
2674 err = lock_mddev(mddev);
2675 if (err) {
2676 printk("ioctl lock interrupted, reason %d, cmd %d\n",err, cmd);
2677 goto abort;
2678 }
2679 /* if we don't have a superblock yet, only ADD_NEW_DISK or STOP_ARRAY is allowed */
2680 if (!mddev->sb && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY && cmd != RUN_ARRAY) {
2681 err = -ENODEV;
2682 goto abort_unlock;
2683 }
2684
2685 /*
2686 * Commands even a read-only array can execute:
2687 */
2688 switch (cmd)
2689 {
2690 case GET_ARRAY_INFO:
2691 err = get_array_info(mddev, (void *)arg);
2692 goto done_unlock;
2693
2694 case GET_DISK_INFO:
2695 err = get_disk_info(mddev, (void *)arg);
2696 goto done_unlock;
2697
2698 case RESTART_ARRAY_RW:
2699 err = restart_array(mddev);
2700 goto done_unlock;
2701
2702 case STOP_ARRAY:
2703 if (!(err = do_md_stop (mddev, 0)))
2704 mddev = NULL;
2705 goto done_unlock;
2706
2707 case STOP_ARRAY_RO:
2708 err = do_md_stop (mddev, 1);
2709 goto done_unlock;
2710
2711 /*
2712 * We have a problem here : there is no easy way to give a CHS
2713 * virtual geometry. We currently pretend that we have a 2 heads
2714 * 4 sectors (with a BIG number of cylinders...). This drives
2715 * dosfs just mad... ;-)
2716 */
2717 case HDIO_GETGEO:
2718 if (!loc) {
2719 err = -EINVAL;
2720 goto abort_unlock;
2721 }
2722 err = md_put_user (2, (char *) &loc->heads);
2723 if (err)
2724 goto abort_unlock;
2725 err = md_put_user (4, (char *) &loc->sectors);
2726 if (err)
2727 goto abort_unlock;
2728 err = md_put_user (md_hd_struct[mdidx(mddev)].nr_sects/8,
2729 (short *) &loc->cylinders);
2730 if (err)
2731 goto abort_unlock;
2732 err = md_put_user (md_hd_struct[minor].start_sect,
2733 (long *) &loc->start);
2734 goto done_unlock;
2735 }
2736
2737 /*
2738 * The remaining ioctls are changing the state of the
2739 * superblock, so we do not allow read-only arrays
2740 * here:
2741 */
2742 if (mddev->ro) {
2743 err = -EROFS;
2744 goto abort_unlock;
2745 }
2746
2747 switch (cmd)
2748 {
2749 case CLEAR_ARRAY:
2750 err = clear_array(mddev);
2751 goto done_unlock;
2752
2753 case ADD_NEW_DISK:
2754 {
2755 mdu_disk_info_t info;
2756 if (md_copy_from_user(&info, (void*)arg, sizeof(info)))
2757 err = -EFAULT;
2758 else
2759 err = add_new_disk(mddev, &info);
2760 goto done_unlock;
2761 }
2762 case HOT_REMOVE_DISK:
2763 err = hot_remove_disk(mddev, (kdev_t)arg);
2764 goto done_unlock;
2765
2766 case HOT_ADD_DISK:
2767 err = hot_add_disk(mddev, (kdev_t)arg);
2768 goto done_unlock;
2769
2770 case SET_DISK_INFO:
2771 err = set_disk_info(mddev, (void *)arg);
2772 goto done_unlock;
2773
2774 case WRITE_RAID_INFO:
2775 err = write_raid_info(mddev);
2776 goto done_unlock;
2777
2778 case UNPROTECT_ARRAY:
2779 err = unprotect_array(mddev);
2780 goto done_unlock;
2781
2782 case PROTECT_ARRAY:
2783 err = protect_array(mddev);
2784 goto done_unlock;
2785
2786 case SET_DISK_FAULTY:
2787 err = set_disk_faulty(mddev, (kdev_t)arg);
2788 goto done_unlock;
2789
2790 case RUN_ARRAY:
2791 {
2792 /* The data is never used....
2793 mdu_param_t param;
2794 err = md_copy_from_user(¶m, (mdu_param_t *)arg,
2795 sizeof(param));
2796 if (err)
2797 goto abort_unlock;
2798 */
2799 err = do_md_run (mddev);
2800 /*
2801 * we have to clean up the mess if
2802 * the array cannot be run for some
2803 * reason ...
2804 */
2805 if (err) {
2806 mddev->sb_dirty = 0;
2807 if (!do_md_stop (mddev, 0))
2808 mddev = NULL;
2809 }
2810 goto done_unlock;
2811 }
2812
2813 default:
2814 printk(KERN_WARNING "%s(pid %d) used obsolete MD ioctl, upgrade your software to use new ictls.\n", current->comm, current->pid);
2815 err = -EINVAL;
2816 goto abort_unlock;
2817 }
2818
2819 done_unlock:
2820 abort_unlock:
2821 if (mddev)
2822 unlock_mddev(mddev);
2823
2824 return err;
2825 done:
2826 if (err)
2827 printk("huh12?\n");
2828 abort:
2829 return err;
2830 }
2831
2832 static int md_open (struct inode *inode, struct file *file)
2833 {
2834 /*
2835 * Always succeed, but increment the usage count
2836 */
2837 mddev_t *mddev = kdev_to_mddev(inode->i_rdev);
2838 if (mddev)
2839 atomic_inc(&mddev->active);
2840 return (0);
2841 }
2842
2843 static int md_release (struct inode *inode, struct file * file)
2844 {
2845 mddev_t *mddev = kdev_to_mddev(inode->i_rdev);
2846 if (mddev)
2847 atomic_dec(&mddev->active);
2848 return 0;
2849 }
2850
2851 static struct block_device_operations md_fops=
2852 {
2853 open: md_open,
2854 release: md_release,
2855 ioctl: md_ioctl,
2856 };
2857
2858
2859 int md_thread(void * arg)
2860 {
2861 mdk_thread_t *thread = arg;
2862
2863 md_lock_kernel();
2864
2865 /*
2866 * Detach thread
2867 */
2868
2869 daemonize();
2870
2871 sprintf(current->comm, thread->name);
2872 md_init_signals();
2873 md_flush_signals();
2874 thread->tsk = current;
2875
2876 /*
2877 * md_thread is a 'system-thread', it's priority should be very
2878 * high. We avoid resource deadlocks individually in each
2879 * raid personality. (RAID5 does preallocation) We also use RR and
2880 * the very same RT priority as kswapd, thus we will never get
2881 * into a priority inversion deadlock.
2882 *
2883 * we definitely have to have equal or higher priority than
2884 * bdflush, otherwise bdflush will deadlock if there are too
2885 * many dirty RAID5 blocks.
2886 */
2887 current->policy = SCHED_OTHER;
2888 current->nice = -20;
2889 // md_unlock_kernel();
2890
2891 up(thread->sem);
2892
2893 for (;;) {
2894 DECLARE_WAITQUEUE(wait, current);
2895
2896 add_wait_queue(&thread->wqueue, &wait);
2897 set_task_state(current, TASK_INTERRUPTIBLE);
2898 if (!test_bit(THREAD_WAKEUP, &thread->flags)) {
2899 dprintk("thread %p went to sleep.\n", thread);
2900 schedule();
2901 dprintk("thread %p woke up.\n", thread);
2902 }
2903 current->state = TASK_RUNNING;
2904 remove_wait_queue(&thread->wqueue, &wait);
2905 clear_bit(THREAD_WAKEUP, &thread->flags);
2906
2907 if (thread->run) {
2908 thread->run(thread->data);
2909 run_task_queue(&tq_disk);
2910 } else
2911 break;
2912 if (md_signal_pending(current)) {
2913 printk("%8s(%d) flushing signals.\n", current->comm,
2914 current->pid);
2915 md_flush_signals();
2916 }
2917 }
2918 up(thread->sem);
2919 return 0;
2920 }
2921
2922 void md_wakeup_thread(mdk_thread_t *thread)
2923 {
2924 dprintk("waking up MD thread %p.\n", thread);
2925 set_bit(THREAD_WAKEUP, &thread->flags);
2926 wake_up(&thread->wqueue);
2927 }
2928
2929 mdk_thread_t *md_register_thread (void (*run) (void *),
2930 void *data, const char *name)
2931 {
2932 mdk_thread_t *thread;
2933 int ret;
2934 DECLARE_MUTEX_LOCKED(sem);
2935
2936 thread = (mdk_thread_t *) kmalloc
2937 (sizeof(mdk_thread_t), GFP_KERNEL);
2938 if (!thread)
2939 return NULL;
2940
2941 memset(thread, 0, sizeof(mdk_thread_t));
2942 md_init_waitqueue_head(&thread->wqueue);
2943
2944 thread->sem = &sem;
2945 thread->run = run;
2946 thread->data = data;
2947 thread->name = name;
2948 ret = kernel_thread(md_thread, thread, 0);
2949 if (ret < 0) {
2950 kfree(thread);
2951 return NULL;
2952 }
2953 down(&sem);
2954 return thread;
2955 }
2956
2957 void md_interrupt_thread (mdk_thread_t *thread)
2958 {
2959 if (!thread->tsk) {
2960 MD_BUG();
2961 return;
2962 }
2963 printk("interrupting MD-thread pid %d\n", thread->tsk->pid);
2964 send_sig(SIGKILL, thread->tsk, 1);
2965 }
2966
2967 void md_unregister_thread (mdk_thread_t *thread)
2968 {
2969 DECLARE_MUTEX_LOCKED(sem);
2970
2971 thread->sem = &sem;
2972 thread->run = NULL;
2973 thread->name = NULL;
2974 if (!thread->tsk) {
2975 MD_BUG();
2976 return;
2977 }
2978 md_interrupt_thread(thread);
2979 down(&sem);
2980 }
2981
2982 void md_recover_arrays (void)
2983 {
2984 if (!md_recovery_thread) {
2985 MD_BUG();
2986 return;
2987 }
2988 md_wakeup_thread(md_recovery_thread);
2989 }
2990
2991
2992 int md_error (kdev_t dev, kdev_t rdev)
2993 {
2994 mddev_t *mddev;
2995 mdk_rdev_t * rrdev;
2996 int rc;
2997
2998 mddev = kdev_to_mddev(dev);
2999 /* printk("md_error dev:(%d:%d), rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",MAJOR(dev),MINOR(dev),MAJOR(rdev),MINOR(rdev), __builtin_return_address(0),__builtin_return_address(1),__builtin_return_address(2),__builtin_return_address(3));
3000 */
3001 if (!mddev) {
3002 MD_BUG();
3003 return 0;
3004 }
3005 rrdev = find_rdev(mddev, rdev);
3006 mark_rdev_faulty(rrdev);
3007 /*
3008 * if recovery was running, stop it now.
3009 */
3010 if (mddev->pers->stop_resync)
3011 mddev->pers->stop_resync(mddev);
3012 if (mddev->recovery_running)
3013 md_interrupt_thread(md_recovery_thread);
3014 if (mddev->pers->error_handler) {
3015 rc = mddev->pers->error_handler(mddev, rdev);
3016 md_recover_arrays();
3017 return rc;
3018 }
3019 return 0;
3020 }
3021
3022 static int status_unused (char * page)
3023 {
3024 int sz = 0, i = 0;
3025 mdk_rdev_t *rdev;
3026 struct md_list_head *tmp;
3027
3028 sz += sprintf(page + sz, "unused devices: ");
3029
3030 ITERATE_RDEV_ALL(rdev,tmp) {
3031 if (!rdev->same_set.next && !rdev->same_set.prev) {
3032 /*
3033 * The device is not yet used by any array.
3034 */
3035 i++;
3036 sz += sprintf(page + sz, "%s ",
3037 partition_name(rdev->dev));
3038 }
3039 }
3040 if (!i)
3041 sz += sprintf(page + sz, "<none>");
3042
3043 sz += sprintf(page + sz, "\n");
3044 return sz;
3045 }
3046
3047
3048 static int status_resync (char * page, mddev_t * mddev)
3049 {
3050 int sz = 0;
3051 unsigned long max_blocks, resync, res, dt, db, rt;
3052
3053 resync = mddev->curr_resync - atomic_read(&mddev->recovery_active);
3054 max_blocks = mddev->sb->size;
3055
3056 /*
3057 * Should not happen.
3058 */
3059 if (!max_blocks) {
3060 MD_BUG();
3061 return 0;
3062 }
3063 res = (resync/1024)*1000/(max_blocks/1024 + 1);
3064 {
3065 int i, x = res/50, y = 20-x;
3066 sz += sprintf(page + sz, "[");
3067 for (i = 0; i < x; i++)
3068 sz += sprintf(page + sz, "=");
3069 sz += sprintf(page + sz, ">");
3070 for (i = 0; i < y; i++)
3071 sz += sprintf(page + sz, ".");
3072 sz += sprintf(page + sz, "] ");
3073 }
3074 if (!mddev->recovery_running)
3075 /*
3076 * true resync
3077 */
3078 sz += sprintf(page + sz, " resync =%3lu.%lu%% (%lu/%lu)",
3079 res/10, res % 10, resync, max_blocks);
3080 else
3081 /*
3082 * recovery ...
3083 */
3084 sz += sprintf(page + sz, " recovery =%3lu.%lu%% (%lu/%lu)",
3085 res/10, res % 10, resync, max_blocks);
3086
3087 /*
3088 * We do not want to overflow, so the order of operands and
3089 * the * 100 / 100 trick are important. We do a +1 to be
3090 * safe against division by zero. We only estimate anyway.
3091 *
3092 * dt: time from mark until now
3093 * db: blocks written from mark until now
3094 * rt: remaining time
3095 */
3096 dt = ((jiffies - mddev->resync_mark) / HZ);
3097 if (!dt) dt++;
3098 db = resync - mddev->resync_mark_cnt;
3099 rt = (dt * ((max_blocks-resync) / (db/100+1)))/100;
3100
3101 sz += sprintf(page + sz, " finish=%lu.%lumin", rt / 60, (rt % 60)/6);
3102
3103 sz += sprintf(page + sz, " speed=%ldK/sec", db/dt);
3104
3105 return sz;
3106 }
3107
3108 static int md_status_read_proc(char *page, char **start, off_t off,
3109 int count, int *eof, void *data)
3110 {
3111 int sz = 0, j, size;
3112 struct md_list_head *tmp, *tmp2;
3113 mdk_rdev_t *rdev;
3114 mddev_t *mddev;
3115
3116 sz += sprintf(page + sz, "Personalities : ");
3117 for (j = 0; j < MAX_PERSONALITY; j++)
3118 if (pers[j])
3119 sz += sprintf(page+sz, "[%s] ", pers[j]->name);
3120
3121 sz += sprintf(page+sz, "\n");
3122
3123
3124 sz += sprintf(page+sz, "read_ahead ");
3125 if (read_ahead[MD_MAJOR] == INT_MAX)
3126 sz += sprintf(page+sz, "not set\n");
3127 else
3128 sz += sprintf(page+sz, "%d sectors\n", read_ahead[MD_MAJOR]);
3129
3130 ITERATE_MDDEV(mddev,tmp) {
3131 sz += sprintf(page + sz, "md%d : %sactive", mdidx(mddev),
3132 mddev->pers ? "" : "in");
3133 if (mddev->pers) {
3134 if (mddev->ro)
3135 sz += sprintf(page + sz, " (read-only)");
3136 sz += sprintf(page + sz, " %s", mddev->pers->name);
3137 }
3138
3139 size = 0;
3140 ITERATE_RDEV(mddev,rdev,tmp2) {
3141 sz += sprintf(page + sz, " %s[%d]",
3142 partition_name(rdev->dev), rdev->desc_nr);
3143 if (rdev->faulty) {
3144 sz += sprintf(page + sz, "(F)");
3145 continue;
3146 }
3147 size += rdev->size;
3148 }
3149
3150 if (mddev->nb_dev) {
3151 if (mddev->pers)
3152 sz += sprintf(page + sz, "\n %d blocks",
3153 md_size[mdidx(mddev)]);
3154 else
3155 sz += sprintf(page + sz, "\n %d blocks", size);
3156 }
3157
3158 if (!mddev->pers) {
3159 sz += sprintf(page+sz, "\n");
3160 continue;
3161 }
3162
3163 sz += mddev->pers->status (page+sz, mddev);
3164
3165 sz += sprintf(page+sz, "\n ");
3166 if (mddev->curr_resync) {
3167 sz += status_resync (page+sz, mddev);
3168 } else {
3169 if (md_atomic_read(&mddev->resync_sem.count) != 1)
3170 sz += sprintf(page + sz, " resync=DELAYED");
3171 }
3172 sz += sprintf(page + sz, "\n");
3173 }
3174 sz += status_unused (page + sz);
3175
3176 return sz;
3177 }
3178
3179 int register_md_personality (int pnum, mdk_personality_t *p)
3180 {
3181 if (pnum >= MAX_PERSONALITY)
3182 return -EINVAL;
3183
3184 if (pers[pnum])
3185 return -EBUSY;
3186
3187 pers[pnum] = p;
3188 printk(KERN_INFO "%s personality registered\n", p->name);
3189 return 0;
3190 }
3191
3192 int unregister_md_personality (int pnum)
3193 {
3194 if (pnum >= MAX_PERSONALITY)
3195 return -EINVAL;
3196
3197 printk(KERN_INFO "%s personality unregistered\n", pers[pnum]->name);
3198 pers[pnum] = NULL;
3199 return 0;
3200 }
3201
3202 static mdp_disk_t *get_spare(mddev_t *mddev)
3203 {
3204 mdp_super_t *sb = mddev->sb;
3205 mdp_disk_t *disk;
3206 mdk_rdev_t *rdev;
3207 struct md_list_head *tmp;
3208
3209 ITERATE_RDEV(mddev,rdev,tmp) {
3210 if (rdev->faulty)
3211 continue;
3212 if (!rdev->sb) {
3213 MD_BUG();
3214 continue;
3215 }
3216 disk = &sb->disks[rdev->desc_nr];
3217 if (disk_faulty(disk)) {
3218 MD_BUG();
3219 continue;
3220 }
3221 if (disk_active(disk))
3222 continue;
3223 return disk;
3224 }
3225 return NULL;
3226 }
3227
3228 static unsigned int sync_io[DK_MAX_MAJOR][DK_MAX_DISK];
3229 void md_sync_acct(kdev_t dev, unsigned long nr_sectors)
3230 {
3231 unsigned int major = MAJOR(dev);
3232 unsigned int index;
3233
3234 index = disk_index(dev);
3235 if ((index >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR))
3236 return;
3237
3238 sync_io[major][index] += nr_sectors;
3239 }
3240
3241 static int is_mddev_idle (mddev_t *mddev)
3242 {
3243 mdk_rdev_t * rdev;
3244 struct md_list_head *tmp;
3245 int idle;
3246 unsigned long curr_events;
3247
3248 idle = 1;
3249 ITERATE_RDEV(mddev,rdev,tmp) {
3250 int major = MAJOR(rdev->dev);
3251 int idx = disk_index(rdev->dev);
3252
3253 if ((idx >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR))
3254 continue;
3255
3256 curr_events = kstat.dk_drive_rblk[major][idx] +
3257 kstat.dk_drive_wblk[major][idx] ;
3258 curr_events -= sync_io[major][idx];
3259 // printk("events(major: %d, idx: %d): %ld\n", major, idx, curr_events);
3260 if (curr_events != rdev->last_events) {
3261 // printk("!I(%ld)", curr_events - rdev->last_events);
3262 rdev->last_events = curr_events;
3263 idle = 0;
3264 }
3265 }
3266 return idle;
3267 }
3268
3269 MD_DECLARE_WAIT_QUEUE_HEAD(resync_wait);
3270
3271 void md_done_sync(mddev_t *mddev, int blocks, int ok)
3272 {
3273 /* another "blocks" (1K) blocks have been synced */
3274 atomic_sub(blocks, &mddev->recovery_active);
3275 wake_up(&mddev->recovery_wait);
3276 if (!ok) {
3277 // stop recovery, signal do_sync ....
3278 }
3279 }
3280
3281 #define SYNC_MARKS 10
3282 #define SYNC_MARK_STEP (3*HZ)
3283 int md_do_sync(mddev_t *mddev, mdp_disk_t *spare)
3284 {
3285 mddev_t *mddev2;
3286 unsigned int max_blocks, currspeed,
3287 j, window, err, serialize;
3288 kdev_t read_disk = mddev_to_kdev(mddev);
3289 unsigned long mark[SYNC_MARKS];
3290 unsigned long mark_cnt[SYNC_MARKS];
3291 int last_mark,m;
3292 struct md_list_head *tmp;
3293 unsigned long last_check;
3294
3295
3296 err = down_interruptible(&mddev->resync_sem);
3297 if (err)
3298 goto out_nolock;
3299
3300 recheck:
3301 serialize = 0;
3302 ITERATE_MDDEV(mddev2,tmp) {
3303 if (mddev2 == mddev)
3304 continue;
3305 if (mddev2->curr_resync && match_mddev_units(mddev,mddev2)) {
3306 printk(KERN_INFO "md: serializing resync, md%d shares one or more physical units with md%d!\n", mdidx(mddev), mdidx(mddev2));
3307 serialize = 1;
3308 break;
3309 }
3310 }
3311 if (serialize) {
3312 interruptible_sleep_on(&resync_wait);
3313 if (md_signal_pending(current)) {
3314 md_flush_signals();
3315 err = -EINTR;
3316 goto out;
3317 }
3318 goto recheck;
3319 }
3320
3321 mddev->curr_resync = 1;
3322
3323 max_blocks = mddev->sb->size;
3324
3325 printk(KERN_INFO "md: syncing RAID array md%d\n", mdidx(mddev));
3326 printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed: %d KB/sec/disc.\n",
3327 sysctl_speed_limit_min);
3328 printk(KERN_INFO "md: using maximum available idle IO bandwith (but not more than %d KB/sec) for reconstruction.\n", sysctl_speed_limit_max);
3329
3330 /*
3331 * Resync has low priority.
3332 */
3333 current->nice = 19;
3334
3335 is_mddev_idle(mddev); /* this also initializes IO event counters */
3336 for (m = 0; m < SYNC_MARKS; m++) {
3337 mark[m] = jiffies;
3338 mark_cnt[m] = 0;
3339 }
3340 last_mark = 0;
3341 mddev->resync_mark = mark[last_mark];
3342 mddev->resync_mark_cnt = mark_cnt[last_mark];
3343
3344 /*
3345 * Tune reconstruction:
3346 */
3347 window = MAX_READAHEAD*(PAGE_SIZE/1024);
3348 printk(KERN_INFO "md: using %dk window, over a total of %d blocks.\n",window,max_blocks);
3349
3350 atomic_set(&mddev->recovery_active, 0);
3351 init_waitqueue_head(&mddev->recovery_wait);
3352 last_check = 0;
3353 for (j = 0; j < max_blocks;) {
3354 int blocks;
3355
3356 blocks = mddev->pers->sync_request(mddev, j);
3357
3358 if (blocks < 0) {
3359 err = blocks;
3360 goto out;
3361 }
3362 atomic_add(blocks, &mddev->recovery_active);
3363 j += blocks;
3364 mddev->curr_resync = j;
3365
3366 if (last_check + window > j)
3367 continue;
3368
3369 run_task_queue(&tq_disk); //??
3370
3371 if (jiffies >= mark[last_mark] + SYNC_MARK_STEP ) {
3372 /* step marks */
3373 int next = (last_mark+1) % SYNC_MARKS;
3374
3375 mddev->resync_mark = mark[next];
3376 mddev->resync_mark_cnt = mark_cnt[next];
3377 mark[next] = jiffies;
3378 mark_cnt[next] = j - atomic_read(&mddev->recovery_active);
3379 last_mark = next;
3380 }
3381
3382
3383 if (md_signal_pending(current)) {
3384 /*
3385 * got a signal, exit.
3386 */
3387 mddev->curr_resync = 0;
3388 printk("md_do_sync() got signal ... exiting\n");
3389 md_flush_signals();
3390 err = -EINTR;
3391 goto out;
3392 }
3393
3394 /*
3395 * this loop exits only if either when we are slower than
3396 * the 'hard' speed limit, or the system was IO-idle for
3397 * a jiffy.
3398 * the system might be non-idle CPU-wise, but we only care
3399 * about not overloading the IO subsystem. (things like an
3400 * e2fsck being done on the RAID array should execute fast)
3401 */
3402 repeat:
3403 if (md_need_resched(current))
3404 schedule();
3405
3406 currspeed = (j-mddev->resync_mark_cnt)/((jiffies-mddev->resync_mark)/HZ +1) +1;
3407
3408 if (currspeed > sysctl_speed_limit_min) {
3409 current->nice = 19;
3410
3411 if ((currspeed > sysctl_speed_limit_max) ||
3412 !is_mddev_idle(mddev)) {
3413 current->state = TASK_INTERRUPTIBLE;
3414 md_schedule_timeout(HZ/4);
3415 if (!md_signal_pending(current))
3416 goto repeat;
3417 }
3418 } else
3419 current->nice = -20;
3420 }
3421 fsync_dev(read_disk);
3422 printk(KERN_INFO "md: md%d: sync done.\n",mdidx(mddev));
3423 err = 0;
3424 /*
3425 * this also signals 'finished resyncing' to md_stop
3426 */
3427 out:
3428 wait_event(mddev->recovery_wait, atomic_read(&mddev->recovery_active)==0);
3429 up(&mddev->resync_sem);
3430 out_nolock:
3431 mddev->curr_resync = 0;
3432 wake_up(&resync_wait);
3433 return err;
3434 }
3435
3436
3437 /*
3438 * This is a kernel thread which syncs a spare disk with the active array
3439 *
3440 * the amount of foolproofing might seem to be a tad excessive, but an
3441 * early (not so error-safe) version of raid1syncd synced the first 0.5 gigs
3442 * of my root partition with the first 0.5 gigs of my /home partition ... so
3443 * i'm a bit nervous ;)
3444 */
3445 void md_do_recovery (void *data)
3446 {
3447 int err;
3448 mddev_t *mddev;
3449 mdp_super_t *sb;
3450 mdp_disk_t *spare;
3451 struct md_list_head *tmp;
3452
3453 printk(KERN_INFO "md: recovery thread got woken up ...\n");
3454 restart:
3455 ITERATE_MDDEV(mddev,tmp) {
3456 sb = mddev->sb;
3457 if (!sb)
3458 continue;
3459 if (mddev->recovery_running)
3460 continue;
3461 if (sb->active_disks == sb->raid_disks)
3462 continue;
3463 if (!sb->spare_disks) {
3464 printk(KERN_ERR "md%d: no spare disk to reconstruct array! -- continuing in degraded mode\n", mdidx(mddev));
3465 continue;
3466 }
3467 /*
3468 * now here we get the spare and resync it.
3469 */
3470 if ((spare = get_spare(mddev)) == NULL)
3471 continue;
3472 printk(KERN_INFO "md%d: resyncing spare disk %s to replace failed disk\n", mdidx(mddev), partition_name(MKDEV(spare->major,spare->minor)));
3473 if (!mddev->pers->diskop)
3474 continue;
3475 if (mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_WRITE))
3476 continue;
3477 down(&mddev->recovery_sem);
3478 mddev->recovery_running = 1;
3479 err = md_do_sync(mddev, spare);
3480 if (err == -EIO) {
3481 printk(KERN_INFO "md%d: spare disk %s failed, skipping to next spare.\n", mdidx(mddev), partition_name(MKDEV(spare->major,spare->minor)));
3482 if (!disk_faulty(spare)) {
3483 mddev->pers->diskop(mddev,&spare,DISKOP_SPARE_INACTIVE);
3484 mark_disk_faulty(spare);
3485 mark_disk_nonsync(spare);
3486 mark_disk_inactive(spare);
3487 sb->spare_disks--;
3488 sb->working_disks--;
3489 sb->failed_disks++;
3490 }
3491 } else
3492 if (disk_faulty(spare))
3493 mddev->pers->diskop(mddev, &spare,
3494 DISKOP_SPARE_INACTIVE);
3495 if (err == -EINTR || err == -ENOMEM) {
3496 /*
3497 * Recovery got interrupted, or ran out of mem ...
3498 * signal back that we have finished using the array.
3499 */
3500 mddev->pers->diskop(mddev, &spare,
3501 DISKOP_SPARE_INACTIVE);
3502 up(&mddev->recovery_sem);
3503 mddev->recovery_running = 0;
3504 continue;
3505 } else {
3506 mddev->recovery_running = 0;
3507 up(&mddev->recovery_sem);
3508 }
3509 if (!disk_faulty(spare)) {
3510 /*
3511 * the SPARE_ACTIVE diskop possibly changes the
3512 * pointer too
3513 */
3514 mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_ACTIVE);
3515 mark_disk_sync(spare);
3516 mark_disk_active(spare);
3517 sb->active_disks++;
3518 sb->spare_disks--;
3519 }
3520 mddev->sb_dirty = 1;
3521 md_update_sb(mddev);
3522 goto restart;
3523 }
3524 printk(KERN_INFO "md: recovery thread finished ...\n");
3525
3526 }
3527
3528 int md_notify_reboot(struct notifier_block *this,
3529 unsigned long code, void *x)
3530 {
3531 struct md_list_head *tmp;
3532 mddev_t *mddev;
3533
3534 if ((code == MD_SYS_DOWN) || (code == MD_SYS_HALT)
3535 || (code == MD_SYS_POWER_OFF)) {
3536
3537 printk(KERN_INFO "stopping all md devices.\n");
3538
3539 ITERATE_MDDEV(mddev,tmp)
3540 do_md_stop (mddev, 1);
3541 /*
3542 * certain more exotic SCSI devices are known to be
3543 * volatile wrt too early system reboots. While the
3544 * right place to handle this issue is the given
3545 * driver, we do want to have a safe RAID driver ...
3546 */
3547 md_mdelay(1000*1);
3548 }
3549 return NOTIFY_DONE;
3550 }
3551
3552 struct notifier_block md_notifier = {
3553 md_notify_reboot,
3554 NULL,
3555 0
3556 };
3557 #ifndef MODULE
3558 static int md__init raid_setup(char *str)
3559 {
3560 int len, pos;
3561
3562 len = strlen(str) + 1;
3563 pos = 0;
3564
3565 while (pos < len) {
3566 char *comma = strchr(str+pos, ',');
3567 int wlen;
3568 if (comma)
3569 wlen = (comma-str)-pos;
3570 else wlen = (len-1)-pos;
3571
3572 if (strncmp(str, "noautodetect", wlen) == 0)
3573 raid_setup_args.noautodetect = 1;
3574 pos += wlen+1;
3575 }
3576 raid_setup_args.set = 1;
3577 return 1;
3578 }
3579 __setup("raid=", raid_setup);
3580 #endif
3581 static void md_geninit (void)
3582 {
3583 int i;
3584
3585 for(i = 0; i < MAX_MD_DEVS; i++) {
3586 md_blocksizes[i] = 1024;
3587 md_size[i] = 0;
3588 md_hardsect_sizes[i] = 512;
3589 md_maxreadahead[i] = MD_READAHEAD;
3590 register_disk(&md_gendisk, MKDEV(MAJOR_NR,i), 1, &md_fops, 0);
3591 }
3592 blksize_size[MAJOR_NR] = md_blocksizes;
3593 blk_size[MAJOR_NR] = md_size;
3594 max_readahead[MAJOR_NR] = md_maxreadahead;
3595 hardsect_size[MAJOR_NR] = md_hardsect_sizes;
3596
3597 printk("md.c: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
3598
3599 #ifdef CONFIG_PROC_FS
3600 create_proc_read_entry("mdstat", 0, NULL, md_status_read_proc, NULL);
3601 #endif
3602 }
3603
3604 int md__init md_init (void)
3605 {
3606 static char * name = "mdrecoveryd";
3607
3608 printk (KERN_INFO "md driver %d.%d.%d MAX_MD_DEVS=%d, MD_SB_DISKS=%d\n",
3609 MD_MAJOR_VERSION, MD_MINOR_VERSION,
3610 MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS);
3611
3612 if (devfs_register_blkdev (MAJOR_NR, "md", &md_fops))
3613 {
3614 printk (KERN_ALERT "Unable to get major %d for md\n", MAJOR_NR);
3615 return (-1);
3616 }
3617 devfs_handle = devfs_mk_dir (NULL, "md", NULL);
3618 devfs_register_series (devfs_handle, "%u",MAX_MD_DEVS,DEVFS_FL_DEFAULT,
3619 MAJOR_NR, 0, S_IFBLK | S_IRUSR | S_IWUSR,
3620 &md_fops, NULL);
3621
3622 /* forward all md request to md_make_request */
3623 blk_queue_make_request(BLK_DEFAULT_QUEUE(MAJOR_NR), md_make_request);
3624
3625
3626 read_ahead[MAJOR_NR] = INT_MAX;
3627 md_gendisk.next = gendisk_head;
3628
3629 gendisk_head = &md_gendisk;
3630
3631 md_recovery_thread = md_register_thread(md_do_recovery, NULL, name);
3632 if (!md_recovery_thread)
3633 printk(KERN_ALERT "bug: couldn't allocate md_recovery_thread\n");
3634
3635 md_register_reboot_notifier(&md_notifier);
3636 raid_table_header = register_sysctl_table(raid_root_table, 1);
3637
3638 md_geninit();
3639 return (0);
3640 }
3641
3642 #ifdef CONFIG_MD_BOOT
3643 #define MAX_MD_BOOT_DEVS 8
3644 struct {
3645 unsigned long set;
3646 int pers[MAX_MD_BOOT_DEVS];
3647 int chunk[MAX_MD_BOOT_DEVS];
3648 kdev_t devices[MAX_MD_BOOT_DEVS][MD_SB_DISKS];
3649 } md_setup_args md__initdata = { 0, };
3650
3651 /*
3652 * Parse the command-line parameters given our kernel, but do not
3653 * actually try to invoke the MD device now; that is handled by
3654 * md_setup_drive after the low-level disk drivers have initialised.
3655 *
3656 * 27/11/1999: Fixed to work correctly with the 2.3 kernel (which
3657 * assigns the task of parsing integer arguments to the
3658 * invoked program now). Added ability to initialise all
3659 * the MD devices (by specifying multiple "md=" lines)
3660 * instead of just one. -- KTK
3661 * 18May2000: Added support for persistant-superblock arrays:
3662 * md=n,0,factor,fault,device-list uses RAID0 for device n
3663 * md=n,-1,factor,fault,device-list uses LINEAR for device n
3664 * md=n,device-list reads a RAID superblock from the devices
3665 * elements in device-list are read by name_to_kdev_t so can be
3666 * a hex number or something like /dev/hda1 /dev/sdb
3667 */
3668 extern kdev_t name_to_kdev_t(char *line) md__init;
3669 static int md__init md_setup(char *str)
3670 {
3671 int minor, level, factor, fault, i=0;
3672 kdev_t device;
3673 char *devnames, *pername = "";
3674
3675 if(get_option(&str, &minor) != 2) { /* MD Number */
3676 printk("md: Too few arguments supplied to md=.\n");
3677 return 0;
3678 }
3679 if (minor >= MAX_MD_BOOT_DEVS) {
3680 printk ("md: Minor device number too high.\n");
3681 return 0;
3682 } else if (md_setup_args.set & (1 << minor)) {
3683 printk ("md: Warning - md=%d,... has been specified twice;\n"
3684 " will discard the first definition.\n", minor);
3685 }
3686 switch(get_option(&str, &level)) { /* RAID Personality */
3687 case 2: /* could be 0 or -1.. */
3688 if (level == 0 || level == -1) {
3689 if (get_option(&str, &factor) != 2 || /* Chunk Size */
3690 get_option(&str, &fault) != 2) {
3691 printk("md: Too few arguments supplied to md=.\n");
3692 return 0;
3693 }
3694 md_setup_args.pers[minor] = level;
3695 md_setup_args.chunk[minor] = 1 << (factor+12);
3696 switch(level) {
3697 case -1:
3698 level = LINEAR;
3699 pername = "linear";
3700 break;
3701 case 0:
3702 level = RAID0;
3703 pername = "raid0";
3704 break;
3705 default:
3706 printk ("md: The kernel has not been configured for raid%d"
3707 " support!\n", level);
3708 return 0;
3709 }
3710 md_setup_args.pers[minor] = level;
3711 break;
3712 }
3713 /* FALL THROUGH */
3714 case 1: /* the first device is numeric */
3715 md_setup_args.devices[minor][i++] = level;
3716 /* FALL THROUGH */
3717 case 0:
3718 md_setup_args.pers[minor] = 0;
3719 pername="super-block";
3720 }
3721 devnames = str;
3722 for (; i<MD_SB_DISKS && str; i++) {
3723 if ((device = name_to_kdev_t(str))) {
3724 md_setup_args.devices[minor][i] = device;
3725 } else {
3726 printk ("md: Unknown device name, %s.\n", str);
3727 return 0;
3728 }
3729 if ((str = strchr(str, ',')) != NULL)
3730 str++;
3731 }
3732 if (!i) {
3733 printk ("md: No devices specified for md%d?\n", minor);
3734 return 0;
3735 }
3736
3737 printk ("md: Will configure md%d (%s) from %s, below.\n",
3738 minor, pername, devnames);
3739 md_setup_args.devices[minor][i] = (kdev_t) 0;
3740 md_setup_args.set |= (1 << minor);
3741 return 1;
3742 }
3743
3744 void md__init md_setup_drive(void)
3745 {
3746 int minor, i;
3747 kdev_t dev;
3748 mddev_t*mddev;
3749
3750 for (minor = 0; minor < MAX_MD_BOOT_DEVS; minor++) {
3751 mdu_disk_info_t dinfo;
3752 int err=0;
3753 if (!(md_setup_args.set & (1 << minor)))
3754 continue;
3755 printk("md: Loading md%d.\n", minor);
3756 if (mddev_map[minor].mddev) {
3757 printk(".. md%d already autodetected - use raid=noautodetect\n", minor);
3758 continue;
3759 }
3760 mddev = alloc_mddev(MKDEV(MD_MAJOR,minor));
3761 if (md_setup_args.pers[minor]) {
3762 /* non-persistent */
3763 mdu_array_info_t ainfo;
3764 ainfo.level = pers_to_level(md_setup_args.pers[minor]);
3765 ainfo.size = 0;
3766 ainfo.nr_disks =0;
3767 ainfo.raid_disks =0;
3768 ainfo.md_minor =minor;
3769 ainfo.not_persistent = 1;
3770
3771 ainfo.state = MD_SB_CLEAN;
3772 ainfo.active_disks = 0;
3773 ainfo.working_disks = 0;
3774 ainfo.failed_disks = 0;
3775 ainfo.spare_disks = 0;
3776 ainfo.layout = 0;
3777 ainfo.chunk_size = md_setup_args.chunk[minor];
3778 err = set_array_info(mddev, &ainfo);
3779 for (i=0; !err && (dev = md_setup_args.devices[minor][i]); i++) {
3780 dinfo.number = i;
3781 dinfo.raid_disk = i;
3782 dinfo.state = (1<<MD_DISK_ACTIVE)|(1<<MD_DISK_SYNC);
3783 dinfo.major = MAJOR(dev);
3784 dinfo.minor = MINOR(dev);
3785 mddev->sb->nr_disks++;
3786 mddev->sb->raid_disks++;
3787 mddev->sb->active_disks++;
3788 mddev->sb->working_disks++;
3789 err = add_new_disk (mddev, &dinfo);
3790 }
3791 } else {
3792 /* persistent */
3793 for (i = 0; (dev = md_setup_args.devices[minor][i]); i++) {
3794 dinfo.major = MAJOR(dev);
3795 dinfo.minor = MINOR(dev);
3796 add_new_disk (mddev, &dinfo);
3797 }
3798 }
3799 if (!err)
3800 err = do_md_run(mddev);
3801 if (err) {
3802 mddev->sb_dirty = 0;
3803 do_md_stop(mddev, 0);
3804 printk("md: starting md%d failed\n", minor);
3805 }
3806 }
3807 }
3808
3809 __setup("md=", md_setup);
3810 #endif
3811
3812 #ifdef MODULE
3813 int init_module (void)
3814 {
3815 return md_init();
3816 }
3817
3818 static void free_device_names(void)
3819 {
3820 while (device_names.next != &device_names) {
3821 struct list_head *tmp = device_names.next;
3822 list_del(tmp);
3823 kfree(tmp);
3824 }
3825 }
3826
3827
3828 void cleanup_module (void)
3829 {
3830 struct gendisk **gendisk_ptr;
3831
3832 md_unregister_thread(md_recovery_thread);
3833 devfs_unregister(devfs_handle);
3834
3835 devfs_unregister_blkdev(MAJOR_NR,"md");
3836 unregister_reboot_notifier(&md_notifier);
3837 unregister_sysctl_table(raid_table_header);
3838 #ifdef CONFIG_PROC_FS
3839 remove_proc_entry("mdstat", NULL);
3840 #endif
3841
3842 gendisk_ptr = &gendisk_head;
3843 while (*gendisk_ptr) {
3844 if (*gendisk_ptr == &md_gendisk) {
3845 *gendisk_ptr = md_gendisk.next;
3846 break;
3847 }
3848 gendisk_ptr = & (*gendisk_ptr)->next;
3849 }
3850 blk_dev[MAJOR_NR].queue = NULL;
3851 blksize_size[MAJOR_NR] = NULL;
3852 blk_size[MAJOR_NR] = NULL;
3853 max_readahead[MAJOR_NR] = NULL;
3854 hardsect_size[MAJOR_NR] = NULL;
3855
3856 free_device_names();
3857
3858 }
3859 #endif
3860
3861 __initcall(md_init);
3862 #if defined(CONFIG_AUTODETECT_RAID) || defined(CONFIG_MD_BOOT)
3863 __initcall(md_run_setup);
3864 #endif
3865
3866 MD_EXPORT_SYMBOL(md_size);
3867 MD_EXPORT_SYMBOL(register_md_personality);
3868 MD_EXPORT_SYMBOL(unregister_md_personality);
3869 MD_EXPORT_SYMBOL(partition_name);
3870 MD_EXPORT_SYMBOL(md_error);
3871 MD_EXPORT_SYMBOL(md_do_sync);
3872 MD_EXPORT_SYMBOL(md_sync_acct);
3873 MD_EXPORT_SYMBOL(md_done_sync);
3874 MD_EXPORT_SYMBOL(md_recover_arrays);
3875 MD_EXPORT_SYMBOL(md_register_thread);
3876 MD_EXPORT_SYMBOL(md_unregister_thread);
3877 MD_EXPORT_SYMBOL(md_update_sb);
3878 MD_EXPORT_SYMBOL(md_wakeup_thread);
3879 MD_EXPORT_SYMBOL(md_print_devices);
3880 MD_EXPORT_SYMBOL(find_rdev_nr);
3881 MD_EXPORT_SYMBOL(md_interrupt_thread);
3882 MD_EXPORT_SYMBOL(mddev_map);
3883 MD_EXPORT_SYMBOL(md_check_ordering);
3884
3885
This page was automatically generated by the
LXR engine.
Visit the LXR main site for more
information.