1 /*
2 * raid5.c : Multiple Devices driver for Linux
3 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
4 * Copyright (C) 1999, 2000 Ingo Molnar
5 *
6 * RAID-5 management functions.
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2, or (at your option)
11 * any later version.
12 *
13 * You should have received a copy of the GNU General Public License
14 * (for example /usr/src/linux/COPYING); if not, write to the Free
15 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
16 */
17
18
19 #include <linux/config.h>
20 #include <linux/module.h>
21 #include <linux/locks.h>
22 #include <linux/malloc.h>
23 #include <linux/raid/raid5.h>
24 #include <asm/bitops.h>
25 #include <asm/atomic.h>
26
27 static mdk_personality_t raid5_personality;
28
29 /*
30 * Stripe cache
31 */
32
33 #define NR_STRIPES 256
34 #define HASH_PAGES 1
35 #define HASH_PAGES_ORDER 0
36 #define NR_HASH (HASH_PAGES * PAGE_SIZE / sizeof(struct stripe_head *))
37 #define HASH_MASK (NR_HASH - 1)
38 #define stripe_hash(conf, sect) ((conf)->stripe_hashtbl[((sect) / ((conf)->buffer_size >> 9)) & HASH_MASK])
39
40 /*
41 * The following can be used to debug the driver
42 */
43 #define RAID5_DEBUG 0
44 #define RAID5_PARANOIA 1
45 #if RAID5_PARANOIA && CONFIG_SMP
46 # define CHECK_DEVLOCK() if (!spin_is_locked(&conf->device_lock)) BUG()
47 #else
48 # define CHECK_DEVLOCK()
49 #endif
50
51 #if RAID5_DEBUG
52 #define PRINTK(x...) printk(x)
53 #define inline
54 #define __inline__
55 #else
56 #define PRINTK(x...) do { } while (0)
57 #endif
58
59 static void print_raid5_conf (raid5_conf_t *conf);
60
61 static inline void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
62 {
63 if (atomic_dec_and_test(&sh->count)) {
64 if (!list_empty(&sh->lru))
65 BUG();
66 if (atomic_read(&conf->active_stripes)==0)
67 BUG();
68 if (test_bit(STRIPE_HANDLE, &sh->state)) {
69 list_add_tail(&sh->lru, &conf->handle_list);
70 md_wakeup_thread(conf->thread);
71 }
72 else {
73 list_add_tail(&sh->lru, &conf->inactive_list);
74 atomic_dec(&conf->active_stripes);
75 wake_up(&conf->wait_for_stripe);
76 }
77 }
78 }
79 static void release_stripe(struct stripe_head *sh)
80 {
81 raid5_conf_t *conf = sh->raid_conf;
82
83 spin_lock_irq(&conf->device_lock);
84 __release_stripe(conf, sh);
85 spin_unlock_irq(&conf->device_lock);
86 }
87
88 static void remove_hash(struct stripe_head *sh)
89 {
90 PRINTK("remove_hash(), stripe %lu\n", sh->sector);
91
92 if (sh->hash_pprev) {
93 if (sh->hash_next)
94 sh->hash_next->hash_pprev = sh->hash_pprev;
95 *sh->hash_pprev = sh->hash_next;
96 sh->hash_pprev = NULL;
97 }
98 }
99
100 static __inline__ void insert_hash(raid5_conf_t *conf, struct stripe_head *sh)
101 {
102 struct stripe_head **shp = &stripe_hash(conf, sh->sector);
103
104 PRINTK("insert_hash(), stripe %lu\n",sh->sector);
105
106 CHECK_DEVLOCK();
107 if ((sh->hash_next = *shp) != NULL)
108 (*shp)->hash_pprev = &sh->hash_next;
109 *shp = sh;
110 sh->hash_pprev = shp;
111 }
112
113
114 /* find an idle stripe, make sure it is unhashed, and return it. */
115 static struct stripe_head *get_free_stripe(raid5_conf_t *conf)
116 {
117 struct stripe_head *sh = NULL;
118 struct list_head *first;
119
120 CHECK_DEVLOCK();
121 if (list_empty(&conf->inactive_list))
122 goto out;
123 first = conf->inactive_list.next;
124 sh = list_entry(first, struct stripe_head, lru);
125 list_del_init(first);
126 remove_hash(sh);
127 atomic_inc(&conf->active_stripes);
128 out:
129 return sh;
130 }
131
132 static void shrink_buffers(struct stripe_head *sh, int num)
133 {
134 struct buffer_head *bh;
135 int i;
136
137 for (i=0; i<num ; i++) {
138 bh = sh->bh_cache[i];
139 if (!bh)
140 return;
141 sh->bh_cache[i] = NULL;
142 free_page((unsigned long) bh->b_data);
143 kfree(bh);
144 }
145 }
146
147 static int grow_buffers(struct stripe_head *sh, int num, int b_size, int priority)
148 {
149 struct buffer_head *bh;
150 int i;
151
152 for (i=0; i<num; i++) {
153 struct page *page;
154 bh = kmalloc(sizeof(struct buffer_head), priority);
155 if (!bh)
156 return 1;
157 memset(bh, 0, sizeof (struct buffer_head));
158 init_waitqueue_head(&bh->b_wait);
159 page = alloc_page(priority);
160 bh->b_data = page_address(page);
161 if (!bh->b_data) {
162 kfree(bh);
163 return 1;
164 }
165 atomic_set(&bh->b_count, 0);
166 bh->b_page = page;
167 sh->bh_cache[i] = bh;
168
169 }
170 return 0;
171 }
172
173 static struct buffer_head *raid5_build_block (struct stripe_head *sh, int i);
174
175 static inline void init_stripe(struct stripe_head *sh, unsigned long sector)
176 {
177 raid5_conf_t *conf = sh->raid_conf;
178 int disks = conf->raid_disks, i;
179
180 if (atomic_read(&sh->count) != 0)
181 BUG();
182 if (test_bit(STRIPE_HANDLE, &sh->state))
183 BUG();
184
185 CHECK_DEVLOCK();
186 PRINTK("init_stripe called, stripe %lu\n", sh->sector);
187
188 remove_hash(sh);
189
190 sh->sector = sector;
191 sh->size = conf->buffer_size;
192 sh->state = 0;
193
194 for (i=disks; i--; ) {
195 if (sh->bh_read[i] || sh->bh_write[i] || sh->bh_written[i] ||
196 buffer_locked(sh->bh_cache[i])) {
197 printk("sector=%lx i=%d %p %p %p %d\n",
198 sh->sector, i, sh->bh_read[i],
199 sh->bh_write[i], sh->bh_written[i],
200 buffer_locked(sh->bh_cache[i]));
201 BUG();
202 }
203 clear_bit(BH_Uptodate, &sh->bh_cache[i]->b_state);
204 raid5_build_block(sh, i);
205 }
206 insert_hash(conf, sh);
207 }
208
209 /* the buffer size has changed, so unhash all stripes
210 * as active stripes complete, they will go onto inactive list
211 */
212 static void shrink_stripe_cache(raid5_conf_t *conf)
213 {
214 int i;
215 CHECK_DEVLOCK();
216 if (atomic_read(&conf->active_stripes))
217 BUG();
218 for (i=0; i < NR_HASH; i++) {
219 struct stripe_head *sh;
220 while ((sh = conf->stripe_hashtbl[i]))
221 remove_hash(sh);
222 }
223 }
224
225 static struct stripe_head *__find_stripe(raid5_conf_t *conf, unsigned long sector)
226 {
227 struct stripe_head *sh;
228
229 CHECK_DEVLOCK();
230 PRINTK("__find_stripe, sector %lu\n", sector);
231 for (sh = stripe_hash(conf, sector); sh; sh = sh->hash_next)
232 if (sh->sector == sector)
233 return sh;
234 PRINTK("__stripe %lu not in cache\n", sector);
235 return NULL;
236 }
237
238 static struct stripe_head *get_active_stripe(raid5_conf_t *conf, unsigned long sector, int size, int noblock)
239 {
240 struct stripe_head *sh;
241
242 PRINTK("get_stripe, sector %lu\n", sector);
243
244 md_spin_lock_irq(&conf->device_lock);
245
246 do {
247 if (conf->buffer_size == 0 ||
248 (size && size != conf->buffer_size)) {
249 /* either the size is being changed (buffer_size==0) or
250 * we need to change it.
251 * If size==0, we can proceed as soon as buffer_size gets set.
252 * If size>0, we can proceed when active_stripes reaches 0, or
253 * when someone else sets the buffer_size to size.
254 * If someone sets the buffer size to something else, we will need to
255 * assert that we want to change it again
256 */
257 int oldsize = conf->buffer_size;
258 PRINTK("get_stripe %ld/%d buffer_size is %d, %d active\n", sector, size, conf->buffer_size, atomic_read(&conf->active_stripes));
259 if (size==0)
260 wait_event_lock_irq(conf->wait_for_stripe,
261 conf->buffer_size,
262 conf->device_lock);
263 else {
264 while (conf->buffer_size != size && atomic_read(&conf->active_stripes)) {
265 conf->buffer_size = 0;
266 wait_event_lock_irq(conf->wait_for_stripe,
267 atomic_read(&conf->active_stripes)==0 || conf->buffer_size,
268 conf->device_lock);
269 PRINTK("waited and now %ld/%d buffer_size is %d - %d active\n", sector, size,
270 conf->buffer_size, atomic_read(&conf->active_stripes));
271 }
272
273 if (conf->buffer_size != size) {
274 printk("raid5: switching cache buffer size, %d --> %d\n", oldsize, size);
275 shrink_stripe_cache(conf);
276 if (size==0) BUG();
277 conf->buffer_size = size;
278 PRINTK("size now %d\n", conf->buffer_size);
279 }
280 }
281 }
282 if (size == 0)
283 sector -= sector & ((conf->buffer_size>>9)-1);
284
285 sh = __find_stripe(conf, sector);
286 if (!sh) {
287 sh = get_free_stripe(conf);
288 if (noblock && sh == NULL)
289 break;
290 if (!sh) {
291 wait_event_lock_irq(conf->wait_for_stripe,
292 !list_empty(&conf->inactive_list),
293 conf->device_lock);
294 } else
295 init_stripe(sh, sector);
296 } else {
297 if (atomic_read(&sh->count)) {
298 if (!list_empty(&sh->lru))
299 BUG();
300 } else {
301 if (!test_bit(STRIPE_HANDLE, &sh->state))
302 atomic_inc(&conf->active_stripes);
303 if (list_empty(&sh->lru))
304 BUG();
305 list_del_init(&sh->lru);
306 }
307 }
308 } while (sh == NULL);
309
310 if (sh)
311 atomic_inc(&sh->count);
312
313 md_spin_unlock_irq(&conf->device_lock);
314 return sh;
315 }
316
317 static int grow_stripes(raid5_conf_t *conf, int num, int priority)
318 {
319 struct stripe_head *sh;
320
321 while (num--) {
322 sh = kmalloc(sizeof(struct stripe_head), priority);
323 if (!sh)
324 return 1;
325 memset(sh, 0, sizeof(*sh));
326 sh->raid_conf = conf;
327 sh->lock = SPIN_LOCK_UNLOCKED;
328
329 if (grow_buffers(sh, conf->raid_disks, PAGE_SIZE, priority)) {
330 shrink_buffers(sh, conf->raid_disks);
331 kfree(sh);
332 return 1;
333 }
334 /* we just created an active stripe so... */
335 atomic_set(&sh->count, 1);
336 atomic_inc(&conf->active_stripes);
337 INIT_LIST_HEAD(&sh->lru);
338 release_stripe(sh);
339 }
340 return 0;
341 }
342
343 static void shrink_stripes(raid5_conf_t *conf, int num)
344 {
345 struct stripe_head *sh;
346
347 while (num--) {
348 spin_lock_irq(&conf->device_lock);
349 sh = get_free_stripe(conf);
350 spin_unlock_irq(&conf->device_lock);
351 if (!sh)
352 break;
353 if (atomic_read(&sh->count))
354 BUG();
355 shrink_buffers(sh, conf->raid_disks);
356 kfree(sh);
357 atomic_dec(&conf->active_stripes);
358 }
359 }
360
361
362 static inline void raid5_end_buffer_read(struct buffer_head *blist, struct buffer_head *bh)
363 {
364 while (blist) {
365 struct buffer_head *new = blist;
366 blist = new->b_reqnext;
367 memcpy(new->b_data, bh->b_data, bh->b_size);
368 new->b_end_io(new, 1);
369 }
370 }
371
372 static void raid5_end_read_request (struct buffer_head * bh, int uptodate)
373 {
374 struct stripe_head *sh = bh->b_private;
375 raid5_conf_t *conf = sh->raid_conf;
376 int disks = conf->raid_disks, i;
377 unsigned long flags;
378 struct buffer_head *buffers = NULL;
379
380 for (i=0 ; i<disks; i++)
381 if (bh == sh->bh_cache[i])
382 break;
383
384 PRINTK("end_read_request %lu/%d, %d, count: %d, uptodate %d.\n", sh->sector, i, atomic_read(&sh->count), uptodate);
385 if (i == disks) {
386 BUG();
387 return;
388 }
389
390 md_spin_lock_irqsave(&conf->device_lock, flags);
391 if (uptodate) {
392 #ifdef CONFIG_HIGHMEM
393 /* cannot map highmem bufferheads from irq,
394 * so leave it for stripe_handle if there might
395 * be a problem
396 */
397 if (sh->bh_read[i] &&
398 sh->bh_read[i]->b_reqnext == NULL &&
399 !PageHighMem(sh->bh_read[i]->b_page)) {
400 /* it's safe */
401 buffers = sh->bh_read[i];
402 sh->bh_read[i] = NULL;
403 }
404 #else
405 buffers = sh->bh_read[i];
406 sh->bh_read[i] = NULL;
407 #endif
408 set_bit(BH_Uptodate, &bh->b_state);
409 if (buffers) {
410 spin_unlock_irqrestore(&conf->device_lock, flags);
411 raid5_end_buffer_read(buffers, bh);
412 spin_lock_irqsave(&conf->device_lock, flags);
413 }
414 } else {
415 md_error(mddev_to_kdev(conf->mddev), bh->b_dev);
416 clear_bit(BH_Uptodate, &bh->b_state);
417 }
418 clear_bit(BH_Lock, &bh->b_state);
419 set_bit(STRIPE_HANDLE, &sh->state);
420 __release_stripe(conf, sh);
421 md_spin_unlock_irqrestore(&conf->device_lock, flags);
422 }
423
424 static void raid5_end_write_request (struct buffer_head *bh, int uptodate)
425 {
426 struct stripe_head *sh = bh->b_private;
427 raid5_conf_t *conf = sh->raid_conf;
428 int disks = conf->raid_disks, i;
429 unsigned long flags;
430
431 for (i=0 ; i<disks; i++)
432 if (bh == sh->bh_cache[i])
433 break;
434
435 PRINTK("end_write_request %lu/%d, count %d, uptodate: %d.\n", sh->sector, i, atomic_read(&sh->count), uptodate);
436 if (i == disks) {
437 BUG();
438 return;
439 }
440
441 md_spin_lock_irqsave(&conf->device_lock, flags);
442 if (!uptodate)
443 md_error(mddev_to_kdev(conf->mddev), bh->b_dev);
444 clear_bit(BH_Lock, &bh->b_state);
445 set_bit(STRIPE_HANDLE, &sh->state);
446 __release_stripe(conf, sh);
447 md_spin_unlock_irqrestore(&conf->device_lock, flags);
448 }
449
450
451
452 static struct buffer_head *raid5_build_block (struct stripe_head *sh, int i)
453 {
454 raid5_conf_t *conf = sh->raid_conf;
455 struct buffer_head *bh = sh->bh_cache[i];
456 unsigned long block = sh->sector / (sh->size >> 9);
457
458 init_buffer(bh, raid5_end_read_request, sh);
459 bh->b_dev = conf->disks[i].dev;
460 bh->b_blocknr = block;
461
462 bh->b_state = (1 << BH_Req) | (1 << BH_Mapped);
463 bh->b_size = sh->size;
464 bh->b_list = BUF_LOCKED;
465 return bh;
466 }
467
468 static int raid5_error (mddev_t *mddev, kdev_t dev)
469 {
470 raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
471 mdp_super_t *sb = mddev->sb;
472 struct disk_info *disk;
473 int i;
474
475 PRINTK("raid5_error called\n");
476 conf->resync_parity = 0;
477 for (i = 0, disk = conf->disks; i < conf->raid_disks; i++, disk++) {
478 if (disk->dev == dev && disk->operational) {
479 disk->operational = 0;
480 mark_disk_faulty(sb->disks+disk->number);
481 mark_disk_nonsync(sb->disks+disk->number);
482 mark_disk_inactive(sb->disks+disk->number);
483 sb->active_disks--;
484 sb->working_disks--;
485 sb->failed_disks++;
486 mddev->sb_dirty = 1;
487 conf->working_disks--;
488 conf->failed_disks++;
489 md_wakeup_thread(conf->thread);
490 printk (KERN_ALERT
491 "raid5: Disk failure on %s, disabling device."
492 " Operation continuing on %d devices\n",
493 partition_name (dev), conf->working_disks);
494 return 0;
495 }
496 }
497 /*
498 * handle errors in spares (during reconstruction)
499 */
500 if (conf->spare) {
501 disk = conf->spare;
502 if (disk->dev == dev) {
503 printk (KERN_ALERT
504 "raid5: Disk failure on spare %s\n",
505 partition_name (dev));
506 if (!conf->spare->operational) {
507 MD_BUG();
508 return -EIO;
509 }
510 disk->operational = 0;
511 disk->write_only = 0;
512 conf->spare = NULL;
513 mark_disk_faulty(sb->disks+disk->number);
514 mark_disk_nonsync(sb->disks+disk->number);
515 mark_disk_inactive(sb->disks+disk->number);
516 sb->spare_disks--;
517 sb->working_disks--;
518 sb->failed_disks++;
519
520 return 0;
521 }
522 }
523 MD_BUG();
524 return -EIO;
525 }
526
527 /*
528 * Input: a 'big' sector number,
529 * Output: index of the data and parity disk, and the sector # in them.
530 */
531 static unsigned long raid5_compute_sector(unsigned long r_sector, unsigned int raid_disks,
532 unsigned int data_disks, unsigned int * dd_idx,
533 unsigned int * pd_idx, raid5_conf_t *conf)
534 {
535 unsigned long stripe;
536 unsigned long chunk_number;
537 unsigned int chunk_offset;
538 unsigned long new_sector;
539 int sectors_per_chunk = conf->chunk_size >> 9;
540
541 /* First compute the information on this sector */
542
543 /*
544 * Compute the chunk number and the sector offset inside the chunk
545 */
546 chunk_number = r_sector / sectors_per_chunk;
547 chunk_offset = r_sector % sectors_per_chunk;
548
549 /*
550 * Compute the stripe number
551 */
552 stripe = chunk_number / data_disks;
553
554 /*
555 * Compute the data disk and parity disk indexes inside the stripe
556 */
557 *dd_idx = chunk_number % data_disks;
558
559 /*
560 * Select the parity disk based on the user selected algorithm.
561 */
562 if (conf->level == 4)
563 *pd_idx = data_disks;
564 else switch (conf->algorithm) {
565 case ALGORITHM_LEFT_ASYMMETRIC:
566 *pd_idx = data_disks - stripe % raid_disks;
567 if (*dd_idx >= *pd_idx)
568 (*dd_idx)++;
569 break;
570 case ALGORITHM_RIGHT_ASYMMETRIC:
571 *pd_idx = stripe % raid_disks;
572 if (*dd_idx >= *pd_idx)
573 (*dd_idx)++;
574 break;
575 case ALGORITHM_LEFT_SYMMETRIC:
576 *pd_idx = data_disks - stripe % raid_disks;
577 *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;
578 break;
579 case ALGORITHM_RIGHT_SYMMETRIC:
580 *pd_idx = stripe % raid_disks;
581 *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;
582 break;
583 default:
584 printk ("raid5: unsupported algorithm %d\n", conf->algorithm);
585 }
586
587 /*
588 * Finally, compute the new sector number
589 */
590 new_sector = stripe * sectors_per_chunk + chunk_offset;
591 return new_sector;
592 }
593
594 #if 0
595 static unsigned long compute_blocknr(struct stripe_head *sh, int i)
596 {
597 raid5_conf_t *conf = sh->raid_conf;
598 int raid_disks = conf->raid_disks, data_disks = raid_disks - 1;
599 unsigned long new_sector = sh->sector, check;
600 int sectors_per_chunk = conf->chunk_size >> 9;
601 unsigned long stripe = new_sector / sectors_per_chunk;
602 int chunk_offset = new_sector % sectors_per_chunk;
603 int chunk_number, dummy1, dummy2, dd_idx = i;
604 unsigned long r_sector, blocknr;
605
606 switch (conf->algorithm) {
607 case ALGORITHM_LEFT_ASYMMETRIC:
608 case ALGORITHM_RIGHT_ASYMMETRIC:
609 if (i > sh->pd_idx)
610 i--;
611 break;
612 case ALGORITHM_LEFT_SYMMETRIC:
613 case ALGORITHM_RIGHT_SYMMETRIC:
614 if (i < sh->pd_idx)
615 i += raid_disks;
616 i -= (sh->pd_idx + 1);
617 break;
618 default:
619 printk ("raid5: unsupported algorithm %d\n", conf->algorithm);
620 }
621
622 chunk_number = stripe * data_disks + i;
623 r_sector = chunk_number * sectors_per_chunk + chunk_offset;
624 blocknr = r_sector / (sh->size >> 9);
625
626 check = raid5_compute_sector (r_sector, raid_disks, data_disks, &dummy1, &dummy2, conf);
627 if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) {
628 printk("compute_blocknr: map not correct\n");
629 return 0;
630 }
631 return blocknr;
632 }
633 #endif
634
635 #define check_xor() do { \
636 if (count == MAX_XOR_BLOCKS) { \
637 xor_block(count, bh_ptr); \
638 count = 1; \
639 } \
640 } while(0)
641
642
643 static void compute_block(struct stripe_head *sh, int dd_idx)
644 {
645 raid5_conf_t *conf = sh->raid_conf;
646 int i, count, disks = conf->raid_disks;
647 struct buffer_head *bh_ptr[MAX_XOR_BLOCKS], *bh;
648
649 PRINTK("compute_block, stripe %lu, idx %d\n", sh->sector, dd_idx);
650
651
652 memset(sh->bh_cache[dd_idx]->b_data, 0, sh->size);
653 bh_ptr[0] = sh->bh_cache[dd_idx];
654 count = 1;
655 for (i = disks ; i--; ) {
656 if (i == dd_idx)
657 continue;
658 bh = sh->bh_cache[i];
659 if (buffer_uptodate(bh))
660 bh_ptr[count++] = bh;
661 else
662 printk("compute_block() %d, stripe %lu, %d not present\n", dd_idx, sh->sector, i);
663
664 check_xor();
665 }
666 if (count != 1)
667 xor_block(count, bh_ptr);
668 set_bit(BH_Uptodate, &sh->bh_cache[dd_idx]->b_state);
669 }
670
671 static void compute_parity(struct stripe_head *sh, int method)
672 {
673 raid5_conf_t *conf = sh->raid_conf;
674 int i, pd_idx = sh->pd_idx, disks = conf->raid_disks, count;
675 struct buffer_head *bh_ptr[MAX_XOR_BLOCKS];
676 struct buffer_head *chosen[MD_SB_DISKS];
677
678 PRINTK("compute_parity, stripe %lu, method %d\n", sh->sector, method);
679 memset(chosen, 0, sizeof(chosen));
680
681 count = 1;
682 bh_ptr[0] = sh->bh_cache[pd_idx];
683 spin_lock_irq(&conf->device_lock);
684 switch(method) {
685 case READ_MODIFY_WRITE:
686 if (!buffer_uptodate(sh->bh_cache[pd_idx]))
687 BUG();
688 for (i=disks ; i-- ;) {
689 if (i==pd_idx)
690 continue;
691 if (sh->bh_write[i] &&
692 buffer_uptodate(sh->bh_cache[i])) {
693 bh_ptr[count++] = sh->bh_cache[i];
694 chosen[i] = sh->bh_write[i];
695 sh->bh_write[i] = sh->bh_write[i]->b_reqnext;
696 chosen[i]->b_reqnext = sh->bh_written[i];
697 sh->bh_written[i] = chosen[i];
698 check_xor();
699 }
700 }
701 break;
702 case RECONSTRUCT_WRITE:
703 memset(sh->bh_cache[pd_idx]->b_data, 0, sh->size);
704 for (i= disks; i-- ;)
705 if (i!=pd_idx && sh->bh_write[i]) {
706 chosen[i] = sh->bh_write[i];
707 sh->bh_write[i] = sh->bh_write[i]->b_reqnext;
708 chosen[i]->b_reqnext = sh->bh_written[i];
709 sh->bh_written[i] = chosen[i];
710 check_xor();
711 }
712 break;
713 case CHECK_PARITY:
714 break;
715 }
716 spin_unlock_irq(&conf->device_lock);
717 for (i = disks; i--;)
718 if (chosen[i]) {
719 struct buffer_head *bh = sh->bh_cache[i];
720 char *bdata;
721 mark_buffer_clean(chosen[i]); /* NO FIXME */
722 bdata = bh_kmap(chosen[i]);
723 memcpy(bh->b_data,
724 bdata,sh->size);
725 bh_kunmap(chosen[i]);
726 set_bit(BH_Lock, &bh->b_state);
727 mark_buffer_uptodate(bh, 1);
728 }
729
730 switch(method) {
731 case RECONSTRUCT_WRITE:
732 case CHECK_PARITY:
733 for (i=disks; i--;)
734 if (i != pd_idx) {
735 bh_ptr[count++] = sh->bh_cache[i];
736 check_xor();
737 }
738 break;
739 case READ_MODIFY_WRITE:
740 for (i = disks; i--;)
741 if (chosen[i]) {
742 bh_ptr[count++] = sh->bh_cache[i];
743 check_xor();
744 }
745 }
746 if (count != 1)
747 xor_block(count, bh_ptr);
748
749 if (method != CHECK_PARITY) {
750 mark_buffer_uptodate(sh->bh_cache[pd_idx], 1);
751 set_bit(BH_Lock, &sh->bh_cache[pd_idx]->b_state);
752 } else
753 mark_buffer_uptodate(sh->bh_cache[pd_idx], 0);
754 }
755
756 static void add_stripe_bh (struct stripe_head *sh, struct buffer_head *bh, int dd_idx, int rw)
757 {
758 struct buffer_head **bhp;
759 raid5_conf_t *conf = sh->raid_conf;
760
761 PRINTK("adding bh b#%lu to stripe s#%lu\n", bh->b_blocknr, sh->sector);
762
763
764 spin_lock_irq(&conf->device_lock);
765 bh->b_reqnext = NULL;
766 if (rw == READ)
767 bhp = &sh->bh_read[dd_idx];
768 else
769 bhp = &sh->bh_write[dd_idx];
770 while (*bhp) {
771 printk(KERN_NOTICE "raid5: multiple %d requests for sector %ld\n", rw, sh->sector);
772 bhp = & (*bhp)->b_reqnext;
773 }
774 *bhp = bh;
775 spin_unlock_irq(&conf->device_lock);
776
777 PRINTK("added bh b#%lu to stripe s#%lu, disk %d.\n", bh->b_blocknr, sh->sector, dd_idx);
778 }
779
780
781
782
783
784 /*
785 * handle_stripe - do things to a stripe.
786 *
787 * We lock the stripe and then examine the state of various bits
788 * to see what needs to be done.
789 * Possible results:
790 * return some read request which now have data
791 * return some write requests which are safely on disc
792 * schedule a read on some buffers
793 * schedule a write of some buffers
794 * return confirmation of parity correctness
795 *
796 * Parity calculations are done inside the stripe lock
797 * buffers are taken off read_list or write_list, and bh_cache buffers
798 * get BH_Lock set before the stripe lock is released.
799 *
800 */
801
802 static void handle_stripe(struct stripe_head *sh)
803 {
804 raid5_conf_t *conf = sh->raid_conf;
805 int disks = conf->raid_disks;
806 struct buffer_head *return_ok= NULL, *return_fail = NULL;
807 int action[MD_SB_DISKS];
808 int i;
809 int syncing;
810 int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
811 int failed_num=0;
812 struct buffer_head *bh;
813
814 PRINTK("handling stripe %ld, cnt=%d, pd_idx=%d\n", sh->sector, atomic_read(&sh->count), sh->pd_idx);
815 memset(action, 0, sizeof(action));
816
817 spin_lock(&sh->lock);
818 clear_bit(STRIPE_HANDLE, &sh->state);
819
820 syncing = test_bit(STRIPE_SYNCING, &sh->state);
821 /* Now to look around and see what can be done */
822
823 for (i=disks; i--; ) {
824 bh = sh->bh_cache[i];
825 PRINTK("check %d: state %lx read %p write %p written %p\n", i, bh->b_state, sh->bh_read[i], sh->bh_write[i], sh->bh_written[i]);
826 /* maybe we can reply to a read */
827 if (buffer_uptodate(bh) && sh->bh_read[i]) {
828 struct buffer_head *rbh, *rbh2;
829 PRINTK("Return read for disc %d\n", i);
830 spin_lock_irq(&conf->device_lock);
831 rbh = sh->bh_read[i];
832 sh->bh_read[i] = NULL;
833 spin_unlock_irq(&conf->device_lock);
834 while (rbh) {
835 char *bdata;
836 bdata = bh_kmap(rbh);
837 memcpy(bdata, bh->b_data, bh->b_size);
838 bh_kunmap(rbh);
839 rbh2 = rbh->b_reqnext;
840 rbh->b_reqnext = return_ok;
841 return_ok = rbh;
842 rbh = rbh2;
843 }
844 }
845
846 /* now count some things */
847 if (buffer_locked(bh)) locked++;
848 if (buffer_uptodate(bh)) uptodate++;
849
850
851 if (sh->bh_read[i]) to_read++;
852 if (sh->bh_write[i]) to_write++;
853 if (sh->bh_written[i]) written++;
854 if (!conf->disks[i].operational) {
855 failed++;
856 failed_num = i;
857 }
858 }
859 PRINTK("locked=%d uptodate=%d to_read=%d to_write=%d failed=%d failed_num=%d\n",
860 locked, uptodate, to_read, to_write, failed, failed_num);
861 /* check if the array has lost two devices and, if so, some requests might
862 * need to be failed
863 */
864 if (failed > 1 && to_read+to_write) {
865 spin_lock_irq(&conf->device_lock);
866 for (i=disks; i--; ) {
867 /* fail all writes first */
868 if (sh->bh_write[i]) to_write--;
869 while ((bh = sh->bh_write[i])) {
870 sh->bh_write[i] = bh->b_reqnext;
871 bh->b_reqnext = return_fail;
872 return_fail = bh;
873 }
874 /* fail any reads if this device is non-operational */
875 if (!conf->disks[i].operational) {
876 if (sh->bh_read[i]) to_read--;
877 while ((bh = sh->bh_read[i])) {
878 sh->bh_read[i] = bh->b_reqnext;
879 bh->b_reqnext = return_fail;
880 return_fail = bh;
881 }
882 }
883 }
884 spin_unlock_irq(&conf->device_lock);
885 if (syncing) {
886 md_done_sync(conf->mddev, (sh->size>>10) - sh->sync_redone,0);
887 clear_bit(STRIPE_SYNCING, &sh->state);
888 syncing = 0;
889 }
890 }
891
892 /* might be able to return some write requests if the parity block
893 * is safe, or on a failed drive
894 */
895 bh = sh->bh_cache[sh->pd_idx];
896 if ( written &&
897 ( (conf->disks[sh->pd_idx].operational && !buffer_locked(bh) && buffer_uptodate(bh))
898 || (failed == 1 && failed_num == sh->pd_idx))
899 ) {
900 /* any written block on a uptodate or failed drive can be returned */
901 for (i=disks; i--; )
902 if (sh->bh_written[i]) {
903 bh = sh->bh_cache[i];
904 if (!conf->disks[sh->pd_idx].operational ||
905 (!buffer_locked(bh) && buffer_uptodate(bh)) ) {
906 /* maybe we can return some write requests */
907 struct buffer_head *wbh, *wbh2;
908 PRINTK("Return write for disc %d\n", i);
909 spin_lock_irq(&conf->device_lock);
910 wbh = sh->bh_written[i];
911 sh->bh_written[i] = NULL;
912 spin_unlock_irq(&conf->device_lock);
913 while (wbh) {
914 wbh2 = wbh->b_reqnext;
915 wbh->b_reqnext = return_ok;
916 return_ok = wbh;
917 wbh = wbh2;
918 }
919 }
920 }
921 }
922
923 /* Now we might consider reading some blocks, either to check/generate
924 * parity, or to satisfy requests
925 */
926 if (to_read || (syncing && (uptodate+failed < disks))) {
927 for (i=disks; i--;) {
928 bh = sh->bh_cache[i];
929 if (!buffer_locked(bh) && !buffer_uptodate(bh) &&
930 (sh->bh_read[i] || syncing || (failed && sh->bh_read[failed_num]))) {
931 /* we would like to get this block, possibly
932 * by computing it, but we might not be able to
933 */
934 if (uptodate == disks-1) {
935 PRINTK("Computing block %d\n", i);
936 compute_block(sh, i);
937 uptodate++;
938 } else if (conf->disks[i].operational) {
939 set_bit(BH_Lock, &bh->b_state);
940 action[i] = READ+1;
941 locked++;
942 PRINTK("Reading block %d (sync=%d)\n", i, syncing);
943 if (syncing)
944 md_sync_acct(conf->disks[i].dev, bh->b_size>>9);
945 }
946 }
947 }
948 set_bit(STRIPE_HANDLE, &sh->state);
949 }
950
951 /* now to consider writing and what else, if anything should be read */
952 if (to_write) {
953 int rmw=0, rcw=0;
954 for (i=disks ; i--;) {
955 /* would I have to read this buffer for read_modify_write */
956 bh = sh->bh_cache[i];
957 if ((sh->bh_write[i] || i == sh->pd_idx) &&
958 !buffer_locked(bh) && !buffer_uptodate(bh)) {
959 if (conf->disks[i].operational
960 /* && !(conf->resync_parity && i == sh->pd_idx) */
961 )
962 rmw++;
963 else rmw += 2*disks; /* cannot read it */
964 }
965 /* Would I have to read this buffer for reconstruct_write */
966 if (!sh->bh_write[i] && i != sh->pd_idx &&
967 !buffer_locked(bh) && !buffer_uptodate(bh)) {
968 if (conf->disks[i].operational) rcw++;
969 else rcw += 2*disks;
970 }
971 }
972 PRINTK("for sector %ld, rmw=%d rcw=%d\n", sh->sector, rmw, rcw);
973 set_bit(STRIPE_HANDLE, &sh->state);
974 if (rmw < rcw && rmw > 0)
975 /* prefer read-modify-write, but need to get some data */
976 for (i=disks; i--;) {
977 bh = sh->bh_cache[i];
978 if ((sh->bh_write[i] || i == sh->pd_idx) &&
979 !buffer_locked(bh) && !buffer_uptodate(bh) &&
980 conf->disks[i].operational) {
981 PRINTK("Read_old block %d for r-m-w\n", i);
982 set_bit(BH_Lock, &bh->b_state);
983 action[i] = READ+1;
984 locked++;
985 }
986 }
987 if (rcw <= rmw && rcw > 0)
988 /* want reconstruct write, but need to get some data */
989 for (i=disks; i--;) {
990 bh = sh->bh_cache[i];
991 if (!sh->bh_write[i] && i != sh->pd_idx &&
992 !buffer_locked(bh) && !buffer_uptodate(bh) &&
993 conf->disks[i].operational) {
994 PRINTK("Read_old block %d for Reconstruct\n", i);
995 set_bit(BH_Lock, &bh->b_state);
996 action[i] = READ+1;
997 locked++;
998 }
999 }
1000 /* now if nothing is locked, and if we have enough data, we can start a write request */
1001 if (locked == 0 && (rcw == 0 ||rmw == 0)) {
1002 PRINTK("Computing parity...\n");
1003 compute_parity(sh, rcw==0 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE);
1004 /* now every locked buffer is ready to be written */
1005 for (i=disks; i--;)
1006 if (buffer_locked(sh->bh_cache[i])) {
1007 PRINTK("Writing block %d\n", i);
1008 locked++;
1009 action[i] = WRITE+1;
1010 if (!conf->disks[i].operational
1011 || (i==sh->pd_idx && failed == 0))
1012 set_bit(STRIPE_INSYNC, &sh->state);
1013 }
1014 }
1015 }
1016
1017 /* maybe we need to check and possibly fix the parity for this stripe
1018 * Any reads will already have been scheduled, so we just see if enough data
1019 * is available
1020 */
1021 if (syncing && locked == 0 &&
1022 !test_bit(STRIPE_INSYNC, &sh->state) && failed <= 1) {
1023 set_bit(STRIPE_HANDLE, &sh->state);
1024 if (failed == 0) {
1025 if (uptodate != disks)
1026 BUG();
1027 compute_parity(sh, CHECK_PARITY);
1028 uptodate--;
1029 bh = sh->bh_cache[sh->pd_idx];
1030 if ((*(u32*)bh->b_data) == 0 &&
1031 !memcmp(bh->b_data, bh->b_data+4, bh->b_size-4)) {
1032 /* parity is correct (on disc, not in buffer any more) */
1033 set_bit(STRIPE_INSYNC, &sh->state);
1034 }
1035 }
1036 if (!test_bit(STRIPE_INSYNC, &sh->state)) {
1037 if (failed==0)
1038 failed_num = sh->pd_idx;
1039 /* should be able to compute the missing block and write it to spare */
1040 if (!buffer_uptodate(sh->bh_cache[failed_num])) {
1041 if (uptodate+1 != disks)
1042 BUG();
1043 compute_block(sh, failed_num);
1044 uptodate++;
1045 }
1046 if (uptodate != disks)
1047 BUG();
1048 bh = sh->bh_cache[failed_num];
1049 set_bit(BH_Lock, &bh->b_state);
1050 action[failed_num] = WRITE+1;
1051 locked++;
1052 set_bit(STRIPE_INSYNC, &sh->state);
1053 if (conf->disks[i].operational)
1054 md_sync_acct(conf->disks[i].dev, bh->b_size>>9);
1055 else if (conf->spare)
1056 md_sync_acct(conf->spare->dev, bh->b_size>>9);
1057
1058 }
1059 }
1060 if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
1061 md_done_sync(conf->mddev, (sh->size>>10) - sh->sync_redone,1);
1062 clear_bit(STRIPE_SYNCING, &sh->state);
1063 }
1064
1065
1066 spin_unlock(&sh->lock);
1067
1068 while ((bh=return_ok)) {
1069 return_ok = bh->b_reqnext;
1070 bh->b_reqnext = NULL;
1071 bh->b_end_io(bh, 1);
1072 }
1073 while ((bh=return_fail)) {
1074 return_ok = bh->b_reqnext;
1075 bh->b_reqnext = NULL;
1076 bh->b_end_io(bh, 0);
1077 }
1078 for (i=disks; i-- ;)
1079 if (action[i]) {
1080 struct buffer_head *bh = sh->bh_cache[i];
1081 int skip = 0;
1082 if (action[i] == READ+1)
1083 bh->b_end_io = raid5_end_read_request;
1084 else
1085 bh->b_end_io = raid5_end_write_request;
1086 if (conf->disks[i].operational)
1087 bh->b_dev = conf->disks[i].dev;
1088 else if (conf->spare && action[i] == WRITE+1)
1089 bh->b_dev = conf->spare->dev;
1090 else if (action[i] == READ+1)
1091 BUG();
1092 else skip=1;
1093 if (!skip) {
1094 PRINTK("for %ld schedule op %d on disc %d\n", sh->sector, action[i]-1, i);
1095 atomic_inc(&sh->count);
1096 bh->b_rdev = bh->b_dev;
1097 bh->b_rsector = bh->b_blocknr * (bh->b_size>>9);
1098 generic_make_request(action[i]-1, bh);
1099 } else {
1100 PRINTK("skip op %d on disc %d for sector %ld\n", action[i]-1, i, sh->sector);
1101 clear_bit(BH_Lock, &bh->b_state);
1102 set_bit(STRIPE_HANDLE, &sh->state);
1103 }
1104 }
1105 }
1106
1107
1108 static int raid5_make_request (mddev_t *mddev, int rw, struct buffer_head * bh)
1109 {
1110 raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
1111 const unsigned int raid_disks = conf->raid_disks;
1112 const unsigned int data_disks = raid_disks - 1;
1113 unsigned int dd_idx, pd_idx;
1114 unsigned long new_sector;
1115 int read_ahead = 0;
1116
1117 struct stripe_head *sh;
1118
1119 if (rw == READA) {
1120 rw = READ;
1121 read_ahead=1;
1122 }
1123
1124 new_sector = raid5_compute_sector(bh->b_rsector,
1125 raid_disks, data_disks, &dd_idx, &pd_idx, conf);
1126
1127 PRINTK("raid5_make_request, sector %lu\n", new_sector);
1128 sh = get_active_stripe(conf, new_sector, bh->b_size, read_ahead);
1129 if (sh) {
1130 sh->pd_idx = pd_idx;
1131
1132 add_stripe_bh(sh, bh, dd_idx, rw);
1133 handle_stripe(sh);
1134 release_stripe(sh);
1135 } else
1136 bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state));
1137 return 0;
1138 }
1139
1140 /*
1141 * Determine correct block size for this device.
1142 */
1143 unsigned int device_bsize (kdev_t dev)
1144 {
1145 unsigned int i, correct_size;
1146
1147 correct_size = BLOCK_SIZE;
1148 if (blksize_size[MAJOR(dev)]) {
1149 i = blksize_size[MAJOR(dev)][MINOR(dev)];
1150 if (i)
1151 correct_size = i;
1152 }
1153
1154 return correct_size;
1155 }
1156
1157 static int raid5_sync_request (mddev_t *mddev, unsigned long block_nr)
1158 {
1159 raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
1160 struct stripe_head *sh;
1161 int sectors_per_chunk = conf->chunk_size >> 9;
1162 unsigned long stripe = (block_nr<<1)/sectors_per_chunk;
1163 int chunk_offset = (block_nr<<1) % sectors_per_chunk;
1164 int dd_idx, pd_idx;
1165 unsigned long first_sector;
1166 int raid_disks = conf->raid_disks;
1167 int data_disks = raid_disks-1;
1168 int redone = 0;
1169 int bufsize;
1170
1171 sh = get_active_stripe(conf, block_nr<<1, 0, 0);
1172 bufsize = sh->size;
1173 redone = block_nr-(sh->sector>>1);
1174 first_sector = raid5_compute_sector(stripe*data_disks*sectors_per_chunk
1175 + chunk_offset, raid_disks, data_disks, &dd_idx, &pd_idx, conf);
1176 sh->pd_idx = pd_idx;
1177 spin_lock(&sh->lock);
1178 set_bit(STRIPE_SYNCING, &sh->state);
1179 clear_bit(STRIPE_INSYNC, &sh->state);
1180 sh->sync_redone = redone;
1181 spin_unlock(&sh->lock);
1182
1183 handle_stripe(sh);
1184 release_stripe(sh);
1185
1186 return (bufsize>>10)-redone;
1187 }
1188
1189 /*
1190 * This is our raid5 kernel thread.
1191 *
1192 * We scan the hash table for stripes which can be handled now.
1193 * During the scan, completed stripes are saved for us by the interrupt
1194 * handler, so that they will not have to wait for our next wakeup.
1195 */
1196 static void raid5d (void *data)
1197 {
1198 struct stripe_head *sh;
1199 raid5_conf_t *conf = data;
1200 mddev_t *mddev = conf->mddev;
1201 int handled;
1202
1203 PRINTK("+++ raid5d active\n");
1204
1205 handled = 0;
1206
1207 if (mddev->sb_dirty) {
1208 mddev->sb_dirty = 0;
1209 md_update_sb(mddev);
1210 }
1211 md_spin_lock_irq(&conf->device_lock);
1212 while (!list_empty(&conf->handle_list)) {
1213 struct list_head *first = conf->handle_list.next;
1214 sh = list_entry(first, struct stripe_head, lru);
1215
1216 list_del_init(first);
1217 atomic_inc(&sh->count);
1218 if (atomic_read(&sh->count)!= 1)
1219 BUG();
1220 md_spin_unlock_irq(&conf->device_lock);
1221
1222 handled++;
1223 handle_stripe(sh);
1224 release_stripe(sh);
1225
1226 md_spin_lock_irq(&conf->device_lock);
1227 }
1228 PRINTK("%d stripes handled\n", handled);
1229
1230 md_spin_unlock_irq(&conf->device_lock);
1231
1232 PRINTK("--- raid5d inactive\n");
1233 }
1234
1235 /*
1236 * Private kernel thread for parity reconstruction after an unclean
1237 * shutdown. Reconstruction on spare drives in case of a failed drive
1238 * is done by the generic mdsyncd.
1239 */
1240 static void raid5syncd (void *data)
1241 {
1242 raid5_conf_t *conf = data;
1243 mddev_t *mddev = conf->mddev;
1244
1245 if (!conf->resync_parity)
1246 return;
1247 if (conf->resync_parity == 2)
1248 return;
1249 down(&mddev->recovery_sem);
1250 if (md_do_sync(mddev,NULL)) {
1251 up(&mddev->recovery_sem);
1252 printk("raid5: resync aborted!\n");
1253 return;
1254 }
1255 conf->resync_parity = 0;
1256 up(&mddev->recovery_sem);
1257 printk("raid5: resync finished.\n");
1258 }
1259
1260 static int __check_consistency (mddev_t *mddev, int row)
1261 {
1262 raid5_conf_t *conf = mddev->private;
1263 kdev_t dev;
1264 struct buffer_head *bh[MD_SB_DISKS], *tmp = NULL;
1265 int i, ret = 0, nr = 0, count;
1266 struct buffer_head *bh_ptr[MAX_XOR_BLOCKS];
1267
1268 if (conf->working_disks != conf->raid_disks)
1269 goto out;
1270 tmp = kmalloc(sizeof(*tmp), GFP_KERNEL);
1271 tmp->b_size = 4096;
1272 tmp->b_page = alloc_page(GFP_KERNEL);
1273 tmp->b_data = page_address(tmp->b_page);
1274 if (!tmp->b_data)
1275 goto out;
1276 md_clear_page(tmp->b_data);
1277 memset(bh, 0, MD_SB_DISKS * sizeof(struct buffer_head *));
1278 for (i = 0; i < conf->raid_disks; i++) {
1279 dev = conf->disks[i].dev;
1280 set_blocksize(dev, 4096);
1281 bh[i] = bread(dev, row / 4, 4096);
1282 if (!bh[i])
1283 break;
1284 nr++;
1285 }
1286 if (nr == conf->raid_disks) {
1287 bh_ptr[0] = tmp;
1288 count = 1;
1289 for (i = 1; i < nr; i++) {
1290 bh_ptr[count++] = bh[i];
1291 if (count == MAX_XOR_BLOCKS) {
1292 xor_block(count, &bh_ptr[0]);
1293 count = 1;
1294 }
1295 }
1296 if (count != 1) {
1297 xor_block(count, &bh_ptr[0]);
1298 }
1299 if (memcmp(tmp->b_data, bh[0]->b_data, 4096))
1300 ret = 1;
1301 }
1302 for (i = 0; i < conf->raid_disks; i++) {
1303 dev = conf->disks[i].dev;
1304 if (bh[i]) {
1305 bforget(bh[i]);
1306 bh[i] = NULL;
1307 }
1308 fsync_dev(dev);
1309 invalidate_buffers(dev);
1310 }
1311 free_page((unsigned long) tmp->b_data);
1312 out:
1313 if (tmp)
1314 kfree(tmp);
1315 return ret;
1316 }
1317
1318 static int check_consistency (mddev_t *mddev)
1319 {
1320 if (__check_consistency(mddev, 0))
1321 /*
1322 * We are not checking this currently, as it's legitimate to have
1323 * an inconsistent array, at creation time.
1324 */
1325 return 0;
1326
1327 return 0;
1328 }
1329
1330 static int raid5_run (mddev_t *mddev)
1331 {
1332 raid5_conf_t *conf;
1333 int i, j, raid_disk, memory;
1334 mdp_super_t *sb = mddev->sb;
1335 mdp_disk_t *desc;
1336 mdk_rdev_t *rdev;
1337 struct disk_info *disk;
1338 struct md_list_head *tmp;
1339 int start_recovery = 0;
1340
1341 MOD_INC_USE_COUNT;
1342
1343 if (sb->level != 5 && sb->level != 4) {
1344 printk("raid5: md%d: raid level not set to 4/5 (%d)\n", mdidx(mddev), sb->level);
1345 MOD_DEC_USE_COUNT;
1346 return -EIO;
1347 }
1348
1349 mddev->private = kmalloc (sizeof (raid5_conf_t), GFP_KERNEL);
1350 if ((conf = mddev->private) == NULL)
1351 goto abort;
1352 memset (conf, 0, sizeof (*conf));
1353 conf->mddev = mddev;
1354
1355 if ((conf->stripe_hashtbl = (struct stripe_head **) md__get_free_pages(GFP_ATOMIC, HASH_PAGES_ORDER)) == NULL)
1356 goto abort;
1357 memset(conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE);
1358
1359 conf->device_lock = MD_SPIN_LOCK_UNLOCKED;
1360 md_init_waitqueue_head(&conf->wait_for_stripe);
1361 INIT_LIST_HEAD(&conf->handle_list);
1362 INIT_LIST_HEAD(&conf->inactive_list);
1363 atomic_set(&conf->active_stripes, 0);
1364 conf->buffer_size = PAGE_SIZE; /* good default for rebuild */
1365
1366 PRINTK("raid5_run(md%d) called.\n", mdidx(mddev));
1367
1368 ITERATE_RDEV(mddev,rdev,tmp) {
1369 /*
1370 * This is important -- we are using the descriptor on
1371 * the disk only to get a pointer to the descriptor on
1372 * the main superblock, which might be more recent.
1373 */
1374 desc = sb->disks + rdev->desc_nr;
1375 raid_disk = desc->raid_disk;
1376 disk = conf->disks + raid_disk;
1377
1378 if (disk_faulty(desc)) {
1379 printk(KERN_ERR "raid5: disabled device %s (errors detected)\n", partition_name(rdev->dev));
1380 if (!rdev->faulty) {
1381 MD_BUG();
1382 goto abort;
1383 }
1384 disk->number = desc->number;
1385 disk->raid_disk = raid_disk;
1386 disk->dev = rdev->dev;
1387
1388 disk->operational = 0;
1389 disk->write_only = 0;
1390 disk->spare = 0;
1391 disk->used_slot = 1;
1392 continue;
1393 }
1394 if (disk_active(desc)) {
1395 if (!disk_sync(desc)) {
1396 printk(KERN_ERR "raid5: disabled device %s (not in sync)\n", partition_name(rdev->dev));
1397 MD_BUG();
1398 goto abort;
1399 }
1400 if (raid_disk > sb->raid_disks) {
1401 printk(KERN_ERR "raid5: disabled device %s (inconsistent descriptor)\n", partition_name(rdev->dev));
1402 continue;
1403 }
1404 if (disk->operational) {
1405 printk(KERN_ERR "raid5: disabled device %s (device %d already operational)\n", partition_name(rdev->dev), raid_disk);
1406 continue;
1407 }
1408 printk(KERN_INFO "raid5: device %s operational as raid disk %d\n", partition_name(rdev->dev), raid_disk);
1409
1410 disk->number = desc->number;
1411 disk->raid_disk = raid_disk;
1412 disk->dev = rdev->dev;
1413 disk->operational = 1;
1414 disk->used_slot = 1;
1415
1416 conf->working_disks++;
1417 } else {
1418 /*
1419 * Must be a spare disk ..
1420 */
1421 printk(KERN_INFO "raid5: spare disk %s\n", partition_name(rdev->dev));
1422 disk->number = desc->number;
1423 disk->raid_disk = raid_disk;
1424 disk->dev = rdev->dev;
1425
1426 disk->operational = 0;
1427 disk->write_only = 0;
1428 disk->spare = 1;
1429 disk->used_slot = 1;
1430 }
1431 }
1432
1433 for (i = 0; i < MD_SB_DISKS; i++) {
1434 desc = sb->disks + i;
1435 raid_disk = desc->raid_disk;
1436 disk = conf->disks + raid_disk;
1437
1438 if (disk_faulty(desc) && (raid_disk < sb->raid_disks) &&
1439 !conf->disks[raid_disk].used_slot) {
1440
1441 disk->number = desc->number;
1442 disk->raid_disk = raid_disk;
1443 disk->dev = MKDEV(0,0);
1444
1445 disk->operational = 0;
1446 disk->write_only = 0;
1447 disk->spare = 0;
1448 disk->used_slot = 1;
1449 }
1450 }
1451
1452 conf->raid_disks = sb->raid_disks;
1453 /*
1454 * 0 for a fully functional array, 1 for a degraded array.
1455 */
1456 conf->failed_disks = conf->raid_disks - conf->working_disks;
1457 conf->mddev = mddev;
1458 conf->chunk_size = sb->chunk_size;
1459 conf->level = sb->level;
1460 conf->algorithm = sb->layout;
1461 conf->max_nr_stripes = NR_STRIPES;
1462
1463 #if 0
1464 for (i = 0; i < conf->raid_disks; i++) {
1465 if (!conf->disks[i].used_slot) {
1466 MD_BUG();
1467 goto abort;
1468 }
1469 }
1470 #endif
1471 if (!conf->chunk_size || conf->chunk_size % 4) {
1472 printk(KERN_ERR "raid5: invalid chunk size %d for md%d\n", conf->chunk_size, mdidx(mddev));
1473 goto abort;
1474 }
1475 if (conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) {
1476 printk(KERN_ERR "raid5: unsupported parity algorithm %d for md%d\n", conf->algorithm, mdidx(mddev));
1477 goto abort;
1478 }
1479 if (conf->failed_disks > 1) {
1480 printk(KERN_ERR "raid5: not enough operational devices for md%d (%d/%d failed)\n", mdidx(mddev), conf->failed_disks, conf->raid_disks);
1481 goto abort;
1482 }
1483
1484 if (conf->working_disks != sb->raid_disks) {
1485 printk(KERN_ALERT "raid5: md%d, not all disks are operational -- trying to recover array\n", mdidx(mddev));
1486 start_recovery = 1;
1487 }
1488
1489 if (!start_recovery && (sb->state & (1 << MD_SB_CLEAN)) &&
1490 check_consistency(mddev)) {
1491 printk(KERN_ERR "raid5: detected raid-5 superblock xor inconsistency -- running resync\n");
1492 sb->state &= ~(1 << MD_SB_CLEAN);
1493 }
1494
1495 {
1496 const char * name = "raid5d";
1497
1498 conf->thread = md_register_thread(raid5d, conf, name);
1499 if (!conf->thread) {
1500 printk(KERN_ERR "raid5: couldn't allocate thread for md%d\n", mdidx(mddev));
1501 goto abort;
1502 }
1503 }
1504
1505 memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
1506 conf->raid_disks * ((sizeof(struct buffer_head) + PAGE_SIZE))) / 1024;
1507 if (grow_stripes(conf, conf->max_nr_stripes, GFP_KERNEL)) {
1508 printk(KERN_ERR "raid5: couldn't allocate %dkB for buffers\n", memory);
1509 shrink_stripes(conf, conf->max_nr_stripes);
1510 goto abort;
1511 } else
1512 printk(KERN_INFO "raid5: allocated %dkB for md%d\n", memory, mdidx(mddev));
1513
1514 /*
1515 * Regenerate the "device is in sync with the raid set" bit for
1516 * each device.
1517 */
1518 for (i = 0; i < MD_SB_DISKS ; i++) {
1519 mark_disk_nonsync(sb->disks + i);
1520 for (j = 0; j < sb->raid_disks; j++) {
1521 if (!conf->disks[j].operational)
1522 continue;
1523 if (sb->disks[i].number == conf->disks[j].number)
1524 mark_disk_sync(sb->disks + i);
1525 }
1526 }
1527 sb->active_disks = conf->working_disks;
1528
1529 if (sb->active_disks == sb->raid_disks)
1530 printk("raid5: raid level %d set md%d active with %d out of %d devices, algorithm %d\n", conf->level, mdidx(mddev), sb->active_disks, sb->raid_disks, conf->algorithm);
1531 else
1532 printk(KERN_ALERT "raid5: raid level %d set md%d active with %d out of %d devices, algorithm %d\n", conf->level, mdidx(mddev), sb->active_disks, sb->raid_disks, conf->algorithm);
1533
1534 if (!start_recovery && !(sb->state & (1 << MD_SB_CLEAN))) {
1535 const char * name = "raid5syncd";
1536
1537 conf->resync_thread = md_register_thread(raid5syncd, conf,name);
1538 if (!conf->resync_thread) {
1539 printk(KERN_ERR "raid5: couldn't allocate thread for md%d\n", mdidx(mddev));
1540 goto abort;
1541 }
1542
1543 printk("raid5: raid set md%d not clean; reconstructing parity\n", mdidx(mddev));
1544 conf->resync_parity = 1;
1545 md_wakeup_thread(conf->resync_thread);
1546 }
1547
1548 print_raid5_conf(conf);
1549 if (start_recovery)
1550 md_recover_arrays();
1551 print_raid5_conf(conf);
1552
1553 /* Ok, everything is just fine now */
1554 return (0);
1555 abort:
1556 if (conf) {
1557 print_raid5_conf(conf);
1558 if (conf->stripe_hashtbl)
1559 free_pages((unsigned long) conf->stripe_hashtbl,
1560 HASH_PAGES_ORDER);
1561 kfree(conf);
1562 }
1563 mddev->private = NULL;
1564 printk(KERN_ALERT "raid5: failed to run raid set md%d\n", mdidx(mddev));
1565 MOD_DEC_USE_COUNT;
1566 return -EIO;
1567 }
1568
1569 static int raid5_stop_resync (mddev_t *mddev)
1570 {
1571 raid5_conf_t *conf = mddev_to_conf(mddev);
1572 mdk_thread_t *thread = conf->resync_thread;
1573
1574 if (thread) {
1575 if (conf->resync_parity) {
1576 conf->resync_parity = 2;
1577 md_interrupt_thread(thread);
1578 printk(KERN_INFO "raid5: parity resync was not fully finished, restarting next time.\n");
1579 return 1;
1580 }
1581 return 0;
1582 }
1583 return 0;
1584 }
1585
1586 static int raid5_restart_resync (mddev_t *mddev)
1587 {
1588 raid5_conf_t *conf = mddev_to_conf(mddev);
1589
1590 if (conf->resync_parity) {
1591 if (!conf->resync_thread) {
1592 MD_BUG();
1593 return 0;
1594 }
1595 printk("raid5: waking up raid5resync.\n");
1596 conf->resync_parity = 1;
1597 md_wakeup_thread(conf->resync_thread);
1598 return 1;
1599 } else
1600 printk("raid5: no restart-resync needed.\n");
1601 return 0;
1602 }
1603
1604
1605 static int raid5_stop (mddev_t *mddev)
1606 {
1607 raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
1608
1609 if (conf->resync_thread)
1610 md_unregister_thread(conf->resync_thread);
1611 md_unregister_thread(conf->thread);
1612 shrink_stripes(conf, conf->max_nr_stripes);
1613 free_pages((unsigned long) conf->stripe_hashtbl, HASH_PAGES_ORDER);
1614 kfree(conf);
1615 mddev->private = NULL;
1616 MOD_DEC_USE_COUNT;
1617 return 0;
1618 }
1619
1620 #if RAID5_DEBUG
1621 static void print_sh (struct stripe_head *sh)
1622 {
1623 int i;
1624
1625 printk("sh %lu, size %d, pd_idx %d, state %ld.\n", sh->sector, sh->size, sh->pd_idx, sh->state);
1626 printk("sh %lu, count %d.\n", sh->sector, atomic_read(&sh->count));
1627 printk("sh %lu, ", sh->sector);
1628 for (i = 0; i < MD_SB_DISKS; i++) {
1629 if (sh->bh_cache[i])
1630 printk("(cache%d: %p %ld) ", i, sh->bh_cache[i], sh->bh_cache[i]->b_state);
1631 }
1632 printk("\n");
1633 }
1634
1635 static void printall (raid5_conf_t *conf)
1636 {
1637 struct stripe_head *sh;
1638 int i;
1639
1640 md_spin_lock_irq(&conf->device_lock);
1641 for (i = 0; i < NR_HASH; i++) {
1642 sh = conf->stripe_hashtbl[i];
1643 for (; sh; sh = sh->hash_next) {
1644 if (sh->raid_conf != conf)
1645 continue;
1646 print_sh(sh);
1647 }
1648 }
1649 md_spin_unlock_irq(&conf->device_lock);
1650
1651 PRINTK("--- raid5d inactive\n");
1652 }
1653 #endif
1654
1655 static int raid5_status (char *page, mddev_t *mddev)
1656 {
1657 raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
1658 mdp_super_t *sb = mddev->sb;
1659 int sz = 0, i;
1660
1661 sz += sprintf (page+sz, " level %d, %dk chunk, algorithm %d", sb->level, sb->chunk_size >> 10, sb->layout);
1662 sz += sprintf (page+sz, " [%d/%d] [", conf->raid_disks, conf->working_disks);
1663 for (i = 0; i < conf->raid_disks; i++)
1664 sz += sprintf (page+sz, "%s", conf->disks[i].operational ? "U" : "_");
1665 sz += sprintf (page+sz, "]");
1666 #if RAID5_DEBUG
1667 #define D(x) \
1668 sz += sprintf (page+sz, "<"#x":%d>", atomic_read(&conf->x))
1669 printall(conf);
1670 #endif
1671 return sz;
1672 }
1673
1674 static void print_raid5_conf (raid5_conf_t *conf)
1675 {
1676 int i;
1677 struct disk_info *tmp;
1678
1679 printk("RAID5 conf printout:\n");
1680 if (!conf) {
1681 printk("(conf==NULL)\n");
1682 return;
1683 }
1684 printk(" --- rd:%d wd:%d fd:%d\n", conf->raid_disks,
1685 conf->working_disks, conf->failed_disks);
1686
1687 #if RAID5_DEBUG
1688 for (i = 0; i < MD_SB_DISKS; i++) {
1689 #else
1690 for (i = 0; i < conf->working_disks+conf->failed_disks; i++) {
1691 #endif
1692 tmp = conf->disks + i;
1693 printk(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n",
1694 i, tmp->spare,tmp->operational,
1695 tmp->number,tmp->raid_disk,tmp->used_slot,
1696 partition_name(tmp->dev));
1697 }
1698 }
1699
1700 static int raid5_diskop(mddev_t *mddev, mdp_disk_t **d, int state)
1701 {
1702 int err = 0;
1703 int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1;
1704 raid5_conf_t *conf = mddev->private;
1705 struct disk_info *tmp, *sdisk, *fdisk, *rdisk, *adisk;
1706 mdp_super_t *sb = mddev->sb;
1707 mdp_disk_t *failed_desc, *spare_desc, *added_desc;
1708
1709 print_raid5_conf(conf);
1710 md_spin_lock_irq(&conf->device_lock);
1711 /*
1712 * find the disk ...
1713 */
1714 switch (state) {
1715
1716 case DISKOP_SPARE_ACTIVE:
1717
1718 /*
1719 * Find the failed disk within the RAID5 configuration ...
1720 * (this can only be in the first conf->raid_disks part)
1721 */
1722 for (i = 0; i < conf->raid_disks; i++) {
1723 tmp = conf->disks + i;
1724 if ((!tmp->operational && !tmp->spare) ||
1725 !tmp->used_slot) {
1726 failed_disk = i;
1727 break;
1728 }
1729 }
1730 /*
1731 * When we activate a spare disk we _must_ have a disk in
1732 * the lower (active) part of the array to replace.
1733 */
1734 if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) {
1735 MD_BUG();
1736 err = 1;
1737 goto abort;
1738 }
1739 /* fall through */
1740
1741 case DISKOP_SPARE_WRITE:
1742 case DISKOP_SPARE_INACTIVE:
1743
1744 /*
1745 * Find the spare disk ... (can only be in the 'high'
1746 * area of the array)
1747 */
1748 for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
1749 tmp = conf->disks + i;
1750 if (tmp->spare && tmp->number == (*d)->number) {
1751 spare_disk = i;
1752 break;
1753 }
1754 }
1755 if (spare_disk == -1) {
1756 MD_BUG();
1757 err = 1;
1758 goto abort;
1759 }
1760 break;
1761
1762 case DISKOP_HOT_REMOVE_DISK:
1763
1764 for (i = 0; i < MD_SB_DISKS; i++) {
1765 tmp = conf->disks + i;
1766 if (tmp->used_slot && (tmp->number == (*d)->number)) {
1767 if (tmp->operational) {
1768 err = -EBUSY;
1769 goto abort;
1770 }
1771 removed_disk = i;
1772 break;
1773 }
1774 }
1775 if (removed_disk == -1) {
1776 MD_BUG();
1777 err = 1;
1778 goto abort;
1779 }
1780 break;
1781
1782 case DISKOP_HOT_ADD_DISK:
1783
1784 for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
1785 tmp = conf->disks + i;
1786 if (!tmp->used_slot) {
1787 added_disk = i;
1788 break;
1789 }
1790 }
1791 if (added_disk == -1) {
1792 MD_BUG();
1793 err = 1;
1794 goto abort;
1795 }
1796 break;
1797 }
1798
1799 switch (state) {
1800 /*
1801 * Switch the spare disk to write-only mode:
1802 */
1803 case DISKOP_SPARE_WRITE:
1804 if (conf->spare) {
1805 MD_BUG();
1806 err = 1;
1807 goto abort;
1808 }
1809 sdisk = conf->disks + spare_disk;
1810 sdisk->operational = 1;
1811 sdisk->write_only = 1;
1812 conf->spare = sdisk;
1813 break;
1814 /*
1815 * Deactivate a spare disk:
1816 */
1817 case DISKOP_SPARE_INACTIVE:
1818 sdisk = conf->disks + spare_disk;
1819 sdisk->operational = 0;
1820 sdisk->write_only = 0;
1821 /*
1822 * Was the spare being resynced?
1823 */
1824 if (conf->spare == sdisk)
1825 conf->spare = NULL;
1826 break;
1827 /*
1828 * Activate (mark read-write) the (now sync) spare disk,
1829 * which means we switch it's 'raid position' (->raid_disk)
1830 * with the failed disk. (only the first 'conf->raid_disks'
1831 * slots are used for 'real' disks and we must preserve this
1832 * property)
1833 */
1834 case DISKOP_SPARE_ACTIVE:
1835 if (!conf->spare) {
1836 MD_BUG();
1837 err = 1;
1838 goto abort;
1839 }
1840 sdisk = conf->disks + spare_disk;
1841 fdisk = conf->disks + failed_disk;
1842
1843 spare_desc = &sb->disks[sdisk->number];
1844 failed_desc = &sb->disks[fdisk->number];
1845
1846 if (spare_desc != *d) {
1847 MD_BUG();
1848 err = 1;
1849 goto abort;
1850 }
1851
1852 if (spare_desc->raid_disk != sdisk->raid_disk) {
1853 MD_BUG();
1854 err = 1;
1855 goto abort;
1856 }
1857
1858 if (sdisk->raid_disk != spare_disk) {
1859 MD_BUG();
1860 err = 1;
1861 goto abort;
1862 }
1863
1864 if (failed_desc->raid_disk != fdisk->raid_disk) {
1865 MD_BUG();
1866 err = 1;
1867 goto abort;
1868 }
1869
1870 if (fdisk->raid_disk != failed_disk) {
1871 MD_BUG();
1872 err = 1;
1873 goto abort;
1874 }
1875
1876 /*
1877 * do the switch finally
1878 */
1879 xchg_values(*spare_desc, *failed_desc);
1880 xchg_values(*fdisk, *sdisk);
1881
1882 /*
1883 * (careful, 'failed' and 'spare' are switched from now on)
1884 *
1885 * we want to preserve linear numbering and we want to
1886 * give the proper raid_disk number to the now activated
1887 * disk. (this means we switch back these values)
1888 */
1889
1890 xchg_values(spare_desc->raid_disk, failed_desc->raid_disk);
1891 xchg_values(sdisk->raid_disk, fdisk->raid_disk);
1892 xchg_values(spare_desc->number, failed_desc->number);
1893 xchg_values(sdisk->number, fdisk->number);
1894
1895 *d = failed_desc;
1896
1897 if (sdisk->dev == MKDEV(0,0))
1898 sdisk->used_slot = 0;
1899
1900 /*
1901 * this really activates the spare.
1902 */
1903 fdisk->spare = 0;
1904 fdisk->write_only = 0;
1905
1906 /*
1907 * if we activate a spare, we definitely replace a
1908 * non-operational disk slot in the 'low' area of
1909 * the disk array.
1910 */
1911 conf->failed_disks--;
1912 conf->working_disks++;
1913 conf->spare = NULL;
1914
1915 break;
1916
1917 case DISKOP_HOT_REMOVE_DISK:
1918 rdisk = conf->disks + removed_disk;
1919
1920 if (rdisk->spare && (removed_disk < conf->raid_disks)) {
1921 MD_BUG();
1922 err = 1;
1923 goto abort;
1924 }
1925 rdisk->dev = MKDEV(0,0);
1926 rdisk->used_slot = 0;
1927
1928 break;
1929
1930 case DISKOP_HOT_ADD_DISK:
1931 adisk = conf->disks + added_disk;
1932 added_desc = *d;
1933
1934 if (added_disk != added_desc->number) {
1935 MD_BUG();
1936 err = 1;
1937 goto abort;
1938 }
1939
1940 adisk->number = added_desc->number;
1941 adisk->raid_disk = added_desc->raid_disk;
1942 adisk->dev = MKDEV(added_desc->major,added_desc->minor);
1943
1944 adisk->operational = 0;
1945 adisk->write_only = 0;
1946 adisk->spare = 1;
1947 adisk->used_slot = 1;
1948
1949
1950 break;
1951
1952 default:
1953 MD_BUG();
1954 err = 1;
1955 goto abort;
1956 }
1957 abort:
1958 md_spin_unlock_irq(&conf->device_lock);
1959 print_raid5_conf(conf);
1960 return err;
1961 }
1962
1963 static mdk_personality_t raid5_personality=
1964 {
1965 name: "raid5",
1966 make_request: raid5_make_request,
1967 run: raid5_run,
1968 stop: raid5_stop,
1969 status: raid5_status,
1970 error_handler: raid5_error,
1971 diskop: raid5_diskop,
1972 stop_resync: raid5_stop_resync,
1973 restart_resync: raid5_restart_resync,
1974 sync_request: raid5_sync_request
1975 };
1976
1977 static int md__init raid5_init (void)
1978 {
1979 return register_md_personality (RAID5, &raid5_personality);
1980 }
1981
1982 static void raid5_exit (void)
1983 {
1984 unregister_md_personality (RAID5);
1985 }
1986
1987 module_init(raid5_init);
1988 module_exit(raid5_exit);
1989
1990
This page was automatically generated by the
LXR engine.
Visit the LXR main site for more
information.