~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~ [ freetext search ] ~ [ file search ] ~

Linux Cross Reference
Linux/drivers/md/raid5.c

Version: ~ [ 2.4.0 ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 /*
  2  * raid5.c : Multiple Devices driver for Linux
  3  *         Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
  4  *         Copyright (C) 1999, 2000 Ingo Molnar
  5  *
  6  * RAID-5 management functions.
  7  *
  8  * This program is free software; you can redistribute it and/or modify
  9  * it under the terms of the GNU General Public License as published by
 10  * the Free Software Foundation; either version 2, or (at your option)
 11  * any later version.
 12  *
 13  * You should have received a copy of the GNU General Public License
 14  * (for example /usr/src/linux/COPYING); if not, write to the Free
 15  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 16  */
 17 
 18 
 19 #include <linux/config.h>
 20 #include <linux/module.h>
 21 #include <linux/locks.h>
 22 #include <linux/malloc.h>
 23 #include <linux/raid/raid5.h>
 24 #include <asm/bitops.h>
 25 #include <asm/atomic.h>
 26 
 27 static mdk_personality_t raid5_personality;
 28 
 29 /*
 30  * Stripe cache
 31  */
 32 
 33 #define NR_STRIPES              256
 34 #define HASH_PAGES              1
 35 #define HASH_PAGES_ORDER        0
 36 #define NR_HASH                 (HASH_PAGES * PAGE_SIZE / sizeof(struct stripe_head *))
 37 #define HASH_MASK               (NR_HASH - 1)
 38 #define stripe_hash(conf, sect) ((conf)->stripe_hashtbl[((sect) / ((conf)->buffer_size >> 9)) & HASH_MASK])
 39 
 40 /*
 41  * The following can be used to debug the driver
 42  */
 43 #define RAID5_DEBUG     0
 44 #define RAID5_PARANOIA  1
 45 #if RAID5_PARANOIA && CONFIG_SMP
 46 # define CHECK_DEVLOCK() if (!spin_is_locked(&conf->device_lock)) BUG()
 47 #else
 48 # define CHECK_DEVLOCK()
 49 #endif
 50 
 51 #if RAID5_DEBUG
 52 #define PRINTK(x...) printk(x)
 53 #define inline
 54 #define __inline__
 55 #else
 56 #define PRINTK(x...) do { } while (0)
 57 #endif
 58 
 59 static void print_raid5_conf (raid5_conf_t *conf);
 60 
 61 static inline void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
 62 {
 63         if (atomic_dec_and_test(&sh->count)) {
 64                 if (!list_empty(&sh->lru))
 65                         BUG();
 66                 if (atomic_read(&conf->active_stripes)==0)
 67                         BUG();
 68                 if (test_bit(STRIPE_HANDLE, &sh->state)) {
 69                         list_add_tail(&sh->lru, &conf->handle_list);
 70                         md_wakeup_thread(conf->thread);
 71                 }
 72                 else {
 73                         list_add_tail(&sh->lru, &conf->inactive_list);
 74                         atomic_dec(&conf->active_stripes);
 75                         wake_up(&conf->wait_for_stripe);
 76                 }
 77         }
 78 }
 79 static void release_stripe(struct stripe_head *sh)
 80 {
 81         raid5_conf_t *conf = sh->raid_conf;
 82 
 83         spin_lock_irq(&conf->device_lock);
 84         __release_stripe(conf, sh);
 85         spin_unlock_irq(&conf->device_lock);
 86 }
 87 
 88 static void remove_hash(struct stripe_head *sh)
 89 {
 90         PRINTK("remove_hash(), stripe %lu\n", sh->sector);
 91 
 92         if (sh->hash_pprev) {
 93                 if (sh->hash_next)
 94                         sh->hash_next->hash_pprev = sh->hash_pprev;
 95                 *sh->hash_pprev = sh->hash_next;
 96                 sh->hash_pprev = NULL;
 97         }
 98 }
 99 
100 static __inline__ void insert_hash(raid5_conf_t *conf, struct stripe_head *sh)
101 {
102         struct stripe_head **shp = &stripe_hash(conf, sh->sector);
103 
104         PRINTK("insert_hash(), stripe %lu\n",sh->sector);
105 
106         CHECK_DEVLOCK();
107         if ((sh->hash_next = *shp) != NULL)
108                 (*shp)->hash_pprev = &sh->hash_next;
109         *shp = sh;
110         sh->hash_pprev = shp;
111 }
112 
113 
114 /* find an idle stripe, make sure it is unhashed, and return it. */
115 static struct stripe_head *get_free_stripe(raid5_conf_t *conf)
116 {
117         struct stripe_head *sh = NULL;
118         struct list_head *first;
119 
120         CHECK_DEVLOCK();
121         if (list_empty(&conf->inactive_list))
122                 goto out;
123         first = conf->inactive_list.next;
124         sh = list_entry(first, struct stripe_head, lru);
125         list_del_init(first);
126         remove_hash(sh);
127         atomic_inc(&conf->active_stripes);
128 out:
129         return sh;
130 }
131 
132 static void shrink_buffers(struct stripe_head *sh, int num)
133 {
134         struct buffer_head *bh;
135         int i;
136 
137         for (i=0; i<num ; i++) {
138                 bh = sh->bh_cache[i];
139                 if (!bh)
140                         return;
141                 sh->bh_cache[i] = NULL;
142                 free_page((unsigned long) bh->b_data);
143                 kfree(bh);
144         }
145 }
146 
147 static int grow_buffers(struct stripe_head *sh, int num, int b_size, int priority)
148 {
149         struct buffer_head *bh;
150         int i;
151 
152         for (i=0; i<num; i++) {
153                 struct page *page;
154                 bh = kmalloc(sizeof(struct buffer_head), priority);
155                 if (!bh)
156                         return 1;
157                 memset(bh, 0, sizeof (struct buffer_head));
158                 init_waitqueue_head(&bh->b_wait);
159                 page = alloc_page(priority);
160                 bh->b_data = page_address(page);
161                 if (!bh->b_data) {
162                         kfree(bh);
163                         return 1;
164                 }
165                 atomic_set(&bh->b_count, 0);
166                 bh->b_page = page;
167                 sh->bh_cache[i] = bh;
168 
169         }
170         return 0;
171 }
172 
173 static struct buffer_head *raid5_build_block (struct stripe_head *sh, int i);
174 
175 static inline void init_stripe(struct stripe_head *sh, unsigned long sector)
176 {
177         raid5_conf_t *conf = sh->raid_conf;
178         int disks = conf->raid_disks, i;
179 
180         if (atomic_read(&sh->count) != 0)
181                 BUG();
182         if (test_bit(STRIPE_HANDLE, &sh->state))
183                 BUG();
184         
185         CHECK_DEVLOCK();
186         PRINTK("init_stripe called, stripe %lu\n", sh->sector);
187 
188         remove_hash(sh);
189         
190         sh->sector = sector;
191         sh->size = conf->buffer_size;
192         sh->state = 0;
193 
194         for (i=disks; i--; ) {
195                 if (sh->bh_read[i] || sh->bh_write[i] || sh->bh_written[i] ||
196                     buffer_locked(sh->bh_cache[i])) {
197                         printk("sector=%lx i=%d %p %p %p %d\n",
198                                sh->sector, i, sh->bh_read[i],
199                                sh->bh_write[i], sh->bh_written[i],
200                                buffer_locked(sh->bh_cache[i]));
201                         BUG();
202                 }
203                 clear_bit(BH_Uptodate, &sh->bh_cache[i]->b_state);
204                 raid5_build_block(sh, i);
205         }
206         insert_hash(conf, sh);
207 }
208 
209 /* the buffer size has changed, so unhash all stripes
210  * as active stripes complete, they will go onto inactive list
211  */
212 static void shrink_stripe_cache(raid5_conf_t *conf)
213 {
214         int i;
215         CHECK_DEVLOCK();
216         if (atomic_read(&conf->active_stripes))
217                 BUG();
218         for (i=0; i < NR_HASH; i++) {
219                 struct stripe_head *sh;
220                 while ((sh = conf->stripe_hashtbl[i])) 
221                         remove_hash(sh);
222         }
223 }
224 
225 static struct stripe_head *__find_stripe(raid5_conf_t *conf, unsigned long sector)
226 {
227         struct stripe_head *sh;
228 
229         CHECK_DEVLOCK();
230         PRINTK("__find_stripe, sector %lu\n", sector);
231         for (sh = stripe_hash(conf, sector); sh; sh = sh->hash_next)
232                 if (sh->sector == sector)
233                         return sh;
234         PRINTK("__stripe %lu not in cache\n", sector);
235         return NULL;
236 }
237 
238 static struct stripe_head *get_active_stripe(raid5_conf_t *conf, unsigned long sector, int size, int noblock) 
239 {
240         struct stripe_head *sh;
241 
242         PRINTK("get_stripe, sector %lu\n", sector);
243 
244         md_spin_lock_irq(&conf->device_lock);
245 
246         do {
247                 if (conf->buffer_size == 0 ||
248                     (size && size != conf->buffer_size)) {
249                         /* either the size is being changed (buffer_size==0) or
250                          * we need to change it.
251                          * If size==0, we can proceed as soon as buffer_size gets set.
252                          * If size>0, we can proceed when active_stripes reaches 0, or
253                          * when someone else sets the buffer_size to size.
254                          * If someone sets the buffer size to something else, we will need to
255                          * assert that we want to change it again
256                          */
257                         int oldsize = conf->buffer_size;
258                         PRINTK("get_stripe %ld/%d buffer_size is %d, %d active\n", sector, size, conf->buffer_size, atomic_read(&conf->active_stripes));
259                         if (size==0)
260                                 wait_event_lock_irq(conf->wait_for_stripe,
261                                                     conf->buffer_size,
262                                                     conf->device_lock);
263                         else {
264                                 while (conf->buffer_size != size && atomic_read(&conf->active_stripes)) {
265                                         conf->buffer_size = 0;
266                                         wait_event_lock_irq(conf->wait_for_stripe,
267                                                             atomic_read(&conf->active_stripes)==0 || conf->buffer_size,
268                                                             conf->device_lock);
269                                         PRINTK("waited and now  %ld/%d buffer_size is %d - %d active\n", sector, size,
270                                                conf->buffer_size, atomic_read(&conf->active_stripes));
271                                 }
272 
273                                 if (conf->buffer_size != size) {
274                                         printk("raid5: switching cache buffer size, %d --> %d\n", oldsize, size);
275                                         shrink_stripe_cache(conf);
276                                         if (size==0) BUG();
277                                         conf->buffer_size = size;
278                                         PRINTK("size now %d\n", conf->buffer_size);
279                                 }
280                         }
281                 }
282                 if (size == 0)
283                         sector -= sector & ((conf->buffer_size>>9)-1);
284 
285                 sh = __find_stripe(conf, sector);
286                 if (!sh) {
287                         sh = get_free_stripe(conf);
288                         if (noblock && sh == NULL)
289                                 break;
290                         if (!sh) {
291                                 wait_event_lock_irq(conf->wait_for_stripe,
292                                                     !list_empty(&conf->inactive_list),
293                                                     conf->device_lock);
294                         } else
295                                 init_stripe(sh, sector);
296                 } else {
297                         if (atomic_read(&sh->count)) {
298                                 if (!list_empty(&sh->lru))
299                                         BUG();
300                         } else {
301                                 if (!test_bit(STRIPE_HANDLE, &sh->state))
302                                         atomic_inc(&conf->active_stripes);
303                                 if (list_empty(&sh->lru))
304                                         BUG();
305                                 list_del_init(&sh->lru);
306                         }
307                 }
308         } while (sh == NULL);
309 
310         if (sh)
311                 atomic_inc(&sh->count);
312 
313         md_spin_unlock_irq(&conf->device_lock);
314         return sh;
315 }
316 
317 static int grow_stripes(raid5_conf_t *conf, int num, int priority)
318 {
319         struct stripe_head *sh;
320 
321         while (num--) {
322                 sh = kmalloc(sizeof(struct stripe_head), priority);
323                 if (!sh)
324                         return 1;
325                 memset(sh, 0, sizeof(*sh));
326                 sh->raid_conf = conf;
327                 sh->lock = SPIN_LOCK_UNLOCKED;
328 
329                 if (grow_buffers(sh, conf->raid_disks, PAGE_SIZE, priority)) {
330                         shrink_buffers(sh, conf->raid_disks);
331                         kfree(sh);
332                         return 1;
333                 }
334                 /* we just created an active stripe so... */
335                 atomic_set(&sh->count, 1);
336                 atomic_inc(&conf->active_stripes);
337                 INIT_LIST_HEAD(&sh->lru);
338                 release_stripe(sh);
339         }
340         return 0;
341 }
342 
343 static void shrink_stripes(raid5_conf_t *conf, int num)
344 {
345         struct stripe_head *sh;
346 
347         while (num--) {
348                 spin_lock_irq(&conf->device_lock);
349                 sh = get_free_stripe(conf);
350                 spin_unlock_irq(&conf->device_lock);
351                 if (!sh)
352                         break;
353                 if (atomic_read(&sh->count))
354                         BUG();
355                 shrink_buffers(sh, conf->raid_disks);
356                 kfree(sh);
357                 atomic_dec(&conf->active_stripes);
358         }
359 }
360 
361 
362 static inline void raid5_end_buffer_read(struct buffer_head *blist, struct buffer_head *bh)
363 {
364         while (blist) {
365                 struct buffer_head *new = blist;
366                 blist = new->b_reqnext;
367                 memcpy(new->b_data, bh->b_data, bh->b_size);
368                 new->b_end_io(new, 1);
369         }
370 }
371 
372 static void raid5_end_read_request (struct buffer_head * bh, int uptodate)
373 {
374         struct stripe_head *sh = bh->b_private;
375         raid5_conf_t *conf = sh->raid_conf;
376         int disks = conf->raid_disks, i;
377         unsigned long flags;
378         struct buffer_head *buffers = NULL;
379 
380         for (i=0 ; i<disks; i++)
381                 if (bh == sh->bh_cache[i])
382                         break;
383 
384         PRINTK("end_read_request %lu/%d,  %d, count: %d, uptodate %d.\n", sh->sector, i, atomic_read(&sh->count), uptodate);
385         if (i == disks) {
386                 BUG();
387                 return;
388         }
389 
390         md_spin_lock_irqsave(&conf->device_lock, flags);
391         if (uptodate) {
392 #ifdef CONFIG_HIGHMEM
393                 /* cannot map highmem bufferheads from irq,
394                  * so leave it for stripe_handle if there might
395                  * be a problem
396                  */
397                 if (sh->bh_read[i] &&
398                     sh->bh_read[i]->b_reqnext == NULL &&
399                     !PageHighMem(sh->bh_read[i]->b_page)) {
400                         /* it's safe */
401                         buffers = sh->bh_read[i];
402                         sh->bh_read[i] = NULL;
403                 }
404 #else
405                 buffers = sh->bh_read[i];
406                 sh->bh_read[i] = NULL;
407 #endif
408                 set_bit(BH_Uptodate, &bh->b_state);
409                 if (buffers) {
410                         spin_unlock_irqrestore(&conf->device_lock, flags);
411                         raid5_end_buffer_read(buffers, bh);
412                         spin_lock_irqsave(&conf->device_lock, flags);
413                 }
414         } else {
415                 md_error(mddev_to_kdev(conf->mddev), bh->b_dev);
416                 clear_bit(BH_Uptodate, &bh->b_state);
417         }
418         clear_bit(BH_Lock, &bh->b_state);
419         set_bit(STRIPE_HANDLE, &sh->state);
420         __release_stripe(conf, sh);
421         md_spin_unlock_irqrestore(&conf->device_lock, flags);
422 }
423 
424 static void raid5_end_write_request (struct buffer_head *bh, int uptodate)
425 {
426         struct stripe_head *sh = bh->b_private;
427         raid5_conf_t *conf = sh->raid_conf;
428         int disks = conf->raid_disks, i;
429         unsigned long flags;
430 
431         for (i=0 ; i<disks; i++)
432                 if (bh == sh->bh_cache[i])
433                         break;
434 
435         PRINTK("end_write_request %lu/%d, count %d, uptodate: %d.\n", sh->sector, i, atomic_read(&sh->count), uptodate);
436         if (i == disks) {
437                 BUG();
438                 return;
439         }
440 
441         md_spin_lock_irqsave(&conf->device_lock, flags);
442         if (!uptodate)
443                 md_error(mddev_to_kdev(conf->mddev), bh->b_dev);
444         clear_bit(BH_Lock, &bh->b_state);
445         set_bit(STRIPE_HANDLE, &sh->state);
446         __release_stripe(conf, sh);
447         md_spin_unlock_irqrestore(&conf->device_lock, flags);
448 }
449         
450 
451 
452 static struct buffer_head *raid5_build_block (struct stripe_head *sh, int i)
453 {
454         raid5_conf_t *conf = sh->raid_conf;
455         struct buffer_head *bh = sh->bh_cache[i];
456         unsigned long block = sh->sector / (sh->size >> 9);
457 
458         init_buffer(bh, raid5_end_read_request, sh);
459         bh->b_dev       = conf->disks[i].dev;
460         bh->b_blocknr   = block;
461 
462         bh->b_state     = (1 << BH_Req) | (1 << BH_Mapped);
463         bh->b_size      = sh->size;
464         bh->b_list      = BUF_LOCKED;
465         return bh;
466 }
467 
468 static int raid5_error (mddev_t *mddev, kdev_t dev)
469 {
470         raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
471         mdp_super_t *sb = mddev->sb;
472         struct disk_info *disk;
473         int i;
474 
475         PRINTK("raid5_error called\n");
476         conf->resync_parity = 0;
477         for (i = 0, disk = conf->disks; i < conf->raid_disks; i++, disk++) {
478                 if (disk->dev == dev && disk->operational) {
479                         disk->operational = 0;
480                         mark_disk_faulty(sb->disks+disk->number);
481                         mark_disk_nonsync(sb->disks+disk->number);
482                         mark_disk_inactive(sb->disks+disk->number);
483                         sb->active_disks--;
484                         sb->working_disks--;
485                         sb->failed_disks++;
486                         mddev->sb_dirty = 1;
487                         conf->working_disks--;
488                         conf->failed_disks++;
489                         md_wakeup_thread(conf->thread);
490                         printk (KERN_ALERT
491                                 "raid5: Disk failure on %s, disabling device."
492                                 " Operation continuing on %d devices\n",
493                                 partition_name (dev), conf->working_disks);
494                         return 0;
495                 }
496         }
497         /*
498          * handle errors in spares (during reconstruction)
499          */
500         if (conf->spare) {
501                 disk = conf->spare;
502                 if (disk->dev == dev) {
503                         printk (KERN_ALERT
504                                 "raid5: Disk failure on spare %s\n",
505                                 partition_name (dev));
506                         if (!conf->spare->operational) {
507                                 MD_BUG();
508                                 return -EIO;
509                         }
510                         disk->operational = 0;
511                         disk->write_only = 0;
512                         conf->spare = NULL;
513                         mark_disk_faulty(sb->disks+disk->number);
514                         mark_disk_nonsync(sb->disks+disk->number);
515                         mark_disk_inactive(sb->disks+disk->number);
516                         sb->spare_disks--;
517                         sb->working_disks--;
518                         sb->failed_disks++;
519 
520                         return 0;
521                 }
522         }
523         MD_BUG();
524         return -EIO;
525 }       
526 
527 /*
528  * Input: a 'big' sector number,
529  * Output: index of the data and parity disk, and the sector # in them.
530  */
531 static unsigned long raid5_compute_sector(unsigned long r_sector, unsigned int raid_disks,
532                         unsigned int data_disks, unsigned int * dd_idx,
533                         unsigned int * pd_idx, raid5_conf_t *conf)
534 {
535         unsigned long stripe;
536         unsigned long chunk_number;
537         unsigned int chunk_offset;
538         unsigned long new_sector;
539         int sectors_per_chunk = conf->chunk_size >> 9;
540 
541         /* First compute the information on this sector */
542 
543         /*
544          * Compute the chunk number and the sector offset inside the chunk
545          */
546         chunk_number = r_sector / sectors_per_chunk;
547         chunk_offset = r_sector % sectors_per_chunk;
548 
549         /*
550          * Compute the stripe number
551          */
552         stripe = chunk_number / data_disks;
553 
554         /*
555          * Compute the data disk and parity disk indexes inside the stripe
556          */
557         *dd_idx = chunk_number % data_disks;
558 
559         /*
560          * Select the parity disk based on the user selected algorithm.
561          */
562         if (conf->level == 4)
563                 *pd_idx = data_disks;
564         else switch (conf->algorithm) {
565                 case ALGORITHM_LEFT_ASYMMETRIC:
566                         *pd_idx = data_disks - stripe % raid_disks;
567                         if (*dd_idx >= *pd_idx)
568                                 (*dd_idx)++;
569                         break;
570                 case ALGORITHM_RIGHT_ASYMMETRIC:
571                         *pd_idx = stripe % raid_disks;
572                         if (*dd_idx >= *pd_idx)
573                                 (*dd_idx)++;
574                         break;
575                 case ALGORITHM_LEFT_SYMMETRIC:
576                         *pd_idx = data_disks - stripe % raid_disks;
577                         *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;
578                         break;
579                 case ALGORITHM_RIGHT_SYMMETRIC:
580                         *pd_idx = stripe % raid_disks;
581                         *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;
582                         break;
583                 default:
584                         printk ("raid5: unsupported algorithm %d\n", conf->algorithm);
585         }
586 
587         /*
588          * Finally, compute the new sector number
589          */
590         new_sector = stripe * sectors_per_chunk + chunk_offset;
591         return new_sector;
592 }
593 
594 #if 0
595 static unsigned long compute_blocknr(struct stripe_head *sh, int i)
596 {
597         raid5_conf_t *conf = sh->raid_conf;
598         int raid_disks = conf->raid_disks, data_disks = raid_disks - 1;
599         unsigned long new_sector = sh->sector, check;
600         int sectors_per_chunk = conf->chunk_size >> 9;
601         unsigned long stripe = new_sector / sectors_per_chunk;
602         int chunk_offset = new_sector % sectors_per_chunk;
603         int chunk_number, dummy1, dummy2, dd_idx = i;
604         unsigned long r_sector, blocknr;
605 
606         switch (conf->algorithm) {
607                 case ALGORITHM_LEFT_ASYMMETRIC:
608                 case ALGORITHM_RIGHT_ASYMMETRIC:
609                         if (i > sh->pd_idx)
610                                 i--;
611                         break;
612                 case ALGORITHM_LEFT_SYMMETRIC:
613                 case ALGORITHM_RIGHT_SYMMETRIC:
614                         if (i < sh->pd_idx)
615                                 i += raid_disks;
616                         i -= (sh->pd_idx + 1);
617                         break;
618                 default:
619                         printk ("raid5: unsupported algorithm %d\n", conf->algorithm);
620         }
621 
622         chunk_number = stripe * data_disks + i;
623         r_sector = chunk_number * sectors_per_chunk + chunk_offset;
624         blocknr = r_sector / (sh->size >> 9);
625 
626         check = raid5_compute_sector (r_sector, raid_disks, data_disks, &dummy1, &dummy2, conf);
627         if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) {
628                 printk("compute_blocknr: map not correct\n");
629                 return 0;
630         }
631         return blocknr;
632 }
633 #endif
634 
635 #define check_xor()     do {                                    \
636                            if (count == MAX_XOR_BLOCKS) {       \
637                                 xor_block(count, bh_ptr);       \
638                                 count = 1;                      \
639                            }                                    \
640                         } while(0)
641 
642 
643 static void compute_block(struct stripe_head *sh, int dd_idx)
644 {
645         raid5_conf_t *conf = sh->raid_conf;
646         int i, count, disks = conf->raid_disks;
647         struct buffer_head *bh_ptr[MAX_XOR_BLOCKS], *bh;
648 
649         PRINTK("compute_block, stripe %lu, idx %d\n", sh->sector, dd_idx);
650 
651 
652         memset(sh->bh_cache[dd_idx]->b_data, 0, sh->size);
653         bh_ptr[0] = sh->bh_cache[dd_idx];
654         count = 1;
655         for (i = disks ; i--; ) {
656                 if (i == dd_idx)
657                         continue;
658                 bh = sh->bh_cache[i];
659                 if (buffer_uptodate(bh))
660                         bh_ptr[count++] = bh;
661                 else
662                         printk("compute_block() %d, stripe %lu, %d not present\n", dd_idx, sh->sector, i);
663 
664                 check_xor();
665         }
666         if (count != 1)
667                 xor_block(count, bh_ptr);
668         set_bit(BH_Uptodate, &sh->bh_cache[dd_idx]->b_state);
669 }
670 
671 static void compute_parity(struct stripe_head *sh, int method)
672 {
673         raid5_conf_t *conf = sh->raid_conf;
674         int i, pd_idx = sh->pd_idx, disks = conf->raid_disks, count;
675         struct buffer_head *bh_ptr[MAX_XOR_BLOCKS];
676         struct buffer_head *chosen[MD_SB_DISKS];
677 
678         PRINTK("compute_parity, stripe %lu, method %d\n", sh->sector, method);
679         memset(chosen, 0, sizeof(chosen));
680 
681         count = 1;
682         bh_ptr[0] = sh->bh_cache[pd_idx];
683         spin_lock_irq(&conf->device_lock);
684         switch(method) {
685         case READ_MODIFY_WRITE:
686                 if (!buffer_uptodate(sh->bh_cache[pd_idx]))
687                         BUG();
688                 for (i=disks ; i-- ;) {
689                         if (i==pd_idx)
690                                 continue;
691                         if (sh->bh_write[i] &&
692                             buffer_uptodate(sh->bh_cache[i])) {
693                                 bh_ptr[count++] = sh->bh_cache[i];
694                                 chosen[i] = sh->bh_write[i];
695                                 sh->bh_write[i] = sh->bh_write[i]->b_reqnext;
696                                 chosen[i]->b_reqnext = sh->bh_written[i];
697                                 sh->bh_written[i] = chosen[i];
698                                 check_xor();
699                         }
700                 }
701                 break;
702         case RECONSTRUCT_WRITE:
703                 memset(sh->bh_cache[pd_idx]->b_data, 0, sh->size);
704                 for (i= disks; i-- ;)
705                         if (i!=pd_idx && sh->bh_write[i]) {
706                                 chosen[i] = sh->bh_write[i];
707                                 sh->bh_write[i] = sh->bh_write[i]->b_reqnext;
708                                 chosen[i]->b_reqnext = sh->bh_written[i];
709                                 sh->bh_written[i] = chosen[i];
710                                 check_xor();
711                         }
712                 break;
713         case CHECK_PARITY:
714                 break;
715         }
716         spin_unlock_irq(&conf->device_lock);
717         for (i = disks; i--;)
718                 if (chosen[i]) {
719                         struct buffer_head *bh = sh->bh_cache[i];
720                         char *bdata;
721                         mark_buffer_clean(chosen[i]); /* NO FIXME */
722                         bdata = bh_kmap(chosen[i]);
723                         memcpy(bh->b_data,
724                                bdata,sh->size);
725                         bh_kunmap(chosen[i]);
726                         set_bit(BH_Lock, &bh->b_state);
727                         mark_buffer_uptodate(bh, 1);
728                 }
729 
730         switch(method) {
731         case RECONSTRUCT_WRITE:
732         case CHECK_PARITY:
733                 for (i=disks; i--;)
734                         if (i != pd_idx) {
735                                 bh_ptr[count++] = sh->bh_cache[i];
736                                 check_xor();
737                         }
738                 break;
739         case READ_MODIFY_WRITE:
740                 for (i = disks; i--;)
741                         if (chosen[i]) {
742                                 bh_ptr[count++] = sh->bh_cache[i];
743                                 check_xor();
744                         }
745         }
746         if (count != 1)
747                 xor_block(count, bh_ptr);
748         
749         if (method != CHECK_PARITY) {
750                 mark_buffer_uptodate(sh->bh_cache[pd_idx], 1);
751                 set_bit(BH_Lock, &sh->bh_cache[pd_idx]->b_state);
752         } else
753                 mark_buffer_uptodate(sh->bh_cache[pd_idx], 0);
754 }
755 
756 static void add_stripe_bh (struct stripe_head *sh, struct buffer_head *bh, int dd_idx, int rw)
757 {
758         struct buffer_head **bhp;
759         raid5_conf_t *conf = sh->raid_conf;
760 
761         PRINTK("adding bh b#%lu to stripe s#%lu\n", bh->b_blocknr, sh->sector);
762 
763 
764         spin_lock_irq(&conf->device_lock);
765         bh->b_reqnext = NULL;
766         if (rw == READ)
767                 bhp = &sh->bh_read[dd_idx];
768         else
769                 bhp = &sh->bh_write[dd_idx];
770         while (*bhp) {
771                 printk(KERN_NOTICE "raid5: multiple %d requests for sector %ld\n", rw, sh->sector);
772                 bhp = & (*bhp)->b_reqnext;
773         }
774         *bhp = bh;
775         spin_unlock_irq(&conf->device_lock);
776 
777         PRINTK("added bh b#%lu to stripe s#%lu, disk %d.\n", bh->b_blocknr, sh->sector, dd_idx);
778 }
779 
780 
781 
782 
783 
784 /*
785  * handle_stripe - do things to a stripe.
786  *
787  * We lock the stripe and then examine the state of various bits
788  * to see what needs to be done.
789  * Possible results:
790  *    return some read request which now have data
791  *    return some write requests which are safely on disc
792  *    schedule a read on some buffers
793  *    schedule a write of some buffers
794  *    return confirmation of parity correctness
795  *
796  * Parity calculations are done inside the stripe lock
797  * buffers are taken off read_list or write_list, and bh_cache buffers
798  * get BH_Lock set before the stripe lock is released.
799  *
800  */
801  
802 static void handle_stripe(struct stripe_head *sh)
803 {
804         raid5_conf_t *conf = sh->raid_conf;
805         int disks = conf->raid_disks;
806         struct buffer_head *return_ok= NULL, *return_fail = NULL;
807         int action[MD_SB_DISKS];
808         int i;
809         int syncing;
810         int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
811         int failed_num=0;
812         struct buffer_head *bh;
813 
814         PRINTK("handling stripe %ld, cnt=%d, pd_idx=%d\n", sh->sector, atomic_read(&sh->count), sh->pd_idx);
815         memset(action, 0, sizeof(action));
816 
817         spin_lock(&sh->lock);
818         clear_bit(STRIPE_HANDLE, &sh->state);
819 
820         syncing = test_bit(STRIPE_SYNCING, &sh->state);
821         /* Now to look around and see what can be done */
822 
823         for (i=disks; i--; ) {
824                 bh = sh->bh_cache[i];
825                 PRINTK("check %d: state %lx read %p write %p written %p\n", i, bh->b_state, sh->bh_read[i], sh->bh_write[i], sh->bh_written[i]);
826                 /* maybe we can reply to a read */
827                 if (buffer_uptodate(bh) && sh->bh_read[i]) {
828                         struct buffer_head *rbh, *rbh2;
829                         PRINTK("Return read for disc %d\n", i);
830                         spin_lock_irq(&conf->device_lock);
831                         rbh = sh->bh_read[i];
832                         sh->bh_read[i] = NULL;
833                         spin_unlock_irq(&conf->device_lock);
834                         while (rbh) {
835                                 char *bdata;
836                                 bdata = bh_kmap(rbh);
837                                 memcpy(bdata, bh->b_data, bh->b_size);
838                                 bh_kunmap(rbh);
839                                 rbh2 = rbh->b_reqnext;
840                                 rbh->b_reqnext = return_ok;
841                                 return_ok = rbh;
842                                 rbh = rbh2;
843                         }
844                 }
845 
846                 /* now count some things */
847                 if (buffer_locked(bh)) locked++;
848                 if (buffer_uptodate(bh)) uptodate++;
849 
850                 
851                 if (sh->bh_read[i]) to_read++;
852                 if (sh->bh_write[i]) to_write++;
853                 if (sh->bh_written[i]) written++;
854                 if (!conf->disks[i].operational) {
855                         failed++;
856                         failed_num = i;
857                 }
858         }
859         PRINTK("locked=%d uptodate=%d to_read=%d to_write=%d failed=%d failed_num=%d\n",
860                locked, uptodate, to_read, to_write, failed, failed_num);
861         /* check if the array has lost two devices and, if so, some requests might
862          * need to be failed
863          */
864         if (failed > 1 && to_read+to_write) {
865                 spin_lock_irq(&conf->device_lock);
866                 for (i=disks; i--; ) {
867                         /* fail all writes first */
868                         if (sh->bh_write[i]) to_write--;
869                         while ((bh = sh->bh_write[i])) {
870                                 sh->bh_write[i] = bh->b_reqnext;
871                                 bh->b_reqnext = return_fail;
872                                 return_fail = bh;
873                         }
874                         /* fail any reads if this device is non-operational */
875                         if (!conf->disks[i].operational) {
876                                 if (sh->bh_read[i]) to_read--;
877                                 while ((bh = sh->bh_read[i])) {
878                                         sh->bh_read[i] = bh->b_reqnext;
879                                         bh->b_reqnext = return_fail;
880                                         return_fail = bh;
881                                 }
882                         }
883                 }
884                 spin_unlock_irq(&conf->device_lock);
885                 if (syncing) {
886                         md_done_sync(conf->mddev, (sh->size>>10) - sh->sync_redone,0);
887                         clear_bit(STRIPE_SYNCING, &sh->state);
888                         syncing = 0;
889                 }                       
890         }
891 
892         /* might be able to return some write requests if the parity block
893          * is safe, or on a failed drive
894          */
895         bh = sh->bh_cache[sh->pd_idx];
896         if ( written &&
897              ( (conf->disks[sh->pd_idx].operational && !buffer_locked(bh) && buffer_uptodate(bh))
898                || (failed == 1 && failed_num == sh->pd_idx))
899             ) {
900             /* any written block on a uptodate or failed drive can be returned */
901             for (i=disks; i--; )
902                 if (sh->bh_written[i]) {
903                     bh = sh->bh_cache[i];
904                     if (!conf->disks[sh->pd_idx].operational ||
905                         (!buffer_locked(bh) && buffer_uptodate(bh)) ) {
906                         /* maybe we can return some write requests */
907                         struct buffer_head *wbh, *wbh2;
908                         PRINTK("Return write for disc %d\n", i);
909                         spin_lock_irq(&conf->device_lock);
910                         wbh = sh->bh_written[i];
911                         sh->bh_written[i] = NULL;
912                         spin_unlock_irq(&conf->device_lock);
913                         while (wbh) {
914                             wbh2 = wbh->b_reqnext;
915                             wbh->b_reqnext = return_ok;
916                             return_ok = wbh;
917                             wbh = wbh2;
918                         }
919                     }
920                 }
921         }
922                 
923         /* Now we might consider reading some blocks, either to check/generate
924          * parity, or to satisfy requests
925          */
926         if (to_read || (syncing && (uptodate+failed < disks))) {
927                 for (i=disks; i--;) {
928                         bh = sh->bh_cache[i];
929                         if (!buffer_locked(bh) && !buffer_uptodate(bh) &&
930                             (sh->bh_read[i] || syncing || (failed && sh->bh_read[failed_num]))) {
931                                 /* we would like to get this block, possibly
932                                  * by computing it, but we might not be able to
933                                  */
934                                 if (uptodate == disks-1) {
935                                         PRINTK("Computing block %d\n", i);
936                                         compute_block(sh, i);
937                                         uptodate++;
938                                 } else if (conf->disks[i].operational) {
939                                         set_bit(BH_Lock, &bh->b_state);
940                                         action[i] = READ+1;
941                                         locked++;
942                                         PRINTK("Reading block %d (sync=%d)\n", i, syncing);
943                                         if (syncing)
944                                                 md_sync_acct(conf->disks[i].dev, bh->b_size>>9);
945                                 }
946                         }
947                 }
948                 set_bit(STRIPE_HANDLE, &sh->state);
949         }
950 
951         /* now to consider writing and what else, if anything should be read */
952         if (to_write) {
953                 int rmw=0, rcw=0;
954                 for (i=disks ; i--;) {
955                         /* would I have to read this buffer for read_modify_write */
956                         bh = sh->bh_cache[i];
957                         if ((sh->bh_write[i] || i == sh->pd_idx) &&
958                             !buffer_locked(bh) && !buffer_uptodate(bh)) {
959                                 if (conf->disks[i].operational 
960 /*                                  && !(conf->resync_parity && i == sh->pd_idx) */
961                                         )
962                                         rmw++;
963                                 else rmw += 2*disks;  /* cannot read it */
964                         }
965                         /* Would I have to read this buffer for reconstruct_write */
966                         if (!sh->bh_write[i] && i != sh->pd_idx &&
967                             !buffer_locked(bh) && !buffer_uptodate(bh)) {
968                                 if (conf->disks[i].operational) rcw++;
969                                 else rcw += 2*disks;
970                         }
971                 }
972                 PRINTK("for sector %ld, rmw=%d rcw=%d\n", sh->sector, rmw, rcw);
973                 set_bit(STRIPE_HANDLE, &sh->state);
974                 if (rmw < rcw && rmw > 0)
975                         /* prefer read-modify-write, but need to get some data */
976                         for (i=disks; i--;) {
977                                 bh = sh->bh_cache[i];
978                                 if ((sh->bh_write[i] || i == sh->pd_idx) &&
979                                     !buffer_locked(bh) && !buffer_uptodate(bh) &&
980                                     conf->disks[i].operational) {
981                                         PRINTK("Read_old block %d for r-m-w\n", i);
982                                         set_bit(BH_Lock, &bh->b_state);
983                                         action[i] = READ+1;
984                                         locked++;
985                                 }
986                         }
987                 if (rcw <= rmw && rcw > 0)
988                         /* want reconstruct write, but need to get some data */
989                         for (i=disks; i--;) {
990                                 bh = sh->bh_cache[i];
991                                 if (!sh->bh_write[i]  && i != sh->pd_idx &&
992                                     !buffer_locked(bh) && !buffer_uptodate(bh) &&
993                                     conf->disks[i].operational) {
994                                         PRINTK("Read_old block %d for Reconstruct\n", i);
995                                         set_bit(BH_Lock, &bh->b_state);
996                                         action[i] = READ+1;
997                                         locked++;
998                                 }
999                         }
1000                 /* now if nothing is locked, and if we have enough data, we can start a write request */
1001                 if (locked == 0 && (rcw == 0 ||rmw == 0)) {
1002                         PRINTK("Computing parity...\n");
1003                         compute_parity(sh, rcw==0 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE);
1004                         /* now every locked buffer is ready to be written */
1005                         for (i=disks; i--;)
1006                                 if (buffer_locked(sh->bh_cache[i])) {
1007                                         PRINTK("Writing block %d\n", i);
1008                                         locked++;
1009                                         action[i] = WRITE+1;
1010                                         if (!conf->disks[i].operational
1011                                             || (i==sh->pd_idx && failed == 0))
1012                                                 set_bit(STRIPE_INSYNC, &sh->state);
1013                                 }
1014                 }
1015         }
1016 
1017         /* maybe we need to check and possibly fix the parity for this stripe
1018          * Any reads will already have been scheduled, so we just see if enough data
1019          * is available
1020          */
1021         if (syncing && locked == 0 &&
1022             !test_bit(STRIPE_INSYNC, &sh->state) && failed <= 1) {
1023                 set_bit(STRIPE_HANDLE, &sh->state);
1024                 if (failed == 0) {
1025                         if (uptodate != disks)
1026                                 BUG();
1027                         compute_parity(sh, CHECK_PARITY);
1028                         uptodate--;
1029                         bh = sh->bh_cache[sh->pd_idx];
1030                         if ((*(u32*)bh->b_data) == 0 &&
1031                             !memcmp(bh->b_data, bh->b_data+4, bh->b_size-4)) {
1032                                 /* parity is correct (on disc, not in buffer any more) */
1033                                 set_bit(STRIPE_INSYNC, &sh->state);
1034                         }
1035                 }
1036                 if (!test_bit(STRIPE_INSYNC, &sh->state)) {
1037                         if (failed==0)
1038                                 failed_num = sh->pd_idx;
1039                         /* should be able to compute the missing block and write it to spare */
1040                         if (!buffer_uptodate(sh->bh_cache[failed_num])) {
1041                                 if (uptodate+1 != disks)
1042                                         BUG();
1043                                 compute_block(sh, failed_num);
1044                                 uptodate++;
1045                         }
1046                         if (uptodate != disks)
1047                                 BUG();
1048                         bh = sh->bh_cache[failed_num];
1049                         set_bit(BH_Lock, &bh->b_state);
1050                         action[failed_num] = WRITE+1;
1051                         locked++;
1052                         set_bit(STRIPE_INSYNC, &sh->state);
1053                         if (conf->disks[i].operational)
1054                                 md_sync_acct(conf->disks[i].dev, bh->b_size>>9);
1055                         else if (conf->spare)
1056                                 md_sync_acct(conf->spare->dev, bh->b_size>>9);
1057 
1058                 }
1059         }
1060         if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
1061                 md_done_sync(conf->mddev, (sh->size>>10) - sh->sync_redone,1);
1062                 clear_bit(STRIPE_SYNCING, &sh->state);
1063         }
1064         
1065         
1066         spin_unlock(&sh->lock);
1067 
1068         while ((bh=return_ok)) {
1069                 return_ok = bh->b_reqnext;
1070                 bh->b_reqnext = NULL;
1071                 bh->b_end_io(bh, 1);
1072         }
1073         while ((bh=return_fail)) {
1074                 return_ok = bh->b_reqnext;
1075                 bh->b_reqnext = NULL;
1076                 bh->b_end_io(bh, 0);
1077         }
1078         for (i=disks; i-- ;) 
1079                 if (action[i]) {
1080                         struct buffer_head *bh = sh->bh_cache[i];
1081                         int skip = 0;
1082                         if (action[i] == READ+1)
1083                                 bh->b_end_io = raid5_end_read_request;
1084                         else
1085                                 bh->b_end_io = raid5_end_write_request;
1086                         if (conf->disks[i].operational)
1087                                 bh->b_dev = conf->disks[i].dev;
1088                         else if (conf->spare && action[i] == WRITE+1)
1089                                 bh->b_dev = conf->spare->dev;
1090                         else if (action[i] == READ+1)
1091                                 BUG();
1092                         else skip=1;
1093                         if (!skip) {
1094                                 PRINTK("for %ld schedule op %d on disc %d\n", sh->sector, action[i]-1, i);
1095                                 atomic_inc(&sh->count);
1096                                 bh->b_rdev = bh->b_dev;
1097                                 bh->b_rsector = bh->b_blocknr * (bh->b_size>>9);
1098                                 generic_make_request(action[i]-1, bh);
1099                         } else {
1100                                 PRINTK("skip op %d on disc %d for sector %ld\n", action[i]-1, i, sh->sector);
1101                                 clear_bit(BH_Lock, &bh->b_state);
1102                                 set_bit(STRIPE_HANDLE, &sh->state);
1103                         }
1104                 }
1105 }
1106 
1107 
1108 static int raid5_make_request (mddev_t *mddev, int rw, struct buffer_head * bh)
1109 {
1110         raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
1111         const unsigned int raid_disks = conf->raid_disks;
1112         const unsigned int data_disks = raid_disks - 1;
1113         unsigned int dd_idx, pd_idx;
1114         unsigned long new_sector;
1115         int read_ahead = 0;
1116 
1117         struct stripe_head *sh;
1118 
1119         if (rw == READA) {
1120                 rw = READ;
1121                 read_ahead=1;
1122         }
1123 
1124         new_sector = raid5_compute_sector(bh->b_rsector,
1125                         raid_disks, data_disks, &dd_idx, &pd_idx, conf);
1126 
1127         PRINTK("raid5_make_request, sector %lu\n", new_sector);
1128         sh = get_active_stripe(conf, new_sector, bh->b_size, read_ahead);
1129         if (sh) {
1130                 sh->pd_idx = pd_idx;
1131 
1132                 add_stripe_bh(sh, bh, dd_idx, rw);
1133                 handle_stripe(sh);
1134                 release_stripe(sh);
1135         } else
1136                 bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state));
1137         return 0;
1138 }
1139 
1140 /*
1141  * Determine correct block size for this device.
1142  */
1143 unsigned int device_bsize (kdev_t dev)
1144 {
1145         unsigned int i, correct_size;
1146 
1147         correct_size = BLOCK_SIZE;
1148         if (blksize_size[MAJOR(dev)]) {
1149                 i = blksize_size[MAJOR(dev)][MINOR(dev)];
1150                 if (i)
1151                         correct_size = i;
1152         }
1153 
1154         return correct_size;
1155 }
1156 
1157 static int raid5_sync_request (mddev_t *mddev, unsigned long block_nr)
1158 {
1159         raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
1160         struct stripe_head *sh;
1161         int sectors_per_chunk = conf->chunk_size >> 9;
1162         unsigned long stripe = (block_nr<<1)/sectors_per_chunk;
1163         int chunk_offset = (block_nr<<1) % sectors_per_chunk;
1164         int dd_idx, pd_idx;
1165         unsigned long first_sector;
1166         int raid_disks = conf->raid_disks;
1167         int data_disks = raid_disks-1;
1168         int redone = 0;
1169         int bufsize;
1170 
1171         sh = get_active_stripe(conf, block_nr<<1, 0, 0);
1172         bufsize = sh->size;
1173         redone = block_nr-(sh->sector>>1);
1174         first_sector = raid5_compute_sector(stripe*data_disks*sectors_per_chunk
1175                 + chunk_offset, raid_disks, data_disks, &dd_idx, &pd_idx, conf);
1176         sh->pd_idx = pd_idx;
1177         spin_lock(&sh->lock);   
1178         set_bit(STRIPE_SYNCING, &sh->state);
1179         clear_bit(STRIPE_INSYNC, &sh->state);
1180         sh->sync_redone = redone;
1181         spin_unlock(&sh->lock);
1182 
1183         handle_stripe(sh);
1184         release_stripe(sh);
1185 
1186         return (bufsize>>10)-redone;
1187 }
1188 
1189 /*
1190  * This is our raid5 kernel thread.
1191  *
1192  * We scan the hash table for stripes which can be handled now.
1193  * During the scan, completed stripes are saved for us by the interrupt
1194  * handler, so that they will not have to wait for our next wakeup.
1195  */
1196 static void raid5d (void *data)
1197 {
1198         struct stripe_head *sh;
1199         raid5_conf_t *conf = data;
1200         mddev_t *mddev = conf->mddev;
1201         int handled;
1202 
1203         PRINTK("+++ raid5d active\n");
1204 
1205         handled = 0;
1206 
1207         if (mddev->sb_dirty) {
1208                 mddev->sb_dirty = 0;
1209                 md_update_sb(mddev);
1210         }
1211         md_spin_lock_irq(&conf->device_lock);
1212         while (!list_empty(&conf->handle_list)) {
1213                 struct list_head *first = conf->handle_list.next;
1214                 sh = list_entry(first, struct stripe_head, lru);
1215 
1216                 list_del_init(first);
1217                 atomic_inc(&sh->count);
1218                 if (atomic_read(&sh->count)!= 1)
1219                         BUG();
1220                 md_spin_unlock_irq(&conf->device_lock);
1221                 
1222                 handled++;
1223                 handle_stripe(sh);
1224                 release_stripe(sh);
1225 
1226                 md_spin_lock_irq(&conf->device_lock);
1227         }
1228         PRINTK("%d stripes handled\n", handled);
1229 
1230         md_spin_unlock_irq(&conf->device_lock);
1231 
1232         PRINTK("--- raid5d inactive\n");
1233 }
1234 
1235 /*
1236  * Private kernel thread for parity reconstruction after an unclean
1237  * shutdown. Reconstruction on spare drives in case of a failed drive
1238  * is done by the generic mdsyncd.
1239  */
1240 static void raid5syncd (void *data)
1241 {
1242         raid5_conf_t *conf = data;
1243         mddev_t *mddev = conf->mddev;
1244 
1245         if (!conf->resync_parity)
1246                 return;
1247         if (conf->resync_parity == 2)
1248                 return;
1249         down(&mddev->recovery_sem);
1250         if (md_do_sync(mddev,NULL)) {
1251                 up(&mddev->recovery_sem);
1252                 printk("raid5: resync aborted!\n");
1253                 return;
1254         }
1255         conf->resync_parity = 0;
1256         up(&mddev->recovery_sem);
1257         printk("raid5: resync finished.\n");
1258 }
1259 
1260 static int __check_consistency (mddev_t *mddev, int row)
1261 {
1262         raid5_conf_t *conf = mddev->private;
1263         kdev_t dev;
1264         struct buffer_head *bh[MD_SB_DISKS], *tmp = NULL;
1265         int i, ret = 0, nr = 0, count;
1266         struct buffer_head *bh_ptr[MAX_XOR_BLOCKS];
1267 
1268         if (conf->working_disks != conf->raid_disks)
1269                 goto out;
1270         tmp = kmalloc(sizeof(*tmp), GFP_KERNEL);
1271         tmp->b_size = 4096;
1272         tmp->b_page = alloc_page(GFP_KERNEL);
1273         tmp->b_data = page_address(tmp->b_page);
1274         if (!tmp->b_data)
1275                 goto out;
1276         md_clear_page(tmp->b_data);
1277         memset(bh, 0, MD_SB_DISKS * sizeof(struct buffer_head *));
1278         for (i = 0; i < conf->raid_disks; i++) {
1279                 dev = conf->disks[i].dev;
1280                 set_blocksize(dev, 4096);
1281                 bh[i] = bread(dev, row / 4, 4096);
1282                 if (!bh[i])
1283                         break;
1284                 nr++;
1285         }
1286         if (nr == conf->raid_disks) {
1287                 bh_ptr[0] = tmp;
1288                 count = 1;
1289                 for (i = 1; i < nr; i++) {
1290                         bh_ptr[count++] = bh[i];
1291                         if (count == MAX_XOR_BLOCKS) {
1292                                 xor_block(count, &bh_ptr[0]);
1293                                 count = 1;
1294                         }
1295                 }
1296                 if (count != 1) {
1297                         xor_block(count, &bh_ptr[0]);
1298                 }
1299                 if (memcmp(tmp->b_data, bh[0]->b_data, 4096))
1300                         ret = 1;
1301         }
1302         for (i = 0; i < conf->raid_disks; i++) {
1303                 dev = conf->disks[i].dev;
1304                 if (bh[i]) {
1305                         bforget(bh[i]);
1306                         bh[i] = NULL;
1307                 }
1308                 fsync_dev(dev);
1309                 invalidate_buffers(dev);
1310         }
1311         free_page((unsigned long) tmp->b_data);
1312 out:
1313         if (tmp)
1314                 kfree(tmp);
1315         return ret;
1316 }
1317 
1318 static int check_consistency (mddev_t *mddev)
1319 {
1320         if (__check_consistency(mddev, 0))
1321 /*
1322  * We are not checking this currently, as it's legitimate to have
1323  * an inconsistent array, at creation time.
1324  */
1325                 return 0;
1326 
1327         return 0;
1328 }
1329 
1330 static int raid5_run (mddev_t *mddev)
1331 {
1332         raid5_conf_t *conf;
1333         int i, j, raid_disk, memory;
1334         mdp_super_t *sb = mddev->sb;
1335         mdp_disk_t *desc;
1336         mdk_rdev_t *rdev;
1337         struct disk_info *disk;
1338         struct md_list_head *tmp;
1339         int start_recovery = 0;
1340 
1341         MOD_INC_USE_COUNT;
1342 
1343         if (sb->level != 5 && sb->level != 4) {
1344                 printk("raid5: md%d: raid level not set to 4/5 (%d)\n", mdidx(mddev), sb->level);
1345                 MOD_DEC_USE_COUNT;
1346                 return -EIO;
1347         }
1348 
1349         mddev->private = kmalloc (sizeof (raid5_conf_t), GFP_KERNEL);
1350         if ((conf = mddev->private) == NULL)
1351                 goto abort;
1352         memset (conf, 0, sizeof (*conf));
1353         conf->mddev = mddev;
1354 
1355         if ((conf->stripe_hashtbl = (struct stripe_head **) md__get_free_pages(GFP_ATOMIC, HASH_PAGES_ORDER)) == NULL)
1356                 goto abort;
1357         memset(conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE);
1358 
1359         conf->device_lock = MD_SPIN_LOCK_UNLOCKED;
1360         md_init_waitqueue_head(&conf->wait_for_stripe);
1361         INIT_LIST_HEAD(&conf->handle_list);
1362         INIT_LIST_HEAD(&conf->inactive_list);
1363         atomic_set(&conf->active_stripes, 0);
1364         conf->buffer_size = PAGE_SIZE; /* good default for rebuild */
1365 
1366         PRINTK("raid5_run(md%d) called.\n", mdidx(mddev));
1367 
1368         ITERATE_RDEV(mddev,rdev,tmp) {
1369                 /*
1370                  * This is important -- we are using the descriptor on
1371                  * the disk only to get a pointer to the descriptor on
1372                  * the main superblock, which might be more recent.
1373                  */
1374                 desc = sb->disks + rdev->desc_nr;
1375                 raid_disk = desc->raid_disk;
1376                 disk = conf->disks + raid_disk;
1377 
1378                 if (disk_faulty(desc)) {
1379                         printk(KERN_ERR "raid5: disabled device %s (errors detected)\n", partition_name(rdev->dev));
1380                         if (!rdev->faulty) {
1381                                 MD_BUG();
1382                                 goto abort;
1383                         }
1384                         disk->number = desc->number;
1385                         disk->raid_disk = raid_disk;
1386                         disk->dev = rdev->dev;
1387 
1388                         disk->operational = 0;
1389                         disk->write_only = 0;
1390                         disk->spare = 0;
1391                         disk->used_slot = 1;
1392                         continue;
1393                 }
1394                 if (disk_active(desc)) {
1395                         if (!disk_sync(desc)) {
1396                                 printk(KERN_ERR "raid5: disabled device %s (not in sync)\n", partition_name(rdev->dev));
1397                                 MD_BUG();
1398                                 goto abort;
1399                         }
1400                         if (raid_disk > sb->raid_disks) {
1401                                 printk(KERN_ERR "raid5: disabled device %s (inconsistent descriptor)\n", partition_name(rdev->dev));
1402                                 continue;
1403                         }
1404                         if (disk->operational) {
1405                                 printk(KERN_ERR "raid5: disabled device %s (device %d already operational)\n", partition_name(rdev->dev), raid_disk);
1406                                 continue;
1407                         }
1408                         printk(KERN_INFO "raid5: device %s operational as raid disk %d\n", partition_name(rdev->dev), raid_disk);
1409         
1410                         disk->number = desc->number;
1411                         disk->raid_disk = raid_disk;
1412                         disk->dev = rdev->dev;
1413                         disk->operational = 1;
1414                         disk->used_slot = 1;
1415 
1416                         conf->working_disks++;
1417                 } else {
1418                         /*
1419                          * Must be a spare disk ..
1420                          */
1421                         printk(KERN_INFO "raid5: spare disk %s\n", partition_name(rdev->dev));
1422                         disk->number = desc->number;
1423                         disk->raid_disk = raid_disk;
1424                         disk->dev = rdev->dev;
1425 
1426                         disk->operational = 0;
1427                         disk->write_only = 0;
1428                         disk->spare = 1;
1429                         disk->used_slot = 1;
1430                 }
1431         }
1432 
1433         for (i = 0; i < MD_SB_DISKS; i++) {
1434                 desc = sb->disks + i;
1435                 raid_disk = desc->raid_disk;
1436                 disk = conf->disks + raid_disk;
1437 
1438                 if (disk_faulty(desc) && (raid_disk < sb->raid_disks) &&
1439                         !conf->disks[raid_disk].used_slot) {
1440 
1441                         disk->number = desc->number;
1442                         disk->raid_disk = raid_disk;
1443                         disk->dev = MKDEV(0,0);
1444 
1445                         disk->operational = 0;
1446                         disk->write_only = 0;
1447                         disk->spare = 0;
1448                         disk->used_slot = 1;
1449                 }
1450         }
1451 
1452         conf->raid_disks = sb->raid_disks;
1453         /*
1454          * 0 for a fully functional array, 1 for a degraded array.
1455          */
1456         conf->failed_disks = conf->raid_disks - conf->working_disks;
1457         conf->mddev = mddev;
1458         conf->chunk_size = sb->chunk_size;
1459         conf->level = sb->level;
1460         conf->algorithm = sb->layout;
1461         conf->max_nr_stripes = NR_STRIPES;
1462 
1463 #if 0
1464         for (i = 0; i < conf->raid_disks; i++) {
1465                 if (!conf->disks[i].used_slot) {
1466                         MD_BUG();
1467                         goto abort;
1468                 }
1469         }
1470 #endif
1471         if (!conf->chunk_size || conf->chunk_size % 4) {
1472                 printk(KERN_ERR "raid5: invalid chunk size %d for md%d\n", conf->chunk_size, mdidx(mddev));
1473                 goto abort;
1474         }
1475         if (conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) {
1476                 printk(KERN_ERR "raid5: unsupported parity algorithm %d for md%d\n", conf->algorithm, mdidx(mddev));
1477                 goto abort;
1478         }
1479         if (conf->failed_disks > 1) {
1480                 printk(KERN_ERR "raid5: not enough operational devices for md%d (%d/%d failed)\n", mdidx(mddev), conf->failed_disks, conf->raid_disks);
1481                 goto abort;
1482         }
1483 
1484         if (conf->working_disks != sb->raid_disks) {
1485                 printk(KERN_ALERT "raid5: md%d, not all disks are operational -- trying to recover array\n", mdidx(mddev));
1486                 start_recovery = 1;
1487         }
1488 
1489         if (!start_recovery && (sb->state & (1 << MD_SB_CLEAN)) &&
1490                         check_consistency(mddev)) {
1491                 printk(KERN_ERR "raid5: detected raid-5 superblock xor inconsistency -- running resync\n");
1492                 sb->state &= ~(1 << MD_SB_CLEAN);
1493         }
1494 
1495         {
1496                 const char * name = "raid5d";
1497 
1498                 conf->thread = md_register_thread(raid5d, conf, name);
1499                 if (!conf->thread) {
1500                         printk(KERN_ERR "raid5: couldn't allocate thread for md%d\n", mdidx(mddev));
1501                         goto abort;
1502                 }
1503         }
1504 
1505         memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
1506                  conf->raid_disks * ((sizeof(struct buffer_head) + PAGE_SIZE))) / 1024;
1507         if (grow_stripes(conf, conf->max_nr_stripes, GFP_KERNEL)) {
1508                 printk(KERN_ERR "raid5: couldn't allocate %dkB for buffers\n", memory);
1509                 shrink_stripes(conf, conf->max_nr_stripes);
1510                 goto abort;
1511         } else
1512                 printk(KERN_INFO "raid5: allocated %dkB for md%d\n", memory, mdidx(mddev));
1513 
1514         /*
1515          * Regenerate the "device is in sync with the raid set" bit for
1516          * each device.
1517          */
1518         for (i = 0; i < MD_SB_DISKS ; i++) {
1519                 mark_disk_nonsync(sb->disks + i);
1520                 for (j = 0; j < sb->raid_disks; j++) {
1521                         if (!conf->disks[j].operational)
1522                                 continue;
1523                         if (sb->disks[i].number == conf->disks[j].number)
1524                                 mark_disk_sync(sb->disks + i);
1525                 }
1526         }
1527         sb->active_disks = conf->working_disks;
1528 
1529         if (sb->active_disks == sb->raid_disks)
1530                 printk("raid5: raid level %d set md%d active with %d out of %d devices, algorithm %d\n", conf->level, mdidx(mddev), sb->active_disks, sb->raid_disks, conf->algorithm);
1531         else
1532                 printk(KERN_ALERT "raid5: raid level %d set md%d active with %d out of %d devices, algorithm %d\n", conf->level, mdidx(mddev), sb->active_disks, sb->raid_disks, conf->algorithm);
1533 
1534         if (!start_recovery && !(sb->state & (1 << MD_SB_CLEAN))) {
1535                 const char * name = "raid5syncd";
1536 
1537                 conf->resync_thread = md_register_thread(raid5syncd, conf,name);
1538                 if (!conf->resync_thread) {
1539                         printk(KERN_ERR "raid5: couldn't allocate thread for md%d\n", mdidx(mddev));
1540                         goto abort;
1541                 }
1542 
1543                 printk("raid5: raid set md%d not clean; reconstructing parity\n", mdidx(mddev));
1544                 conf->resync_parity = 1;
1545                 md_wakeup_thread(conf->resync_thread);
1546         }
1547 
1548         print_raid5_conf(conf);
1549         if (start_recovery)
1550                 md_recover_arrays();
1551         print_raid5_conf(conf);
1552 
1553         /* Ok, everything is just fine now */
1554         return (0);
1555 abort:
1556         if (conf) {
1557                 print_raid5_conf(conf);
1558                 if (conf->stripe_hashtbl)
1559                         free_pages((unsigned long) conf->stripe_hashtbl,
1560                                                         HASH_PAGES_ORDER);
1561                 kfree(conf);
1562         }
1563         mddev->private = NULL;
1564         printk(KERN_ALERT "raid5: failed to run raid set md%d\n", mdidx(mddev));
1565         MOD_DEC_USE_COUNT;
1566         return -EIO;
1567 }
1568 
1569 static int raid5_stop_resync (mddev_t *mddev)
1570 {
1571         raid5_conf_t *conf = mddev_to_conf(mddev);
1572         mdk_thread_t *thread = conf->resync_thread;
1573 
1574         if (thread) {
1575                 if (conf->resync_parity) {
1576                         conf->resync_parity = 2;
1577                         md_interrupt_thread(thread);
1578                         printk(KERN_INFO "raid5: parity resync was not fully finished, restarting next time.\n");
1579                         return 1;
1580                 }
1581                 return 0;
1582         }
1583         return 0;
1584 }
1585 
1586 static int raid5_restart_resync (mddev_t *mddev)
1587 {
1588         raid5_conf_t *conf = mddev_to_conf(mddev);
1589 
1590         if (conf->resync_parity) {
1591                 if (!conf->resync_thread) {
1592                         MD_BUG();
1593                         return 0;
1594                 }
1595                 printk("raid5: waking up raid5resync.\n");
1596                 conf->resync_parity = 1;
1597                 md_wakeup_thread(conf->resync_thread);
1598                 return 1;
1599         } else
1600                 printk("raid5: no restart-resync needed.\n");
1601         return 0;
1602 }
1603 
1604 
1605 static int raid5_stop (mddev_t *mddev)
1606 {
1607         raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
1608 
1609         if (conf->resync_thread)
1610                 md_unregister_thread(conf->resync_thread);
1611         md_unregister_thread(conf->thread);
1612         shrink_stripes(conf, conf->max_nr_stripes);
1613         free_pages((unsigned long) conf->stripe_hashtbl, HASH_PAGES_ORDER);
1614         kfree(conf);
1615         mddev->private = NULL;
1616         MOD_DEC_USE_COUNT;
1617         return 0;
1618 }
1619 
1620 #if RAID5_DEBUG
1621 static void print_sh (struct stripe_head *sh)
1622 {
1623         int i;
1624 
1625         printk("sh %lu, size %d, pd_idx %d, state %ld.\n", sh->sector, sh->size, sh->pd_idx, sh->state);
1626         printk("sh %lu,  count %d.\n", sh->sector, atomic_read(&sh->count));
1627         printk("sh %lu, ", sh->sector);
1628         for (i = 0; i < MD_SB_DISKS; i++) {
1629                 if (sh->bh_cache[i])
1630                         printk("(cache%d: %p %ld) ", i, sh->bh_cache[i], sh->bh_cache[i]->b_state);
1631         }
1632         printk("\n");
1633 }
1634 
1635 static void printall (raid5_conf_t *conf)
1636 {
1637         struct stripe_head *sh;
1638         int i;
1639 
1640         md_spin_lock_irq(&conf->device_lock);
1641         for (i = 0; i < NR_HASH; i++) {
1642                 sh = conf->stripe_hashtbl[i];
1643                 for (; sh; sh = sh->hash_next) {
1644                         if (sh->raid_conf != conf)
1645                                 continue;
1646                         print_sh(sh);
1647                 }
1648         }
1649         md_spin_unlock_irq(&conf->device_lock);
1650 
1651         PRINTK("--- raid5d inactive\n");
1652 }
1653 #endif
1654 
1655 static int raid5_status (char *page, mddev_t *mddev)
1656 {
1657         raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
1658         mdp_super_t *sb = mddev->sb;
1659         int sz = 0, i;
1660 
1661         sz += sprintf (page+sz, " level %d, %dk chunk, algorithm %d", sb->level, sb->chunk_size >> 10, sb->layout);
1662         sz += sprintf (page+sz, " [%d/%d] [", conf->raid_disks, conf->working_disks);
1663         for (i = 0; i < conf->raid_disks; i++)
1664                 sz += sprintf (page+sz, "%s", conf->disks[i].operational ? "U" : "_");
1665         sz += sprintf (page+sz, "]");
1666 #if RAID5_DEBUG
1667 #define D(x) \
1668         sz += sprintf (page+sz, "<"#x":%d>", atomic_read(&conf->x))
1669         printall(conf);
1670 #endif
1671         return sz;
1672 }
1673 
1674 static void print_raid5_conf (raid5_conf_t *conf)
1675 {
1676         int i;
1677         struct disk_info *tmp;
1678 
1679         printk("RAID5 conf printout:\n");
1680         if (!conf) {
1681                 printk("(conf==NULL)\n");
1682                 return;
1683         }
1684         printk(" --- rd:%d wd:%d fd:%d\n", conf->raid_disks,
1685                  conf->working_disks, conf->failed_disks);
1686 
1687 #if RAID5_DEBUG
1688         for (i = 0; i < MD_SB_DISKS; i++) {
1689 #else
1690         for (i = 0; i < conf->working_disks+conf->failed_disks; i++) {
1691 #endif
1692                 tmp = conf->disks + i;
1693                 printk(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n",
1694                         i, tmp->spare,tmp->operational,
1695                         tmp->number,tmp->raid_disk,tmp->used_slot,
1696                         partition_name(tmp->dev));
1697         }
1698 }
1699 
1700 static int raid5_diskop(mddev_t *mddev, mdp_disk_t **d, int state)
1701 {
1702         int err = 0;
1703         int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1;
1704         raid5_conf_t *conf = mddev->private;
1705         struct disk_info *tmp, *sdisk, *fdisk, *rdisk, *adisk;
1706         mdp_super_t *sb = mddev->sb;
1707         mdp_disk_t *failed_desc, *spare_desc, *added_desc;
1708 
1709         print_raid5_conf(conf);
1710         md_spin_lock_irq(&conf->device_lock);
1711         /*
1712          * find the disk ...
1713          */
1714         switch (state) {
1715 
1716         case DISKOP_SPARE_ACTIVE:
1717 
1718                 /*
1719                  * Find the failed disk within the RAID5 configuration ...
1720                  * (this can only be in the first conf->raid_disks part)
1721                  */
1722                 for (i = 0; i < conf->raid_disks; i++) {
1723                         tmp = conf->disks + i;
1724                         if ((!tmp->operational && !tmp->spare) ||
1725                                         !tmp->used_slot) {
1726                                 failed_disk = i;
1727                                 break;
1728                         }
1729                 }
1730                 /*
1731                  * When we activate a spare disk we _must_ have a disk in
1732                  * the lower (active) part of the array to replace.
1733                  */
1734                 if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) {
1735                         MD_BUG();
1736                         err = 1;
1737                         goto abort;
1738                 }
1739                 /* fall through */
1740 
1741         case DISKOP_SPARE_WRITE:
1742         case DISKOP_SPARE_INACTIVE:
1743 
1744                 /*
1745                  * Find the spare disk ... (can only be in the 'high'
1746                  * area of the array)
1747                  */
1748                 for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
1749                         tmp = conf->disks + i;
1750                         if (tmp->spare && tmp->number == (*d)->number) {
1751                                 spare_disk = i;
1752                                 break;
1753                         }
1754                 }
1755                 if (spare_disk == -1) {
1756                         MD_BUG();
1757                         err = 1;
1758                         goto abort;
1759                 }
1760                 break;
1761 
1762         case DISKOP_HOT_REMOVE_DISK:
1763 
1764                 for (i = 0; i < MD_SB_DISKS; i++) {
1765                         tmp = conf->disks + i;
1766                         if (tmp->used_slot && (tmp->number == (*d)->number)) {
1767                                 if (tmp->operational) {
1768                                         err = -EBUSY;
1769                                         goto abort;
1770                                 }
1771                                 removed_disk = i;
1772                                 break;
1773                         }
1774                 }
1775                 if (removed_disk == -1) {
1776                         MD_BUG();
1777                         err = 1;
1778                         goto abort;
1779                 }
1780                 break;
1781 
1782         case DISKOP_HOT_ADD_DISK:
1783 
1784                 for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
1785                         tmp = conf->disks + i;
1786                         if (!tmp->used_slot) {
1787                                 added_disk = i;
1788                                 break;
1789                         }
1790                 }
1791                 if (added_disk == -1) {
1792                         MD_BUG();
1793                         err = 1;
1794                         goto abort;
1795                 }
1796                 break;
1797         }
1798 
1799         switch (state) {
1800         /*
1801          * Switch the spare disk to write-only mode:
1802          */
1803         case DISKOP_SPARE_WRITE:
1804                 if (conf->spare) {
1805                         MD_BUG();
1806                         err = 1;
1807                         goto abort;
1808                 }
1809                 sdisk = conf->disks + spare_disk;
1810                 sdisk->operational = 1;
1811                 sdisk->write_only = 1;
1812                 conf->spare = sdisk;
1813                 break;
1814         /*
1815          * Deactivate a spare disk:
1816          */
1817         case DISKOP_SPARE_INACTIVE:
1818                 sdisk = conf->disks + spare_disk;
1819                 sdisk->operational = 0;
1820                 sdisk->write_only = 0;
1821                 /*
1822                  * Was the spare being resynced?
1823                  */
1824                 if (conf->spare == sdisk)
1825                         conf->spare = NULL;
1826                 break;
1827         /*
1828          * Activate (mark read-write) the (now sync) spare disk,
1829          * which means we switch it's 'raid position' (->raid_disk)
1830          * with the failed disk. (only the first 'conf->raid_disks'
1831          * slots are used for 'real' disks and we must preserve this
1832          * property)
1833          */
1834         case DISKOP_SPARE_ACTIVE:
1835                 if (!conf->spare) {
1836                         MD_BUG();
1837                         err = 1;
1838                         goto abort;
1839                 }
1840                 sdisk = conf->disks + spare_disk;
1841                 fdisk = conf->disks + failed_disk;
1842 
1843                 spare_desc = &sb->disks[sdisk->number];
1844                 failed_desc = &sb->disks[fdisk->number];
1845 
1846                 if (spare_desc != *d) {
1847                         MD_BUG();
1848                         err = 1;
1849                         goto abort;
1850                 }
1851 
1852                 if (spare_desc->raid_disk != sdisk->raid_disk) {
1853                         MD_BUG();
1854                         err = 1;
1855                         goto abort;
1856                 }
1857                         
1858                 if (sdisk->raid_disk != spare_disk) {
1859                         MD_BUG();
1860                         err = 1;
1861                         goto abort;
1862                 }
1863 
1864                 if (failed_desc->raid_disk != fdisk->raid_disk) {
1865                         MD_BUG();
1866                         err = 1;
1867                         goto abort;
1868                 }
1869 
1870                 if (fdisk->raid_disk != failed_disk) {
1871                         MD_BUG();
1872                         err = 1;
1873                         goto abort;
1874                 }
1875 
1876                 /*
1877                  * do the switch finally
1878                  */
1879                 xchg_values(*spare_desc, *failed_desc);
1880                 xchg_values(*fdisk, *sdisk);
1881 
1882                 /*
1883                  * (careful, 'failed' and 'spare' are switched from now on)
1884                  *
1885                  * we want to preserve linear numbering and we want to
1886                  * give the proper raid_disk number to the now activated
1887                  * disk. (this means we switch back these values)
1888                  */
1889         
1890                 xchg_values(spare_desc->raid_disk, failed_desc->raid_disk);
1891                 xchg_values(sdisk->raid_disk, fdisk->raid_disk);
1892                 xchg_values(spare_desc->number, failed_desc->number);
1893                 xchg_values(sdisk->number, fdisk->number);
1894 
1895                 *d = failed_desc;
1896 
1897                 if (sdisk->dev == MKDEV(0,0))
1898                         sdisk->used_slot = 0;
1899 
1900                 /*
1901                  * this really activates the spare.
1902                  */
1903                 fdisk->spare = 0;
1904                 fdisk->write_only = 0;
1905 
1906                 /*
1907                  * if we activate a spare, we definitely replace a
1908                  * non-operational disk slot in the 'low' area of
1909                  * the disk array.
1910                  */
1911                 conf->failed_disks--;
1912                 conf->working_disks++;
1913                 conf->spare = NULL;
1914 
1915                 break;
1916 
1917         case DISKOP_HOT_REMOVE_DISK:
1918                 rdisk = conf->disks + removed_disk;
1919 
1920                 if (rdisk->spare && (removed_disk < conf->raid_disks)) {
1921                         MD_BUG();       
1922                         err = 1;
1923                         goto abort;
1924                 }
1925                 rdisk->dev = MKDEV(0,0);
1926                 rdisk->used_slot = 0;
1927 
1928                 break;
1929 
1930         case DISKOP_HOT_ADD_DISK:
1931                 adisk = conf->disks + added_disk;
1932                 added_desc = *d;
1933 
1934                 if (added_disk != added_desc->number) {
1935                         MD_BUG();       
1936                         err = 1;
1937                         goto abort;
1938                 }
1939 
1940                 adisk->number = added_desc->number;
1941                 adisk->raid_disk = added_desc->raid_disk;
1942                 adisk->dev = MKDEV(added_desc->major,added_desc->minor);
1943 
1944                 adisk->operational = 0;
1945                 adisk->write_only = 0;
1946                 adisk->spare = 1;
1947                 adisk->used_slot = 1;
1948 
1949 
1950                 break;
1951 
1952         default:
1953                 MD_BUG();       
1954                 err = 1;
1955                 goto abort;
1956         }
1957 abort:
1958         md_spin_unlock_irq(&conf->device_lock);
1959         print_raid5_conf(conf);
1960         return err;
1961 }
1962 
1963 static mdk_personality_t raid5_personality=
1964 {
1965         name:           "raid5",
1966         make_request:   raid5_make_request,
1967         run:            raid5_run,
1968         stop:           raid5_stop,
1969         status:         raid5_status,
1970         error_handler:  raid5_error,
1971         diskop:         raid5_diskop,
1972         stop_resync:    raid5_stop_resync,
1973         restart_resync: raid5_restart_resync,
1974         sync_request:   raid5_sync_request
1975 };
1976 
1977 static int md__init raid5_init (void)
1978 {
1979         return register_md_personality (RAID5, &raid5_personality);
1980 }
1981 
1982 static void raid5_exit (void)
1983 {
1984         unregister_md_personality (RAID5);
1985 }
1986 
1987 module_init(raid5_init);
1988 module_exit(raid5_exit);
1989 
1990 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~ [ freetext search ] ~ [ file search ] ~

This page was automatically generated by the LXR engine.
Visit the LXR main site for more information.