1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * raid10.c : Multiple Devices driver for Linux
4 *
5 * Copyright (C) 2000-2004 Neil Brown
6 *
7 * RAID-10 support for md.
8 *
9 * Base on code in raid1.c. See raid1.c for further copyright information.
10 */
11
12 #include <linux/slab.h>
13 #include <linux/delay.h>
14 #include <linux/blkdev.h>
15 #include <linux/module.h>
16 #include <linux/seq_file.h>
17 #include <linux/ratelimit.h>
18 #include <linux/kthread.h>
19 #include <linux/raid/md_p.h>
20 #include <trace/events/block.h>
21 #include "md.h"
22
23 #define RAID_1_10_NAME "raid10"
24 #include "raid10.h"
25 #include "raid0.h"
26 #include "md-bitmap.h"
27
28 /*
29 * RAID10 provides a combination of RAID0 and RAID1 functionality.
30 * The layout of data is defined by
31 * chunk_size
32 * raid_disks
33 * near_copies (stored in low byte of layout)
34 * far_copies (stored in second byte of layout)
35 * far_offset (stored in bit 16 of layout )
36 * use_far_sets (stored in bit 17 of layout )
37 * use_far_sets_bugfixed (stored in bit 18 of layout )
38 *
39 * The data to be stored is divided into chunks using chunksize. Each device
40 * is divided into far_copies sections. In each section, chunks are laid out
41 * in a style similar to raid0, but near_copies copies of each chunk is stored
42 * (each on a different drive). The starting device for each section is offset
43 * near_copies from the starting device of the previous section. Thus there
44 * are (near_copies * far_copies) of each chunk, and each is on a different
45 * drive. near_copies and far_copies must be at least one, and their product
46 * is at most raid_disks.
47 *
48 * If far_offset is true, then the far_copies are handled a bit differently.
49 * The copies are still in different stripes, but instead of being very far
50 * apart on disk, there are adjacent stripes.
51 *
52 * The far and offset algorithms are handled slightly differently if
53 * 'use_far_sets' is true. In this case, the array's devices are grouped into
54 * sets that are (near_copies * far_copies) in size. The far copied stripes
55 * are still shifted by 'near_copies' devices, but this shifting stays confined
56 * to the set rather than the entire array. This is done to improve the number
57 * of device combinations that can fail without causing the array to fail.
58 * Example 'far' algorithm w/o 'use_far_sets' (each letter represents a chunk
59 * on a device):
60 * A B C D A B C D E
61 * ... ...
62 * D A B C E A B C D
63 * Example 'far' algorithm w/ 'use_far_sets' enabled (sets illustrated w/ []'s):
64 * [A B] [C D] [A B] [C D E]
65 * |...| |...| |...| | ... |
66 * [B A] [D C] [B A] [E C D]
67 */
68
69 static void allow_barrier(struct r10conf *conf);
70 static void lower_barrier(struct r10conf *conf);
71 static int _enough(struct r10conf *conf, int previous, int ignore);
72 static int enough(struct r10conf *conf, int ignore);
73 static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
74 int *skipped);
75 static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio);
76 static void end_reshape_write(struct bio *bio);
77 static void end_reshape(struct r10conf *conf);
78
79 #include "raid1-10.c"
80
81 #define NULL_CMD
82 #define cmd_before(conf, cmd) \
83 do { \
84 write_sequnlock_irq(&(conf)->resync_lock); \
85 cmd; \
86 } while (0)
87 #define cmd_after(conf) write_seqlock_irq(&(conf)->resync_lock)
88
89 #define wait_event_barrier_cmd(conf, cond, cmd) \
90 wait_event_cmd((conf)->wait_barrier, cond, cmd_before(conf, cmd), \
91 cmd_after(conf))
92
93 #define wait_event_barrier(conf, cond) \
94 wait_event_barrier_cmd(conf, cond, NULL_CMD)
95
96 /*
97 * for resync bio, r10bio pointer can be retrieved from the per-bio
98 * 'struct resync_pages'.
99 */
get_resync_r10bio(struct bio * bio)100 static inline struct r10bio *get_resync_r10bio(struct bio *bio)
101 {
102 return get_resync_pages(bio)->raid_bio;
103 }
104
r10bio_pool_alloc(gfp_t gfp_flags,void * data)105 static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
106 {
107 struct r10conf *conf = data;
108 int size = offsetof(struct r10bio, devs[conf->geo.raid_disks]);
109
110 /* allocate a r10bio with room for raid_disks entries in the
111 * bios array */
112 return kzalloc(size, gfp_flags);
113 }
114
115 #define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
116 /* amount of memory to reserve for resync requests */
117 #define RESYNC_WINDOW (1024*1024)
118 /* maximum number of concurrent requests, memory permitting */
119 #define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE)
120 #define CLUSTER_RESYNC_WINDOW (32 * RESYNC_WINDOW)
121 #define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9)
122
123 /*
124 * When performing a resync, we need to read and compare, so
125 * we need as many pages are there are copies.
126 * When performing a recovery, we need 2 bios, one for read,
127 * one for write (we recover only one drive per r10buf)
128 *
129 */
r10buf_pool_alloc(gfp_t gfp_flags,void * data)130 static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
131 {
132 struct r10conf *conf = data;
133 struct r10bio *r10_bio;
134 struct bio *bio;
135 int j;
136 int nalloc, nalloc_rp;
137 struct resync_pages *rps;
138
139 r10_bio = r10bio_pool_alloc(gfp_flags, conf);
140 if (!r10_bio)
141 return NULL;
142
143 if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) ||
144 test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery))
145 nalloc = conf->copies; /* resync */
146 else
147 nalloc = 2; /* recovery */
148
149 /* allocate once for all bios */
150 if (!conf->have_replacement)
151 nalloc_rp = nalloc;
152 else
153 nalloc_rp = nalloc * 2;
154 rps = kmalloc_array(nalloc_rp, sizeof(struct resync_pages), gfp_flags);
155 if (!rps)
156 goto out_free_r10bio;
157
158 /*
159 * Allocate bios.
160 */
161 for (j = nalloc ; j-- ; ) {
162 bio = bio_kmalloc(RESYNC_PAGES, gfp_flags);
163 if (!bio)
164 goto out_free_bio;
165 bio_init(bio, NULL, bio->bi_inline_vecs, RESYNC_PAGES, 0);
166 r10_bio->devs[j].bio = bio;
167 if (!conf->have_replacement)
168 continue;
169 bio = bio_kmalloc(RESYNC_PAGES, gfp_flags);
170 if (!bio)
171 goto out_free_bio;
172 bio_init(bio, NULL, bio->bi_inline_vecs, RESYNC_PAGES, 0);
173 r10_bio->devs[j].repl_bio = bio;
174 }
175 /*
176 * Allocate RESYNC_PAGES data pages and attach them
177 * where needed.
178 */
179 for (j = 0; j < nalloc; j++) {
180 struct bio *rbio = r10_bio->devs[j].repl_bio;
181 struct resync_pages *rp, *rp_repl;
182
183 rp = &rps[j];
184 if (rbio)
185 rp_repl = &rps[nalloc + j];
186
187 bio = r10_bio->devs[j].bio;
188
189 if (!j || test_bit(MD_RECOVERY_SYNC,
190 &conf->mddev->recovery)) {
191 if (resync_alloc_pages(rp, gfp_flags))
192 goto out_free_pages;
193 } else {
194 memcpy(rp, &rps[0], sizeof(*rp));
195 resync_get_all_pages(rp);
196 }
197
198 rp->raid_bio = r10_bio;
199 bio->bi_private = rp;
200 if (rbio) {
201 memcpy(rp_repl, rp, sizeof(*rp));
202 rbio->bi_private = rp_repl;
203 }
204 }
205
206 return r10_bio;
207
208 out_free_pages:
209 while (--j >= 0)
210 resync_free_pages(&rps[j]);
211
212 j = 0;
213 out_free_bio:
214 for ( ; j < nalloc; j++) {
215 if (r10_bio->devs[j].bio)
216 bio_uninit(r10_bio->devs[j].bio);
217 kfree(r10_bio->devs[j].bio);
218 if (r10_bio->devs[j].repl_bio)
219 bio_uninit(r10_bio->devs[j].repl_bio);
220 kfree(r10_bio->devs[j].repl_bio);
221 }
222 kfree(rps);
223 out_free_r10bio:
224 rbio_pool_free(r10_bio, conf);
225 return NULL;
226 }
227
r10buf_pool_free(void * __r10_bio,void * data)228 static void r10buf_pool_free(void *__r10_bio, void *data)
229 {
230 struct r10conf *conf = data;
231 struct r10bio *r10bio = __r10_bio;
232 int j;
233 struct resync_pages *rp = NULL;
234
235 for (j = conf->copies; j--; ) {
236 struct bio *bio = r10bio->devs[j].bio;
237
238 if (bio) {
239 rp = get_resync_pages(bio);
240 resync_free_pages(rp);
241 bio_uninit(bio);
242 kfree(bio);
243 }
244
245 bio = r10bio->devs[j].repl_bio;
246 if (bio) {
247 bio_uninit(bio);
248 kfree(bio);
249 }
250 }
251
252 /* resync pages array stored in the 1st bio's .bi_private */
253 kfree(rp);
254
255 rbio_pool_free(r10bio, conf);
256 }
257
put_all_bios(struct r10conf * conf,struct r10bio * r10_bio)258 static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio)
259 {
260 int i;
261
262 for (i = 0; i < conf->geo.raid_disks; i++) {
263 struct bio **bio = & r10_bio->devs[i].bio;
264 if (!BIO_SPECIAL(*bio))
265 bio_put(*bio);
266 *bio = NULL;
267 bio = &r10_bio->devs[i].repl_bio;
268 if (r10_bio->read_slot < 0 && !BIO_SPECIAL(*bio))
269 bio_put(*bio);
270 *bio = NULL;
271 }
272 }
273
free_r10bio(struct r10bio * r10_bio)274 static void free_r10bio(struct r10bio *r10_bio)
275 {
276 struct r10conf *conf = r10_bio->mddev->private;
277
278 put_all_bios(conf, r10_bio);
279 mempool_free(r10_bio, &conf->r10bio_pool);
280 }
281
put_buf(struct r10bio * r10_bio)282 static void put_buf(struct r10bio *r10_bio)
283 {
284 struct r10conf *conf = r10_bio->mddev->private;
285
286 mempool_free(r10_bio, &conf->r10buf_pool);
287
288 lower_barrier(conf);
289 }
290
wake_up_barrier(struct r10conf * conf)291 static void wake_up_barrier(struct r10conf *conf)
292 {
293 if (wq_has_sleeper(&conf->wait_barrier))
294 wake_up(&conf->wait_barrier);
295 }
296
reschedule_retry(struct r10bio * r10_bio)297 static void reschedule_retry(struct r10bio *r10_bio)
298 {
299 unsigned long flags;
300 struct mddev *mddev = r10_bio->mddev;
301 struct r10conf *conf = mddev->private;
302
303 spin_lock_irqsave(&conf->device_lock, flags);
304 list_add(&r10_bio->retry_list, &conf->retry_list);
305 conf->nr_queued ++;
306 spin_unlock_irqrestore(&conf->device_lock, flags);
307
308 /* wake up frozen array... */
309 wake_up(&conf->wait_barrier);
310
311 md_wakeup_thread(mddev->thread);
312 }
313
314 /*
315 * raid_end_bio_io() is called when we have finished servicing a mirrored
316 * operation and are ready to return a success/failure code to the buffer
317 * cache layer.
318 */
raid_end_bio_io(struct r10bio * r10_bio)319 static void raid_end_bio_io(struct r10bio *r10_bio)
320 {
321 struct bio *bio = r10_bio->master_bio;
322 struct r10conf *conf = r10_bio->mddev->private;
323
324 if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
325 bio->bi_status = BLK_STS_IOERR;
326
327 bio_endio(bio);
328 /*
329 * Wake up any possible resync thread that waits for the device
330 * to go idle.
331 */
332 allow_barrier(conf);
333
334 free_r10bio(r10_bio);
335 }
336
337 /*
338 * Update disk head position estimator based on IRQ completion info.
339 */
update_head_pos(int slot,struct r10bio * r10_bio)340 static inline void update_head_pos(int slot, struct r10bio *r10_bio)
341 {
342 struct r10conf *conf = r10_bio->mddev->private;
343
344 conf->mirrors[r10_bio->devs[slot].devnum].head_position =
345 r10_bio->devs[slot].addr + (r10_bio->sectors);
346 }
347
348 /*
349 * Find the disk number which triggered given bio
350 */
find_bio_disk(struct r10conf * conf,struct r10bio * r10_bio,struct bio * bio,int * slotp,int * replp)351 static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio,
352 struct bio *bio, int *slotp, int *replp)
353 {
354 int slot;
355 int repl = 0;
356
357 for (slot = 0; slot < conf->geo.raid_disks; slot++) {
358 if (r10_bio->devs[slot].bio == bio)
359 break;
360 if (r10_bio->devs[slot].repl_bio == bio) {
361 repl = 1;
362 break;
363 }
364 }
365
366 update_head_pos(slot, r10_bio);
367
368 if (slotp)
369 *slotp = slot;
370 if (replp)
371 *replp = repl;
372 return r10_bio->devs[slot].devnum;
373 }
374
raid10_end_read_request(struct bio * bio)375 static void raid10_end_read_request(struct bio *bio)
376 {
377 int uptodate = !bio->bi_status;
378 struct r10bio *r10_bio = bio->bi_private;
379 int slot;
380 struct md_rdev *rdev;
381 struct r10conf *conf = r10_bio->mddev->private;
382
383 slot = r10_bio->read_slot;
384 rdev = r10_bio->devs[slot].rdev;
385 /*
386 * this branch is our 'one mirror IO has finished' event handler:
387 */
388 update_head_pos(slot, r10_bio);
389
390 if (uptodate) {
391 /*
392 * Set R10BIO_Uptodate in our master bio, so that
393 * we will return a good error code to the higher
394 * levels even if IO on some other mirrored buffer fails.
395 *
396 * The 'master' represents the composite IO operation to
397 * user-side. So if something waits for IO, then it will
398 * wait for the 'master' bio.
399 */
400 set_bit(R10BIO_Uptodate, &r10_bio->state);
401 } else if (!raid1_should_handle_error(bio)) {
402 uptodate = 1;
403 } else {
404 /* If all other devices that store this block have
405 * failed, we want to return the error upwards rather
406 * than fail the last device. Here we redefine
407 * "uptodate" to mean "Don't want to retry"
408 */
409 if (!_enough(conf, test_bit(R10BIO_Previous, &r10_bio->state),
410 rdev->raid_disk))
411 uptodate = 1;
412 }
413 if (uptodate) {
414 raid_end_bio_io(r10_bio);
415 rdev_dec_pending(rdev, conf->mddev);
416 } else {
417 /*
418 * oops, read error - keep the refcount on the rdev
419 */
420 pr_err_ratelimited("md/raid10:%s: %pg: rescheduling sector %llu\n",
421 mdname(conf->mddev),
422 rdev->bdev,
423 (unsigned long long)r10_bio->sector);
424 set_bit(R10BIO_ReadError, &r10_bio->state);
425 reschedule_retry(r10_bio);
426 }
427 }
428
close_write(struct r10bio * r10_bio)429 static void close_write(struct r10bio *r10_bio)
430 {
431 struct mddev *mddev = r10_bio->mddev;
432
433 md_write_end(mddev);
434 }
435
one_write_done(struct r10bio * r10_bio)436 static void one_write_done(struct r10bio *r10_bio)
437 {
438 if (atomic_dec_and_test(&r10_bio->remaining)) {
439 if (test_bit(R10BIO_WriteError, &r10_bio->state))
440 reschedule_retry(r10_bio);
441 else {
442 close_write(r10_bio);
443 if (test_bit(R10BIO_MadeGood, &r10_bio->state))
444 reschedule_retry(r10_bio);
445 else
446 raid_end_bio_io(r10_bio);
447 }
448 }
449 }
450
raid10_end_write_request(struct bio * bio)451 static void raid10_end_write_request(struct bio *bio)
452 {
453 struct r10bio *r10_bio = bio->bi_private;
454 int dev;
455 int dec_rdev = 1;
456 struct r10conf *conf = r10_bio->mddev->private;
457 int slot, repl;
458 struct md_rdev *rdev = NULL;
459 struct bio *to_put = NULL;
460 bool ignore_error = !raid1_should_handle_error(bio) ||
461 (bio->bi_status && bio_op(bio) == REQ_OP_DISCARD);
462
463 dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
464
465 if (repl)
466 rdev = conf->mirrors[dev].replacement;
467 if (!rdev) {
468 smp_rmb();
469 repl = 0;
470 rdev = conf->mirrors[dev].rdev;
471 }
472 /*
473 * this branch is our 'one mirror IO has finished' event handler:
474 */
475 if (bio->bi_status && !ignore_error) {
476 if (repl)
477 /* Never record new bad blocks to replacement,
478 * just fail it.
479 */
480 md_error(rdev->mddev, rdev);
481 else {
482 set_bit(WriteErrorSeen, &rdev->flags);
483 if (!test_and_set_bit(WantReplacement, &rdev->flags))
484 set_bit(MD_RECOVERY_NEEDED,
485 &rdev->mddev->recovery);
486
487 dec_rdev = 0;
488 if (test_bit(FailFast, &rdev->flags) &&
489 (bio->bi_opf & MD_FAILFAST)) {
490 md_error(rdev->mddev, rdev);
491 }
492
493 /*
494 * When the device is faulty, it is not necessary to
495 * handle write error.
496 */
497 if (!test_bit(Faulty, &rdev->flags))
498 set_bit(R10BIO_WriteError, &r10_bio->state);
499 else {
500 /* Fail the request */
501 r10_bio->devs[slot].bio = NULL;
502 to_put = bio;
503 dec_rdev = 1;
504 }
505 }
506 } else {
507 /*
508 * Set R10BIO_Uptodate in our master bio, so that
509 * we will return a good error code for to the higher
510 * levels even if IO on some other mirrored buffer fails.
511 *
512 * The 'master' represents the composite IO operation to
513 * user-side. So if something waits for IO, then it will
514 * wait for the 'master' bio.
515 *
516 * Do not set R10BIO_Uptodate if the current device is
517 * rebuilding or Faulty. This is because we cannot use
518 * such device for properly reading the data back (we could
519 * potentially use it, if the current write would have felt
520 * before rdev->recovery_offset, but for simplicity we don't
521 * check this here.
522 */
523 if (test_bit(In_sync, &rdev->flags) &&
524 !test_bit(Faulty, &rdev->flags))
525 set_bit(R10BIO_Uptodate, &r10_bio->state);
526
527 /* Maybe we can clear some bad blocks. */
528 if (rdev_has_badblock(rdev, r10_bio->devs[slot].addr,
529 r10_bio->sectors) &&
530 !ignore_error) {
531 bio_put(bio);
532 if (repl)
533 r10_bio->devs[slot].repl_bio = IO_MADE_GOOD;
534 else
535 r10_bio->devs[slot].bio = IO_MADE_GOOD;
536 dec_rdev = 0;
537 set_bit(R10BIO_MadeGood, &r10_bio->state);
538 }
539 }
540
541 /*
542 *
543 * Let's see if all mirrored write operations have finished
544 * already.
545 */
546 one_write_done(r10_bio);
547 if (dec_rdev)
548 rdev_dec_pending(rdev, conf->mddev);
549 if (to_put)
550 bio_put(to_put);
551 }
552
553 /*
554 * RAID10 layout manager
555 * As well as the chunksize and raid_disks count, there are two
556 * parameters: near_copies and far_copies.
557 * near_copies * far_copies must be <= raid_disks.
558 * Normally one of these will be 1.
559 * If both are 1, we get raid0.
560 * If near_copies == raid_disks, we get raid1.
561 *
562 * Chunks are laid out in raid0 style with near_copies copies of the
563 * first chunk, followed by near_copies copies of the next chunk and
564 * so on.
565 * If far_copies > 1, then after 1/far_copies of the array has been assigned
566 * as described above, we start again with a device offset of near_copies.
567 * So we effectively have another copy of the whole array further down all
568 * the drives, but with blocks on different drives.
569 * With this layout, and block is never stored twice on the one device.
570 *
571 * raid10_find_phys finds the sector offset of a given virtual sector
572 * on each device that it is on.
573 *
574 * raid10_find_virt does the reverse mapping, from a device and a
575 * sector offset to a virtual address
576 */
577
__raid10_find_phys(struct geom * geo,struct r10bio * r10bio)578 static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)
579 {
580 int n,f;
581 sector_t sector;
582 sector_t chunk;
583 sector_t stripe;
584 int dev;
585 int slot = 0;
586 int last_far_set_start, last_far_set_size;
587
588 last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
589 last_far_set_start *= geo->far_set_size;
590
591 last_far_set_size = geo->far_set_size;
592 last_far_set_size += (geo->raid_disks % geo->far_set_size);
593
594 /* now calculate first sector/dev */
595 chunk = r10bio->sector >> geo->chunk_shift;
596 sector = r10bio->sector & geo->chunk_mask;
597
598 chunk *= geo->near_copies;
599 stripe = chunk;
600 dev = sector_div(stripe, geo->raid_disks);
601 if (geo->far_offset)
602 stripe *= geo->far_copies;
603
604 sector += stripe << geo->chunk_shift;
605
606 /* and calculate all the others */
607 for (n = 0; n < geo->near_copies; n++) {
608 int d = dev;
609 int set;
610 sector_t s = sector;
611 r10bio->devs[slot].devnum = d;
612 r10bio->devs[slot].addr = s;
613 slot++;
614
615 for (f = 1; f < geo->far_copies; f++) {
616 set = d / geo->far_set_size;
617 d += geo->near_copies;
618
619 if ((geo->raid_disks % geo->far_set_size) &&
620 (d > last_far_set_start)) {
621 d -= last_far_set_start;
622 d %= last_far_set_size;
623 d += last_far_set_start;
624 } else {
625 d %= geo->far_set_size;
626 d += geo->far_set_size * set;
627 }
628 s += geo->stride;
629 r10bio->devs[slot].devnum = d;
630 r10bio->devs[slot].addr = s;
631 slot++;
632 }
633 dev++;
634 if (dev >= geo->raid_disks) {
635 dev = 0;
636 sector += (geo->chunk_mask + 1);
637 }
638 }
639 }
640
raid10_find_phys(struct r10conf * conf,struct r10bio * r10bio)641 static void raid10_find_phys(struct r10conf *conf, struct r10bio *r10bio)
642 {
643 struct geom *geo = &conf->geo;
644
645 if (conf->reshape_progress != MaxSector &&
646 ((r10bio->sector >= conf->reshape_progress) !=
647 conf->mddev->reshape_backwards)) {
648 set_bit(R10BIO_Previous, &r10bio->state);
649 geo = &conf->prev;
650 } else
651 clear_bit(R10BIO_Previous, &r10bio->state);
652
653 __raid10_find_phys(geo, r10bio);
654 }
655
raid10_find_virt(struct r10conf * conf,sector_t sector,int dev)656 static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
657 {
658 sector_t offset, chunk, vchunk;
659 /* Never use conf->prev as this is only called during resync
660 * or recovery, so reshape isn't happening
661 */
662 struct geom *geo = &conf->geo;
663 int far_set_start = (dev / geo->far_set_size) * geo->far_set_size;
664 int far_set_size = geo->far_set_size;
665 int last_far_set_start;
666
667 if (geo->raid_disks % geo->far_set_size) {
668 last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
669 last_far_set_start *= geo->far_set_size;
670
671 if (dev >= last_far_set_start) {
672 far_set_size = geo->far_set_size;
673 far_set_size += (geo->raid_disks % geo->far_set_size);
674 far_set_start = last_far_set_start;
675 }
676 }
677
678 offset = sector & geo->chunk_mask;
679 if (geo->far_offset) {
680 int fc;
681 chunk = sector >> geo->chunk_shift;
682 fc = sector_div(chunk, geo->far_copies);
683 dev -= fc * geo->near_copies;
684 if (dev < far_set_start)
685 dev += far_set_size;
686 } else {
687 while (sector >= geo->stride) {
688 sector -= geo->stride;
689 if (dev < (geo->near_copies + far_set_start))
690 dev += far_set_size - geo->near_copies;
691 else
692 dev -= geo->near_copies;
693 }
694 chunk = sector >> geo->chunk_shift;
695 }
696 vchunk = chunk * geo->raid_disks + dev;
697 sector_div(vchunk, geo->near_copies);
698 return (vchunk << geo->chunk_shift) + offset;
699 }
700
701 /*
702 * This routine returns the disk from which the requested read should
703 * be done. There is a per-array 'next expected sequential IO' sector
704 * number - if this matches on the next IO then we use the last disk.
705 * There is also a per-disk 'last know head position' sector that is
706 * maintained from IRQ contexts, both the normal and the resync IO
707 * completion handlers update this position correctly. If there is no
708 * perfect sequential match then we pick the disk whose head is closest.
709 *
710 * If there are 2 mirrors in the same 2 devices, performance degrades
711 * because position is mirror, not device based.
712 *
713 * The rdev for the device selected will have nr_pending incremented.
714 */
715
716 /*
717 * FIXME: possibly should rethink readbalancing and do it differently
718 * depending on near_copies / far_copies geometry.
719 */
read_balance(struct r10conf * conf,struct r10bio * r10_bio,int * max_sectors)720 static struct md_rdev *read_balance(struct r10conf *conf,
721 struct r10bio *r10_bio,
722 int *max_sectors)
723 {
724 const sector_t this_sector = r10_bio->sector;
725 int disk, slot;
726 int sectors = r10_bio->sectors;
727 int best_good_sectors;
728 sector_t new_distance, best_dist;
729 struct md_rdev *best_dist_rdev, *best_pending_rdev, *rdev = NULL;
730 int do_balance;
731 int best_dist_slot, best_pending_slot;
732 bool has_nonrot_disk = false;
733 unsigned int min_pending;
734 struct geom *geo = &conf->geo;
735
736 raid10_find_phys(conf, r10_bio);
737 best_dist_slot = -1;
738 min_pending = UINT_MAX;
739 best_dist_rdev = NULL;
740 best_pending_rdev = NULL;
741 best_dist = MaxSector;
742 best_good_sectors = 0;
743 do_balance = 1;
744 clear_bit(R10BIO_FailFast, &r10_bio->state);
745
746 if (raid1_should_read_first(conf->mddev, this_sector, sectors))
747 do_balance = 0;
748
749 for (slot = 0; slot < conf->copies ; slot++) {
750 sector_t first_bad;
751 int bad_sectors;
752 sector_t dev_sector;
753 unsigned int pending;
754 bool nonrot;
755
756 if (r10_bio->devs[slot].bio == IO_BLOCKED)
757 continue;
758 disk = r10_bio->devs[slot].devnum;
759 rdev = conf->mirrors[disk].replacement;
760 if (rdev == NULL || test_bit(Faulty, &rdev->flags) ||
761 r10_bio->devs[slot].addr + sectors >
762 rdev->recovery_offset)
763 rdev = conf->mirrors[disk].rdev;
764 if (rdev == NULL ||
765 test_bit(Faulty, &rdev->flags))
766 continue;
767 if (!test_bit(In_sync, &rdev->flags) &&
768 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
769 continue;
770
771 dev_sector = r10_bio->devs[slot].addr;
772 if (is_badblock(rdev, dev_sector, sectors,
773 &first_bad, &bad_sectors)) {
774 if (best_dist < MaxSector)
775 /* Already have a better slot */
776 continue;
777 if (first_bad <= dev_sector) {
778 /* Cannot read here. If this is the
779 * 'primary' device, then we must not read
780 * beyond 'bad_sectors' from another device.
781 */
782 bad_sectors -= (dev_sector - first_bad);
783 if (!do_balance && sectors > bad_sectors)
784 sectors = bad_sectors;
785 if (best_good_sectors > sectors)
786 best_good_sectors = sectors;
787 } else {
788 sector_t good_sectors =
789 first_bad - dev_sector;
790 if (good_sectors > best_good_sectors) {
791 best_good_sectors = good_sectors;
792 best_dist_slot = slot;
793 best_dist_rdev = rdev;
794 }
795 if (!do_balance)
796 /* Must read from here */
797 break;
798 }
799 continue;
800 } else
801 best_good_sectors = sectors;
802
803 if (!do_balance)
804 break;
805
806 nonrot = bdev_nonrot(rdev->bdev);
807 has_nonrot_disk |= nonrot;
808 pending = atomic_read(&rdev->nr_pending);
809 if (min_pending > pending && nonrot) {
810 min_pending = pending;
811 best_pending_slot = slot;
812 best_pending_rdev = rdev;
813 }
814
815 if (best_dist_slot >= 0)
816 /* At least 2 disks to choose from so failfast is OK */
817 set_bit(R10BIO_FailFast, &r10_bio->state);
818 /* This optimisation is debatable, and completely destroys
819 * sequential read speed for 'far copies' arrays. So only
820 * keep it for 'near' arrays, and review those later.
821 */
822 if (geo->near_copies > 1 && !pending)
823 new_distance = 0;
824
825 /* for far > 1 always use the lowest address */
826 else if (geo->far_copies > 1)
827 new_distance = r10_bio->devs[slot].addr;
828 else
829 new_distance = abs(r10_bio->devs[slot].addr -
830 conf->mirrors[disk].head_position);
831
832 if (new_distance < best_dist) {
833 best_dist = new_distance;
834 best_dist_slot = slot;
835 best_dist_rdev = rdev;
836 }
837 }
838 if (slot >= conf->copies) {
839 if (has_nonrot_disk) {
840 slot = best_pending_slot;
841 rdev = best_pending_rdev;
842 } else {
843 slot = best_dist_slot;
844 rdev = best_dist_rdev;
845 }
846 }
847
848 if (slot >= 0) {
849 atomic_inc(&rdev->nr_pending);
850 r10_bio->read_slot = slot;
851 } else
852 rdev = NULL;
853 *max_sectors = best_good_sectors;
854
855 return rdev;
856 }
857
flush_pending_writes(struct r10conf * conf)858 static void flush_pending_writes(struct r10conf *conf)
859 {
860 /* Any writes that have been queued but are awaiting
861 * bitmap updates get flushed here.
862 */
863 spin_lock_irq(&conf->device_lock);
864
865 if (conf->pending_bio_list.head) {
866 struct blk_plug plug;
867 struct bio *bio;
868
869 bio = bio_list_get(&conf->pending_bio_list);
870 spin_unlock_irq(&conf->device_lock);
871
872 /*
873 * As this is called in a wait_event() loop (see freeze_array),
874 * current->state might be TASK_UNINTERRUPTIBLE which will
875 * cause a warning when we prepare to wait again. As it is
876 * rare that this path is taken, it is perfectly safe to force
877 * us to go around the wait_event() loop again, so the warning
878 * is a false-positive. Silence the warning by resetting
879 * thread state
880 */
881 __set_current_state(TASK_RUNNING);
882
883 blk_start_plug(&plug);
884 raid1_prepare_flush_writes(conf->mddev);
885 wake_up(&conf->wait_barrier);
886
887 while (bio) { /* submit pending writes */
888 struct bio *next = bio->bi_next;
889
890 raid1_submit_write(bio);
891 bio = next;
892 cond_resched();
893 }
894 blk_finish_plug(&plug);
895 } else
896 spin_unlock_irq(&conf->device_lock);
897 }
898
899 /* Barriers....
900 * Sometimes we need to suspend IO while we do something else,
901 * either some resync/recovery, or reconfigure the array.
902 * To do this we raise a 'barrier'.
903 * The 'barrier' is a counter that can be raised multiple times
904 * to count how many activities are happening which preclude
905 * normal IO.
906 * We can only raise the barrier if there is no pending IO.
907 * i.e. if nr_pending == 0.
908 * We choose only to raise the barrier if no-one is waiting for the
909 * barrier to go down. This means that as soon as an IO request
910 * is ready, no other operations which require a barrier will start
911 * until the IO request has had a chance.
912 *
913 * So: regular IO calls 'wait_barrier'. When that returns there
914 * is no backgroup IO happening, It must arrange to call
915 * allow_barrier when it has finished its IO.
916 * backgroup IO calls must call raise_barrier. Once that returns
917 * there is no normal IO happeing. It must arrange to call
918 * lower_barrier when the particular background IO completes.
919 */
920
raise_barrier(struct r10conf * conf,int force)921 static void raise_barrier(struct r10conf *conf, int force)
922 {
923 write_seqlock_irq(&conf->resync_lock);
924
925 if (WARN_ON_ONCE(force && !conf->barrier))
926 force = false;
927
928 /* Wait until no block IO is waiting (unless 'force') */
929 wait_event_barrier(conf, force || !conf->nr_waiting);
930
931 /* block any new IO from starting */
932 WRITE_ONCE(conf->barrier, conf->barrier + 1);
933
934 /* Now wait for all pending IO to complete */
935 wait_event_barrier(conf, !atomic_read(&conf->nr_pending) &&
936 conf->barrier < RESYNC_DEPTH);
937
938 write_sequnlock_irq(&conf->resync_lock);
939 }
940
lower_barrier(struct r10conf * conf)941 static void lower_barrier(struct r10conf *conf)
942 {
943 unsigned long flags;
944
945 write_seqlock_irqsave(&conf->resync_lock, flags);
946 WRITE_ONCE(conf->barrier, conf->barrier - 1);
947 write_sequnlock_irqrestore(&conf->resync_lock, flags);
948 wake_up(&conf->wait_barrier);
949 }
950
stop_waiting_barrier(struct r10conf * conf)951 static bool stop_waiting_barrier(struct r10conf *conf)
952 {
953 struct bio_list *bio_list = current->bio_list;
954 struct md_thread *thread;
955
956 /* barrier is dropped */
957 if (!conf->barrier)
958 return true;
959
960 /*
961 * If there are already pending requests (preventing the barrier from
962 * rising completely), and the pre-process bio queue isn't empty, then
963 * don't wait, as we need to empty that queue to get the nr_pending
964 * count down.
965 */
966 if (atomic_read(&conf->nr_pending) && bio_list &&
967 (!bio_list_empty(&bio_list[0]) || !bio_list_empty(&bio_list[1])))
968 return true;
969
970 /* daemon thread must exist while handling io */
971 thread = rcu_dereference_protected(conf->mddev->thread, true);
972 /*
973 * move on if io is issued from raid10d(), nr_pending is not released
974 * from original io(see handle_read_error()). All raise barrier is
975 * blocked until this io is done.
976 */
977 if (thread->tsk == current) {
978 WARN_ON_ONCE(atomic_read(&conf->nr_pending) == 0);
979 return true;
980 }
981
982 return false;
983 }
984
wait_barrier_nolock(struct r10conf * conf)985 static bool wait_barrier_nolock(struct r10conf *conf)
986 {
987 unsigned int seq = read_seqbegin(&conf->resync_lock);
988
989 if (READ_ONCE(conf->barrier))
990 return false;
991
992 atomic_inc(&conf->nr_pending);
993 if (!read_seqretry(&conf->resync_lock, seq))
994 return true;
995
996 if (atomic_dec_and_test(&conf->nr_pending))
997 wake_up_barrier(conf);
998
999 return false;
1000 }
1001
wait_barrier(struct r10conf * conf,bool nowait)1002 static bool wait_barrier(struct r10conf *conf, bool nowait)
1003 {
1004 bool ret = true;
1005
1006 if (wait_barrier_nolock(conf))
1007 return true;
1008
1009 write_seqlock_irq(&conf->resync_lock);
1010 if (conf->barrier) {
1011 /* Return false when nowait flag is set */
1012 if (nowait) {
1013 ret = false;
1014 } else {
1015 conf->nr_waiting++;
1016 mddev_add_trace_msg(conf->mddev, "raid10 wait barrier");
1017 wait_event_barrier(conf, stop_waiting_barrier(conf));
1018 conf->nr_waiting--;
1019 }
1020 if (!conf->nr_waiting)
1021 wake_up(&conf->wait_barrier);
1022 }
1023 /* Only increment nr_pending when we wait */
1024 if (ret)
1025 atomic_inc(&conf->nr_pending);
1026 write_sequnlock_irq(&conf->resync_lock);
1027 return ret;
1028 }
1029
allow_barrier(struct r10conf * conf)1030 static void allow_barrier(struct r10conf *conf)
1031 {
1032 if ((atomic_dec_and_test(&conf->nr_pending)) ||
1033 (conf->array_freeze_pending))
1034 wake_up_barrier(conf);
1035 }
1036
freeze_array(struct r10conf * conf,int extra)1037 static void freeze_array(struct r10conf *conf, int extra)
1038 {
1039 /* stop syncio and normal IO and wait for everything to
1040 * go quiet.
1041 * We increment barrier and nr_waiting, and then
1042 * wait until nr_pending match nr_queued+extra
1043 * This is called in the context of one normal IO request
1044 * that has failed. Thus any sync request that might be pending
1045 * will be blocked by nr_pending, and we need to wait for
1046 * pending IO requests to complete or be queued for re-try.
1047 * Thus the number queued (nr_queued) plus this request (extra)
1048 * must match the number of pending IOs (nr_pending) before
1049 * we continue.
1050 */
1051 write_seqlock_irq(&conf->resync_lock);
1052 conf->array_freeze_pending++;
1053 WRITE_ONCE(conf->barrier, conf->barrier + 1);
1054 conf->nr_waiting++;
1055 wait_event_barrier_cmd(conf, atomic_read(&conf->nr_pending) ==
1056 conf->nr_queued + extra, flush_pending_writes(conf));
1057 conf->array_freeze_pending--;
1058 write_sequnlock_irq(&conf->resync_lock);
1059 }
1060
unfreeze_array(struct r10conf * conf)1061 static void unfreeze_array(struct r10conf *conf)
1062 {
1063 /* reverse the effect of the freeze */
1064 write_seqlock_irq(&conf->resync_lock);
1065 WRITE_ONCE(conf->barrier, conf->barrier - 1);
1066 conf->nr_waiting--;
1067 wake_up(&conf->wait_barrier);
1068 write_sequnlock_irq(&conf->resync_lock);
1069 }
1070
choose_data_offset(struct r10bio * r10_bio,struct md_rdev * rdev)1071 static sector_t choose_data_offset(struct r10bio *r10_bio,
1072 struct md_rdev *rdev)
1073 {
1074 if (!test_bit(MD_RECOVERY_RESHAPE, &rdev->mddev->recovery) ||
1075 test_bit(R10BIO_Previous, &r10_bio->state))
1076 return rdev->data_offset;
1077 else
1078 return rdev->new_data_offset;
1079 }
1080
raid10_unplug(struct blk_plug_cb * cb,bool from_schedule)1081 static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
1082 {
1083 struct raid1_plug_cb *plug = container_of(cb, struct raid1_plug_cb, cb);
1084 struct mddev *mddev = plug->cb.data;
1085 struct r10conf *conf = mddev->private;
1086 struct bio *bio;
1087
1088 if (from_schedule) {
1089 spin_lock_irq(&conf->device_lock);
1090 bio_list_merge(&conf->pending_bio_list, &plug->pending);
1091 spin_unlock_irq(&conf->device_lock);
1092 wake_up_barrier(conf);
1093 md_wakeup_thread(mddev->thread);
1094 kfree(plug);
1095 return;
1096 }
1097
1098 /* we aren't scheduling, so we can do the write-out directly. */
1099 bio = bio_list_get(&plug->pending);
1100 raid1_prepare_flush_writes(mddev);
1101 wake_up_barrier(conf);
1102
1103 while (bio) { /* submit pending writes */
1104 struct bio *next = bio->bi_next;
1105
1106 raid1_submit_write(bio);
1107 bio = next;
1108 cond_resched();
1109 }
1110 kfree(plug);
1111 }
1112
1113 /*
1114 * 1. Register the new request and wait if the reconstruction thread has put
1115 * up a bar for new requests. Continue immediately if no resync is active
1116 * currently.
1117 * 2. If IO spans the reshape position. Need to wait for reshape to pass.
1118 */
regular_request_wait(struct mddev * mddev,struct r10conf * conf,struct bio * bio,sector_t sectors)1119 static bool regular_request_wait(struct mddev *mddev, struct r10conf *conf,
1120 struct bio *bio, sector_t sectors)
1121 {
1122 /* Bail out if REQ_NOWAIT is set for the bio */
1123 if (!wait_barrier(conf, bio->bi_opf & REQ_NOWAIT)) {
1124 bio_wouldblock_error(bio);
1125 return false;
1126 }
1127 while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1128 bio->bi_iter.bi_sector < conf->reshape_progress &&
1129 bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
1130 allow_barrier(conf);
1131 if (bio->bi_opf & REQ_NOWAIT) {
1132 bio_wouldblock_error(bio);
1133 return false;
1134 }
1135 mddev_add_trace_msg(conf->mddev, "raid10 wait reshape");
1136 wait_event(conf->wait_barrier,
1137 conf->reshape_progress <= bio->bi_iter.bi_sector ||
1138 conf->reshape_progress >= bio->bi_iter.bi_sector +
1139 sectors);
1140 wait_barrier(conf, false);
1141 }
1142 return true;
1143 }
1144
raid10_read_request(struct mddev * mddev,struct bio * bio,struct r10bio * r10_bio,bool io_accounting)1145 static void raid10_read_request(struct mddev *mddev, struct bio *bio,
1146 struct r10bio *r10_bio, bool io_accounting)
1147 {
1148 struct r10conf *conf = mddev->private;
1149 struct bio *read_bio;
1150 int max_sectors;
1151 struct md_rdev *rdev;
1152 char b[BDEVNAME_SIZE];
1153 int slot = r10_bio->read_slot;
1154 struct md_rdev *err_rdev = NULL;
1155 gfp_t gfp = GFP_NOIO;
1156
1157 if (slot >= 0 && r10_bio->devs[slot].rdev) {
1158 /*
1159 * This is an error retry, but we cannot
1160 * safely dereference the rdev in the r10_bio,
1161 * we must use the one in conf.
1162 * If it has already been disconnected (unlikely)
1163 * we lose the device name in error messages.
1164 */
1165 int disk;
1166 /*
1167 * As we are blocking raid10, it is a little safer to
1168 * use __GFP_HIGH.
1169 */
1170 gfp = GFP_NOIO | __GFP_HIGH;
1171
1172 disk = r10_bio->devs[slot].devnum;
1173 err_rdev = conf->mirrors[disk].rdev;
1174 if (err_rdev)
1175 snprintf(b, sizeof(b), "%pg", err_rdev->bdev);
1176 else {
1177 strcpy(b, "???");
1178 /* This never gets dereferenced */
1179 err_rdev = r10_bio->devs[slot].rdev;
1180 }
1181 }
1182
1183 if (!regular_request_wait(mddev, conf, bio, r10_bio->sectors)) {
1184 raid_end_bio_io(r10_bio);
1185 return;
1186 }
1187
1188 rdev = read_balance(conf, r10_bio, &max_sectors);
1189 if (!rdev) {
1190 if (err_rdev) {
1191 pr_crit_ratelimited("md/raid10:%s: %s: unrecoverable I/O read error for block %llu\n",
1192 mdname(mddev), b,
1193 (unsigned long long)r10_bio->sector);
1194 }
1195 raid_end_bio_io(r10_bio);
1196 return;
1197 }
1198 if (err_rdev)
1199 pr_err_ratelimited("md/raid10:%s: %pg: redirecting sector %llu to another mirror\n",
1200 mdname(mddev),
1201 rdev->bdev,
1202 (unsigned long long)r10_bio->sector);
1203 if (max_sectors < bio_sectors(bio)) {
1204 struct bio *split = bio_split(bio, max_sectors,
1205 gfp, &conf->bio_split);
1206 bio_chain(split, bio);
1207 allow_barrier(conf);
1208 submit_bio_noacct(bio);
1209 wait_barrier(conf, false);
1210 bio = split;
1211 r10_bio->master_bio = bio;
1212 r10_bio->sectors = max_sectors;
1213 }
1214 slot = r10_bio->read_slot;
1215
1216 if (io_accounting) {
1217 md_account_bio(mddev, &bio);
1218 r10_bio->master_bio = bio;
1219 }
1220 read_bio = bio_alloc_clone(rdev->bdev, bio, gfp, &mddev->bio_set);
1221 read_bio->bi_opf &= ~REQ_NOWAIT;
1222
1223 r10_bio->devs[slot].bio = read_bio;
1224 r10_bio->devs[slot].rdev = rdev;
1225
1226 read_bio->bi_iter.bi_sector = r10_bio->devs[slot].addr +
1227 choose_data_offset(r10_bio, rdev);
1228 read_bio->bi_end_io = raid10_end_read_request;
1229 if (test_bit(FailFast, &rdev->flags) &&
1230 test_bit(R10BIO_FailFast, &r10_bio->state))
1231 read_bio->bi_opf |= MD_FAILFAST;
1232 read_bio->bi_private = r10_bio;
1233 mddev_trace_remap(mddev, read_bio, r10_bio->sector);
1234 submit_bio_noacct(read_bio);
1235 return;
1236 }
1237
raid10_write_one_disk(struct mddev * mddev,struct r10bio * r10_bio,struct bio * bio,bool replacement,int n_copy)1238 static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
1239 struct bio *bio, bool replacement,
1240 int n_copy)
1241 {
1242 unsigned long flags;
1243 struct r10conf *conf = mddev->private;
1244 struct md_rdev *rdev;
1245 int devnum = r10_bio->devs[n_copy].devnum;
1246 struct bio *mbio;
1247
1248 rdev = replacement ? conf->mirrors[devnum].replacement :
1249 conf->mirrors[devnum].rdev;
1250
1251 mbio = bio_alloc_clone(rdev->bdev, bio, GFP_NOIO, &mddev->bio_set);
1252 mbio->bi_opf &= ~REQ_NOWAIT;
1253 if (replacement)
1254 r10_bio->devs[n_copy].repl_bio = mbio;
1255 else
1256 r10_bio->devs[n_copy].bio = mbio;
1257
1258 mbio->bi_iter.bi_sector = (r10_bio->devs[n_copy].addr +
1259 choose_data_offset(r10_bio, rdev));
1260 mbio->bi_end_io = raid10_end_write_request;
1261 if (!replacement && test_bit(FailFast,
1262 &conf->mirrors[devnum].rdev->flags)
1263 && enough(conf, devnum))
1264 mbio->bi_opf |= MD_FAILFAST;
1265 mbio->bi_private = r10_bio;
1266 mddev_trace_remap(mddev, mbio, r10_bio->sector);
1267 /* flush_pending_writes() needs access to the rdev so...*/
1268 mbio->bi_bdev = (void *)rdev;
1269
1270 atomic_inc(&r10_bio->remaining);
1271
1272 if (!raid1_add_bio_to_plug(mddev, mbio, raid10_unplug, conf->copies)) {
1273 spin_lock_irqsave(&conf->device_lock, flags);
1274 bio_list_add(&conf->pending_bio_list, mbio);
1275 spin_unlock_irqrestore(&conf->device_lock, flags);
1276 md_wakeup_thread(mddev->thread);
1277 }
1278 }
1279
wait_blocked_dev(struct mddev * mddev,struct r10bio * r10_bio)1280 static void wait_blocked_dev(struct mddev *mddev, struct r10bio *r10_bio)
1281 {
1282 int i;
1283 struct r10conf *conf = mddev->private;
1284 struct md_rdev *blocked_rdev;
1285
1286 retry_wait:
1287 blocked_rdev = NULL;
1288 for (i = 0; i < conf->copies; i++) {
1289 struct md_rdev *rdev, *rrdev;
1290
1291 rdev = conf->mirrors[i].rdev;
1292 rrdev = conf->mirrors[i].replacement;
1293 if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
1294 atomic_inc(&rdev->nr_pending);
1295 blocked_rdev = rdev;
1296 break;
1297 }
1298 if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) {
1299 atomic_inc(&rrdev->nr_pending);
1300 blocked_rdev = rrdev;
1301 break;
1302 }
1303
1304 if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {
1305 sector_t dev_sector = r10_bio->devs[i].addr;
1306
1307 /*
1308 * Discard request doesn't care the write result
1309 * so it doesn't need to wait blocked disk here.
1310 */
1311 if (!r10_bio->sectors)
1312 continue;
1313
1314 if (rdev_has_badblock(rdev, dev_sector,
1315 r10_bio->sectors) < 0) {
1316 /*
1317 * Mustn't write here until the bad block
1318 * is acknowledged
1319 */
1320 atomic_inc(&rdev->nr_pending);
1321 set_bit(BlockedBadBlocks, &rdev->flags);
1322 blocked_rdev = rdev;
1323 break;
1324 }
1325 }
1326 }
1327
1328 if (unlikely(blocked_rdev)) {
1329 /* Have to wait for this device to get unblocked, then retry */
1330 allow_barrier(conf);
1331 mddev_add_trace_msg(conf->mddev,
1332 "raid10 %s wait rdev %d blocked",
1333 __func__, blocked_rdev->raid_disk);
1334 md_wait_for_blocked_rdev(blocked_rdev, mddev);
1335 wait_barrier(conf, false);
1336 goto retry_wait;
1337 }
1338 }
1339
raid10_write_request(struct mddev * mddev,struct bio * bio,struct r10bio * r10_bio)1340 static void raid10_write_request(struct mddev *mddev, struct bio *bio,
1341 struct r10bio *r10_bio)
1342 {
1343 struct r10conf *conf = mddev->private;
1344 int i;
1345 sector_t sectors;
1346 int max_sectors;
1347
1348 if ((mddev_is_clustered(mddev) &&
1349 md_cluster_ops->area_resyncing(mddev, WRITE,
1350 bio->bi_iter.bi_sector,
1351 bio_end_sector(bio)))) {
1352 DEFINE_WAIT(w);
1353 /* Bail out if REQ_NOWAIT is set for the bio */
1354 if (bio->bi_opf & REQ_NOWAIT) {
1355 bio_wouldblock_error(bio);
1356 return;
1357 }
1358 for (;;) {
1359 prepare_to_wait(&conf->wait_barrier,
1360 &w, TASK_IDLE);
1361 if (!md_cluster_ops->area_resyncing(mddev, WRITE,
1362 bio->bi_iter.bi_sector, bio_end_sector(bio)))
1363 break;
1364 schedule();
1365 }
1366 finish_wait(&conf->wait_barrier, &w);
1367 }
1368
1369 sectors = r10_bio->sectors;
1370 if (!regular_request_wait(mddev, conf, bio, sectors)) {
1371 raid_end_bio_io(r10_bio);
1372 return;
1373 }
1374
1375 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1376 (mddev->reshape_backwards
1377 ? (bio->bi_iter.bi_sector < conf->reshape_safe &&
1378 bio->bi_iter.bi_sector + sectors > conf->reshape_progress)
1379 : (bio->bi_iter.bi_sector + sectors > conf->reshape_safe &&
1380 bio->bi_iter.bi_sector < conf->reshape_progress))) {
1381 /* Need to update reshape_position in metadata */
1382 mddev->reshape_position = conf->reshape_progress;
1383 set_mask_bits(&mddev->sb_flags, 0,
1384 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
1385 md_wakeup_thread(mddev->thread);
1386 if (bio->bi_opf & REQ_NOWAIT) {
1387 allow_barrier(conf);
1388 bio_wouldblock_error(bio);
1389 return;
1390 }
1391 mddev_add_trace_msg(conf->mddev,
1392 "raid10 wait reshape metadata");
1393 wait_event(mddev->sb_wait,
1394 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
1395
1396 conf->reshape_safe = mddev->reshape_position;
1397 }
1398
1399 /* first select target devices under rcu_lock and
1400 * inc refcount on their rdev. Record them by setting
1401 * bios[x] to bio
1402 * If there are known/acknowledged bad blocks on any device
1403 * on which we have seen a write error, we want to avoid
1404 * writing to those blocks. This potentially requires several
1405 * writes to write around the bad blocks. Each set of writes
1406 * gets its own r10_bio with a set of bios attached.
1407 */
1408
1409 r10_bio->read_slot = -1; /* make sure repl_bio gets freed */
1410 raid10_find_phys(conf, r10_bio);
1411
1412 wait_blocked_dev(mddev, r10_bio);
1413
1414 max_sectors = r10_bio->sectors;
1415
1416 for (i = 0; i < conf->copies; i++) {
1417 int d = r10_bio->devs[i].devnum;
1418 struct md_rdev *rdev, *rrdev;
1419
1420 rdev = conf->mirrors[d].rdev;
1421 rrdev = conf->mirrors[d].replacement;
1422 if (rdev && (test_bit(Faulty, &rdev->flags)))
1423 rdev = NULL;
1424 if (rrdev && (test_bit(Faulty, &rrdev->flags)))
1425 rrdev = NULL;
1426
1427 r10_bio->devs[i].bio = NULL;
1428 r10_bio->devs[i].repl_bio = NULL;
1429
1430 if (!rdev && !rrdev)
1431 continue;
1432 if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {
1433 sector_t first_bad;
1434 sector_t dev_sector = r10_bio->devs[i].addr;
1435 int bad_sectors;
1436 int is_bad;
1437
1438 is_bad = is_badblock(rdev, dev_sector, max_sectors,
1439 &first_bad, &bad_sectors);
1440 if (is_bad && first_bad <= dev_sector) {
1441 /* Cannot write here at all */
1442 bad_sectors -= (dev_sector - first_bad);
1443 if (bad_sectors < max_sectors)
1444 /* Mustn't write more than bad_sectors
1445 * to other devices yet
1446 */
1447 max_sectors = bad_sectors;
1448 continue;
1449 }
1450 if (is_bad) {
1451 int good_sectors = first_bad - dev_sector;
1452 if (good_sectors < max_sectors)
1453 max_sectors = good_sectors;
1454 }
1455 }
1456 if (rdev) {
1457 r10_bio->devs[i].bio = bio;
1458 atomic_inc(&rdev->nr_pending);
1459 }
1460 if (rrdev) {
1461 r10_bio->devs[i].repl_bio = bio;
1462 atomic_inc(&rrdev->nr_pending);
1463 }
1464 }
1465
1466 if (max_sectors < r10_bio->sectors)
1467 r10_bio->sectors = max_sectors;
1468
1469 if (r10_bio->sectors < bio_sectors(bio)) {
1470 struct bio *split = bio_split(bio, r10_bio->sectors,
1471 GFP_NOIO, &conf->bio_split);
1472 bio_chain(split, bio);
1473 allow_barrier(conf);
1474 submit_bio_noacct(bio);
1475 wait_barrier(conf, false);
1476 bio = split;
1477 r10_bio->master_bio = bio;
1478 }
1479
1480 md_account_bio(mddev, &bio);
1481 r10_bio->master_bio = bio;
1482 atomic_set(&r10_bio->remaining, 1);
1483
1484 for (i = 0; i < conf->copies; i++) {
1485 if (r10_bio->devs[i].bio)
1486 raid10_write_one_disk(mddev, r10_bio, bio, false, i);
1487 if (r10_bio->devs[i].repl_bio)
1488 raid10_write_one_disk(mddev, r10_bio, bio, true, i);
1489 }
1490 one_write_done(r10_bio);
1491 }
1492
__make_request(struct mddev * mddev,struct bio * bio,int sectors)1493 static void __make_request(struct mddev *mddev, struct bio *bio, int sectors)
1494 {
1495 struct r10conf *conf = mddev->private;
1496 struct r10bio *r10_bio;
1497
1498 r10_bio = mempool_alloc(&conf->r10bio_pool, GFP_NOIO);
1499
1500 r10_bio->master_bio = bio;
1501 r10_bio->sectors = sectors;
1502
1503 r10_bio->mddev = mddev;
1504 r10_bio->sector = bio->bi_iter.bi_sector;
1505 r10_bio->state = 0;
1506 r10_bio->read_slot = -1;
1507 memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) *
1508 conf->geo.raid_disks);
1509
1510 if (bio_data_dir(bio) == READ)
1511 raid10_read_request(mddev, bio, r10_bio, true);
1512 else
1513 raid10_write_request(mddev, bio, r10_bio);
1514 }
1515
raid_end_discard_bio(struct r10bio * r10bio)1516 static void raid_end_discard_bio(struct r10bio *r10bio)
1517 {
1518 struct r10conf *conf = r10bio->mddev->private;
1519 struct r10bio *first_r10bio;
1520
1521 while (atomic_dec_and_test(&r10bio->remaining)) {
1522
1523 allow_barrier(conf);
1524
1525 if (!test_bit(R10BIO_Discard, &r10bio->state)) {
1526 first_r10bio = (struct r10bio *)r10bio->master_bio;
1527 free_r10bio(r10bio);
1528 r10bio = first_r10bio;
1529 } else {
1530 md_write_end(r10bio->mddev);
1531 bio_endio(r10bio->master_bio);
1532 free_r10bio(r10bio);
1533 break;
1534 }
1535 }
1536 }
1537
raid10_end_discard_request(struct bio * bio)1538 static void raid10_end_discard_request(struct bio *bio)
1539 {
1540 struct r10bio *r10_bio = bio->bi_private;
1541 struct r10conf *conf = r10_bio->mddev->private;
1542 struct md_rdev *rdev = NULL;
1543 int dev;
1544 int slot, repl;
1545
1546 /*
1547 * We don't care the return value of discard bio
1548 */
1549 if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
1550 set_bit(R10BIO_Uptodate, &r10_bio->state);
1551
1552 dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
1553 rdev = repl ? conf->mirrors[dev].replacement :
1554 conf->mirrors[dev].rdev;
1555
1556 raid_end_discard_bio(r10_bio);
1557 rdev_dec_pending(rdev, conf->mddev);
1558 }
1559
1560 /*
1561 * There are some limitations to handle discard bio
1562 * 1st, the discard size is bigger than stripe_size*2.
1563 * 2st, if the discard bio spans reshape progress, we use the old way to
1564 * handle discard bio
1565 */
raid10_handle_discard(struct mddev * mddev,struct bio * bio)1566 static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
1567 {
1568 struct r10conf *conf = mddev->private;
1569 struct geom *geo = &conf->geo;
1570 int far_copies = geo->far_copies;
1571 bool first_copy = true;
1572 struct r10bio *r10_bio, *first_r10bio;
1573 struct bio *split;
1574 int disk;
1575 sector_t chunk;
1576 unsigned int stripe_size;
1577 unsigned int stripe_data_disks;
1578 sector_t split_size;
1579 sector_t bio_start, bio_end;
1580 sector_t first_stripe_index, last_stripe_index;
1581 sector_t start_disk_offset;
1582 unsigned int start_disk_index;
1583 sector_t end_disk_offset;
1584 unsigned int end_disk_index;
1585 unsigned int remainder;
1586
1587 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
1588 return -EAGAIN;
1589
1590 if (WARN_ON_ONCE(bio->bi_opf & REQ_NOWAIT)) {
1591 bio_wouldblock_error(bio);
1592 return 0;
1593 }
1594 wait_barrier(conf, false);
1595
1596 /*
1597 * Check reshape again to avoid reshape happens after checking
1598 * MD_RECOVERY_RESHAPE and before wait_barrier
1599 */
1600 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
1601 goto out;
1602
1603 if (geo->near_copies)
1604 stripe_data_disks = geo->raid_disks / geo->near_copies +
1605 geo->raid_disks % geo->near_copies;
1606 else
1607 stripe_data_disks = geo->raid_disks;
1608
1609 stripe_size = stripe_data_disks << geo->chunk_shift;
1610
1611 bio_start = bio->bi_iter.bi_sector;
1612 bio_end = bio_end_sector(bio);
1613
1614 /*
1615 * Maybe one discard bio is smaller than strip size or across one
1616 * stripe and discard region is larger than one stripe size. For far
1617 * offset layout, if the discard region is not aligned with stripe
1618 * size, there is hole when we submit discard bio to member disk.
1619 * For simplicity, we only handle discard bio which discard region
1620 * is bigger than stripe_size * 2
1621 */
1622 if (bio_sectors(bio) < stripe_size*2)
1623 goto out;
1624
1625 /*
1626 * Keep bio aligned with strip size.
1627 */
1628 div_u64_rem(bio_start, stripe_size, &remainder);
1629 if (remainder) {
1630 split_size = stripe_size - remainder;
1631 split = bio_split(bio, split_size, GFP_NOIO, &conf->bio_split);
1632 bio_chain(split, bio);
1633 allow_barrier(conf);
1634 /* Resend the fist split part */
1635 submit_bio_noacct(split);
1636 wait_barrier(conf, false);
1637 }
1638 div_u64_rem(bio_end, stripe_size, &remainder);
1639 if (remainder) {
1640 split_size = bio_sectors(bio) - remainder;
1641 split = bio_split(bio, split_size, GFP_NOIO, &conf->bio_split);
1642 bio_chain(split, bio);
1643 allow_barrier(conf);
1644 /* Resend the second split part */
1645 submit_bio_noacct(bio);
1646 bio = split;
1647 wait_barrier(conf, false);
1648 }
1649
1650 bio_start = bio->bi_iter.bi_sector;
1651 bio_end = bio_end_sector(bio);
1652
1653 /*
1654 * Raid10 uses chunk as the unit to store data. It's similar like raid0.
1655 * One stripe contains the chunks from all member disk (one chunk from
1656 * one disk at the same HBA address). For layout detail, see 'man md 4'
1657 */
1658 chunk = bio_start >> geo->chunk_shift;
1659 chunk *= geo->near_copies;
1660 first_stripe_index = chunk;
1661 start_disk_index = sector_div(first_stripe_index, geo->raid_disks);
1662 if (geo->far_offset)
1663 first_stripe_index *= geo->far_copies;
1664 start_disk_offset = (bio_start & geo->chunk_mask) +
1665 (first_stripe_index << geo->chunk_shift);
1666
1667 chunk = bio_end >> geo->chunk_shift;
1668 chunk *= geo->near_copies;
1669 last_stripe_index = chunk;
1670 end_disk_index = sector_div(last_stripe_index, geo->raid_disks);
1671 if (geo->far_offset)
1672 last_stripe_index *= geo->far_copies;
1673 end_disk_offset = (bio_end & geo->chunk_mask) +
1674 (last_stripe_index << geo->chunk_shift);
1675
1676 retry_discard:
1677 r10_bio = mempool_alloc(&conf->r10bio_pool, GFP_NOIO);
1678 r10_bio->mddev = mddev;
1679 r10_bio->state = 0;
1680 r10_bio->sectors = 0;
1681 memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * geo->raid_disks);
1682 wait_blocked_dev(mddev, r10_bio);
1683
1684 /*
1685 * For far layout it needs more than one r10bio to cover all regions.
1686 * Inspired by raid10_sync_request, we can use the first r10bio->master_bio
1687 * to record the discard bio. Other r10bio->master_bio record the first
1688 * r10bio. The first r10bio only release after all other r10bios finish.
1689 * The discard bio returns only first r10bio finishes
1690 */
1691 if (first_copy) {
1692 md_account_bio(mddev, &bio);
1693 r10_bio->master_bio = bio;
1694 set_bit(R10BIO_Discard, &r10_bio->state);
1695 first_copy = false;
1696 first_r10bio = r10_bio;
1697 } else
1698 r10_bio->master_bio = (struct bio *)first_r10bio;
1699
1700 /*
1701 * first select target devices under rcu_lock and
1702 * inc refcount on their rdev. Record them by setting
1703 * bios[x] to bio
1704 */
1705 for (disk = 0; disk < geo->raid_disks; disk++) {
1706 struct md_rdev *rdev, *rrdev;
1707
1708 rdev = conf->mirrors[disk].rdev;
1709 rrdev = conf->mirrors[disk].replacement;
1710 r10_bio->devs[disk].bio = NULL;
1711 r10_bio->devs[disk].repl_bio = NULL;
1712
1713 if (rdev && (test_bit(Faulty, &rdev->flags)))
1714 rdev = NULL;
1715 if (rrdev && (test_bit(Faulty, &rrdev->flags)))
1716 rrdev = NULL;
1717 if (!rdev && !rrdev)
1718 continue;
1719
1720 if (rdev) {
1721 r10_bio->devs[disk].bio = bio;
1722 atomic_inc(&rdev->nr_pending);
1723 }
1724 if (rrdev) {
1725 r10_bio->devs[disk].repl_bio = bio;
1726 atomic_inc(&rrdev->nr_pending);
1727 }
1728 }
1729
1730 atomic_set(&r10_bio->remaining, 1);
1731 for (disk = 0; disk < geo->raid_disks; disk++) {
1732 sector_t dev_start, dev_end;
1733 struct bio *mbio, *rbio = NULL;
1734
1735 /*
1736 * Now start to calculate the start and end address for each disk.
1737 * The space between dev_start and dev_end is the discard region.
1738 *
1739 * For dev_start, it needs to consider three conditions:
1740 * 1st, the disk is before start_disk, you can imagine the disk in
1741 * the next stripe. So the dev_start is the start address of next
1742 * stripe.
1743 * 2st, the disk is after start_disk, it means the disk is at the
1744 * same stripe of first disk
1745 * 3st, the first disk itself, we can use start_disk_offset directly
1746 */
1747 if (disk < start_disk_index)
1748 dev_start = (first_stripe_index + 1) * mddev->chunk_sectors;
1749 else if (disk > start_disk_index)
1750 dev_start = first_stripe_index * mddev->chunk_sectors;
1751 else
1752 dev_start = start_disk_offset;
1753
1754 if (disk < end_disk_index)
1755 dev_end = (last_stripe_index + 1) * mddev->chunk_sectors;
1756 else if (disk > end_disk_index)
1757 dev_end = last_stripe_index * mddev->chunk_sectors;
1758 else
1759 dev_end = end_disk_offset;
1760
1761 /*
1762 * It only handles discard bio which size is >= stripe size, so
1763 * dev_end > dev_start all the time.
1764 * It doesn't need to use rcu lock to get rdev here. We already
1765 * add rdev->nr_pending in the first loop.
1766 */
1767 if (r10_bio->devs[disk].bio) {
1768 struct md_rdev *rdev = conf->mirrors[disk].rdev;
1769 mbio = bio_alloc_clone(bio->bi_bdev, bio, GFP_NOIO,
1770 &mddev->bio_set);
1771 mbio->bi_end_io = raid10_end_discard_request;
1772 mbio->bi_private = r10_bio;
1773 r10_bio->devs[disk].bio = mbio;
1774 r10_bio->devs[disk].devnum = disk;
1775 atomic_inc(&r10_bio->remaining);
1776 md_submit_discard_bio(mddev, rdev, mbio,
1777 dev_start + choose_data_offset(r10_bio, rdev),
1778 dev_end - dev_start);
1779 bio_endio(mbio);
1780 }
1781 if (r10_bio->devs[disk].repl_bio) {
1782 struct md_rdev *rrdev = conf->mirrors[disk].replacement;
1783 rbio = bio_alloc_clone(bio->bi_bdev, bio, GFP_NOIO,
1784 &mddev->bio_set);
1785 rbio->bi_end_io = raid10_end_discard_request;
1786 rbio->bi_private = r10_bio;
1787 r10_bio->devs[disk].repl_bio = rbio;
1788 r10_bio->devs[disk].devnum = disk;
1789 atomic_inc(&r10_bio->remaining);
1790 md_submit_discard_bio(mddev, rrdev, rbio,
1791 dev_start + choose_data_offset(r10_bio, rrdev),
1792 dev_end - dev_start);
1793 bio_endio(rbio);
1794 }
1795 }
1796
1797 if (!geo->far_offset && --far_copies) {
1798 first_stripe_index += geo->stride >> geo->chunk_shift;
1799 start_disk_offset += geo->stride;
1800 last_stripe_index += geo->stride >> geo->chunk_shift;
1801 end_disk_offset += geo->stride;
1802 atomic_inc(&first_r10bio->remaining);
1803 raid_end_discard_bio(r10_bio);
1804 wait_barrier(conf, false);
1805 goto retry_discard;
1806 }
1807
1808 raid_end_discard_bio(r10_bio);
1809
1810 return 0;
1811 out:
1812 allow_barrier(conf);
1813 return -EAGAIN;
1814 }
1815
raid10_make_request(struct mddev * mddev,struct bio * bio)1816 static bool raid10_make_request(struct mddev *mddev, struct bio *bio)
1817 {
1818 struct r10conf *conf = mddev->private;
1819 sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask);
1820 int chunk_sects = chunk_mask + 1;
1821 int sectors = bio_sectors(bio);
1822
1823 if (unlikely(bio->bi_opf & REQ_PREFLUSH)
1824 && md_flush_request(mddev, bio))
1825 return true;
1826
1827 md_write_start(mddev, bio);
1828
1829 if (unlikely(bio_op(bio) == REQ_OP_DISCARD))
1830 if (!raid10_handle_discard(mddev, bio))
1831 return true;
1832
1833 /*
1834 * If this request crosses a chunk boundary, we need to split
1835 * it.
1836 */
1837 if (unlikely((bio->bi_iter.bi_sector & chunk_mask) +
1838 sectors > chunk_sects
1839 && (conf->geo.near_copies < conf->geo.raid_disks
1840 || conf->prev.near_copies <
1841 conf->prev.raid_disks)))
1842 sectors = chunk_sects -
1843 (bio->bi_iter.bi_sector &
1844 (chunk_sects - 1));
1845 __make_request(mddev, bio, sectors);
1846
1847 /* In case raid10d snuck in to freeze_array */
1848 wake_up_barrier(conf);
1849 return true;
1850 }
1851
raid10_status(struct seq_file * seq,struct mddev * mddev)1852 static void raid10_status(struct seq_file *seq, struct mddev *mddev)
1853 {
1854 struct r10conf *conf = mddev->private;
1855 int i;
1856
1857 lockdep_assert_held(&mddev->lock);
1858
1859 if (conf->geo.near_copies < conf->geo.raid_disks)
1860 seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2);
1861 if (conf->geo.near_copies > 1)
1862 seq_printf(seq, " %d near-copies", conf->geo.near_copies);
1863 if (conf->geo.far_copies > 1) {
1864 if (conf->geo.far_offset)
1865 seq_printf(seq, " %d offset-copies", conf->geo.far_copies);
1866 else
1867 seq_printf(seq, " %d far-copies", conf->geo.far_copies);
1868 if (conf->geo.far_set_size != conf->geo.raid_disks)
1869 seq_printf(seq, " %d devices per set", conf->geo.far_set_size);
1870 }
1871 seq_printf(seq, " [%d/%d] [", conf->geo.raid_disks,
1872 conf->geo.raid_disks - mddev->degraded);
1873 for (i = 0; i < conf->geo.raid_disks; i++) {
1874 struct md_rdev *rdev = READ_ONCE(conf->mirrors[i].rdev);
1875
1876 seq_printf(seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
1877 }
1878 seq_printf(seq, "]");
1879 }
1880
1881 /* check if there are enough drives for
1882 * every block to appear on atleast one.
1883 * Don't consider the device numbered 'ignore'
1884 * as we might be about to remove it.
1885 */
_enough(struct r10conf * conf,int previous,int ignore)1886 static int _enough(struct r10conf *conf, int previous, int ignore)
1887 {
1888 int first = 0;
1889 int has_enough = 0;
1890 int disks, ncopies;
1891 if (previous) {
1892 disks = conf->prev.raid_disks;
1893 ncopies = conf->prev.near_copies;
1894 } else {
1895 disks = conf->geo.raid_disks;
1896 ncopies = conf->geo.near_copies;
1897 }
1898
1899 do {
1900 int n = conf->copies;
1901 int cnt = 0;
1902 int this = first;
1903 while (n--) {
1904 struct md_rdev *rdev;
1905 if (this != ignore &&
1906 (rdev = conf->mirrors[this].rdev) &&
1907 test_bit(In_sync, &rdev->flags))
1908 cnt++;
1909 this = (this+1) % disks;
1910 }
1911 if (cnt == 0)
1912 goto out;
1913 first = (first + ncopies) % disks;
1914 } while (first != 0);
1915 has_enough = 1;
1916 out:
1917 return has_enough;
1918 }
1919
enough(struct r10conf * conf,int ignore)1920 static int enough(struct r10conf *conf, int ignore)
1921 {
1922 /* when calling 'enough', both 'prev' and 'geo' must
1923 * be stable.
1924 * This is ensured if ->reconfig_mutex or ->device_lock
1925 * is held.
1926 */
1927 return _enough(conf, 0, ignore) &&
1928 _enough(conf, 1, ignore);
1929 }
1930
1931 /**
1932 * raid10_error() - RAID10 error handler.
1933 * @mddev: affected md device.
1934 * @rdev: member device to fail.
1935 *
1936 * The routine acknowledges &rdev failure and determines new @mddev state.
1937 * If it failed, then:
1938 * - &MD_BROKEN flag is set in &mddev->flags.
1939 * Otherwise, it must be degraded:
1940 * - recovery is interrupted.
1941 * - &mddev->degraded is bumped.
1942 *
1943 * @rdev is marked as &Faulty excluding case when array is failed and
1944 * &mddev->fail_last_dev is off.
1945 */
raid10_error(struct mddev * mddev,struct md_rdev * rdev)1946 static void raid10_error(struct mddev *mddev, struct md_rdev *rdev)
1947 {
1948 struct r10conf *conf = mddev->private;
1949 unsigned long flags;
1950
1951 spin_lock_irqsave(&conf->device_lock, flags);
1952
1953 if (test_bit(In_sync, &rdev->flags) && !enough(conf, rdev->raid_disk)) {
1954 set_bit(MD_BROKEN, &mddev->flags);
1955
1956 if (!mddev->fail_last_dev) {
1957 spin_unlock_irqrestore(&conf->device_lock, flags);
1958 return;
1959 }
1960 }
1961 if (test_and_clear_bit(In_sync, &rdev->flags))
1962 mddev->degraded++;
1963
1964 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1965 set_bit(Blocked, &rdev->flags);
1966 set_bit(Faulty, &rdev->flags);
1967 set_mask_bits(&mddev->sb_flags, 0,
1968 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
1969 spin_unlock_irqrestore(&conf->device_lock, flags);
1970 pr_crit("md/raid10:%s: Disk failure on %pg, disabling device.\n"
1971 "md/raid10:%s: Operation continuing on %d devices.\n",
1972 mdname(mddev), rdev->bdev,
1973 mdname(mddev), conf->geo.raid_disks - mddev->degraded);
1974 }
1975
print_conf(struct r10conf * conf)1976 static void print_conf(struct r10conf *conf)
1977 {
1978 int i;
1979 struct md_rdev *rdev;
1980
1981 pr_debug("RAID10 conf printout:\n");
1982 if (!conf) {
1983 pr_debug("(!conf)\n");
1984 return;
1985 }
1986 pr_debug(" --- wd:%d rd:%d\n", conf->geo.raid_disks - conf->mddev->degraded,
1987 conf->geo.raid_disks);
1988
1989 lockdep_assert_held(&conf->mddev->reconfig_mutex);
1990 for (i = 0; i < conf->geo.raid_disks; i++) {
1991 rdev = conf->mirrors[i].rdev;
1992 if (rdev)
1993 pr_debug(" disk %d, wo:%d, o:%d, dev:%pg\n",
1994 i, !test_bit(In_sync, &rdev->flags),
1995 !test_bit(Faulty, &rdev->flags),
1996 rdev->bdev);
1997 }
1998 }
1999
close_sync(struct r10conf * conf)2000 static void close_sync(struct r10conf *conf)
2001 {
2002 wait_barrier(conf, false);
2003 allow_barrier(conf);
2004
2005 mempool_exit(&conf->r10buf_pool);
2006 }
2007
raid10_spare_active(struct mddev * mddev)2008 static int raid10_spare_active(struct mddev *mddev)
2009 {
2010 int i;
2011 struct r10conf *conf = mddev->private;
2012 struct raid10_info *tmp;
2013 int count = 0;
2014 unsigned long flags;
2015
2016 /*
2017 * Find all non-in_sync disks within the RAID10 configuration
2018 * and mark them in_sync
2019 */
2020 for (i = 0; i < conf->geo.raid_disks; i++) {
2021 tmp = conf->mirrors + i;
2022 if (tmp->replacement
2023 && tmp->replacement->recovery_offset == MaxSector
2024 && !test_bit(Faulty, &tmp->replacement->flags)
2025 && !test_and_set_bit(In_sync, &tmp->replacement->flags)) {
2026 /* Replacement has just become active */
2027 if (!tmp->rdev
2028 || !test_and_clear_bit(In_sync, &tmp->rdev->flags))
2029 count++;
2030 if (tmp->rdev) {
2031 /* Replaced device not technically faulty,
2032 * but we need to be sure it gets removed
2033 * and never re-added.
2034 */
2035 set_bit(Faulty, &tmp->rdev->flags);
2036 sysfs_notify_dirent_safe(
2037 tmp->rdev->sysfs_state);
2038 }
2039 sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);
2040 } else if (tmp->rdev
2041 && tmp->rdev->recovery_offset == MaxSector
2042 && !test_bit(Faulty, &tmp->rdev->flags)
2043 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
2044 count++;
2045 sysfs_notify_dirent_safe(tmp->rdev->sysfs_state);
2046 }
2047 }
2048 spin_lock_irqsave(&conf->device_lock, flags);
2049 mddev->degraded -= count;
2050 spin_unlock_irqrestore(&conf->device_lock, flags);
2051
2052 print_conf(conf);
2053 return count;
2054 }
2055
raid10_add_disk(struct mddev * mddev,struct md_rdev * rdev)2056 static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
2057 {
2058 struct r10conf *conf = mddev->private;
2059 int err = -EEXIST;
2060 int mirror, repl_slot = -1;
2061 int first = 0;
2062 int last = conf->geo.raid_disks - 1;
2063 struct raid10_info *p;
2064
2065 if (mddev->recovery_cp < MaxSector)
2066 /* only hot-add to in-sync arrays, as recovery is
2067 * very different from resync
2068 */
2069 return -EBUSY;
2070 if (rdev->saved_raid_disk < 0 && !_enough(conf, 1, -1))
2071 return -EINVAL;
2072
2073 if (rdev->raid_disk >= 0)
2074 first = last = rdev->raid_disk;
2075
2076 if (rdev->saved_raid_disk >= first &&
2077 rdev->saved_raid_disk < conf->geo.raid_disks &&
2078 conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
2079 mirror = rdev->saved_raid_disk;
2080 else
2081 mirror = first;
2082 for ( ; mirror <= last ; mirror++) {
2083 p = &conf->mirrors[mirror];
2084 if (p->recovery_disabled == mddev->recovery_disabled)
2085 continue;
2086 if (p->rdev) {
2087 if (test_bit(WantReplacement, &p->rdev->flags) &&
2088 p->replacement == NULL && repl_slot < 0)
2089 repl_slot = mirror;
2090 continue;
2091 }
2092
2093 err = mddev_stack_new_rdev(mddev, rdev);
2094 if (err)
2095 return err;
2096 p->head_position = 0;
2097 p->recovery_disabled = mddev->recovery_disabled - 1;
2098 rdev->raid_disk = mirror;
2099 err = 0;
2100 if (rdev->saved_raid_disk != mirror)
2101 conf->fullsync = 1;
2102 WRITE_ONCE(p->rdev, rdev);
2103 break;
2104 }
2105
2106 if (err && repl_slot >= 0) {
2107 p = &conf->mirrors[repl_slot];
2108 clear_bit(In_sync, &rdev->flags);
2109 set_bit(Replacement, &rdev->flags);
2110 rdev->raid_disk = repl_slot;
2111 err = mddev_stack_new_rdev(mddev, rdev);
2112 if (err)
2113 return err;
2114 conf->fullsync = 1;
2115 WRITE_ONCE(p->replacement, rdev);
2116 }
2117
2118 print_conf(conf);
2119 return err;
2120 }
2121
raid10_remove_disk(struct mddev * mddev,struct md_rdev * rdev)2122 static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
2123 {
2124 struct r10conf *conf = mddev->private;
2125 int err = 0;
2126 int number = rdev->raid_disk;
2127 struct md_rdev **rdevp;
2128 struct raid10_info *p;
2129
2130 print_conf(conf);
2131 if (unlikely(number >= mddev->raid_disks))
2132 return 0;
2133 p = conf->mirrors + number;
2134 if (rdev == p->rdev)
2135 rdevp = &p->rdev;
2136 else if (rdev == p->replacement)
2137 rdevp = &p->replacement;
2138 else
2139 return 0;
2140
2141 if (test_bit(In_sync, &rdev->flags) ||
2142 atomic_read(&rdev->nr_pending)) {
2143 err = -EBUSY;
2144 goto abort;
2145 }
2146 /* Only remove non-faulty devices if recovery
2147 * is not possible.
2148 */
2149 if (!test_bit(Faulty, &rdev->flags) &&
2150 mddev->recovery_disabled != p->recovery_disabled &&
2151 (!p->replacement || p->replacement == rdev) &&
2152 number < conf->geo.raid_disks &&
2153 enough(conf, -1)) {
2154 err = -EBUSY;
2155 goto abort;
2156 }
2157 WRITE_ONCE(*rdevp, NULL);
2158 if (p->replacement) {
2159 /* We must have just cleared 'rdev' */
2160 WRITE_ONCE(p->rdev, p->replacement);
2161 clear_bit(Replacement, &p->replacement->flags);
2162 WRITE_ONCE(p->replacement, NULL);
2163 }
2164
2165 clear_bit(WantReplacement, &rdev->flags);
2166 err = md_integrity_register(mddev);
2167
2168 abort:
2169
2170 print_conf(conf);
2171 return err;
2172 }
2173
__end_sync_read(struct r10bio * r10_bio,struct bio * bio,int d)2174 static void __end_sync_read(struct r10bio *r10_bio, struct bio *bio, int d)
2175 {
2176 struct r10conf *conf = r10_bio->mddev->private;
2177
2178 if (!bio->bi_status)
2179 set_bit(R10BIO_Uptodate, &r10_bio->state);
2180 else
2181 /* The write handler will notice the lack of
2182 * R10BIO_Uptodate and record any errors etc
2183 */
2184 atomic_add(r10_bio->sectors,
2185 &conf->mirrors[d].rdev->corrected_errors);
2186
2187 /* for reconstruct, we always reschedule after a read.
2188 * for resync, only after all reads
2189 */
2190 rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev);
2191 if (test_bit(R10BIO_IsRecover, &r10_bio->state) ||
2192 atomic_dec_and_test(&r10_bio->remaining)) {
2193 /* we have read all the blocks,
2194 * do the comparison in process context in raid10d
2195 */
2196 reschedule_retry(r10_bio);
2197 }
2198 }
2199
end_sync_read(struct bio * bio)2200 static void end_sync_read(struct bio *bio)
2201 {
2202 struct r10bio *r10_bio = get_resync_r10bio(bio);
2203 struct r10conf *conf = r10_bio->mddev->private;
2204 int d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);
2205
2206 __end_sync_read(r10_bio, bio, d);
2207 }
2208
end_reshape_read(struct bio * bio)2209 static void end_reshape_read(struct bio *bio)
2210 {
2211 /* reshape read bio isn't allocated from r10buf_pool */
2212 struct r10bio *r10_bio = bio->bi_private;
2213
2214 __end_sync_read(r10_bio, bio, r10_bio->read_slot);
2215 }
2216
end_sync_request(struct r10bio * r10_bio)2217 static void end_sync_request(struct r10bio *r10_bio)
2218 {
2219 struct mddev *mddev = r10_bio->mddev;
2220
2221 while (atomic_dec_and_test(&r10_bio->remaining)) {
2222 if (r10_bio->master_bio == NULL) {
2223 /* the primary of several recovery bios */
2224 sector_t s = r10_bio->sectors;
2225 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
2226 test_bit(R10BIO_WriteError, &r10_bio->state))
2227 reschedule_retry(r10_bio);
2228 else
2229 put_buf(r10_bio);
2230 md_done_sync(mddev, s, 1);
2231 break;
2232 } else {
2233 struct r10bio *r10_bio2 = (struct r10bio *)r10_bio->master_bio;
2234 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
2235 test_bit(R10BIO_WriteError, &r10_bio->state))
2236 reschedule_retry(r10_bio);
2237 else
2238 put_buf(r10_bio);
2239 r10_bio = r10_bio2;
2240 }
2241 }
2242 }
2243
end_sync_write(struct bio * bio)2244 static void end_sync_write(struct bio *bio)
2245 {
2246 struct r10bio *r10_bio = get_resync_r10bio(bio);
2247 struct mddev *mddev = r10_bio->mddev;
2248 struct r10conf *conf = mddev->private;
2249 int d;
2250 int slot;
2251 int repl;
2252 struct md_rdev *rdev = NULL;
2253
2254 d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
2255 if (repl)
2256 rdev = conf->mirrors[d].replacement;
2257 else
2258 rdev = conf->mirrors[d].rdev;
2259
2260 if (bio->bi_status) {
2261 if (repl)
2262 md_error(mddev, rdev);
2263 else {
2264 set_bit(WriteErrorSeen, &rdev->flags);
2265 if (!test_and_set_bit(WantReplacement, &rdev->flags))
2266 set_bit(MD_RECOVERY_NEEDED,
2267 &rdev->mddev->recovery);
2268 set_bit(R10BIO_WriteError, &r10_bio->state);
2269 }
2270 } else if (rdev_has_badblock(rdev, r10_bio->devs[slot].addr,
2271 r10_bio->sectors)) {
2272 set_bit(R10BIO_MadeGood, &r10_bio->state);
2273 }
2274
2275 rdev_dec_pending(rdev, mddev);
2276
2277 end_sync_request(r10_bio);
2278 }
2279
2280 /*
2281 * Note: sync and recover and handled very differently for raid10
2282 * This code is for resync.
2283 * For resync, we read through virtual addresses and read all blocks.
2284 * If there is any error, we schedule a write. The lowest numbered
2285 * drive is authoritative.
2286 * However requests come for physical address, so we need to map.
2287 * For every physical address there are raid_disks/copies virtual addresses,
2288 * which is always are least one, but is not necessarly an integer.
2289 * This means that a physical address can span multiple chunks, so we may
2290 * have to submit multiple io requests for a single sync request.
2291 */
2292 /*
2293 * We check if all blocks are in-sync and only write to blocks that
2294 * aren't in sync
2295 */
sync_request_write(struct mddev * mddev,struct r10bio * r10_bio)2296 static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
2297 {
2298 struct r10conf *conf = mddev->private;
2299 int i, first;
2300 struct bio *tbio, *fbio;
2301 int vcnt;
2302 struct page **tpages, **fpages;
2303
2304 atomic_set(&r10_bio->remaining, 1);
2305
2306 /* find the first device with a block */
2307 for (i=0; i<conf->copies; i++)
2308 if (!r10_bio->devs[i].bio->bi_status)
2309 break;
2310
2311 if (i == conf->copies)
2312 goto done;
2313
2314 first = i;
2315 fbio = r10_bio->devs[i].bio;
2316 fbio->bi_iter.bi_size = r10_bio->sectors << 9;
2317 fbio->bi_iter.bi_idx = 0;
2318 fpages = get_resync_pages(fbio)->pages;
2319
2320 vcnt = (r10_bio->sectors + (PAGE_SIZE >> 9) - 1) >> (PAGE_SHIFT - 9);
2321 /* now find blocks with errors */
2322 for (i=0 ; i < conf->copies ; i++) {
2323 int j, d;
2324 struct md_rdev *rdev;
2325 struct resync_pages *rp;
2326
2327 tbio = r10_bio->devs[i].bio;
2328
2329 if (tbio->bi_end_io != end_sync_read)
2330 continue;
2331 if (i == first)
2332 continue;
2333
2334 tpages = get_resync_pages(tbio)->pages;
2335 d = r10_bio->devs[i].devnum;
2336 rdev = conf->mirrors[d].rdev;
2337 if (!r10_bio->devs[i].bio->bi_status) {
2338 /* We know that the bi_io_vec layout is the same for
2339 * both 'first' and 'i', so we just compare them.
2340 * All vec entries are PAGE_SIZE;
2341 */
2342 int sectors = r10_bio->sectors;
2343 for (j = 0; j < vcnt; j++) {
2344 int len = PAGE_SIZE;
2345 if (sectors < (len / 512))
2346 len = sectors * 512;
2347 if (memcmp(page_address(fpages[j]),
2348 page_address(tpages[j]),
2349 len))
2350 break;
2351 sectors -= len/512;
2352 }
2353 if (j == vcnt)
2354 continue;
2355 atomic64_add(r10_bio->sectors, &mddev->resync_mismatches);
2356 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
2357 /* Don't fix anything. */
2358 continue;
2359 } else if (test_bit(FailFast, &rdev->flags)) {
2360 /* Just give up on this device */
2361 md_error(rdev->mddev, rdev);
2362 continue;
2363 }
2364 /* Ok, we need to write this bio, either to correct an
2365 * inconsistency or to correct an unreadable block.
2366 * First we need to fixup bv_offset, bv_len and
2367 * bi_vecs, as the read request might have corrupted these
2368 */
2369 rp = get_resync_pages(tbio);
2370 bio_reset(tbio, conf->mirrors[d].rdev->bdev, REQ_OP_WRITE);
2371
2372 md_bio_reset_resync_pages(tbio, rp, fbio->bi_iter.bi_size);
2373
2374 rp->raid_bio = r10_bio;
2375 tbio->bi_private = rp;
2376 tbio->bi_iter.bi_sector = r10_bio->devs[i].addr;
2377 tbio->bi_end_io = end_sync_write;
2378
2379 bio_copy_data(tbio, fbio);
2380
2381 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
2382 atomic_inc(&r10_bio->remaining);
2383 md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(tbio));
2384
2385 if (test_bit(FailFast, &conf->mirrors[d].rdev->flags))
2386 tbio->bi_opf |= MD_FAILFAST;
2387 tbio->bi_iter.bi_sector += conf->mirrors[d].rdev->data_offset;
2388 submit_bio_noacct(tbio);
2389 }
2390
2391 /* Now write out to any replacement devices
2392 * that are active
2393 */
2394 for (i = 0; i < conf->copies; i++) {
2395 int d;
2396
2397 tbio = r10_bio->devs[i].repl_bio;
2398 if (!tbio || !tbio->bi_end_io)
2399 continue;
2400 if (r10_bio->devs[i].bio->bi_end_io != end_sync_write
2401 && r10_bio->devs[i].bio != fbio)
2402 bio_copy_data(tbio, fbio);
2403 d = r10_bio->devs[i].devnum;
2404 atomic_inc(&r10_bio->remaining);
2405 md_sync_acct(conf->mirrors[d].replacement->bdev,
2406 bio_sectors(tbio));
2407 submit_bio_noacct(tbio);
2408 }
2409
2410 done:
2411 if (atomic_dec_and_test(&r10_bio->remaining)) {
2412 md_done_sync(mddev, r10_bio->sectors, 1);
2413 put_buf(r10_bio);
2414 }
2415 }
2416
2417 /*
2418 * Now for the recovery code.
2419 * Recovery happens across physical sectors.
2420 * We recover all non-is_sync drives by finding the virtual address of
2421 * each, and then choose a working drive that also has that virt address.
2422 * There is a separate r10_bio for each non-in_sync drive.
2423 * Only the first two slots are in use. The first for reading,
2424 * The second for writing.
2425 *
2426 */
fix_recovery_read_error(struct r10bio * r10_bio)2427 static void fix_recovery_read_error(struct r10bio *r10_bio)
2428 {
2429 /* We got a read error during recovery.
2430 * We repeat the read in smaller page-sized sections.
2431 * If a read succeeds, write it to the new device or record
2432 * a bad block if we cannot.
2433 * If a read fails, record a bad block on both old and
2434 * new devices.
2435 */
2436 struct mddev *mddev = r10_bio->mddev;
2437 struct r10conf *conf = mddev->private;
2438 struct bio *bio = r10_bio->devs[0].bio;
2439 sector_t sect = 0;
2440 int sectors = r10_bio->sectors;
2441 int idx = 0;
2442 int dr = r10_bio->devs[0].devnum;
2443 int dw = r10_bio->devs[1].devnum;
2444 struct page **pages = get_resync_pages(bio)->pages;
2445
2446 while (sectors) {
2447 int s = sectors;
2448 struct md_rdev *rdev;
2449 sector_t addr;
2450 int ok;
2451
2452 if (s > (PAGE_SIZE>>9))
2453 s = PAGE_SIZE >> 9;
2454
2455 rdev = conf->mirrors[dr].rdev;
2456 addr = r10_bio->devs[0].addr + sect;
2457 ok = sync_page_io(rdev,
2458 addr,
2459 s << 9,
2460 pages[idx],
2461 REQ_OP_READ, false);
2462 if (ok) {
2463 rdev = conf->mirrors[dw].rdev;
2464 addr = r10_bio->devs[1].addr + sect;
2465 ok = sync_page_io(rdev,
2466 addr,
2467 s << 9,
2468 pages[idx],
2469 REQ_OP_WRITE, false);
2470 if (!ok) {
2471 set_bit(WriteErrorSeen, &rdev->flags);
2472 if (!test_and_set_bit(WantReplacement,
2473 &rdev->flags))
2474 set_bit(MD_RECOVERY_NEEDED,
2475 &rdev->mddev->recovery);
2476 }
2477 }
2478 if (!ok) {
2479 /* We don't worry if we cannot set a bad block -
2480 * it really is bad so there is no loss in not
2481 * recording it yet
2482 */
2483 rdev_set_badblocks(rdev, addr, s, 0);
2484
2485 if (rdev != conf->mirrors[dw].rdev) {
2486 /* need bad block on destination too */
2487 struct md_rdev *rdev2 = conf->mirrors[dw].rdev;
2488 addr = r10_bio->devs[1].addr + sect;
2489 ok = rdev_set_badblocks(rdev2, addr, s, 0);
2490 if (!ok) {
2491 /* just abort the recovery */
2492 pr_notice("md/raid10:%s: recovery aborted due to read error\n",
2493 mdname(mddev));
2494
2495 conf->mirrors[dw].recovery_disabled
2496 = mddev->recovery_disabled;
2497 set_bit(MD_RECOVERY_INTR,
2498 &mddev->recovery);
2499 break;
2500 }
2501 }
2502 }
2503
2504 sectors -= s;
2505 sect += s;
2506 idx++;
2507 }
2508 }
2509
recovery_request_write(struct mddev * mddev,struct r10bio * r10_bio)2510 static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
2511 {
2512 struct r10conf *conf = mddev->private;
2513 int d;
2514 struct bio *wbio = r10_bio->devs[1].bio;
2515 struct bio *wbio2 = r10_bio->devs[1].repl_bio;
2516
2517 /* Need to test wbio2->bi_end_io before we call
2518 * submit_bio_noacct as if the former is NULL,
2519 * the latter is free to free wbio2.
2520 */
2521 if (wbio2 && !wbio2->bi_end_io)
2522 wbio2 = NULL;
2523
2524 if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) {
2525 fix_recovery_read_error(r10_bio);
2526 if (wbio->bi_end_io)
2527 end_sync_request(r10_bio);
2528 if (wbio2)
2529 end_sync_request(r10_bio);
2530 return;
2531 }
2532
2533 /*
2534 * share the pages with the first bio
2535 * and submit the write request
2536 */
2537 d = r10_bio->devs[1].devnum;
2538 if (wbio->bi_end_io) {
2539 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
2540 md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio));
2541 submit_bio_noacct(wbio);
2542 }
2543 if (wbio2) {
2544 atomic_inc(&conf->mirrors[d].replacement->nr_pending);
2545 md_sync_acct(conf->mirrors[d].replacement->bdev,
2546 bio_sectors(wbio2));
2547 submit_bio_noacct(wbio2);
2548 }
2549 }
2550
r10_sync_page_io(struct md_rdev * rdev,sector_t sector,int sectors,struct page * page,enum req_op op)2551 static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector,
2552 int sectors, struct page *page, enum req_op op)
2553 {
2554 if (rdev_has_badblock(rdev, sector, sectors) &&
2555 (op == REQ_OP_READ || test_bit(WriteErrorSeen, &rdev->flags)))
2556 return -1;
2557 if (sync_page_io(rdev, sector, sectors << 9, page, op, false))
2558 /* success */
2559 return 1;
2560 if (op == REQ_OP_WRITE) {
2561 set_bit(WriteErrorSeen, &rdev->flags);
2562 if (!test_and_set_bit(WantReplacement, &rdev->flags))
2563 set_bit(MD_RECOVERY_NEEDED,
2564 &rdev->mddev->recovery);
2565 }
2566 /* need to record an error - either for the block or the device */
2567 if (!rdev_set_badblocks(rdev, sector, sectors, 0))
2568 md_error(rdev->mddev, rdev);
2569 return 0;
2570 }
2571
2572 /*
2573 * This is a kernel thread which:
2574 *
2575 * 1. Retries failed read operations on working mirrors.
2576 * 2. Updates the raid superblock when problems encounter.
2577 * 3. Performs writes following reads for array synchronising.
2578 */
2579
fix_read_error(struct r10conf * conf,struct mddev * mddev,struct r10bio * r10_bio)2580 static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10bio *r10_bio)
2581 {
2582 int sect = 0; /* Offset from r10_bio->sector */
2583 int sectors = r10_bio->sectors, slot = r10_bio->read_slot;
2584 struct md_rdev *rdev;
2585 int d = r10_bio->devs[slot].devnum;
2586
2587 /* still own a reference to this rdev, so it cannot
2588 * have been cleared recently.
2589 */
2590 rdev = conf->mirrors[d].rdev;
2591
2592 if (test_bit(Faulty, &rdev->flags))
2593 /* drive has already been failed, just ignore any
2594 more fix_read_error() attempts */
2595 return;
2596
2597 if (exceed_read_errors(mddev, rdev)) {
2598 r10_bio->devs[slot].bio = IO_BLOCKED;
2599 return;
2600 }
2601
2602 while(sectors) {
2603 int s = sectors;
2604 int sl = slot;
2605 int success = 0;
2606 int start;
2607
2608 if (s > (PAGE_SIZE>>9))
2609 s = PAGE_SIZE >> 9;
2610
2611 do {
2612 d = r10_bio->devs[sl].devnum;
2613 rdev = conf->mirrors[d].rdev;
2614 if (rdev &&
2615 test_bit(In_sync, &rdev->flags) &&
2616 !test_bit(Faulty, &rdev->flags) &&
2617 rdev_has_badblock(rdev,
2618 r10_bio->devs[sl].addr + sect,
2619 s) == 0) {
2620 atomic_inc(&rdev->nr_pending);
2621 success = sync_page_io(rdev,
2622 r10_bio->devs[sl].addr +
2623 sect,
2624 s<<9,
2625 conf->tmppage,
2626 REQ_OP_READ, false);
2627 rdev_dec_pending(rdev, mddev);
2628 if (success)
2629 break;
2630 }
2631 sl++;
2632 if (sl == conf->copies)
2633 sl = 0;
2634 } while (sl != slot);
2635
2636 if (!success) {
2637 /* Cannot read from anywhere, just mark the block
2638 * as bad on the first device to discourage future
2639 * reads.
2640 */
2641 int dn = r10_bio->devs[slot].devnum;
2642 rdev = conf->mirrors[dn].rdev;
2643
2644 if (!rdev_set_badblocks(
2645 rdev,
2646 r10_bio->devs[slot].addr
2647 + sect,
2648 s, 0)) {
2649 md_error(mddev, rdev);
2650 r10_bio->devs[slot].bio
2651 = IO_BLOCKED;
2652 }
2653 break;
2654 }
2655
2656 start = sl;
2657 /* write it back and re-read */
2658 while (sl != slot) {
2659 if (sl==0)
2660 sl = conf->copies;
2661 sl--;
2662 d = r10_bio->devs[sl].devnum;
2663 rdev = conf->mirrors[d].rdev;
2664 if (!rdev ||
2665 test_bit(Faulty, &rdev->flags) ||
2666 !test_bit(In_sync, &rdev->flags))
2667 continue;
2668
2669 atomic_inc(&rdev->nr_pending);
2670 if (r10_sync_page_io(rdev,
2671 r10_bio->devs[sl].addr +
2672 sect,
2673 s, conf->tmppage, REQ_OP_WRITE)
2674 == 0) {
2675 /* Well, this device is dead */
2676 pr_notice("md/raid10:%s: read correction write failed (%d sectors at %llu on %pg)\n",
2677 mdname(mddev), s,
2678 (unsigned long long)(
2679 sect +
2680 choose_data_offset(r10_bio,
2681 rdev)),
2682 rdev->bdev);
2683 pr_notice("md/raid10:%s: %pg: failing drive\n",
2684 mdname(mddev),
2685 rdev->bdev);
2686 }
2687 rdev_dec_pending(rdev, mddev);
2688 }
2689 sl = start;
2690 while (sl != slot) {
2691 if (sl==0)
2692 sl = conf->copies;
2693 sl--;
2694 d = r10_bio->devs[sl].devnum;
2695 rdev = conf->mirrors[d].rdev;
2696 if (!rdev ||
2697 test_bit(Faulty, &rdev->flags) ||
2698 !test_bit(In_sync, &rdev->flags))
2699 continue;
2700
2701 atomic_inc(&rdev->nr_pending);
2702 switch (r10_sync_page_io(rdev,
2703 r10_bio->devs[sl].addr +
2704 sect,
2705 s, conf->tmppage, REQ_OP_READ)) {
2706 case 0:
2707 /* Well, this device is dead */
2708 pr_notice("md/raid10:%s: unable to read back corrected sectors (%d sectors at %llu on %pg)\n",
2709 mdname(mddev), s,
2710 (unsigned long long)(
2711 sect +
2712 choose_data_offset(r10_bio, rdev)),
2713 rdev->bdev);
2714 pr_notice("md/raid10:%s: %pg: failing drive\n",
2715 mdname(mddev),
2716 rdev->bdev);
2717 break;
2718 case 1:
2719 pr_info("md/raid10:%s: read error corrected (%d sectors at %llu on %pg)\n",
2720 mdname(mddev), s,
2721 (unsigned long long)(
2722 sect +
2723 choose_data_offset(r10_bio, rdev)),
2724 rdev->bdev);
2725 atomic_add(s, &rdev->corrected_errors);
2726 }
2727
2728 rdev_dec_pending(rdev, mddev);
2729 }
2730
2731 sectors -= s;
2732 sect += s;
2733 }
2734 }
2735
narrow_write_error(struct r10bio * r10_bio,int i)2736 static int narrow_write_error(struct r10bio *r10_bio, int i)
2737 {
2738 struct bio *bio = r10_bio->master_bio;
2739 struct mddev *mddev = r10_bio->mddev;
2740 struct r10conf *conf = mddev->private;
2741 struct md_rdev *rdev = conf->mirrors[r10_bio->devs[i].devnum].rdev;
2742 /* bio has the data to be written to slot 'i' where
2743 * we just recently had a write error.
2744 * We repeatedly clone the bio and trim down to one block,
2745 * then try the write. Where the write fails we record
2746 * a bad block.
2747 * It is conceivable that the bio doesn't exactly align with
2748 * blocks. We must handle this.
2749 *
2750 * We currently own a reference to the rdev.
2751 */
2752
2753 int block_sectors;
2754 sector_t sector;
2755 int sectors;
2756 int sect_to_write = r10_bio->sectors;
2757 int ok = 1;
2758
2759 if (rdev->badblocks.shift < 0)
2760 return 0;
2761
2762 block_sectors = roundup(1 << rdev->badblocks.shift,
2763 bdev_logical_block_size(rdev->bdev) >> 9);
2764 sector = r10_bio->sector;
2765 sectors = ((r10_bio->sector + block_sectors)
2766 & ~(sector_t)(block_sectors - 1))
2767 - sector;
2768
2769 while (sect_to_write) {
2770 struct bio *wbio;
2771 sector_t wsector;
2772 if (sectors > sect_to_write)
2773 sectors = sect_to_write;
2774 /* Write at 'sector' for 'sectors' */
2775 wbio = bio_alloc_clone(rdev->bdev, bio, GFP_NOIO,
2776 &mddev->bio_set);
2777 bio_trim(wbio, sector - bio->bi_iter.bi_sector, sectors);
2778 wsector = r10_bio->devs[i].addr + (sector - r10_bio->sector);
2779 wbio->bi_iter.bi_sector = wsector +
2780 choose_data_offset(r10_bio, rdev);
2781 wbio->bi_opf = REQ_OP_WRITE;
2782
2783 if (submit_bio_wait(wbio) < 0)
2784 /* Failure! */
2785 ok = rdev_set_badblocks(rdev, wsector,
2786 sectors, 0)
2787 && ok;
2788
2789 bio_put(wbio);
2790 sect_to_write -= sectors;
2791 sector += sectors;
2792 sectors = block_sectors;
2793 }
2794 return ok;
2795 }
2796
handle_read_error(struct mddev * mddev,struct r10bio * r10_bio)2797 static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
2798 {
2799 int slot = r10_bio->read_slot;
2800 struct bio *bio;
2801 struct r10conf *conf = mddev->private;
2802 struct md_rdev *rdev = r10_bio->devs[slot].rdev;
2803
2804 /* we got a read error. Maybe the drive is bad. Maybe just
2805 * the block and we can fix it.
2806 * We freeze all other IO, and try reading the block from
2807 * other devices. When we find one, we re-write
2808 * and check it that fixes the read error.
2809 * This is all done synchronously while the array is
2810 * frozen.
2811 */
2812 bio = r10_bio->devs[slot].bio;
2813 bio_put(bio);
2814 r10_bio->devs[slot].bio = NULL;
2815
2816 if (mddev->ro)
2817 r10_bio->devs[slot].bio = IO_BLOCKED;
2818 else if (!test_bit(FailFast, &rdev->flags)) {
2819 freeze_array(conf, 1);
2820 fix_read_error(conf, mddev, r10_bio);
2821 unfreeze_array(conf);
2822 } else
2823 md_error(mddev, rdev);
2824
2825 rdev_dec_pending(rdev, mddev);
2826 r10_bio->state = 0;
2827 raid10_read_request(mddev, r10_bio->master_bio, r10_bio, false);
2828 /*
2829 * allow_barrier after re-submit to ensure no sync io
2830 * can be issued while regular io pending.
2831 */
2832 allow_barrier(conf);
2833 }
2834
handle_write_completed(struct r10conf * conf,struct r10bio * r10_bio)2835 static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
2836 {
2837 /* Some sort of write request has finished and it
2838 * succeeded in writing where we thought there was a
2839 * bad block. So forget the bad block.
2840 * Or possibly if failed and we need to record
2841 * a bad block.
2842 */
2843 int m;
2844 struct md_rdev *rdev;
2845
2846 if (test_bit(R10BIO_IsSync, &r10_bio->state) ||
2847 test_bit(R10BIO_IsRecover, &r10_bio->state)) {
2848 for (m = 0; m < conf->copies; m++) {
2849 int dev = r10_bio->devs[m].devnum;
2850 rdev = conf->mirrors[dev].rdev;
2851 if (r10_bio->devs[m].bio == NULL ||
2852 r10_bio->devs[m].bio->bi_end_io == NULL)
2853 continue;
2854 if (!r10_bio->devs[m].bio->bi_status) {
2855 rdev_clear_badblocks(
2856 rdev,
2857 r10_bio->devs[m].addr,
2858 r10_bio->sectors, 0);
2859 } else {
2860 if (!rdev_set_badblocks(
2861 rdev,
2862 r10_bio->devs[m].addr,
2863 r10_bio->sectors, 0))
2864 md_error(conf->mddev, rdev);
2865 }
2866 rdev = conf->mirrors[dev].replacement;
2867 if (r10_bio->devs[m].repl_bio == NULL ||
2868 r10_bio->devs[m].repl_bio->bi_end_io == NULL)
2869 continue;
2870
2871 if (!r10_bio->devs[m].repl_bio->bi_status) {
2872 rdev_clear_badblocks(
2873 rdev,
2874 r10_bio->devs[m].addr,
2875 r10_bio->sectors, 0);
2876 } else {
2877 if (!rdev_set_badblocks(
2878 rdev,
2879 r10_bio->devs[m].addr,
2880 r10_bio->sectors, 0))
2881 md_error(conf->mddev, rdev);
2882 }
2883 }
2884 put_buf(r10_bio);
2885 } else {
2886 bool fail = false;
2887 for (m = 0; m < conf->copies; m++) {
2888 int dev = r10_bio->devs[m].devnum;
2889 struct bio *bio = r10_bio->devs[m].bio;
2890 rdev = conf->mirrors[dev].rdev;
2891 if (bio == IO_MADE_GOOD) {
2892 rdev_clear_badblocks(
2893 rdev,
2894 r10_bio->devs[m].addr,
2895 r10_bio->sectors, 0);
2896 rdev_dec_pending(rdev, conf->mddev);
2897 } else if (bio != NULL && bio->bi_status) {
2898 fail = true;
2899 if (!narrow_write_error(r10_bio, m))
2900 md_error(conf->mddev, rdev);
2901 rdev_dec_pending(rdev, conf->mddev);
2902 }
2903 bio = r10_bio->devs[m].repl_bio;
2904 rdev = conf->mirrors[dev].replacement;
2905 if (rdev && bio == IO_MADE_GOOD) {
2906 rdev_clear_badblocks(
2907 rdev,
2908 r10_bio->devs[m].addr,
2909 r10_bio->sectors, 0);
2910 rdev_dec_pending(rdev, conf->mddev);
2911 }
2912 }
2913 if (fail) {
2914 spin_lock_irq(&conf->device_lock);
2915 list_add(&r10_bio->retry_list, &conf->bio_end_io_list);
2916 conf->nr_queued++;
2917 spin_unlock_irq(&conf->device_lock);
2918 /*
2919 * In case freeze_array() is waiting for condition
2920 * nr_pending == nr_queued + extra to be true.
2921 */
2922 wake_up(&conf->wait_barrier);
2923 md_wakeup_thread(conf->mddev->thread);
2924 } else {
2925 if (test_bit(R10BIO_WriteError,
2926 &r10_bio->state))
2927 close_write(r10_bio);
2928 raid_end_bio_io(r10_bio);
2929 }
2930 }
2931 }
2932
raid10d(struct md_thread * thread)2933 static void raid10d(struct md_thread *thread)
2934 {
2935 struct mddev *mddev = thread->mddev;
2936 struct r10bio *r10_bio;
2937 unsigned long flags;
2938 struct r10conf *conf = mddev->private;
2939 struct list_head *head = &conf->retry_list;
2940 struct blk_plug plug;
2941
2942 md_check_recovery(mddev);
2943
2944 if (!list_empty_careful(&conf->bio_end_io_list) &&
2945 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
2946 LIST_HEAD(tmp);
2947 spin_lock_irqsave(&conf->device_lock, flags);
2948 if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
2949 while (!list_empty(&conf->bio_end_io_list)) {
2950 list_move(conf->bio_end_io_list.prev, &tmp);
2951 conf->nr_queued--;
2952 }
2953 }
2954 spin_unlock_irqrestore(&conf->device_lock, flags);
2955 while (!list_empty(&tmp)) {
2956 r10_bio = list_first_entry(&tmp, struct r10bio,
2957 retry_list);
2958 list_del(&r10_bio->retry_list);
2959
2960 if (test_bit(R10BIO_WriteError,
2961 &r10_bio->state))
2962 close_write(r10_bio);
2963 raid_end_bio_io(r10_bio);
2964 }
2965 }
2966
2967 blk_start_plug(&plug);
2968 for (;;) {
2969
2970 flush_pending_writes(conf);
2971
2972 spin_lock_irqsave(&conf->device_lock, flags);
2973 if (list_empty(head)) {
2974 spin_unlock_irqrestore(&conf->device_lock, flags);
2975 break;
2976 }
2977 r10_bio = list_entry(head->prev, struct r10bio, retry_list);
2978 list_del(head->prev);
2979 conf->nr_queued--;
2980 spin_unlock_irqrestore(&conf->device_lock, flags);
2981
2982 mddev = r10_bio->mddev;
2983 conf = mddev->private;
2984 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
2985 test_bit(R10BIO_WriteError, &r10_bio->state))
2986 handle_write_completed(conf, r10_bio);
2987 else if (test_bit(R10BIO_IsReshape, &r10_bio->state))
2988 reshape_request_write(mddev, r10_bio);
2989 else if (test_bit(R10BIO_IsSync, &r10_bio->state))
2990 sync_request_write(mddev, r10_bio);
2991 else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
2992 recovery_request_write(mddev, r10_bio);
2993 else if (test_bit(R10BIO_ReadError, &r10_bio->state))
2994 handle_read_error(mddev, r10_bio);
2995 else
2996 WARN_ON_ONCE(1);
2997
2998 cond_resched();
2999 if (mddev->sb_flags & ~(1<<MD_SB_CHANGE_PENDING))
3000 md_check_recovery(mddev);
3001 }
3002 blk_finish_plug(&plug);
3003 }
3004
init_resync(struct r10conf * conf)3005 static int init_resync(struct r10conf *conf)
3006 {
3007 int ret, buffs, i;
3008
3009 buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
3010 BUG_ON(mempool_initialized(&conf->r10buf_pool));
3011 conf->have_replacement = 0;
3012 for (i = 0; i < conf->geo.raid_disks; i++)
3013 if (conf->mirrors[i].replacement)
3014 conf->have_replacement = 1;
3015 ret = mempool_init(&conf->r10buf_pool, buffs,
3016 r10buf_pool_alloc, r10buf_pool_free, conf);
3017 if (ret)
3018 return ret;
3019 conf->next_resync = 0;
3020 return 0;
3021 }
3022
raid10_alloc_init_r10buf(struct r10conf * conf)3023 static struct r10bio *raid10_alloc_init_r10buf(struct r10conf *conf)
3024 {
3025 struct r10bio *r10bio = mempool_alloc(&conf->r10buf_pool, GFP_NOIO);
3026 struct rsync_pages *rp;
3027 struct bio *bio;
3028 int nalloc;
3029 int i;
3030
3031 if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) ||
3032 test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery))
3033 nalloc = conf->copies; /* resync */
3034 else
3035 nalloc = 2; /* recovery */
3036
3037 for (i = 0; i < nalloc; i++) {
3038 bio = r10bio->devs[i].bio;
3039 rp = bio->bi_private;
3040 bio_reset(bio, NULL, 0);
3041 bio->bi_private = rp;
3042 bio = r10bio->devs[i].repl_bio;
3043 if (bio) {
3044 rp = bio->bi_private;
3045 bio_reset(bio, NULL, 0);
3046 bio->bi_private = rp;
3047 }
3048 }
3049 return r10bio;
3050 }
3051
3052 /*
3053 * Set cluster_sync_high since we need other nodes to add the
3054 * range [cluster_sync_low, cluster_sync_high] to suspend list.
3055 */
raid10_set_cluster_sync_high(struct r10conf * conf)3056 static void raid10_set_cluster_sync_high(struct r10conf *conf)
3057 {
3058 sector_t window_size;
3059 int extra_chunk, chunks;
3060
3061 /*
3062 * First, here we define "stripe" as a unit which across
3063 * all member devices one time, so we get chunks by use
3064 * raid_disks / near_copies. Otherwise, if near_copies is
3065 * close to raid_disks, then resync window could increases
3066 * linearly with the increase of raid_disks, which means
3067 * we will suspend a really large IO window while it is not
3068 * necessary. If raid_disks is not divisible by near_copies,
3069 * an extra chunk is needed to ensure the whole "stripe" is
3070 * covered.
3071 */
3072
3073 chunks = conf->geo.raid_disks / conf->geo.near_copies;
3074 if (conf->geo.raid_disks % conf->geo.near_copies == 0)
3075 extra_chunk = 0;
3076 else
3077 extra_chunk = 1;
3078 window_size = (chunks + extra_chunk) * conf->mddev->chunk_sectors;
3079
3080 /*
3081 * At least use a 32M window to align with raid1's resync window
3082 */
3083 window_size = (CLUSTER_RESYNC_WINDOW_SECTORS > window_size) ?
3084 CLUSTER_RESYNC_WINDOW_SECTORS : window_size;
3085
3086 conf->cluster_sync_high = conf->cluster_sync_low + window_size;
3087 }
3088
3089 /*
3090 * perform a "sync" on one "block"
3091 *
3092 * We need to make sure that no normal I/O request - particularly write
3093 * requests - conflict with active sync requests.
3094 *
3095 * This is achieved by tracking pending requests and a 'barrier' concept
3096 * that can be installed to exclude normal IO requests.
3097 *
3098 * Resync and recovery are handled very differently.
3099 * We differentiate by looking at MD_RECOVERY_SYNC in mddev->recovery.
3100 *
3101 * For resync, we iterate over virtual addresses, read all copies,
3102 * and update if there are differences. If only one copy is live,
3103 * skip it.
3104 * For recovery, we iterate over physical addresses, read a good
3105 * value for each non-in_sync drive, and over-write.
3106 *
3107 * So, for recovery we may have several outstanding complex requests for a
3108 * given address, one for each out-of-sync device. We model this by allocating
3109 * a number of r10_bio structures, one for each out-of-sync device.
3110 * As we setup these structures, we collect all bio's together into a list
3111 * which we then process collectively to add pages, and then process again
3112 * to pass to submit_bio_noacct.
3113 *
3114 * The r10_bio structures are linked using a borrowed master_bio pointer.
3115 * This link is counted in ->remaining. When the r10_bio that points to NULL
3116 * has its remaining count decremented to 0, the whole complex operation
3117 * is complete.
3118 *
3119 */
3120
raid10_sync_request(struct mddev * mddev,sector_t sector_nr,sector_t max_sector,int * skipped)3121 static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
3122 sector_t max_sector, int *skipped)
3123 {
3124 struct r10conf *conf = mddev->private;
3125 struct r10bio *r10_bio;
3126 struct bio *biolist = NULL, *bio;
3127 sector_t nr_sectors;
3128 int i;
3129 int max_sync;
3130 sector_t sync_blocks;
3131 sector_t sectors_skipped = 0;
3132 int chunks_skipped = 0;
3133 sector_t chunk_mask = conf->geo.chunk_mask;
3134 int page_idx = 0;
3135 int error_disk = -1;
3136
3137 /*
3138 * Allow skipping a full rebuild for incremental assembly
3139 * of a clean array, like RAID1 does.
3140 */
3141 if (mddev->bitmap == NULL &&
3142 mddev->recovery_cp == MaxSector &&
3143 mddev->reshape_position == MaxSector &&
3144 !test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
3145 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
3146 !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
3147 conf->fullsync == 0) {
3148 *skipped = 1;
3149 return mddev->dev_sectors - sector_nr;
3150 }
3151
3152 if (!mempool_initialized(&conf->r10buf_pool))
3153 if (init_resync(conf))
3154 return 0;
3155
3156 skipped:
3157 if (sector_nr >= max_sector) {
3158 conf->cluster_sync_low = 0;
3159 conf->cluster_sync_high = 0;
3160
3161 /* If we aborted, we need to abort the
3162 * sync on the 'current' bitmap chucks (there can
3163 * be several when recovering multiple devices).
3164 * as we may have started syncing it but not finished.
3165 * We can find the current address in
3166 * mddev->curr_resync, but for recovery,
3167 * we need to convert that to several
3168 * virtual addresses.
3169 */
3170 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
3171 end_reshape(conf);
3172 close_sync(conf);
3173 return 0;
3174 }
3175
3176 if (mddev->curr_resync < max_sector) { /* aborted */
3177 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
3178 mddev->bitmap_ops->end_sync(mddev,
3179 mddev->curr_resync,
3180 &sync_blocks);
3181 else for (i = 0; i < conf->geo.raid_disks; i++) {
3182 sector_t sect =
3183 raid10_find_virt(conf, mddev->curr_resync, i);
3184
3185 mddev->bitmap_ops->end_sync(mddev, sect,
3186 &sync_blocks);
3187 }
3188 } else {
3189 /* completed sync */
3190 if ((!mddev->bitmap || conf->fullsync)
3191 && conf->have_replacement
3192 && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
3193 /* Completed a full sync so the replacements
3194 * are now fully recovered.
3195 */
3196 for (i = 0; i < conf->geo.raid_disks; i++) {
3197 struct md_rdev *rdev =
3198 conf->mirrors[i].replacement;
3199
3200 if (rdev)
3201 rdev->recovery_offset = MaxSector;
3202 }
3203 }
3204 conf->fullsync = 0;
3205 }
3206 mddev->bitmap_ops->close_sync(mddev);
3207 close_sync(conf);
3208 *skipped = 1;
3209 return sectors_skipped;
3210 }
3211
3212 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
3213 return reshape_request(mddev, sector_nr, skipped);
3214
3215 if (chunks_skipped >= conf->geo.raid_disks) {
3216 pr_err("md/raid10:%s: %s fails\n", mdname(mddev),
3217 test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? "resync" : "recovery");
3218 if (error_disk >= 0 &&
3219 !test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
3220 /*
3221 * recovery fails, set mirrors.recovery_disabled,
3222 * device shouldn't be added to there.
3223 */
3224 conf->mirrors[error_disk].recovery_disabled =
3225 mddev->recovery_disabled;
3226 return 0;
3227 }
3228 /*
3229 * if there has been nothing to do on any drive,
3230 * then there is nothing to do at all.
3231 */
3232 *skipped = 1;
3233 return (max_sector - sector_nr) + sectors_skipped;
3234 }
3235
3236 if (max_sector > mddev->resync_max)
3237 max_sector = mddev->resync_max; /* Don't do IO beyond here */
3238
3239 /* make sure whole request will fit in a chunk - if chunks
3240 * are meaningful
3241 */
3242 if (conf->geo.near_copies < conf->geo.raid_disks &&
3243 max_sector > (sector_nr | chunk_mask))
3244 max_sector = (sector_nr | chunk_mask) + 1;
3245
3246 /*
3247 * If there is non-resync activity waiting for a turn, then let it
3248 * though before starting on this new sync request.
3249 */
3250 if (conf->nr_waiting)
3251 schedule_timeout_uninterruptible(1);
3252
3253 /* Again, very different code for resync and recovery.
3254 * Both must result in an r10bio with a list of bios that
3255 * have bi_end_io, bi_sector, bi_bdev set,
3256 * and bi_private set to the r10bio.
3257 * For recovery, we may actually create several r10bios
3258 * with 2 bios in each, that correspond to the bios in the main one.
3259 * In this case, the subordinate r10bios link back through a
3260 * borrowed master_bio pointer, and the counter in the master
3261 * includes a ref from each subordinate.
3262 */
3263 /* First, we decide what to do and set ->bi_end_io
3264 * To end_sync_read if we want to read, and
3265 * end_sync_write if we will want to write.
3266 */
3267
3268 max_sync = RESYNC_PAGES << (PAGE_SHIFT-9);
3269 if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
3270 /* recovery... the complicated one */
3271 int j;
3272 r10_bio = NULL;
3273
3274 for (i = 0 ; i < conf->geo.raid_disks; i++) {
3275 bool still_degraded;
3276 struct r10bio *rb2;
3277 sector_t sect;
3278 bool must_sync;
3279 int any_working;
3280 struct raid10_info *mirror = &conf->mirrors[i];
3281 struct md_rdev *mrdev, *mreplace;
3282
3283 mrdev = mirror->rdev;
3284 mreplace = mirror->replacement;
3285
3286 if (mrdev && (test_bit(Faulty, &mrdev->flags) ||
3287 test_bit(In_sync, &mrdev->flags)))
3288 mrdev = NULL;
3289 if (mreplace && test_bit(Faulty, &mreplace->flags))
3290 mreplace = NULL;
3291
3292 if (!mrdev && !mreplace)
3293 continue;
3294
3295 still_degraded = false;
3296 /* want to reconstruct this device */
3297 rb2 = r10_bio;
3298 sect = raid10_find_virt(conf, sector_nr, i);
3299 if (sect >= mddev->resync_max_sectors)
3300 /* last stripe is not complete - don't
3301 * try to recover this sector.
3302 */
3303 continue;
3304 /* Unless we are doing a full sync, or a replacement
3305 * we only need to recover the block if it is set in
3306 * the bitmap
3307 */
3308 must_sync = mddev->bitmap_ops->start_sync(mddev, sect,
3309 &sync_blocks,
3310 true);
3311 if (sync_blocks < max_sync)
3312 max_sync = sync_blocks;
3313 if (!must_sync &&
3314 mreplace == NULL &&
3315 !conf->fullsync) {
3316 /* yep, skip the sync_blocks here, but don't assume
3317 * that there will never be anything to do here
3318 */
3319 chunks_skipped = -1;
3320 continue;
3321 }
3322 if (mrdev)
3323 atomic_inc(&mrdev->nr_pending);
3324 if (mreplace)
3325 atomic_inc(&mreplace->nr_pending);
3326
3327 r10_bio = raid10_alloc_init_r10buf(conf);
3328 r10_bio->state = 0;
3329 raise_barrier(conf, rb2 != NULL);
3330 atomic_set(&r10_bio->remaining, 0);
3331
3332 r10_bio->master_bio = (struct bio*)rb2;
3333 if (rb2)
3334 atomic_inc(&rb2->remaining);
3335 r10_bio->mddev = mddev;
3336 set_bit(R10BIO_IsRecover, &r10_bio->state);
3337 r10_bio->sector = sect;
3338
3339 raid10_find_phys(conf, r10_bio);
3340
3341 /* Need to check if the array will still be
3342 * degraded
3343 */
3344 for (j = 0; j < conf->geo.raid_disks; j++) {
3345 struct md_rdev *rdev = conf->mirrors[j].rdev;
3346
3347 if (rdev == NULL || test_bit(Faulty, &rdev->flags)) {
3348 still_degraded = false;
3349 break;
3350 }
3351 }
3352
3353 must_sync = mddev->bitmap_ops->start_sync(mddev, sect,
3354 &sync_blocks, still_degraded);
3355
3356 any_working = 0;
3357 for (j=0; j<conf->copies;j++) {
3358 int k;
3359 int d = r10_bio->devs[j].devnum;
3360 sector_t from_addr, to_addr;
3361 struct md_rdev *rdev = conf->mirrors[d].rdev;
3362 sector_t sector, first_bad;
3363 int bad_sectors;
3364 if (!rdev ||
3365 !test_bit(In_sync, &rdev->flags))
3366 continue;
3367 /* This is where we read from */
3368 any_working = 1;
3369 sector = r10_bio->devs[j].addr;
3370
3371 if (is_badblock(rdev, sector, max_sync,
3372 &first_bad, &bad_sectors)) {
3373 if (first_bad > sector)
3374 max_sync = first_bad - sector;
3375 else {
3376 bad_sectors -= (sector
3377 - first_bad);
3378 if (max_sync > bad_sectors)
3379 max_sync = bad_sectors;
3380 continue;
3381 }
3382 }
3383 bio = r10_bio->devs[0].bio;
3384 bio->bi_next = biolist;
3385 biolist = bio;
3386 bio->bi_end_io = end_sync_read;
3387 bio->bi_opf = REQ_OP_READ;
3388 if (test_bit(FailFast, &rdev->flags))
3389 bio->bi_opf |= MD_FAILFAST;
3390 from_addr = r10_bio->devs[j].addr;
3391 bio->bi_iter.bi_sector = from_addr +
3392 rdev->data_offset;
3393 bio_set_dev(bio, rdev->bdev);
3394 atomic_inc(&rdev->nr_pending);
3395 /* and we write to 'i' (if not in_sync) */
3396
3397 for (k=0; k<conf->copies; k++)
3398 if (r10_bio->devs[k].devnum == i)
3399 break;
3400 BUG_ON(k == conf->copies);
3401 to_addr = r10_bio->devs[k].addr;
3402 r10_bio->devs[0].devnum = d;
3403 r10_bio->devs[0].addr = from_addr;
3404 r10_bio->devs[1].devnum = i;
3405 r10_bio->devs[1].addr = to_addr;
3406
3407 if (mrdev) {
3408 bio = r10_bio->devs[1].bio;
3409 bio->bi_next = biolist;
3410 biolist = bio;
3411 bio->bi_end_io = end_sync_write;
3412 bio->bi_opf = REQ_OP_WRITE;
3413 bio->bi_iter.bi_sector = to_addr
3414 + mrdev->data_offset;
3415 bio_set_dev(bio, mrdev->bdev);
3416 atomic_inc(&r10_bio->remaining);
3417 } else
3418 r10_bio->devs[1].bio->bi_end_io = NULL;
3419
3420 /* and maybe write to replacement */
3421 bio = r10_bio->devs[1].repl_bio;
3422 if (bio)
3423 bio->bi_end_io = NULL;
3424 /* Note: if replace is not NULL, then bio
3425 * cannot be NULL as r10buf_pool_alloc will
3426 * have allocated it.
3427 */
3428 if (!mreplace)
3429 break;
3430 bio->bi_next = biolist;
3431 biolist = bio;
3432 bio->bi_end_io = end_sync_write;
3433 bio->bi_opf = REQ_OP_WRITE;
3434 bio->bi_iter.bi_sector = to_addr +
3435 mreplace->data_offset;
3436 bio_set_dev(bio, mreplace->bdev);
3437 atomic_inc(&r10_bio->remaining);
3438 break;
3439 }
3440 if (j == conf->copies) {
3441 /* Cannot recover, so abort the recovery or
3442 * record a bad block */
3443 if (any_working) {
3444 /* problem is that there are bad blocks
3445 * on other device(s)
3446 */
3447 int k;
3448 for (k = 0; k < conf->copies; k++)
3449 if (r10_bio->devs[k].devnum == i)
3450 break;
3451 if (mrdev && !test_bit(In_sync,
3452 &mrdev->flags)
3453 && !rdev_set_badblocks(
3454 mrdev,
3455 r10_bio->devs[k].addr,
3456 max_sync, 0))
3457 any_working = 0;
3458 if (mreplace &&
3459 !rdev_set_badblocks(
3460 mreplace,
3461 r10_bio->devs[k].addr,
3462 max_sync, 0))
3463 any_working = 0;
3464 }
3465 if (!any_working) {
3466 if (!test_and_set_bit(MD_RECOVERY_INTR,
3467 &mddev->recovery))
3468 pr_warn("md/raid10:%s: insufficient working devices for recovery.\n",
3469 mdname(mddev));
3470 mirror->recovery_disabled
3471 = mddev->recovery_disabled;
3472 } else {
3473 error_disk = i;
3474 }
3475 put_buf(r10_bio);
3476 if (rb2)
3477 atomic_dec(&rb2->remaining);
3478 r10_bio = rb2;
3479 if (mrdev)
3480 rdev_dec_pending(mrdev, mddev);
3481 if (mreplace)
3482 rdev_dec_pending(mreplace, mddev);
3483 break;
3484 }
3485 if (mrdev)
3486 rdev_dec_pending(mrdev, mddev);
3487 if (mreplace)
3488 rdev_dec_pending(mreplace, mddev);
3489 if (r10_bio->devs[0].bio->bi_opf & MD_FAILFAST) {
3490 /* Only want this if there is elsewhere to
3491 * read from. 'j' is currently the first
3492 * readable copy.
3493 */
3494 int targets = 1;
3495 for (; j < conf->copies; j++) {
3496 int d = r10_bio->devs[j].devnum;
3497 if (conf->mirrors[d].rdev &&
3498 test_bit(In_sync,
3499 &conf->mirrors[d].rdev->flags))
3500 targets++;
3501 }
3502 if (targets == 1)
3503 r10_bio->devs[0].bio->bi_opf
3504 &= ~MD_FAILFAST;
3505 }
3506 }
3507 if (biolist == NULL) {
3508 while (r10_bio) {
3509 struct r10bio *rb2 = r10_bio;
3510 r10_bio = (struct r10bio*) rb2->master_bio;
3511 rb2->master_bio = NULL;
3512 put_buf(rb2);
3513 }
3514 goto giveup;
3515 }
3516 } else {
3517 /* resync. Schedule a read for every block at this virt offset */
3518 int count = 0;
3519
3520 /*
3521 * Since curr_resync_completed could probably not update in
3522 * time, and we will set cluster_sync_low based on it.
3523 * Let's check against "sector_nr + 2 * RESYNC_SECTORS" for
3524 * safety reason, which ensures curr_resync_completed is
3525 * updated in bitmap_cond_end_sync.
3526 */
3527 mddev->bitmap_ops->cond_end_sync(mddev, sector_nr,
3528 mddev_is_clustered(mddev) &&
3529 (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high));
3530
3531 if (!mddev->bitmap_ops->start_sync(mddev, sector_nr,
3532 &sync_blocks,
3533 mddev->degraded) &&
3534 !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED,
3535 &mddev->recovery)) {
3536 /* We can skip this block */
3537 *skipped = 1;
3538 return sync_blocks + sectors_skipped;
3539 }
3540 if (sync_blocks < max_sync)
3541 max_sync = sync_blocks;
3542 r10_bio = raid10_alloc_init_r10buf(conf);
3543 r10_bio->state = 0;
3544
3545 r10_bio->mddev = mddev;
3546 atomic_set(&r10_bio->remaining, 0);
3547 raise_barrier(conf, 0);
3548 conf->next_resync = sector_nr;
3549
3550 r10_bio->master_bio = NULL;
3551 r10_bio->sector = sector_nr;
3552 set_bit(R10BIO_IsSync, &r10_bio->state);
3553 raid10_find_phys(conf, r10_bio);
3554 r10_bio->sectors = (sector_nr | chunk_mask) - sector_nr + 1;
3555
3556 for (i = 0; i < conf->copies; i++) {
3557 int d = r10_bio->devs[i].devnum;
3558 sector_t first_bad, sector;
3559 int bad_sectors;
3560 struct md_rdev *rdev;
3561
3562 if (r10_bio->devs[i].repl_bio)
3563 r10_bio->devs[i].repl_bio->bi_end_io = NULL;
3564
3565 bio = r10_bio->devs[i].bio;
3566 bio->bi_status = BLK_STS_IOERR;
3567 rdev = conf->mirrors[d].rdev;
3568 if (rdev == NULL || test_bit(Faulty, &rdev->flags))
3569 continue;
3570
3571 sector = r10_bio->devs[i].addr;
3572 if (is_badblock(rdev, sector, max_sync,
3573 &first_bad, &bad_sectors)) {
3574 if (first_bad > sector)
3575 max_sync = first_bad - sector;
3576 else {
3577 bad_sectors -= (sector - first_bad);
3578 if (max_sync > bad_sectors)
3579 max_sync = bad_sectors;
3580 continue;
3581 }
3582 }
3583 atomic_inc(&rdev->nr_pending);
3584 atomic_inc(&r10_bio->remaining);
3585 bio->bi_next = biolist;
3586 biolist = bio;
3587 bio->bi_end_io = end_sync_read;
3588 bio->bi_opf = REQ_OP_READ;
3589 if (test_bit(FailFast, &rdev->flags))
3590 bio->bi_opf |= MD_FAILFAST;
3591 bio->bi_iter.bi_sector = sector + rdev->data_offset;
3592 bio_set_dev(bio, rdev->bdev);
3593 count++;
3594
3595 rdev = conf->mirrors[d].replacement;
3596 if (rdev == NULL || test_bit(Faulty, &rdev->flags))
3597 continue;
3598
3599 atomic_inc(&rdev->nr_pending);
3600
3601 /* Need to set up for writing to the replacement */
3602 bio = r10_bio->devs[i].repl_bio;
3603 bio->bi_status = BLK_STS_IOERR;
3604
3605 sector = r10_bio->devs[i].addr;
3606 bio->bi_next = biolist;
3607 biolist = bio;
3608 bio->bi_end_io = end_sync_write;
3609 bio->bi_opf = REQ_OP_WRITE;
3610 if (test_bit(FailFast, &rdev->flags))
3611 bio->bi_opf |= MD_FAILFAST;
3612 bio->bi_iter.bi_sector = sector + rdev->data_offset;
3613 bio_set_dev(bio, rdev->bdev);
3614 count++;
3615 }
3616
3617 if (count < 2) {
3618 for (i=0; i<conf->copies; i++) {
3619 int d = r10_bio->devs[i].devnum;
3620 if (r10_bio->devs[i].bio->bi_end_io)
3621 rdev_dec_pending(conf->mirrors[d].rdev,
3622 mddev);
3623 if (r10_bio->devs[i].repl_bio &&
3624 r10_bio->devs[i].repl_bio->bi_end_io)
3625 rdev_dec_pending(
3626 conf->mirrors[d].replacement,
3627 mddev);
3628 }
3629 put_buf(r10_bio);
3630 biolist = NULL;
3631 goto giveup;
3632 }
3633 }
3634
3635 nr_sectors = 0;
3636 if (sector_nr + max_sync < max_sector)
3637 max_sector = sector_nr + max_sync;
3638 do {
3639 struct page *page;
3640 int len = PAGE_SIZE;
3641 if (sector_nr + (len>>9) > max_sector)
3642 len = (max_sector - sector_nr) << 9;
3643 if (len == 0)
3644 break;
3645 for (bio= biolist ; bio ; bio=bio->bi_next) {
3646 struct resync_pages *rp = get_resync_pages(bio);
3647 page = resync_fetch_page(rp, page_idx);
3648 if (WARN_ON(!bio_add_page(bio, page, len, 0))) {
3649 bio->bi_status = BLK_STS_RESOURCE;
3650 bio_endio(bio);
3651 goto giveup;
3652 }
3653 }
3654 nr_sectors += len>>9;
3655 sector_nr += len>>9;
3656 } while (++page_idx < RESYNC_PAGES);
3657 r10_bio->sectors = nr_sectors;
3658
3659 if (mddev_is_clustered(mddev) &&
3660 test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
3661 /* It is resync not recovery */
3662 if (conf->cluster_sync_high < sector_nr + nr_sectors) {
3663 conf->cluster_sync_low = mddev->curr_resync_completed;
3664 raid10_set_cluster_sync_high(conf);
3665 /* Send resync message */
3666 md_cluster_ops->resync_info_update(mddev,
3667 conf->cluster_sync_low,
3668 conf->cluster_sync_high);
3669 }
3670 } else if (mddev_is_clustered(mddev)) {
3671 /* This is recovery not resync */
3672 sector_t sect_va1, sect_va2;
3673 bool broadcast_msg = false;
3674
3675 for (i = 0; i < conf->geo.raid_disks; i++) {
3676 /*
3677 * sector_nr is a device address for recovery, so we
3678 * need translate it to array address before compare
3679 * with cluster_sync_high.
3680 */
3681 sect_va1 = raid10_find_virt(conf, sector_nr, i);
3682
3683 if (conf->cluster_sync_high < sect_va1 + nr_sectors) {
3684 broadcast_msg = true;
3685 /*
3686 * curr_resync_completed is similar as
3687 * sector_nr, so make the translation too.
3688 */
3689 sect_va2 = raid10_find_virt(conf,
3690 mddev->curr_resync_completed, i);
3691
3692 if (conf->cluster_sync_low == 0 ||
3693 conf->cluster_sync_low > sect_va2)
3694 conf->cluster_sync_low = sect_va2;
3695 }
3696 }
3697 if (broadcast_msg) {
3698 raid10_set_cluster_sync_high(conf);
3699 md_cluster_ops->resync_info_update(mddev,
3700 conf->cluster_sync_low,
3701 conf->cluster_sync_high);
3702 }
3703 }
3704
3705 while (biolist) {
3706 bio = biolist;
3707 biolist = biolist->bi_next;
3708
3709 bio->bi_next = NULL;
3710 r10_bio = get_resync_r10bio(bio);
3711 r10_bio->sectors = nr_sectors;
3712
3713 if (bio->bi_end_io == end_sync_read) {
3714 md_sync_acct_bio(bio, nr_sectors);
3715 bio->bi_status = 0;
3716 submit_bio_noacct(bio);
3717 }
3718 }
3719
3720 if (sectors_skipped)
3721 /* pretend they weren't skipped, it makes
3722 * no important difference in this case
3723 */
3724 md_done_sync(mddev, sectors_skipped, 1);
3725
3726 return sectors_skipped + nr_sectors;
3727 giveup:
3728 /* There is nowhere to write, so all non-sync
3729 * drives must be failed or in resync, all drives
3730 * have a bad block, so try the next chunk...
3731 */
3732 if (sector_nr + max_sync < max_sector)
3733 max_sector = sector_nr + max_sync;
3734
3735 sectors_skipped += (max_sector - sector_nr);
3736 chunks_skipped ++;
3737 sector_nr = max_sector;
3738 goto skipped;
3739 }
3740
3741 static sector_t
raid10_size(struct mddev * mddev,sector_t sectors,int raid_disks)3742 raid10_size(struct mddev *mddev, sector_t sectors, int raid_disks)
3743 {
3744 sector_t size;
3745 struct r10conf *conf = mddev->private;
3746
3747 if (!raid_disks)
3748 raid_disks = min(conf->geo.raid_disks,
3749 conf->prev.raid_disks);
3750 if (!sectors)
3751 sectors = conf->dev_sectors;
3752
3753 size = sectors >> conf->geo.chunk_shift;
3754 sector_div(size, conf->geo.far_copies);
3755 size = size * raid_disks;
3756 sector_div(size, conf->geo.near_copies);
3757
3758 return size << conf->geo.chunk_shift;
3759 }
3760
calc_sectors(struct r10conf * conf,sector_t size)3761 static void calc_sectors(struct r10conf *conf, sector_t size)
3762 {
3763 /* Calculate the number of sectors-per-device that will
3764 * actually be used, and set conf->dev_sectors and
3765 * conf->stride
3766 */
3767
3768 size = size >> conf->geo.chunk_shift;
3769 sector_div(size, conf->geo.far_copies);
3770 size = size * conf->geo.raid_disks;
3771 sector_div(size, conf->geo.near_copies);
3772 /* 'size' is now the number of chunks in the array */
3773 /* calculate "used chunks per device" */
3774 size = size * conf->copies;
3775
3776 /* We need to round up when dividing by raid_disks to
3777 * get the stride size.
3778 */
3779 size = DIV_ROUND_UP_SECTOR_T(size, conf->geo.raid_disks);
3780
3781 conf->dev_sectors = size << conf->geo.chunk_shift;
3782
3783 if (conf->geo.far_offset)
3784 conf->geo.stride = 1 << conf->geo.chunk_shift;
3785 else {
3786 sector_div(size, conf->geo.far_copies);
3787 conf->geo.stride = size << conf->geo.chunk_shift;
3788 }
3789 }
3790
3791 enum geo_type {geo_new, geo_old, geo_start};
setup_geo(struct geom * geo,struct mddev * mddev,enum geo_type new)3792 static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
3793 {
3794 int nc, fc, fo;
3795 int layout, chunk, disks;
3796 switch (new) {
3797 case geo_old:
3798 layout = mddev->layout;
3799 chunk = mddev->chunk_sectors;
3800 disks = mddev->raid_disks - mddev->delta_disks;
3801 break;
3802 case geo_new:
3803 layout = mddev->new_layout;
3804 chunk = mddev->new_chunk_sectors;
3805 disks = mddev->raid_disks;
3806 break;
3807 default: /* avoid 'may be unused' warnings */
3808 case geo_start: /* new when starting reshape - raid_disks not
3809 * updated yet. */
3810 layout = mddev->new_layout;
3811 chunk = mddev->new_chunk_sectors;
3812 disks = mddev->raid_disks + mddev->delta_disks;
3813 break;
3814 }
3815 if (layout >> 19)
3816 return -1;
3817 if (chunk < (PAGE_SIZE >> 9) ||
3818 !is_power_of_2(chunk))
3819 return -2;
3820 nc = layout & 255;
3821 fc = (layout >> 8) & 255;
3822 fo = layout & (1<<16);
3823 geo->raid_disks = disks;
3824 geo->near_copies = nc;
3825 geo->far_copies = fc;
3826 geo->far_offset = fo;
3827 switch (layout >> 17) {
3828 case 0: /* original layout. simple but not always optimal */
3829 geo->far_set_size = disks;
3830 break;
3831 case 1: /* "improved" layout which was buggy. Hopefully no-one is
3832 * actually using this, but leave code here just in case.*/
3833 geo->far_set_size = disks/fc;
3834 WARN(geo->far_set_size < fc,
3835 "This RAID10 layout does not provide data safety - please backup and create new array\n");
3836 break;
3837 case 2: /* "improved" layout fixed to match documentation */
3838 geo->far_set_size = fc * nc;
3839 break;
3840 default: /* Not a valid layout */
3841 return -1;
3842 }
3843 geo->chunk_mask = chunk - 1;
3844 geo->chunk_shift = ffz(~chunk);
3845 return nc*fc;
3846 }
3847
raid10_free_conf(struct r10conf * conf)3848 static void raid10_free_conf(struct r10conf *conf)
3849 {
3850 if (!conf)
3851 return;
3852
3853 mempool_exit(&conf->r10bio_pool);
3854 kfree(conf->mirrors);
3855 kfree(conf->mirrors_old);
3856 kfree(conf->mirrors_new);
3857 safe_put_page(conf->tmppage);
3858 bioset_exit(&conf->bio_split);
3859 kfree(conf);
3860 }
3861
setup_conf(struct mddev * mddev)3862 static struct r10conf *setup_conf(struct mddev *mddev)
3863 {
3864 struct r10conf *conf = NULL;
3865 int err = -EINVAL;
3866 struct geom geo;
3867 int copies;
3868
3869 copies = setup_geo(&geo, mddev, geo_new);
3870
3871 if (copies == -2) {
3872 pr_warn("md/raid10:%s: chunk size must be at least PAGE_SIZE(%ld) and be a power of 2.\n",
3873 mdname(mddev), PAGE_SIZE);
3874 goto out;
3875 }
3876
3877 if (copies < 2 || copies > mddev->raid_disks) {
3878 pr_warn("md/raid10:%s: unsupported raid10 layout: 0x%8x\n",
3879 mdname(mddev), mddev->new_layout);
3880 goto out;
3881 }
3882
3883 err = -ENOMEM;
3884 conf = kzalloc(sizeof(struct r10conf), GFP_KERNEL);
3885 if (!conf)
3886 goto out;
3887
3888 /* FIXME calc properly */
3889 conf->mirrors = kcalloc(mddev->raid_disks + max(0, -mddev->delta_disks),
3890 sizeof(struct raid10_info),
3891 GFP_KERNEL);
3892 if (!conf->mirrors)
3893 goto out;
3894
3895 conf->tmppage = alloc_page(GFP_KERNEL);
3896 if (!conf->tmppage)
3897 goto out;
3898
3899 conf->geo = geo;
3900 conf->copies = copies;
3901 err = mempool_init(&conf->r10bio_pool, NR_RAID_BIOS, r10bio_pool_alloc,
3902 rbio_pool_free, conf);
3903 if (err)
3904 goto out;
3905
3906 err = bioset_init(&conf->bio_split, BIO_POOL_SIZE, 0, 0);
3907 if (err)
3908 goto out;
3909
3910 calc_sectors(conf, mddev->dev_sectors);
3911 if (mddev->reshape_position == MaxSector) {
3912 conf->prev = conf->geo;
3913 conf->reshape_progress = MaxSector;
3914 } else {
3915 if (setup_geo(&conf->prev, mddev, geo_old) != conf->copies) {
3916 err = -EINVAL;
3917 goto out;
3918 }
3919 conf->reshape_progress = mddev->reshape_position;
3920 if (conf->prev.far_offset)
3921 conf->prev.stride = 1 << conf->prev.chunk_shift;
3922 else
3923 /* far_copies must be 1 */
3924 conf->prev.stride = conf->dev_sectors;
3925 }
3926 conf->reshape_safe = conf->reshape_progress;
3927 spin_lock_init(&conf->device_lock);
3928 INIT_LIST_HEAD(&conf->retry_list);
3929 INIT_LIST_HEAD(&conf->bio_end_io_list);
3930
3931 seqlock_init(&conf->resync_lock);
3932 init_waitqueue_head(&conf->wait_barrier);
3933 atomic_set(&conf->nr_pending, 0);
3934
3935 err = -ENOMEM;
3936 rcu_assign_pointer(conf->thread,
3937 md_register_thread(raid10d, mddev, "raid10"));
3938 if (!conf->thread)
3939 goto out;
3940
3941 conf->mddev = mddev;
3942 return conf;
3943
3944 out:
3945 raid10_free_conf(conf);
3946 return ERR_PTR(err);
3947 }
3948
raid10_nr_stripes(struct r10conf * conf)3949 static unsigned int raid10_nr_stripes(struct r10conf *conf)
3950 {
3951 unsigned int raid_disks = conf->geo.raid_disks;
3952
3953 if (conf->geo.raid_disks % conf->geo.near_copies)
3954 return raid_disks;
3955 return raid_disks / conf->geo.near_copies;
3956 }
3957
raid10_set_queue_limits(struct mddev * mddev)3958 static int raid10_set_queue_limits(struct mddev *mddev)
3959 {
3960 struct r10conf *conf = mddev->private;
3961 struct queue_limits lim;
3962 int err;
3963
3964 md_init_stacking_limits(&lim);
3965 lim.max_write_zeroes_sectors = 0;
3966 lim.io_min = mddev->chunk_sectors << 9;
3967 lim.chunk_sectors = mddev->chunk_sectors;
3968 lim.io_opt = lim.io_min * raid10_nr_stripes(conf);
3969 err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY);
3970 if (err)
3971 return err;
3972 return queue_limits_set(mddev->gendisk->queue, &lim);
3973 }
3974
raid10_run(struct mddev * mddev)3975 static int raid10_run(struct mddev *mddev)
3976 {
3977 struct r10conf *conf;
3978 int i, disk_idx;
3979 struct raid10_info *disk;
3980 struct md_rdev *rdev;
3981 sector_t size;
3982 sector_t min_offset_diff = 0;
3983 int first = 1;
3984 int ret = -EIO;
3985
3986 if (mddev->private == NULL) {
3987 conf = setup_conf(mddev);
3988 if (IS_ERR(conf))
3989 return PTR_ERR(conf);
3990 mddev->private = conf;
3991 }
3992 conf = mddev->private;
3993 if (!conf)
3994 goto out;
3995
3996 rcu_assign_pointer(mddev->thread, conf->thread);
3997 rcu_assign_pointer(conf->thread, NULL);
3998
3999 if (mddev_is_clustered(conf->mddev)) {
4000 int fc, fo;
4001
4002 fc = (mddev->layout >> 8) & 255;
4003 fo = mddev->layout & (1<<16);
4004 if (fc > 1 || fo > 0) {
4005 pr_err("only near layout is supported by clustered"
4006 " raid10\n");
4007 goto out_free_conf;
4008 }
4009 }
4010
4011 rdev_for_each(rdev, mddev) {
4012 long long diff;
4013
4014 disk_idx = rdev->raid_disk;
4015 if (disk_idx < 0)
4016 continue;
4017 if (disk_idx >= conf->geo.raid_disks &&
4018 disk_idx >= conf->prev.raid_disks)
4019 continue;
4020 disk = conf->mirrors + disk_idx;
4021
4022 if (test_bit(Replacement, &rdev->flags)) {
4023 if (disk->replacement)
4024 goto out_free_conf;
4025 disk->replacement = rdev;
4026 } else {
4027 if (disk->rdev)
4028 goto out_free_conf;
4029 disk->rdev = rdev;
4030 }
4031 diff = (rdev->new_data_offset - rdev->data_offset);
4032 if (!mddev->reshape_backwards)
4033 diff = -diff;
4034 if (diff < 0)
4035 diff = 0;
4036 if (first || diff < min_offset_diff)
4037 min_offset_diff = diff;
4038
4039 disk->head_position = 0;
4040 first = 0;
4041 }
4042
4043 if (!mddev_is_dm(conf->mddev)) {
4044 int err = raid10_set_queue_limits(mddev);
4045
4046 if (err) {
4047 ret = err;
4048 goto out_free_conf;
4049 }
4050 }
4051
4052 /* need to check that every block has at least one working mirror */
4053 if (!enough(conf, -1)) {
4054 pr_err("md/raid10:%s: not enough operational mirrors.\n",
4055 mdname(mddev));
4056 goto out_free_conf;
4057 }
4058
4059 if (conf->reshape_progress != MaxSector) {
4060 /* must ensure that shape change is supported */
4061 if (conf->geo.far_copies != 1 &&
4062 conf->geo.far_offset == 0)
4063 goto out_free_conf;
4064 if (conf->prev.far_copies != 1 &&
4065 conf->prev.far_offset == 0)
4066 goto out_free_conf;
4067 }
4068
4069 mddev->degraded = 0;
4070 for (i = 0;
4071 i < conf->geo.raid_disks
4072 || i < conf->prev.raid_disks;
4073 i++) {
4074
4075 disk = conf->mirrors + i;
4076
4077 if (!disk->rdev && disk->replacement) {
4078 /* The replacement is all we have - use it */
4079 disk->rdev = disk->replacement;
4080 disk->replacement = NULL;
4081 clear_bit(Replacement, &disk->rdev->flags);
4082 }
4083
4084 if (!disk->rdev ||
4085 !test_bit(In_sync, &disk->rdev->flags)) {
4086 disk->head_position = 0;
4087 mddev->degraded++;
4088 if (disk->rdev &&
4089 disk->rdev->saved_raid_disk < 0)
4090 conf->fullsync = 1;
4091 }
4092
4093 if (disk->replacement &&
4094 !test_bit(In_sync, &disk->replacement->flags) &&
4095 disk->replacement->saved_raid_disk < 0) {
4096 conf->fullsync = 1;
4097 }
4098
4099 disk->recovery_disabled = mddev->recovery_disabled - 1;
4100 }
4101
4102 if (mddev->recovery_cp != MaxSector)
4103 pr_notice("md/raid10:%s: not clean -- starting background reconstruction\n",
4104 mdname(mddev));
4105 pr_info("md/raid10:%s: active with %d out of %d devices\n",
4106 mdname(mddev), conf->geo.raid_disks - mddev->degraded,
4107 conf->geo.raid_disks);
4108 /*
4109 * Ok, everything is just fine now
4110 */
4111 mddev->dev_sectors = conf->dev_sectors;
4112 size = raid10_size(mddev, 0, 0);
4113 md_set_array_sectors(mddev, size);
4114 mddev->resync_max_sectors = size;
4115 set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags);
4116
4117 if (md_integrity_register(mddev))
4118 goto out_free_conf;
4119
4120 if (conf->reshape_progress != MaxSector) {
4121 unsigned long before_length, after_length;
4122
4123 before_length = ((1 << conf->prev.chunk_shift) *
4124 conf->prev.far_copies);
4125 after_length = ((1 << conf->geo.chunk_shift) *
4126 conf->geo.far_copies);
4127
4128 if (max(before_length, after_length) > min_offset_diff) {
4129 /* This cannot work */
4130 pr_warn("md/raid10: offset difference not enough to continue reshape\n");
4131 goto out_free_conf;
4132 }
4133 conf->offset_diff = min_offset_diff;
4134
4135 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
4136 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
4137 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
4138 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4139 }
4140
4141 return 0;
4142
4143 out_free_conf:
4144 md_unregister_thread(mddev, &mddev->thread);
4145 raid10_free_conf(conf);
4146 mddev->private = NULL;
4147 out:
4148 return ret;
4149 }
4150
raid10_free(struct mddev * mddev,void * priv)4151 static void raid10_free(struct mddev *mddev, void *priv)
4152 {
4153 raid10_free_conf(priv);
4154 }
4155
raid10_quiesce(struct mddev * mddev,int quiesce)4156 static void raid10_quiesce(struct mddev *mddev, int quiesce)
4157 {
4158 struct r10conf *conf = mddev->private;
4159
4160 if (quiesce)
4161 raise_barrier(conf, 0);
4162 else
4163 lower_barrier(conf);
4164 }
4165
raid10_resize(struct mddev * mddev,sector_t sectors)4166 static int raid10_resize(struct mddev *mddev, sector_t sectors)
4167 {
4168 /* Resize of 'far' arrays is not supported.
4169 * For 'near' and 'offset' arrays we can set the
4170 * number of sectors used to be an appropriate multiple
4171 * of the chunk size.
4172 * For 'offset', this is far_copies*chunksize.
4173 * For 'near' the multiplier is the LCM of
4174 * near_copies and raid_disks.
4175 * So if far_copies > 1 && !far_offset, fail.
4176 * Else find LCM(raid_disks, near_copy)*far_copies and
4177 * multiply by chunk_size. Then round to this number.
4178 * This is mostly done by raid10_size()
4179 */
4180 struct r10conf *conf = mddev->private;
4181 sector_t oldsize, size;
4182 int ret;
4183
4184 if (mddev->reshape_position != MaxSector)
4185 return -EBUSY;
4186
4187 if (conf->geo.far_copies > 1 && !conf->geo.far_offset)
4188 return -EINVAL;
4189
4190 oldsize = raid10_size(mddev, 0, 0);
4191 size = raid10_size(mddev, sectors, 0);
4192 if (mddev->external_size &&
4193 mddev->array_sectors > size)
4194 return -EINVAL;
4195
4196 ret = mddev->bitmap_ops->resize(mddev, size, 0, false);
4197 if (ret)
4198 return ret;
4199
4200 md_set_array_sectors(mddev, size);
4201 if (sectors > mddev->dev_sectors &&
4202 mddev->recovery_cp > oldsize) {
4203 mddev->recovery_cp = oldsize;
4204 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4205 }
4206 calc_sectors(conf, sectors);
4207 mddev->dev_sectors = conf->dev_sectors;
4208 mddev->resync_max_sectors = size;
4209 return 0;
4210 }
4211
raid10_takeover_raid0(struct mddev * mddev,sector_t size,int devs)4212 static void *raid10_takeover_raid0(struct mddev *mddev, sector_t size, int devs)
4213 {
4214 struct md_rdev *rdev;
4215 struct r10conf *conf;
4216
4217 if (mddev->degraded > 0) {
4218 pr_warn("md/raid10:%s: Error: degraded raid0!\n",
4219 mdname(mddev));
4220 return ERR_PTR(-EINVAL);
4221 }
4222 sector_div(size, devs);
4223
4224 /* Set new parameters */
4225 mddev->new_level = 10;
4226 /* new layout: far_copies = 1, near_copies = 2 */
4227 mddev->new_layout = (1<<8) + 2;
4228 mddev->new_chunk_sectors = mddev->chunk_sectors;
4229 mddev->delta_disks = mddev->raid_disks;
4230 mddev->raid_disks *= 2;
4231 /* make sure it will be not marked as dirty */
4232 mddev->recovery_cp = MaxSector;
4233 mddev->dev_sectors = size;
4234
4235 conf = setup_conf(mddev);
4236 if (!IS_ERR(conf)) {
4237 rdev_for_each(rdev, mddev)
4238 if (rdev->raid_disk >= 0) {
4239 rdev->new_raid_disk = rdev->raid_disk * 2;
4240 rdev->sectors = size;
4241 }
4242 }
4243
4244 return conf;
4245 }
4246
raid10_takeover(struct mddev * mddev)4247 static void *raid10_takeover(struct mddev *mddev)
4248 {
4249 struct r0conf *raid0_conf;
4250
4251 /* raid10 can take over:
4252 * raid0 - providing it has only two drives
4253 */
4254 if (mddev->level == 0) {
4255 /* for raid0 takeover only one zone is supported */
4256 raid0_conf = mddev->private;
4257 if (raid0_conf->nr_strip_zones > 1) {
4258 pr_warn("md/raid10:%s: cannot takeover raid 0 with more than one zone.\n",
4259 mdname(mddev));
4260 return ERR_PTR(-EINVAL);
4261 }
4262 return raid10_takeover_raid0(mddev,
4263 raid0_conf->strip_zone->zone_end,
4264 raid0_conf->strip_zone->nb_dev);
4265 }
4266 return ERR_PTR(-EINVAL);
4267 }
4268
raid10_check_reshape(struct mddev * mddev)4269 static int raid10_check_reshape(struct mddev *mddev)
4270 {
4271 /* Called when there is a request to change
4272 * - layout (to ->new_layout)
4273 * - chunk size (to ->new_chunk_sectors)
4274 * - raid_disks (by delta_disks)
4275 * or when trying to restart a reshape that was ongoing.
4276 *
4277 * We need to validate the request and possibly allocate
4278 * space if that might be an issue later.
4279 *
4280 * Currently we reject any reshape of a 'far' mode array,
4281 * allow chunk size to change if new is generally acceptable,
4282 * allow raid_disks to increase, and allow
4283 * a switch between 'near' mode and 'offset' mode.
4284 */
4285 struct r10conf *conf = mddev->private;
4286 struct geom geo;
4287
4288 if (conf->geo.far_copies != 1 && !conf->geo.far_offset)
4289 return -EINVAL;
4290
4291 if (setup_geo(&geo, mddev, geo_start) != conf->copies)
4292 /* mustn't change number of copies */
4293 return -EINVAL;
4294 if (geo.far_copies > 1 && !geo.far_offset)
4295 /* Cannot switch to 'far' mode */
4296 return -EINVAL;
4297
4298 if (mddev->array_sectors & geo.chunk_mask)
4299 /* not factor of array size */
4300 return -EINVAL;
4301
4302 if (!enough(conf, -1))
4303 return -EINVAL;
4304
4305 kfree(conf->mirrors_new);
4306 conf->mirrors_new = NULL;
4307 if (mddev->delta_disks > 0) {
4308 /* allocate new 'mirrors' list */
4309 conf->mirrors_new =
4310 kcalloc(mddev->raid_disks + mddev->delta_disks,
4311 sizeof(struct raid10_info),
4312 GFP_KERNEL);
4313 if (!conf->mirrors_new)
4314 return -ENOMEM;
4315 }
4316 return 0;
4317 }
4318
4319 /*
4320 * Need to check if array has failed when deciding whether to:
4321 * - start an array
4322 * - remove non-faulty devices
4323 * - add a spare
4324 * - allow a reshape
4325 * This determination is simple when no reshape is happening.
4326 * However if there is a reshape, we need to carefully check
4327 * both the before and after sections.
4328 * This is because some failed devices may only affect one
4329 * of the two sections, and some non-in_sync devices may
4330 * be insync in the section most affected by failed devices.
4331 */
calc_degraded(struct r10conf * conf)4332 static int calc_degraded(struct r10conf *conf)
4333 {
4334 int degraded, degraded2;
4335 int i;
4336
4337 degraded = 0;
4338 /* 'prev' section first */
4339 for (i = 0; i < conf->prev.raid_disks; i++) {
4340 struct md_rdev *rdev = conf->mirrors[i].rdev;
4341
4342 if (!rdev || test_bit(Faulty, &rdev->flags))
4343 degraded++;
4344 else if (!test_bit(In_sync, &rdev->flags))
4345 /* When we can reduce the number of devices in
4346 * an array, this might not contribute to
4347 * 'degraded'. It does now.
4348 */
4349 degraded++;
4350 }
4351 if (conf->geo.raid_disks == conf->prev.raid_disks)
4352 return degraded;
4353 degraded2 = 0;
4354 for (i = 0; i < conf->geo.raid_disks; i++) {
4355 struct md_rdev *rdev = conf->mirrors[i].rdev;
4356
4357 if (!rdev || test_bit(Faulty, &rdev->flags))
4358 degraded2++;
4359 else if (!test_bit(In_sync, &rdev->flags)) {
4360 /* If reshape is increasing the number of devices,
4361 * this section has already been recovered, so
4362 * it doesn't contribute to degraded.
4363 * else it does.
4364 */
4365 if (conf->geo.raid_disks <= conf->prev.raid_disks)
4366 degraded2++;
4367 }
4368 }
4369 if (degraded2 > degraded)
4370 return degraded2;
4371 return degraded;
4372 }
4373
raid10_start_reshape(struct mddev * mddev)4374 static int raid10_start_reshape(struct mddev *mddev)
4375 {
4376 /* A 'reshape' has been requested. This commits
4377 * the various 'new' fields and sets MD_RECOVER_RESHAPE
4378 * This also checks if there are enough spares and adds them
4379 * to the array.
4380 * We currently require enough spares to make the final
4381 * array non-degraded. We also require that the difference
4382 * between old and new data_offset - on each device - is
4383 * enough that we never risk over-writing.
4384 */
4385
4386 unsigned long before_length, after_length;
4387 sector_t min_offset_diff = 0;
4388 int first = 1;
4389 struct geom new;
4390 struct r10conf *conf = mddev->private;
4391 struct md_rdev *rdev;
4392 int spares = 0;
4393 int ret;
4394
4395 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4396 return -EBUSY;
4397
4398 if (setup_geo(&new, mddev, geo_start) != conf->copies)
4399 return -EINVAL;
4400
4401 before_length = ((1 << conf->prev.chunk_shift) *
4402 conf->prev.far_copies);
4403 after_length = ((1 << conf->geo.chunk_shift) *
4404 conf->geo.far_copies);
4405
4406 rdev_for_each(rdev, mddev) {
4407 if (!test_bit(In_sync, &rdev->flags)
4408 && !test_bit(Faulty, &rdev->flags))
4409 spares++;
4410 if (rdev->raid_disk >= 0) {
4411 long long diff = (rdev->new_data_offset
4412 - rdev->data_offset);
4413 if (!mddev->reshape_backwards)
4414 diff = -diff;
4415 if (diff < 0)
4416 diff = 0;
4417 if (first || diff < min_offset_diff)
4418 min_offset_diff = diff;
4419 first = 0;
4420 }
4421 }
4422
4423 if (max(before_length, after_length) > min_offset_diff)
4424 return -EINVAL;
4425
4426 if (spares < mddev->delta_disks)
4427 return -EINVAL;
4428
4429 conf->offset_diff = min_offset_diff;
4430 spin_lock_irq(&conf->device_lock);
4431 if (conf->mirrors_new) {
4432 memcpy(conf->mirrors_new, conf->mirrors,
4433 sizeof(struct raid10_info)*conf->prev.raid_disks);
4434 smp_mb();
4435 kfree(conf->mirrors_old);
4436 conf->mirrors_old = conf->mirrors;
4437 conf->mirrors = conf->mirrors_new;
4438 conf->mirrors_new = NULL;
4439 }
4440 setup_geo(&conf->geo, mddev, geo_start);
4441 smp_mb();
4442 if (mddev->reshape_backwards) {
4443 sector_t size = raid10_size(mddev, 0, 0);
4444 if (size < mddev->array_sectors) {
4445 spin_unlock_irq(&conf->device_lock);
4446 pr_warn("md/raid10:%s: array size must be reduce before number of disks\n",
4447 mdname(mddev));
4448 return -EINVAL;
4449 }
4450 mddev->resync_max_sectors = size;
4451 conf->reshape_progress = size;
4452 } else
4453 conf->reshape_progress = 0;
4454 conf->reshape_safe = conf->reshape_progress;
4455 spin_unlock_irq(&conf->device_lock);
4456
4457 if (mddev->delta_disks && mddev->bitmap) {
4458 struct mdp_superblock_1 *sb = NULL;
4459 sector_t oldsize, newsize;
4460
4461 oldsize = raid10_size(mddev, 0, 0);
4462 newsize = raid10_size(mddev, 0, conf->geo.raid_disks);
4463
4464 if (!mddev_is_clustered(mddev)) {
4465 ret = mddev->bitmap_ops->resize(mddev, newsize, 0, false);
4466 if (ret)
4467 goto abort;
4468 else
4469 goto out;
4470 }
4471
4472 rdev_for_each(rdev, mddev) {
4473 if (rdev->raid_disk > -1 &&
4474 !test_bit(Faulty, &rdev->flags))
4475 sb = page_address(rdev->sb_page);
4476 }
4477
4478 /*
4479 * some node is already performing reshape, and no need to
4480 * call bitmap_ops->resize again since it should be called when
4481 * receiving BITMAP_RESIZE msg
4482 */
4483 if ((sb && (le32_to_cpu(sb->feature_map) &
4484 MD_FEATURE_RESHAPE_ACTIVE)) || (oldsize == newsize))
4485 goto out;
4486
4487 ret = mddev->bitmap_ops->resize(mddev, newsize, 0, false);
4488 if (ret)
4489 goto abort;
4490
4491 ret = md_cluster_ops->resize_bitmaps(mddev, newsize, oldsize);
4492 if (ret) {
4493 mddev->bitmap_ops->resize(mddev, oldsize, 0, false);
4494 goto abort;
4495 }
4496 }
4497 out:
4498 if (mddev->delta_disks > 0) {
4499 rdev_for_each(rdev, mddev)
4500 if (rdev->raid_disk < 0 &&
4501 !test_bit(Faulty, &rdev->flags)) {
4502 if (raid10_add_disk(mddev, rdev) == 0) {
4503 if (rdev->raid_disk >=
4504 conf->prev.raid_disks)
4505 set_bit(In_sync, &rdev->flags);
4506 else
4507 rdev->recovery_offset = 0;
4508
4509 /* Failure here is OK */
4510 sysfs_link_rdev(mddev, rdev);
4511 }
4512 } else if (rdev->raid_disk >= conf->prev.raid_disks
4513 && !test_bit(Faulty, &rdev->flags)) {
4514 /* This is a spare that was manually added */
4515 set_bit(In_sync, &rdev->flags);
4516 }
4517 }
4518 /* When a reshape changes the number of devices,
4519 * ->degraded is measured against the larger of the
4520 * pre and post numbers.
4521 */
4522 spin_lock_irq(&conf->device_lock);
4523 mddev->degraded = calc_degraded(conf);
4524 spin_unlock_irq(&conf->device_lock);
4525 mddev->raid_disks = conf->geo.raid_disks;
4526 mddev->reshape_position = conf->reshape_progress;
4527 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
4528
4529 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
4530 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
4531 clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
4532 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
4533 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4534 conf->reshape_checkpoint = jiffies;
4535 md_new_event();
4536 return 0;
4537
4538 abort:
4539 mddev->recovery = 0;
4540 spin_lock_irq(&conf->device_lock);
4541 conf->geo = conf->prev;
4542 mddev->raid_disks = conf->geo.raid_disks;
4543 rdev_for_each(rdev, mddev)
4544 rdev->new_data_offset = rdev->data_offset;
4545 smp_wmb();
4546 conf->reshape_progress = MaxSector;
4547 conf->reshape_safe = MaxSector;
4548 mddev->reshape_position = MaxSector;
4549 spin_unlock_irq(&conf->device_lock);
4550 return ret;
4551 }
4552
4553 /* Calculate the last device-address that could contain
4554 * any block from the chunk that includes the array-address 's'
4555 * and report the next address.
4556 * i.e. the address returned will be chunk-aligned and after
4557 * any data that is in the chunk containing 's'.
4558 */
last_dev_address(sector_t s,struct geom * geo)4559 static sector_t last_dev_address(sector_t s, struct geom *geo)
4560 {
4561 s = (s | geo->chunk_mask) + 1;
4562 s >>= geo->chunk_shift;
4563 s *= geo->near_copies;
4564 s = DIV_ROUND_UP_SECTOR_T(s, geo->raid_disks);
4565 s *= geo->far_copies;
4566 s <<= geo->chunk_shift;
4567 return s;
4568 }
4569
4570 /* Calculate the first device-address that could contain
4571 * any block from the chunk that includes the array-address 's'.
4572 * This too will be the start of a chunk
4573 */
first_dev_address(sector_t s,struct geom * geo)4574 static sector_t first_dev_address(sector_t s, struct geom *geo)
4575 {
4576 s >>= geo->chunk_shift;
4577 s *= geo->near_copies;
4578 sector_div(s, geo->raid_disks);
4579 s *= geo->far_copies;
4580 s <<= geo->chunk_shift;
4581 return s;
4582 }
4583
reshape_request(struct mddev * mddev,sector_t sector_nr,int * skipped)4584 static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
4585 int *skipped)
4586 {
4587 /* We simply copy at most one chunk (smallest of old and new)
4588 * at a time, possibly less if that exceeds RESYNC_PAGES,
4589 * or we hit a bad block or something.
4590 * This might mean we pause for normal IO in the middle of
4591 * a chunk, but that is not a problem as mddev->reshape_position
4592 * can record any location.
4593 *
4594 * If we will want to write to a location that isn't
4595 * yet recorded as 'safe' (i.e. in metadata on disk) then
4596 * we need to flush all reshape requests and update the metadata.
4597 *
4598 * When reshaping forwards (e.g. to more devices), we interpret
4599 * 'safe' as the earliest block which might not have been copied
4600 * down yet. We divide this by previous stripe size and multiply
4601 * by previous stripe length to get lowest device offset that we
4602 * cannot write to yet.
4603 * We interpret 'sector_nr' as an address that we want to write to.
4604 * From this we use last_device_address() to find where we might
4605 * write to, and first_device_address on the 'safe' position.
4606 * If this 'next' write position is after the 'safe' position,
4607 * we must update the metadata to increase the 'safe' position.
4608 *
4609 * When reshaping backwards, we round in the opposite direction
4610 * and perform the reverse test: next write position must not be
4611 * less than current safe position.
4612 *
4613 * In all this the minimum difference in data offsets
4614 * (conf->offset_diff - always positive) allows a bit of slack,
4615 * so next can be after 'safe', but not by more than offset_diff
4616 *
4617 * We need to prepare all the bios here before we start any IO
4618 * to ensure the size we choose is acceptable to all devices.
4619 * The means one for each copy for write-out and an extra one for
4620 * read-in.
4621 * We store the read-in bio in ->master_bio and the others in
4622 * ->devs[x].bio and ->devs[x].repl_bio.
4623 */
4624 struct r10conf *conf = mddev->private;
4625 struct r10bio *r10_bio;
4626 sector_t next, safe, last;
4627 int max_sectors;
4628 int nr_sectors;
4629 int s;
4630 struct md_rdev *rdev;
4631 int need_flush = 0;
4632 struct bio *blist;
4633 struct bio *bio, *read_bio;
4634 int sectors_done = 0;
4635 struct page **pages;
4636
4637 if (sector_nr == 0) {
4638 /* If restarting in the middle, skip the initial sectors */
4639 if (mddev->reshape_backwards &&
4640 conf->reshape_progress < raid10_size(mddev, 0, 0)) {
4641 sector_nr = (raid10_size(mddev, 0, 0)
4642 - conf->reshape_progress);
4643 } else if (!mddev->reshape_backwards &&
4644 conf->reshape_progress > 0)
4645 sector_nr = conf->reshape_progress;
4646 if (sector_nr) {
4647 mddev->curr_resync_completed = sector_nr;
4648 sysfs_notify_dirent_safe(mddev->sysfs_completed);
4649 *skipped = 1;
4650 return sector_nr;
4651 }
4652 }
4653
4654 /* We don't use sector_nr to track where we are up to
4655 * as that doesn't work well for ->reshape_backwards.
4656 * So just use ->reshape_progress.
4657 */
4658 if (mddev->reshape_backwards) {
4659 /* 'next' is the earliest device address that we might
4660 * write to for this chunk in the new layout
4661 */
4662 next = first_dev_address(conf->reshape_progress - 1,
4663 &conf->geo);
4664
4665 /* 'safe' is the last device address that we might read from
4666 * in the old layout after a restart
4667 */
4668 safe = last_dev_address(conf->reshape_safe - 1,
4669 &conf->prev);
4670
4671 if (next + conf->offset_diff < safe)
4672 need_flush = 1;
4673
4674 last = conf->reshape_progress - 1;
4675 sector_nr = last & ~(sector_t)(conf->geo.chunk_mask
4676 & conf->prev.chunk_mask);
4677 if (sector_nr + RESYNC_SECTORS < last)
4678 sector_nr = last + 1 - RESYNC_SECTORS;
4679 } else {
4680 /* 'next' is after the last device address that we
4681 * might write to for this chunk in the new layout
4682 */
4683 next = last_dev_address(conf->reshape_progress, &conf->geo);
4684
4685 /* 'safe' is the earliest device address that we might
4686 * read from in the old layout after a restart
4687 */
4688 safe = first_dev_address(conf->reshape_safe, &conf->prev);
4689
4690 /* Need to update metadata if 'next' might be beyond 'safe'
4691 * as that would possibly corrupt data
4692 */
4693 if (next > safe + conf->offset_diff)
4694 need_flush = 1;
4695
4696 sector_nr = conf->reshape_progress;
4697 last = sector_nr | (conf->geo.chunk_mask
4698 & conf->prev.chunk_mask);
4699
4700 if (sector_nr + RESYNC_SECTORS <= last)
4701 last = sector_nr + RESYNC_SECTORS - 1;
4702 }
4703
4704 if (need_flush ||
4705 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
4706 /* Need to update reshape_position in metadata */
4707 wait_barrier(conf, false);
4708 mddev->reshape_position = conf->reshape_progress;
4709 if (mddev->reshape_backwards)
4710 mddev->curr_resync_completed = raid10_size(mddev, 0, 0)
4711 - conf->reshape_progress;
4712 else
4713 mddev->curr_resync_completed = conf->reshape_progress;
4714 conf->reshape_checkpoint = jiffies;
4715 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
4716 md_wakeup_thread(mddev->thread);
4717 wait_event(mddev->sb_wait, mddev->sb_flags == 0 ||
4718 test_bit(MD_RECOVERY_INTR, &mddev->recovery));
4719 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
4720 allow_barrier(conf);
4721 return sectors_done;
4722 }
4723 conf->reshape_safe = mddev->reshape_position;
4724 allow_barrier(conf);
4725 }
4726
4727 raise_barrier(conf, 0);
4728 read_more:
4729 /* Now schedule reads for blocks from sector_nr to last */
4730 r10_bio = raid10_alloc_init_r10buf(conf);
4731 r10_bio->state = 0;
4732 raise_barrier(conf, 1);
4733 atomic_set(&r10_bio->remaining, 0);
4734 r10_bio->mddev = mddev;
4735 r10_bio->sector = sector_nr;
4736 set_bit(R10BIO_IsReshape, &r10_bio->state);
4737 r10_bio->sectors = last - sector_nr + 1;
4738 rdev = read_balance(conf, r10_bio, &max_sectors);
4739 BUG_ON(!test_bit(R10BIO_Previous, &r10_bio->state));
4740
4741 if (!rdev) {
4742 /* Cannot read from here, so need to record bad blocks
4743 * on all the target devices.
4744 */
4745 // FIXME
4746 mempool_free(r10_bio, &conf->r10buf_pool);
4747 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4748 return sectors_done;
4749 }
4750
4751 read_bio = bio_alloc_bioset(rdev->bdev, RESYNC_PAGES, REQ_OP_READ,
4752 GFP_KERNEL, &mddev->bio_set);
4753 read_bio->bi_iter.bi_sector = (r10_bio->devs[r10_bio->read_slot].addr
4754 + rdev->data_offset);
4755 read_bio->bi_private = r10_bio;
4756 read_bio->bi_end_io = end_reshape_read;
4757 r10_bio->master_bio = read_bio;
4758 r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum;
4759
4760 /*
4761 * Broadcast RESYNC message to other nodes, so all nodes would not
4762 * write to the region to avoid conflict.
4763 */
4764 if (mddev_is_clustered(mddev) && conf->cluster_sync_high <= sector_nr) {
4765 struct mdp_superblock_1 *sb = NULL;
4766 int sb_reshape_pos = 0;
4767
4768 conf->cluster_sync_low = sector_nr;
4769 conf->cluster_sync_high = sector_nr + CLUSTER_RESYNC_WINDOW_SECTORS;
4770 sb = page_address(rdev->sb_page);
4771 if (sb) {
4772 sb_reshape_pos = le64_to_cpu(sb->reshape_position);
4773 /*
4774 * Set cluster_sync_low again if next address for array
4775 * reshape is less than cluster_sync_low. Since we can't
4776 * update cluster_sync_low until it has finished reshape.
4777 */
4778 if (sb_reshape_pos < conf->cluster_sync_low)
4779 conf->cluster_sync_low = sb_reshape_pos;
4780 }
4781
4782 md_cluster_ops->resync_info_update(mddev, conf->cluster_sync_low,
4783 conf->cluster_sync_high);
4784 }
4785
4786 /* Now find the locations in the new layout */
4787 __raid10_find_phys(&conf->geo, r10_bio);
4788
4789 blist = read_bio;
4790 read_bio->bi_next = NULL;
4791
4792 for (s = 0; s < conf->copies*2; s++) {
4793 struct bio *b;
4794 int d = r10_bio->devs[s/2].devnum;
4795 struct md_rdev *rdev2;
4796 if (s&1) {
4797 rdev2 = conf->mirrors[d].replacement;
4798 b = r10_bio->devs[s/2].repl_bio;
4799 } else {
4800 rdev2 = conf->mirrors[d].rdev;
4801 b = r10_bio->devs[s/2].bio;
4802 }
4803 if (!rdev2 || test_bit(Faulty, &rdev2->flags))
4804 continue;
4805
4806 bio_set_dev(b, rdev2->bdev);
4807 b->bi_iter.bi_sector = r10_bio->devs[s/2].addr +
4808 rdev2->new_data_offset;
4809 b->bi_end_io = end_reshape_write;
4810 b->bi_opf = REQ_OP_WRITE;
4811 b->bi_next = blist;
4812 blist = b;
4813 }
4814
4815 /* Now add as many pages as possible to all of these bios. */
4816
4817 nr_sectors = 0;
4818 pages = get_resync_pages(r10_bio->devs[0].bio)->pages;
4819 for (s = 0 ; s < max_sectors; s += PAGE_SIZE >> 9) {
4820 struct page *page = pages[s / (PAGE_SIZE >> 9)];
4821 int len = (max_sectors - s) << 9;
4822 if (len > PAGE_SIZE)
4823 len = PAGE_SIZE;
4824 for (bio = blist; bio ; bio = bio->bi_next) {
4825 if (WARN_ON(!bio_add_page(bio, page, len, 0))) {
4826 bio->bi_status = BLK_STS_RESOURCE;
4827 bio_endio(bio);
4828 return sectors_done;
4829 }
4830 }
4831 sector_nr += len >> 9;
4832 nr_sectors += len >> 9;
4833 }
4834 r10_bio->sectors = nr_sectors;
4835
4836 /* Now submit the read */
4837 md_sync_acct_bio(read_bio, r10_bio->sectors);
4838 atomic_inc(&r10_bio->remaining);
4839 read_bio->bi_next = NULL;
4840 submit_bio_noacct(read_bio);
4841 sectors_done += nr_sectors;
4842 if (sector_nr <= last)
4843 goto read_more;
4844
4845 lower_barrier(conf);
4846
4847 /* Now that we have done the whole section we can
4848 * update reshape_progress
4849 */
4850 if (mddev->reshape_backwards)
4851 conf->reshape_progress -= sectors_done;
4852 else
4853 conf->reshape_progress += sectors_done;
4854
4855 return sectors_done;
4856 }
4857
4858 static void end_reshape_request(struct r10bio *r10_bio);
4859 static int handle_reshape_read_error(struct mddev *mddev,
4860 struct r10bio *r10_bio);
reshape_request_write(struct mddev * mddev,struct r10bio * r10_bio)4861 static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio)
4862 {
4863 /* Reshape read completed. Hopefully we have a block
4864 * to write out.
4865 * If we got a read error then we do sync 1-page reads from
4866 * elsewhere until we find the data - or give up.
4867 */
4868 struct r10conf *conf = mddev->private;
4869 int s;
4870
4871 if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
4872 if (handle_reshape_read_error(mddev, r10_bio) < 0) {
4873 /* Reshape has been aborted */
4874 md_done_sync(mddev, r10_bio->sectors, 0);
4875 return;
4876 }
4877
4878 /* We definitely have the data in the pages, schedule the
4879 * writes.
4880 */
4881 atomic_set(&r10_bio->remaining, 1);
4882 for (s = 0; s < conf->copies*2; s++) {
4883 struct bio *b;
4884 int d = r10_bio->devs[s/2].devnum;
4885 struct md_rdev *rdev;
4886 if (s&1) {
4887 rdev = conf->mirrors[d].replacement;
4888 b = r10_bio->devs[s/2].repl_bio;
4889 } else {
4890 rdev = conf->mirrors[d].rdev;
4891 b = r10_bio->devs[s/2].bio;
4892 }
4893 if (!rdev || test_bit(Faulty, &rdev->flags))
4894 continue;
4895
4896 atomic_inc(&rdev->nr_pending);
4897 md_sync_acct_bio(b, r10_bio->sectors);
4898 atomic_inc(&r10_bio->remaining);
4899 b->bi_next = NULL;
4900 submit_bio_noacct(b);
4901 }
4902 end_reshape_request(r10_bio);
4903 }
4904
end_reshape(struct r10conf * conf)4905 static void end_reshape(struct r10conf *conf)
4906 {
4907 if (test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery))
4908 return;
4909
4910 spin_lock_irq(&conf->device_lock);
4911 conf->prev = conf->geo;
4912 md_finish_reshape(conf->mddev);
4913 smp_wmb();
4914 conf->reshape_progress = MaxSector;
4915 conf->reshape_safe = MaxSector;
4916 spin_unlock_irq(&conf->device_lock);
4917
4918 mddev_update_io_opt(conf->mddev, raid10_nr_stripes(conf));
4919 conf->fullsync = 0;
4920 }
4921
raid10_update_reshape_pos(struct mddev * mddev)4922 static void raid10_update_reshape_pos(struct mddev *mddev)
4923 {
4924 struct r10conf *conf = mddev->private;
4925 sector_t lo, hi;
4926
4927 md_cluster_ops->resync_info_get(mddev, &lo, &hi);
4928 if (((mddev->reshape_position <= hi) && (mddev->reshape_position >= lo))
4929 || mddev->reshape_position == MaxSector)
4930 conf->reshape_progress = mddev->reshape_position;
4931 else
4932 WARN_ON_ONCE(1);
4933 }
4934
handle_reshape_read_error(struct mddev * mddev,struct r10bio * r10_bio)4935 static int handle_reshape_read_error(struct mddev *mddev,
4936 struct r10bio *r10_bio)
4937 {
4938 /* Use sync reads to get the blocks from somewhere else */
4939 int sectors = r10_bio->sectors;
4940 struct r10conf *conf = mddev->private;
4941 struct r10bio *r10b;
4942 int slot = 0;
4943 int idx = 0;
4944 struct page **pages;
4945
4946 r10b = kmalloc(struct_size(r10b, devs, conf->copies), GFP_NOIO);
4947 if (!r10b) {
4948 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4949 return -ENOMEM;
4950 }
4951
4952 /* reshape IOs share pages from .devs[0].bio */
4953 pages = get_resync_pages(r10_bio->devs[0].bio)->pages;
4954
4955 r10b->sector = r10_bio->sector;
4956 __raid10_find_phys(&conf->prev, r10b);
4957
4958 while (sectors) {
4959 int s = sectors;
4960 int success = 0;
4961 int first_slot = slot;
4962
4963 if (s > (PAGE_SIZE >> 9))
4964 s = PAGE_SIZE >> 9;
4965
4966 while (!success) {
4967 int d = r10b->devs[slot].devnum;
4968 struct md_rdev *rdev = conf->mirrors[d].rdev;
4969 sector_t addr;
4970 if (rdev == NULL ||
4971 test_bit(Faulty, &rdev->flags) ||
4972 !test_bit(In_sync, &rdev->flags))
4973 goto failed;
4974
4975 addr = r10b->devs[slot].addr + idx * PAGE_SIZE;
4976 atomic_inc(&rdev->nr_pending);
4977 success = sync_page_io(rdev,
4978 addr,
4979 s << 9,
4980 pages[idx],
4981 REQ_OP_READ, false);
4982 rdev_dec_pending(rdev, mddev);
4983 if (success)
4984 break;
4985 failed:
4986 slot++;
4987 if (slot >= conf->copies)
4988 slot = 0;
4989 if (slot == first_slot)
4990 break;
4991 }
4992 if (!success) {
4993 /* couldn't read this block, must give up */
4994 set_bit(MD_RECOVERY_INTR,
4995 &mddev->recovery);
4996 kfree(r10b);
4997 return -EIO;
4998 }
4999 sectors -= s;
5000 idx++;
5001 }
5002 kfree(r10b);
5003 return 0;
5004 }
5005
end_reshape_write(struct bio * bio)5006 static void end_reshape_write(struct bio *bio)
5007 {
5008 struct r10bio *r10_bio = get_resync_r10bio(bio);
5009 struct mddev *mddev = r10_bio->mddev;
5010 struct r10conf *conf = mddev->private;
5011 int d;
5012 int slot;
5013 int repl;
5014 struct md_rdev *rdev = NULL;
5015
5016 d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
5017 rdev = repl ? conf->mirrors[d].replacement :
5018 conf->mirrors[d].rdev;
5019
5020 if (bio->bi_status) {
5021 /* FIXME should record badblock */
5022 md_error(mddev, rdev);
5023 }
5024
5025 rdev_dec_pending(rdev, mddev);
5026 end_reshape_request(r10_bio);
5027 }
5028
end_reshape_request(struct r10bio * r10_bio)5029 static void end_reshape_request(struct r10bio *r10_bio)
5030 {
5031 if (!atomic_dec_and_test(&r10_bio->remaining))
5032 return;
5033 md_done_sync(r10_bio->mddev, r10_bio->sectors, 1);
5034 bio_put(r10_bio->master_bio);
5035 put_buf(r10_bio);
5036 }
5037
raid10_finish_reshape(struct mddev * mddev)5038 static void raid10_finish_reshape(struct mddev *mddev)
5039 {
5040 struct r10conf *conf = mddev->private;
5041
5042 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
5043 return;
5044
5045 if (mddev->delta_disks > 0) {
5046 if (mddev->recovery_cp > mddev->resync_max_sectors) {
5047 mddev->recovery_cp = mddev->resync_max_sectors;
5048 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5049 }
5050 mddev->resync_max_sectors = mddev->array_sectors;
5051 } else {
5052 int d;
5053 for (d = conf->geo.raid_disks ;
5054 d < conf->geo.raid_disks - mddev->delta_disks;
5055 d++) {
5056 struct md_rdev *rdev = conf->mirrors[d].rdev;
5057 if (rdev)
5058 clear_bit(In_sync, &rdev->flags);
5059 rdev = conf->mirrors[d].replacement;
5060 if (rdev)
5061 clear_bit(In_sync, &rdev->flags);
5062 }
5063 }
5064 mddev->layout = mddev->new_layout;
5065 mddev->chunk_sectors = 1 << conf->geo.chunk_shift;
5066 mddev->reshape_position = MaxSector;
5067 mddev->delta_disks = 0;
5068 mddev->reshape_backwards = 0;
5069 }
5070
5071 static struct md_personality raid10_personality =
5072 {
5073 .name = "raid10",
5074 .level = 10,
5075 .owner = THIS_MODULE,
5076 .make_request = raid10_make_request,
5077 .run = raid10_run,
5078 .free = raid10_free,
5079 .status = raid10_status,
5080 .error_handler = raid10_error,
5081 .hot_add_disk = raid10_add_disk,
5082 .hot_remove_disk= raid10_remove_disk,
5083 .spare_active = raid10_spare_active,
5084 .sync_request = raid10_sync_request,
5085 .quiesce = raid10_quiesce,
5086 .size = raid10_size,
5087 .resize = raid10_resize,
5088 .takeover = raid10_takeover,
5089 .check_reshape = raid10_check_reshape,
5090 .start_reshape = raid10_start_reshape,
5091 .finish_reshape = raid10_finish_reshape,
5092 .update_reshape_pos = raid10_update_reshape_pos,
5093 };
5094
raid_init(void)5095 static int __init raid_init(void)
5096 {
5097 return register_md_personality(&raid10_personality);
5098 }
5099
raid_exit(void)5100 static void raid_exit(void)
5101 {
5102 unregister_md_personality(&raid10_personality);
5103 }
5104
5105 module_init(raid_init);
5106 module_exit(raid_exit);
5107 MODULE_LICENSE("GPL");
5108 MODULE_DESCRIPTION("RAID10 (striped mirror) personality for MD");
5109 MODULE_ALIAS("md-personality-9"); /* RAID10 */
5110 MODULE_ALIAS("md-raid10");
5111 MODULE_ALIAS("md-level-10");
5112