source: mainline/uspace/srv/bd/hr/raid5.c@ a3486f2

Last change on this file since a3486f2 was a3486f2, checked in by Miroslav Cimerman <mc@…>, 7 weeks ago

hr: parallel RAID 5

  • Property mode set to 100644
File size: 22.4 KB
Line 
1/*
2 * Copyright (c) 2025 Miroslav Cimerman
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * - Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * - Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * - The name of the author may not be used to endorse or promote products
15 * derived from this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/** @addtogroup hr
30 * @{
31 */
32/**
33 * @file
34 */
35
36#include <abi/ipc/ipc.h>
37#include <bd_srv.h>
38#include <block.h>
39#include <errno.h>
40#include <hr.h>
41#include <inttypes.h>
42#include <io/log.h>
43#include <ipc/hr.h>
44#include <ipc/services.h>
45#include <loc.h>
46#include <mem.h>
47#include <task.h>
48#include <stdio.h>
49#include <stdlib.h>
50#include <str_error.h>
51
52#include "io.h"
53#include "parity_stripe.h"
54#include "superblock.h"
55#include "util.h"
56#include "var.h"
57
58static void hr_raid5_vol_state_eval_forced(hr_volume_t *);
59
60static size_t hr_raid5_parity_extent(hr_level_t, hr_layout_t, size_t,
61 uint64_t);
62static size_t hr_raid5_data_extent(hr_level_t, hr_layout_t, size_t, uint64_t,
63 uint64_t);
64
65static errno_t hr_raid5_rebuild(void *);
66
67/* bdops */
68static errno_t hr_raid5_bd_open(bd_srvs_t *, bd_srv_t *);
69static errno_t hr_raid5_bd_close(bd_srv_t *);
70static errno_t hr_raid5_bd_read_blocks(bd_srv_t *, aoff64_t, size_t, void *,
71 size_t);
72static errno_t hr_raid5_bd_sync_cache(bd_srv_t *, aoff64_t, size_t);
73static errno_t hr_raid5_bd_write_blocks(bd_srv_t *, aoff64_t, size_t,
74 const void *, size_t);
75static errno_t hr_raid5_bd_get_block_size(bd_srv_t *, size_t *);
76static errno_t hr_raid5_bd_get_num_blocks(bd_srv_t *, aoff64_t *);
77
78static bd_ops_t hr_raid5_bd_ops = {
79 .open = hr_raid5_bd_open,
80 .close = hr_raid5_bd_close,
81 .sync_cache = hr_raid5_bd_sync_cache,
82 .read_blocks = hr_raid5_bd_read_blocks,
83 .write_blocks = hr_raid5_bd_write_blocks,
84 .get_block_size = hr_raid5_bd_get_block_size,
85 .get_num_blocks = hr_raid5_bd_get_num_blocks
86};
87
88extern loc_srv_t *hr_srv;
89
90errno_t hr_raid5_create(hr_volume_t *new_volume)
91{
92 HR_DEBUG("%s()", __func__);
93
94 if (new_volume->level != HR_LVL_5 && new_volume->level != HR_LVL_4)
95 return EINVAL;
96
97 if (new_volume->extent_no < 3) {
98 HR_ERROR("RAID 5 volume needs at least 3 devices\n");
99 return EINVAL;
100 }
101
102 bd_srvs_init(&new_volume->hr_bds);
103 new_volume->hr_bds.ops = &hr_raid5_bd_ops;
104 new_volume->hr_bds.sarg = new_volume;
105
106 hr_raid5_vol_state_eval_forced(new_volume);
107
108 fibril_rwlock_read_lock(&new_volume->states_lock);
109 hr_vol_state_t state = new_volume->state;
110 fibril_rwlock_read_unlock(&new_volume->states_lock);
111 if (state == HR_VOL_FAULTY || state == HR_VOL_NONE) {
112 HR_NOTE("\"%s\": unusable state, not creating\n",
113 new_volume->devname);
114 return EINVAL;
115 }
116
117 return EOK;
118}
119
120/*
121 * Called only once in volume's lifetime.
122 */
123errno_t hr_raid5_init(hr_volume_t *vol)
124{
125 HR_DEBUG("%s()", __func__);
126
127 if (vol->level != HR_LVL_5 && vol->level != HR_LVL_4)
128 return EINVAL;
129
130 vol->data_offset = vol->meta_ops->get_data_offset();
131
132 uint64_t single_sz = vol->truncated_blkno - vol->meta_ops->get_size();
133 vol->data_blkno = single_sz * (vol->extent_no - 1);
134
135 vol->strip_size = HR_STRIP_SIZE;
136
137 if (vol->level == HR_LVL_4)
138 vol->layout = HR_LAYOUT_RAID4_N;
139 else
140 vol->layout = HR_LAYOUT_RAID5_NR;
141
142 return EOK;
143}
144
145void hr_raid5_vol_state_eval(hr_volume_t *vol)
146{
147 HR_DEBUG("%s()", __func__);
148
149 bool exp = true;
150 if (!atomic_compare_exchange_strong(&vol->state_dirty, &exp, false))
151 return;
152
153 vol->meta_ops->inc_counter(vol);
154 (void)vol->meta_ops->save(vol, WITH_STATE_CALLBACK);
155
156 hr_raid5_vol_state_eval_forced(vol);
157}
158
159errno_t hr_raid5_add_hotspare(hr_volume_t *vol, service_id_t hotspare)
160{
161 HR_DEBUG("%s()", __func__);
162
163 errno_t rc = hr_util_add_hotspare(vol, hotspare);
164
165 hr_raid5_vol_state_eval(vol);
166
167 return rc;
168}
169
170void hr_raid5_ext_state_cb(hr_volume_t *vol, size_t extent, errno_t rc)
171{
172 HR_DEBUG("%s()", __func__);
173
174 assert(fibril_rwlock_is_locked(&vol->extents_lock));
175
176 if (rc == EOK)
177 return;
178
179 fibril_rwlock_write_lock(&vol->states_lock);
180
181 switch (rc) {
182 case ENOMEM:
183 hr_update_ext_state(vol, extent, HR_EXT_INVALID);
184 break;
185 case ENOENT:
186 hr_update_ext_state(vol, extent, HR_EXT_MISSING);
187 break;
188 default:
189 hr_update_ext_state(vol, extent, HR_EXT_FAILED);
190 }
191
192 hr_mark_vol_state_dirty(vol);
193
194 fibril_rwlock_write_unlock(&vol->states_lock);
195}
196
197static errno_t hr_raid5_bd_open(bd_srvs_t *bds, bd_srv_t *bd)
198{
199 HR_DEBUG("%s()\n", __func__);
200
201 hr_volume_t *vol = bd->srvs->sarg;
202
203 atomic_fetch_add_explicit(&vol->open_cnt, 1, memory_order_relaxed);
204
205 return EOK;
206}
207
208static errno_t hr_raid5_bd_close(bd_srv_t *bd)
209{
210 HR_DEBUG("%s()\n", __func__);
211
212 hr_volume_t *vol = bd->srvs->sarg;
213
214 atomic_fetch_sub_explicit(&vol->open_cnt, 1, memory_order_relaxed);
215
216 return EOK;
217}
218
219static errno_t hr_raid5_bd_sync_cache(bd_srv_t *bd, aoff64_t ba, size_t cnt)
220{
221 /* XXX */
222 return EOK;
223}
224
225static errno_t hr_raid5_bd_read_blocks(bd_srv_t *bd, uint64_t ba, size_t cnt,
226 void *data_read, size_t size)
227{
228 hr_volume_t *vol = bd->srvs->sarg;
229 errno_t rc;
230
231 if (size < cnt * vol->bsize)
232 return EINVAL;
233
234 fibril_rwlock_read_lock(&vol->states_lock);
235 hr_vol_state_t vol_state = vol->state;
236 fibril_rwlock_read_unlock(&vol->states_lock);
237
238 if (vol_state == HR_VOL_FAULTY || vol_state == HR_VOL_NONE)
239 return EIO;
240
241 rc = hr_check_ba_range(vol, cnt, ba);
242 if (rc != EOK)
243 return rc;
244
245 uint64_t strip_size = vol->strip_size / vol->bsize; /* in blocks */
246 uint64_t strip_no = ba / strip_size;
247
248 /* calculate number of stripes touched */
249 uint64_t last_ba = ba + cnt - 1;
250 uint64_t end_strip_no = last_ba / strip_size;
251 uint64_t start_stripe = strip_no / (vol->extent_no - 1);
252 uint64_t end_stripe = end_strip_no / (vol->extent_no - 1);
253 size_t stripes_cnt = end_stripe - start_stripe + 1;
254
255 hr_stripe_t *stripes = hr_create_stripes(vol, stripes_cnt, false);
256 if (stripes == NULL)
257 return ENOMEM;
258
259 /*
260 * Pre-allocate range locks, because after group creation and
261 * firing off IO requests there is no easy consistent ENOMEM error
262 * path.
263 */
264 hr_range_lock_t **rlps = malloc_waitok(stripes_cnt * sizeof(*rlps));
265 for (size_t i = 0; i < stripes_cnt; i++)
266 rlps[i] = malloc_waitok(sizeof(**rlps));
267
268 /*
269 * extent order has to be locked for the whole IO duration,
270 * so that workers have consistent targets
271 */
272 fibril_rwlock_read_lock(&vol->extents_lock);
273
274 for (uint64_t s = start_stripe; s <= end_stripe; s++) {
275 uint64_t relative = s - start_stripe;
276 hr_range_lock_acquire_noalloc(rlps[relative], vol, s, 1);
277 }
278
279 uint64_t phys_block, len;
280 size_t left;
281
282 hr_layout_t layout = vol->layout;
283 hr_level_t level = vol->level;
284
285 /* parity extent */
286 size_t p_extent = hr_raid5_parity_extent(level, layout,
287 vol->extent_no, strip_no);
288
289 uint64_t strip_off = ba % strip_size;
290
291 left = cnt;
292
293 while (left != 0) {
294 if (level == HR_LVL_5) {
295 p_extent = hr_raid5_parity_extent(level, layout,
296 vol->extent_no, strip_no);
297 }
298
299 size_t extent = hr_raid5_data_extent(level, layout,
300 vol->extent_no, strip_no, p_extent);
301
302 uint64_t stripe_no = strip_no / (vol->extent_no - 1);
303 size_t relative_si = stripe_no - start_stripe; /* relative stripe index */
304 hr_stripe_t *stripe = &stripes[relative_si];
305 stripe->p_extent = p_extent;
306
307 stripe->strips_touched++;
308
309 phys_block = stripe_no * strip_size + strip_off;
310 cnt = min(left, strip_size - strip_off);
311 len = vol->bsize * cnt;
312 hr_add_data_offset(vol, &phys_block);
313
314 stripe->extent_span[extent].range.start = phys_block;
315 stripe->extent_span[extent].range.end = phys_block + cnt - 1;
316 stripe->extent_span[extent].cnt = cnt;
317 stripe->extent_span[extent].data_read = data_read;
318 stripe->extent_span[extent].strip_off = strip_off;
319
320 data_read += len;
321 left -= cnt;
322 strip_off = 0;
323 strip_no++;
324 }
325
326retry:
327 size_t bad_extent = vol->extent_no;
328
329 uint64_t rebuild_pos = atomic_load_explicit(&vol->rebuild_blk,
330 memory_order_relaxed);
331
332 fibril_rwlock_read_lock(&vol->states_lock);
333
334 for (size_t e = 0; e < vol->extent_no; e++) {
335 hr_ext_state_t s = vol->extents[e].state;
336 if ((vol->state == HR_VOL_DEGRADED && s != HR_EXT_ONLINE) ||
337 (s == HR_EXT_REBUILD && rebuild_pos < start_stripe)) {
338 bad_extent = e;
339 break;
340 }
341 }
342
343 fibril_rwlock_read_unlock(&vol->states_lock);
344
345 for (size_t s = 0; s < stripes_cnt; s++) {
346 if (stripes[s].done)
347 continue;
348 execute_stripe(&stripes[s], bad_extent);
349 }
350
351 for (size_t s = 0; s < stripes_cnt; s++) {
352 if (stripes[s].done)
353 continue;
354 wait_for_stripe(&stripes[s]);
355 }
356
357 hr_raid5_vol_state_eval(vol);
358
359 rc = EOK;
360
361 fibril_rwlock_read_lock(&vol->states_lock);
362
363 if (vol->state == HR_VOL_FAULTY) {
364 fibril_rwlock_read_unlock(&vol->states_lock);
365 rc = EIO;
366 goto end;
367 }
368
369 fibril_rwlock_read_unlock(&vol->states_lock);
370
371 for (size_t s = 0; s < stripes_cnt; s++)
372 if (stripes[s].rc == EAGAIN)
373 goto retry;
374
375 /* all stripes are done */
376end:
377 fibril_rwlock_read_unlock(&vol->extents_lock);
378
379 for (size_t i = 0; i < stripes_cnt; i++)
380 hr_range_lock_release(rlps[i]);
381
382 hr_destroy_stripes(stripes, stripes_cnt);
383
384 return rc;
385}
386
387static errno_t hr_raid5_bd_write_blocks(bd_srv_t *bd, aoff64_t ba, size_t cnt,
388 const void *data_write, size_t size)
389{
390 hr_volume_t *vol = bd->srvs->sarg;
391 errno_t rc;
392
393 if (size < cnt * vol->bsize)
394 return EINVAL;
395
396 fibril_rwlock_read_lock(&vol->states_lock);
397 hr_vol_state_t vol_state = vol->state;
398 fibril_rwlock_read_unlock(&vol->states_lock);
399
400 if (vol_state == HR_VOL_FAULTY || vol_state == HR_VOL_NONE)
401 return EIO;
402
403 /* increment metadata counter only on first write */
404 bool exp = false;
405 if (atomic_compare_exchange_strong(&vol->first_write, &exp, true)) {
406 vol->meta_ops->inc_counter(vol);
407 vol->meta_ops->save(vol, WITH_STATE_CALLBACK);
408 }
409
410 rc = hr_check_ba_range(vol, cnt, ba);
411 if (rc != EOK)
412 return rc;
413
414 uint64_t strip_size = vol->strip_size / vol->bsize; /* in blocks */
415 uint64_t strip_no = ba / strip_size;
416
417 /* calculate number of stripes touched */
418 uint64_t last_ba = ba + cnt - 1;
419 uint64_t end_strip_no = last_ba / strip_size;
420 uint64_t start_stripe = strip_no / (vol->extent_no - 1);
421 uint64_t end_stripe = end_strip_no / (vol->extent_no - 1);
422 size_t stripes_cnt = end_stripe - start_stripe + 1;
423
424 hr_stripe_t *stripes = hr_create_stripes(vol, stripes_cnt, true);
425 if (stripes == NULL)
426 return ENOMEM;
427
428 uint64_t stripe_size = strip_size * (vol->extent_no - 1);
429
430 for (uint64_t stripe = start_stripe; stripe <= end_stripe; stripe++) {
431 uint64_t relative_stripe = stripe - start_stripe;
432
433 uint64_t s_start = stripe * stripe_size;
434 uint64_t s_end = s_start + stripe_size - 1;
435
436 uint64_t overlap_start;
437 if (ba > s_start)
438 overlap_start = ba;
439 else
440 overlap_start = s_start;
441
442 uint64_t overlap_end;
443 if (last_ba < s_end)
444 overlap_end = last_ba;
445 else
446 overlap_end = s_end;
447
448 uint64_t start_strip_index =
449 (overlap_start - s_start) / strip_size;
450 uint64_t end_strip_index = (overlap_end - s_start) / strip_size;
451 size_t strips_touched = end_strip_index - start_strip_index + 1;
452
453 stripes[relative_stripe].strips_touched = strips_touched;
454
455 uint64_t first_offset = (overlap_start - s_start) % strip_size;
456 uint64_t last_offset = (overlap_end - s_start) % strip_size;
457
458 size_t partials = 0;
459 if (first_offset != 0)
460 partials++;
461 if (last_offset != strip_size - 1)
462 partials++;
463 if (start_strip_index == end_strip_index && partials == 2)
464 partials = 1;
465
466 stripes[relative_stripe].strips_touched = strips_touched;
467 stripes[relative_stripe].partial_strips_touched = partials;
468
469 if (strips_touched < (vol->extent_no - 1) / 2)
470 stripes[relative_stripe].subtract = true;
471 }
472
473 /*
474 * Pre-allocate range locks, because after group creation and
475 * firing off IO requests there is no easy consistent ENOMEM error
476 * path.
477 */
478 hr_range_lock_t **rlps = malloc_waitok(stripes_cnt * sizeof(*rlps));
479 for (size_t i = 0; i < stripes_cnt; i++)
480 rlps[i] = malloc_waitok(sizeof(**rlps));
481
482 /*
483 * extent order has to be locked for the whole IO duration,
484 * so that workers have consistent targets
485 */
486 fibril_rwlock_read_lock(&vol->extents_lock);
487
488 for (uint64_t s = start_stripe; s <= end_stripe; s++) {
489 uint64_t relative = s - start_stripe;
490 hr_range_lock_acquire_noalloc(rlps[relative], vol, s, 1);
491 }
492
493 uint64_t phys_block, len;
494 size_t left;
495
496 hr_layout_t layout = vol->layout;
497 hr_level_t level = vol->level;
498
499 /* parity extent */
500 size_t p_extent = hr_raid5_parity_extent(level, layout,
501 vol->extent_no, strip_no);
502
503 uint64_t strip_off = ba % strip_size;
504
505 left = cnt;
506
507 while (left != 0) {
508 if (level == HR_LVL_5) {
509 p_extent = hr_raid5_parity_extent(level, layout,
510 vol->extent_no, strip_no);
511 }
512
513 size_t extent = hr_raid5_data_extent(level, layout,
514 vol->extent_no, strip_no, p_extent);
515
516 uint64_t stripe_no = strip_no / (vol->extent_no - 1);
517 size_t relative_si = stripe_no - start_stripe; /* relative stripe index */
518 hr_stripe_t *stripe = &stripes[relative_si];
519 stripe->p_extent = p_extent;
520
521 phys_block = stripe_no * strip_size + strip_off;
522 cnt = min(left, strip_size - strip_off);
523 len = vol->bsize * cnt;
524 hr_add_data_offset(vol, &phys_block);
525
526 stripe->extent_span[extent].range.start = phys_block;
527 stripe->extent_span[extent].range.end = phys_block + cnt - 1;
528 stripe->extent_span[extent].cnt = cnt;
529 stripe->extent_span[extent].data_write = data_write;
530 stripe->extent_span[extent].strip_off = strip_off;
531
532 data_write += len;
533 left -= cnt;
534 strip_off = 0;
535 strip_no++;
536 }
537
538retry:
539 size_t bad_extent = vol->extent_no;
540
541 uint64_t rebuild_pos = atomic_load_explicit(&vol->rebuild_blk,
542 memory_order_relaxed);
543
544 fibril_rwlock_read_lock(&vol->states_lock);
545
546 for (size_t e = 0; e < vol->extent_no; e++) {
547 hr_ext_state_t s = vol->extents[e].state;
548 if ((vol->state == HR_VOL_DEGRADED && s != HR_EXT_ONLINE) ||
549 (s == HR_EXT_REBUILD && rebuild_pos < start_stripe)) {
550 bad_extent = e;
551 break;
552 }
553 }
554
555 fibril_rwlock_read_unlock(&vol->states_lock);
556
557 for (size_t s = 0; s < stripes_cnt; s++) {
558 if (stripes[s].done)
559 continue;
560 execute_stripe(&stripes[s], bad_extent);
561 }
562
563 for (size_t s = 0; s < stripes_cnt; s++) {
564 if (stripes[s].done)
565 continue;
566 wait_for_stripe(&stripes[s]);
567 }
568
569 hr_raid5_vol_state_eval(vol);
570
571 rc = EOK;
572
573 fibril_rwlock_read_lock(&vol->states_lock);
574
575 if (vol->state == HR_VOL_FAULTY) {
576 fibril_rwlock_read_unlock(&vol->states_lock);
577 rc = EIO;
578 goto end;
579 }
580
581 fibril_rwlock_read_unlock(&vol->states_lock);
582
583 for (size_t s = 0; s < stripes_cnt; s++)
584 if (stripes[s].rc == EAGAIN)
585 goto retry;
586
587 /* all stripes are done */
588end:
589 fibril_rwlock_read_unlock(&vol->extents_lock);
590
591 for (size_t i = 0; i < stripes_cnt; i++)
592 hr_range_lock_release(rlps[i]);
593
594 hr_destroy_stripes(stripes, stripes_cnt);
595
596 return rc;
597}
598
599static errno_t hr_raid5_bd_get_block_size(bd_srv_t *bd, size_t *rsize)
600{
601 hr_volume_t *vol = bd->srvs->sarg;
602
603 *rsize = vol->bsize;
604 return EOK;
605}
606
607static errno_t hr_raid5_bd_get_num_blocks(bd_srv_t *bd, aoff64_t *rnb)
608{
609 hr_volume_t *vol = bd->srvs->sarg;
610
611 *rnb = vol->data_blkno;
612 return EOK;
613}
614
615static void hr_raid5_vol_state_eval_forced(hr_volume_t *vol)
616{
617 fibril_rwlock_read_lock(&vol->extents_lock);
618 fibril_rwlock_write_lock(&vol->states_lock);
619
620 hr_vol_state_t state = vol->state;
621
622 size_t bad = 0;
623 for (size_t i = 0; i < vol->extent_no; i++)
624 if (vol->extents[i].state != HR_EXT_ONLINE)
625 bad++;
626
627 switch (bad) {
628 case 0:
629 if (state != HR_VOL_ONLINE)
630 hr_update_vol_state(vol, HR_VOL_ONLINE);
631 break;
632 case 1:
633 if (state != HR_VOL_DEGRADED && state != HR_VOL_REBUILD)
634 hr_update_vol_state(vol, HR_VOL_DEGRADED);
635
636 if (state != HR_VOL_REBUILD) {
637 /* XXX: allow REBUILD on INVALID extents */
638 fibril_mutex_lock(&vol->hotspare_lock);
639 size_t hs_no = vol->hotspare_no;
640 fibril_mutex_unlock(&vol->hotspare_lock);
641 if (hs_no > 0) {
642 fid_t fib = fibril_create(hr_raid5_rebuild,
643 vol);
644 if (fib == 0)
645 break;
646 fibril_start(fib);
647 fibril_detach(fib);
648 }
649 }
650 break;
651 default:
652 if (state != HR_VOL_FAULTY)
653 hr_update_vol_state(vol, HR_VOL_FAULTY);
654 break;
655 }
656
657 fibril_rwlock_write_unlock(&vol->states_lock);
658 fibril_rwlock_read_unlock(&vol->extents_lock);
659}
660
661static void xor(void *dst, const void *src, size_t size)
662{
663 size_t i;
664 uint64_t *d = dst;
665 const uint64_t *s = src;
666
667 for (i = 0; i < size / sizeof(uint64_t); ++i)
668 *d++ ^= *s++;
669}
670
671static size_t hr_raid5_parity_extent(hr_level_t level,
672 hr_layout_t layout, size_t extent_no, uint64_t strip_no)
673{
674 switch (level) {
675 case HR_LVL_4:
676 switch (layout) {
677 case HR_LAYOUT_RAID4_0:
678 return (0);
679 case HR_LAYOUT_RAID4_N:
680 return (extent_no - 1);
681 default:
682 assert(0 && "invalid layout configuration");
683 }
684 case HR_LVL_5:
685 switch (layout) {
686 case HR_LAYOUT_RAID5_0R:
687 return ((strip_no / (extent_no - 1)) % extent_no);
688 case HR_LAYOUT_RAID5_NR:
689 case HR_LAYOUT_RAID5_NC:
690 return ((extent_no - 1) -
691 (strip_no / (extent_no - 1)) % extent_no);
692 default:
693 assert(0 && "invalid layout configuration");
694 }
695 default:
696 assert(0 && "invalid layout configuration");
697 }
698}
699
700static size_t hr_raid5_data_extent(hr_level_t level,
701 hr_layout_t layout, size_t extent_no, uint64_t strip_no, size_t p_extent)
702{
703 switch (level) {
704 case HR_LVL_4:
705 switch (layout) {
706 case HR_LAYOUT_RAID4_0:
707 return ((strip_no % (extent_no - 1)) + 1);
708 case HR_LAYOUT_RAID4_N:
709 return (strip_no % (extent_no - 1));
710 default:
711 assert(0 && "invalid layout configuration");
712 }
713 case HR_LVL_5:
714 switch (layout) {
715 case HR_LAYOUT_RAID5_0R:
716 case HR_LAYOUT_RAID5_NR:
717 if ((strip_no % (extent_no - 1)) < p_extent)
718 return (strip_no % (extent_no - 1));
719 else
720 return ((strip_no % (extent_no - 1)) + 1);
721 case HR_LAYOUT_RAID5_NC:
722 return (((strip_no % (extent_no - 1)) + p_extent + 1) %
723 extent_no);
724 default:
725 assert(0 && "invalid layout configuration");
726 }
727 default:
728 assert(0 && "invalid layout configuration");
729 }
730}
731
732static errno_t hr_raid5_rebuild(void *arg)
733{
734 HR_DEBUG("hr_raid5_rebuild()\n");
735
736 hr_volume_t *vol = arg;
737 errno_t rc = EOK;
738 void *buf = NULL, *xorbuf = NULL;
739
740 fibril_rwlock_read_lock(&vol->extents_lock);
741 fibril_rwlock_write_lock(&vol->states_lock);
742
743 if (vol->hotspare_no == 0) {
744 HR_WARN("hr_raid5_rebuild(): no free hotspares on \"%s\", "
745 "aborting rebuild\n", vol->devname);
746 /* retval isn't checked for now */
747 goto end;
748 }
749
750 size_t bad = vol->extent_no;
751 for (size_t i = 0; i < vol->extent_no; i++) {
752 if (vol->extents[i].state == HR_EXT_FAILED) {
753 bad = i;
754 break;
755 }
756 }
757
758 if (bad == vol->extent_no) {
759 HR_WARN("hr_raid5_rebuild(): no bad extent on \"%s\", "
760 "aborting rebuild\n", vol->devname);
761 /* retval isn't checked for now */
762 goto end;
763 }
764
765 size_t hotspare_idx = vol->hotspare_no - 1;
766
767 hr_ext_state_t hs_state = vol->hotspares[hotspare_idx].state;
768 if (hs_state != HR_EXT_HOTSPARE) {
769 HR_ERROR("hr_raid5_rebuild(): invalid hotspare state \"%s\", "
770 "aborting rebuild\n", hr_get_ext_state_str(hs_state));
771 rc = EINVAL;
772 goto end;
773 }
774
775 HR_DEBUG("hr_raid5_rebuild(): swapping in hotspare\n");
776
777 block_fini(vol->extents[bad].svc_id);
778
779 vol->extents[bad].svc_id = vol->hotspares[hotspare_idx].svc_id;
780 hr_update_ext_state(vol, bad, HR_EXT_HOTSPARE);
781
782 vol->hotspares[hotspare_idx].svc_id = 0;
783 fibril_mutex_lock(&vol->hotspare_lock);
784 hr_update_hotspare_state(vol, hotspare_idx, HR_EXT_MISSING);
785 fibril_mutex_unlock(&vol->hotspare_lock);
786
787 vol->hotspare_no--;
788
789 hr_extent_t *rebuild_ext = &vol->extents[bad];
790
791 HR_DEBUG("hr_raid5_rebuild(): starting rebuild on (%" PRIun ")\n",
792 rebuild_ext->svc_id);
793
794 hr_update_ext_state(vol, bad, HR_EXT_REBUILD);
795 hr_update_vol_state(vol, HR_VOL_REBUILD);
796
797 uint64_t max_blks = DATA_XFER_LIMIT / vol->bsize;
798 uint64_t left = vol->data_blkno / (vol->extent_no - 1);
799 buf = malloc(max_blks * vol->bsize);
800 xorbuf = malloc(max_blks * vol->bsize);
801
802 uint64_t ba = 0, cnt;
803 hr_add_data_offset(vol, &ba);
804
805 while (left != 0) {
806 cnt = min(left, max_blks);
807
808 /*
809 * Almost the same as read_degraded,
810 * but we don't want to allocate new
811 * xorbuf each blk rebuild batch.
812 */
813 bool first = true;
814 for (size_t i = 0; i < vol->extent_no; i++) {
815 if (i == bad)
816 continue;
817 if (first)
818 rc = block_read_direct(vol->extents[i].svc_id,
819 ba, cnt, xorbuf);
820 else
821 rc = block_read_direct(vol->extents[i].svc_id,
822 ba, cnt, buf);
823 if (rc != EOK) {
824 hr_raid5_ext_state_cb(vol, i, rc);
825 HR_ERROR("rebuild on \"%s\" (%" PRIun "), "
826 "failed due to a failed ONLINE extent, "
827 "number %zu\n",
828 vol->devname, vol->svc_id, i);
829 goto end;
830 }
831
832 if (!first)
833 xor(xorbuf, buf, cnt * vol->bsize);
834 else
835 first = false;
836 }
837
838 rc = block_write_direct(rebuild_ext->svc_id, ba, cnt, xorbuf);
839 if (rc != EOK) {
840 hr_raid5_ext_state_cb(vol, bad, rc);
841 HR_ERROR("rebuild on \"%s\" (%" PRIun "), failed due to "
842 "the rebuilt extent number %zu failing\n",
843 vol->devname, vol->svc_id, bad);
844 goto end;
845 }
846
847 ba += cnt;
848 left -= cnt;
849
850 /*
851 * Let other IO requests be served
852 * during rebuild.
853 */
854
855 /*
856 * fibril_rwlock_write_unlock(&vol->states_lock);
857 * fibril_mutex_unlock(&vol->lock);
858 * fibril_mutex_lock(&vol->lock);
859 * fibril_rwlock_write_lock(&vol->states_lock);
860 */
861 }
862
863 HR_DEBUG("hr_raid5_rebuild(): rebuild finished on \"%s\" (%" PRIun "), "
864 "extent number %zu\n", vol->devname, vol->svc_id, hotspare_idx);
865
866 hr_update_ext_state(vol, bad, HR_EXT_ONLINE);
867
868 fibril_rwlock_write_unlock(&vol->states_lock);
869 fibril_rwlock_read_unlock(&vol->extents_lock);
870
871 rc = vol->meta_ops->save(vol, WITH_STATE_CALLBACK);
872
873 fibril_rwlock_read_lock(&vol->extents_lock);
874 fibril_rwlock_write_lock(&vol->states_lock);
875
876end:
877 hr_raid5_vol_state_eval_forced(vol);
878
879 fibril_rwlock_write_unlock(&vol->states_lock);
880 fibril_rwlock_read_unlock(&vol->extents_lock);
881
882 if (buf != NULL)
883 free(buf);
884
885 if (xorbuf != NULL)
886 free(xorbuf);
887
888 return rc;
889}
890
891/** @}
892 */
Note: See TracBrowser for help on using the repository browser.