source: mainline/uspace/srv/bd/hr/raid5.c@ f0360ec

Last change on this file since f0360ec was f0360ec, checked in by Miroslav Cimerman <mc@…>, 7 weeks ago

hr: RAID 0, 1: use ENOMEM safe primitives

  • Property mode set to 100644
File size: 22.4 KB
Line 
1/*
2 * Copyright (c) 2025 Miroslav Cimerman
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * - Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * - Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * - The name of the author may not be used to endorse or promote products
15 * derived from this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/** @addtogroup hr
30 * @{
31 */
32/**
33 * @file
34 */
35
36#include <abi/ipc/ipc.h>
37#include <bd_srv.h>
38#include <block.h>
39#include <errno.h>
40#include <hr.h>
41#include <inttypes.h>
42#include <io/log.h>
43#include <ipc/hr.h>
44#include <ipc/services.h>
45#include <loc.h>
46#include <mem.h>
47#include <task.h>
48#include <stdio.h>
49#include <stdlib.h>
50#include <str_error.h>
51
52#include "io.h"
53#include "parity_stripe.h"
54#include "superblock.h"
55#include "util.h"
56#include "var.h"
57
58static void hr_raid5_vol_state_eval_forced(hr_volume_t *);
59
60static size_t hr_raid5_parity_extent(hr_level_t, hr_layout_t, size_t,
61 uint64_t);
62static size_t hr_raid5_data_extent(hr_level_t, hr_layout_t, size_t, uint64_t,
63 uint64_t);
64
65static errno_t hr_raid5_rebuild(void *);
66
67/* bdops */
68static errno_t hr_raid5_bd_open(bd_srvs_t *, bd_srv_t *);
69static errno_t hr_raid5_bd_close(bd_srv_t *);
70static errno_t hr_raid5_bd_read_blocks(bd_srv_t *, aoff64_t, size_t, void *,
71 size_t);
72static errno_t hr_raid5_bd_sync_cache(bd_srv_t *, aoff64_t, size_t);
73static errno_t hr_raid5_bd_write_blocks(bd_srv_t *, aoff64_t, size_t,
74 const void *, size_t);
75static errno_t hr_raid5_bd_get_block_size(bd_srv_t *, size_t *);
76static errno_t hr_raid5_bd_get_num_blocks(bd_srv_t *, aoff64_t *);
77
78static bd_ops_t hr_raid5_bd_ops = {
79 .open = hr_raid5_bd_open,
80 .close = hr_raid5_bd_close,
81 .sync_cache = hr_raid5_bd_sync_cache,
82 .read_blocks = hr_raid5_bd_read_blocks,
83 .write_blocks = hr_raid5_bd_write_blocks,
84 .get_block_size = hr_raid5_bd_get_block_size,
85 .get_num_blocks = hr_raid5_bd_get_num_blocks
86};
87
88extern loc_srv_t *hr_srv;
89
90errno_t hr_raid5_create(hr_volume_t *new_volume)
91{
92 HR_DEBUG("%s()", __func__);
93
94 if (new_volume->level != HR_LVL_5 && new_volume->level != HR_LVL_4)
95 return EINVAL;
96
97 if (new_volume->extent_no < 3) {
98 HR_ERROR("RAID 5 volume needs at least 3 devices\n");
99 return EINVAL;
100 }
101
102 bd_srvs_init(&new_volume->hr_bds);
103 new_volume->hr_bds.ops = &hr_raid5_bd_ops;
104 new_volume->hr_bds.sarg = new_volume;
105
106 hr_raid5_vol_state_eval_forced(new_volume);
107
108 fibril_rwlock_read_lock(&new_volume->states_lock);
109 hr_vol_state_t state = new_volume->state;
110 fibril_rwlock_read_unlock(&new_volume->states_lock);
111 if (state == HR_VOL_FAULTY || state == HR_VOL_NONE) {
112 HR_NOTE("\"%s\": unusable state, not creating\n",
113 new_volume->devname);
114 return EINVAL;
115 }
116
117 return EOK;
118}
119
120/*
121 * Called only once in volume's lifetime.
122 */
123errno_t hr_raid5_init(hr_volume_t *vol)
124{
125 HR_DEBUG("%s()", __func__);
126
127 if (vol->level != HR_LVL_5 && vol->level != HR_LVL_4)
128 return EINVAL;
129
130 vol->data_offset = vol->meta_ops->get_data_offset();
131
132 uint64_t single_sz = vol->truncated_blkno - vol->meta_ops->get_size();
133 vol->data_blkno = single_sz * (vol->extent_no - 1);
134
135 vol->strip_size = HR_STRIP_SIZE;
136
137 if (vol->level == HR_LVL_4)
138 vol->layout = HR_LAYOUT_RAID4_N;
139 else
140 vol->layout = HR_LAYOUT_RAID5_NR;
141
142 return EOK;
143}
144
145void hr_raid5_vol_state_eval(hr_volume_t *vol)
146{
147 HR_DEBUG("%s()", __func__);
148
149 bool exp = true;
150 if (!atomic_compare_exchange_strong(&vol->state_dirty, &exp, false))
151 return;
152
153 vol->meta_ops->inc_counter(vol);
154 (void)vol->meta_ops->save(vol, WITH_STATE_CALLBACK);
155
156 hr_raid5_vol_state_eval_forced(vol);
157}
158
159errno_t hr_raid5_add_hotspare(hr_volume_t *vol, service_id_t hotspare)
160{
161 HR_DEBUG("%s()", __func__);
162
163 errno_t rc = hr_util_add_hotspare(vol, hotspare);
164
165 hr_raid5_vol_state_eval(vol);
166
167 return rc;
168}
169
170void hr_raid5_ext_state_cb(hr_volume_t *vol, size_t extent, errno_t rc)
171{
172 HR_DEBUG("%s()", __func__);
173
174 assert(fibril_rwlock_is_locked(&vol->extents_lock));
175
176 if (rc == EOK)
177 return;
178
179 fibril_rwlock_write_lock(&vol->states_lock);
180
181 switch (rc) {
182 case ENOENT:
183 hr_update_ext_state(vol, extent, HR_EXT_MISSING);
184 break;
185 default:
186 hr_update_ext_state(vol, extent, HR_EXT_FAILED);
187 }
188
189 hr_mark_vol_state_dirty(vol);
190
191 fibril_rwlock_write_unlock(&vol->states_lock);
192}
193
194static errno_t hr_raid5_bd_open(bd_srvs_t *bds, bd_srv_t *bd)
195{
196 HR_DEBUG("%s()\n", __func__);
197
198 hr_volume_t *vol = bd->srvs->sarg;
199
200 atomic_fetch_add_explicit(&vol->open_cnt, 1, memory_order_relaxed);
201
202 return EOK;
203}
204
205static errno_t hr_raid5_bd_close(bd_srv_t *bd)
206{
207 HR_DEBUG("%s()\n", __func__);
208
209 hr_volume_t *vol = bd->srvs->sarg;
210
211 atomic_fetch_sub_explicit(&vol->open_cnt, 1, memory_order_relaxed);
212
213 return EOK;
214}
215
216static errno_t hr_raid5_bd_sync_cache(bd_srv_t *bd, aoff64_t ba, size_t cnt)
217{
218 hr_volume_t *vol = bd->srvs->sarg;
219
220 return hr_sync_extents(vol);
221}
222
223static errno_t hr_raid5_bd_read_blocks(bd_srv_t *bd, uint64_t ba, size_t cnt,
224 void *data_read, size_t size)
225{
226 hr_volume_t *vol = bd->srvs->sarg;
227 errno_t rc;
228
229 if (size < cnt * vol->bsize)
230 return EINVAL;
231
232 fibril_rwlock_read_lock(&vol->states_lock);
233 hr_vol_state_t vol_state = vol->state;
234 fibril_rwlock_read_unlock(&vol->states_lock);
235
236 if (vol_state == HR_VOL_FAULTY || vol_state == HR_VOL_NONE)
237 return EIO;
238
239 rc = hr_check_ba_range(vol, cnt, ba);
240 if (rc != EOK)
241 return rc;
242
243 uint64_t strip_size = vol->strip_size / vol->bsize; /* in blocks */
244 uint64_t strip_no = ba / strip_size;
245
246 /* calculate number of stripes touched */
247 uint64_t last_ba = ba + cnt - 1;
248 uint64_t end_strip_no = last_ba / strip_size;
249 uint64_t start_stripe = strip_no / (vol->extent_no - 1);
250 uint64_t end_stripe = end_strip_no / (vol->extent_no - 1);
251 size_t stripes_cnt = end_stripe - start_stripe + 1;
252
253 hr_stripe_t *stripes = hr_create_stripes(vol, stripes_cnt, false);
254 if (stripes == NULL)
255 return ENOMEM;
256
257 /*
258 * Pre-allocate range locks, because after group creation and
259 * firing off IO requests there is no easy consistent ENOMEM error
260 * path.
261 */
262 hr_range_lock_t **rlps = malloc_waitok(stripes_cnt * sizeof(*rlps));
263 for (size_t i = 0; i < stripes_cnt; i++)
264 rlps[i] = malloc_waitok(sizeof(**rlps));
265
266 /*
267 * extent order has to be locked for the whole IO duration,
268 * so that workers have consistent targets
269 */
270 fibril_rwlock_read_lock(&vol->extents_lock);
271
272 for (uint64_t s = start_stripe; s <= end_stripe; s++) {
273 uint64_t relative = s - start_stripe;
274 hr_range_lock_acquire_noalloc(rlps[relative], vol, s, 1);
275 }
276
277 uint64_t phys_block, len;
278 size_t left;
279
280 hr_layout_t layout = vol->layout;
281 hr_level_t level = vol->level;
282
283 /* parity extent */
284 size_t p_extent = hr_raid5_parity_extent(level, layout,
285 vol->extent_no, strip_no);
286
287 uint64_t strip_off = ba % strip_size;
288
289 left = cnt;
290
291 while (left != 0) {
292 if (level == HR_LVL_5) {
293 p_extent = hr_raid5_parity_extent(level, layout,
294 vol->extent_no, strip_no);
295 }
296
297 size_t extent = hr_raid5_data_extent(level, layout,
298 vol->extent_no, strip_no, p_extent);
299
300 uint64_t stripe_no = strip_no / (vol->extent_no - 1);
301 size_t relative_si = stripe_no - start_stripe; /* relative stripe index */
302 hr_stripe_t *stripe = &stripes[relative_si];
303 stripe->p_extent = p_extent;
304
305 stripe->strips_touched++;
306
307 phys_block = stripe_no * strip_size + strip_off;
308 cnt = min(left, strip_size - strip_off);
309 len = vol->bsize * cnt;
310 hr_add_data_offset(vol, &phys_block);
311
312 stripe->extent_span[extent].range.start = phys_block;
313 stripe->extent_span[extent].range.end = phys_block + cnt - 1;
314 stripe->extent_span[extent].cnt = cnt;
315 stripe->extent_span[extent].data_read = data_read;
316 stripe->extent_span[extent].strip_off = strip_off;
317
318 data_read += len;
319 left -= cnt;
320 strip_off = 0;
321 strip_no++;
322 }
323
324retry:
325 size_t bad_extent = vol->extent_no;
326
327 uint64_t rebuild_pos = atomic_load_explicit(&vol->rebuild_blk,
328 memory_order_relaxed);
329
330 fibril_rwlock_read_lock(&vol->states_lock);
331
332 for (size_t e = 0; e < vol->extent_no; e++) {
333 hr_ext_state_t s = vol->extents[e].state;
334 if ((vol->state == HR_VOL_DEGRADED && s != HR_EXT_ONLINE) ||
335 (s == HR_EXT_REBUILD && rebuild_pos < start_stripe)) {
336 bad_extent = e;
337 break;
338 }
339 }
340
341 fibril_rwlock_read_unlock(&vol->states_lock);
342
343 for (size_t s = 0; s < stripes_cnt; s++) {
344 if (stripes[s].done)
345 continue;
346 execute_stripe(&stripes[s], bad_extent);
347 }
348
349 for (size_t s = 0; s < stripes_cnt; s++) {
350 if (stripes[s].done)
351 continue;
352 wait_for_stripe(&stripes[s]);
353 }
354
355 hr_raid5_vol_state_eval(vol);
356
357 rc = EOK;
358
359 fibril_rwlock_read_lock(&vol->states_lock);
360
361 if (vol->state == HR_VOL_FAULTY) {
362 fibril_rwlock_read_unlock(&vol->states_lock);
363 rc = EIO;
364 goto end;
365 }
366
367 fibril_rwlock_read_unlock(&vol->states_lock);
368
369 for (size_t s = 0; s < stripes_cnt; s++)
370 if (stripes[s].rc == EAGAIN)
371 goto retry;
372
373 /* all stripes are done */
374end:
375 fibril_rwlock_read_unlock(&vol->extents_lock);
376
377 for (size_t i = 0; i < stripes_cnt; i++)
378 hr_range_lock_release(rlps[i]);
379
380 hr_destroy_stripes(stripes, stripes_cnt);
381
382 return rc;
383}
384
385static errno_t hr_raid5_bd_write_blocks(bd_srv_t *bd, aoff64_t ba, size_t cnt,
386 const void *data_write, size_t size)
387{
388 hr_volume_t *vol = bd->srvs->sarg;
389 errno_t rc;
390
391 if (size < cnt * vol->bsize)
392 return EINVAL;
393
394 fibril_rwlock_read_lock(&vol->states_lock);
395 hr_vol_state_t vol_state = vol->state;
396 fibril_rwlock_read_unlock(&vol->states_lock);
397
398 if (vol_state == HR_VOL_FAULTY || vol_state == HR_VOL_NONE)
399 return EIO;
400
401 /* increment metadata counter only on first write */
402 bool exp = false;
403 if (atomic_compare_exchange_strong(&vol->first_write, &exp, true)) {
404 vol->meta_ops->inc_counter(vol);
405 vol->meta_ops->save(vol, WITH_STATE_CALLBACK);
406 }
407
408 rc = hr_check_ba_range(vol, cnt, ba);
409 if (rc != EOK)
410 return rc;
411
412 uint64_t strip_size = vol->strip_size / vol->bsize; /* in blocks */
413 uint64_t strip_no = ba / strip_size;
414
415 /* calculate number of stripes touched */
416 uint64_t last_ba = ba + cnt - 1;
417 uint64_t end_strip_no = last_ba / strip_size;
418 uint64_t start_stripe = strip_no / (vol->extent_no - 1);
419 uint64_t end_stripe = end_strip_no / (vol->extent_no - 1);
420 size_t stripes_cnt = end_stripe - start_stripe + 1;
421
422 hr_stripe_t *stripes = hr_create_stripes(vol, stripes_cnt, true);
423 if (stripes == NULL)
424 return ENOMEM;
425
426 uint64_t stripe_size = strip_size * (vol->extent_no - 1);
427
428 for (uint64_t stripe = start_stripe; stripe <= end_stripe; stripe++) {
429 uint64_t relative_stripe = stripe - start_stripe;
430
431 uint64_t s_start = stripe * stripe_size;
432 uint64_t s_end = s_start + stripe_size - 1;
433
434 uint64_t overlap_start;
435 if (ba > s_start)
436 overlap_start = ba;
437 else
438 overlap_start = s_start;
439
440 uint64_t overlap_end;
441 if (last_ba < s_end)
442 overlap_end = last_ba;
443 else
444 overlap_end = s_end;
445
446 uint64_t start_strip_index =
447 (overlap_start - s_start) / strip_size;
448 uint64_t end_strip_index = (overlap_end - s_start) / strip_size;
449 size_t strips_touched = end_strip_index - start_strip_index + 1;
450
451 stripes[relative_stripe].strips_touched = strips_touched;
452
453 uint64_t first_offset = (overlap_start - s_start) % strip_size;
454 uint64_t last_offset = (overlap_end - s_start) % strip_size;
455
456 size_t partials = 0;
457 if (first_offset != 0)
458 partials++;
459 if (last_offset != strip_size - 1)
460 partials++;
461 if (start_strip_index == end_strip_index && partials == 2)
462 partials = 1;
463
464 stripes[relative_stripe].strips_touched = strips_touched;
465 stripes[relative_stripe].partial_strips_touched = partials;
466
467 if (strips_touched < (vol->extent_no - 1) / 2)
468 stripes[relative_stripe].subtract = true;
469 }
470
471 /*
472 * Pre-allocate range locks, because after group creation and
473 * firing off IO requests there is no easy consistent ENOMEM error
474 * path.
475 */
476 hr_range_lock_t **rlps = malloc_waitok(stripes_cnt * sizeof(*rlps));
477 for (size_t i = 0; i < stripes_cnt; i++)
478 rlps[i] = malloc_waitok(sizeof(**rlps));
479
480 /*
481 * extent order has to be locked for the whole IO duration,
482 * so that workers have consistent targets
483 */
484 fibril_rwlock_read_lock(&vol->extents_lock);
485
486 for (uint64_t s = start_stripe; s <= end_stripe; s++) {
487 uint64_t relative = s - start_stripe;
488 hr_range_lock_acquire_noalloc(rlps[relative], vol, s, 1);
489 }
490
491 uint64_t phys_block, len;
492 size_t left;
493
494 hr_layout_t layout = vol->layout;
495 hr_level_t level = vol->level;
496
497 /* parity extent */
498 size_t p_extent = hr_raid5_parity_extent(level, layout,
499 vol->extent_no, strip_no);
500
501 uint64_t strip_off = ba % strip_size;
502
503 left = cnt;
504
505 while (left != 0) {
506 if (level == HR_LVL_5) {
507 p_extent = hr_raid5_parity_extent(level, layout,
508 vol->extent_no, strip_no);
509 }
510
511 size_t extent = hr_raid5_data_extent(level, layout,
512 vol->extent_no, strip_no, p_extent);
513
514 uint64_t stripe_no = strip_no / (vol->extent_no - 1);
515 size_t relative_si = stripe_no - start_stripe; /* relative stripe index */
516 hr_stripe_t *stripe = &stripes[relative_si];
517 stripe->p_extent = p_extent;
518
519 phys_block = stripe_no * strip_size + strip_off;
520 cnt = min(left, strip_size - strip_off);
521 len = vol->bsize * cnt;
522 hr_add_data_offset(vol, &phys_block);
523
524 stripe->extent_span[extent].range.start = phys_block;
525 stripe->extent_span[extent].range.end = phys_block + cnt - 1;
526 stripe->extent_span[extent].cnt = cnt;
527 stripe->extent_span[extent].data_write = data_write;
528 stripe->extent_span[extent].strip_off = strip_off;
529
530 data_write += len;
531 left -= cnt;
532 strip_off = 0;
533 strip_no++;
534 }
535
536retry:
537 size_t bad_extent = vol->extent_no;
538
539 uint64_t rebuild_pos = atomic_load_explicit(&vol->rebuild_blk,
540 memory_order_relaxed);
541
542 fibril_rwlock_read_lock(&vol->states_lock);
543
544 for (size_t e = 0; e < vol->extent_no; e++) {
545 hr_ext_state_t s = vol->extents[e].state;
546 if ((vol->state == HR_VOL_DEGRADED && s != HR_EXT_ONLINE) ||
547 (s == HR_EXT_REBUILD && rebuild_pos < start_stripe)) {
548 bad_extent = e;
549 break;
550 }
551 }
552
553 fibril_rwlock_read_unlock(&vol->states_lock);
554
555 for (size_t s = 0; s < stripes_cnt; s++) {
556 if (stripes[s].done)
557 continue;
558 execute_stripe(&stripes[s], bad_extent);
559 }
560
561 for (size_t s = 0; s < stripes_cnt; s++) {
562 if (stripes[s].done)
563 continue;
564 wait_for_stripe(&stripes[s]);
565 }
566
567 hr_raid5_vol_state_eval(vol);
568
569 rc = EOK;
570
571 fibril_rwlock_read_lock(&vol->states_lock);
572
573 if (vol->state == HR_VOL_FAULTY) {
574 fibril_rwlock_read_unlock(&vol->states_lock);
575 rc = EIO;
576 goto end;
577 }
578
579 fibril_rwlock_read_unlock(&vol->states_lock);
580
581 for (size_t s = 0; s < stripes_cnt; s++)
582 if (stripes[s].rc == EAGAIN)
583 goto retry;
584
585 /* all stripes are done */
586end:
587 fibril_rwlock_read_unlock(&vol->extents_lock);
588
589 for (size_t i = 0; i < stripes_cnt; i++)
590 hr_range_lock_release(rlps[i]);
591
592 hr_destroy_stripes(stripes, stripes_cnt);
593
594 return rc;
595}
596
597static errno_t hr_raid5_bd_get_block_size(bd_srv_t *bd, size_t *rsize)
598{
599 hr_volume_t *vol = bd->srvs->sarg;
600
601 *rsize = vol->bsize;
602 return EOK;
603}
604
605static errno_t hr_raid5_bd_get_num_blocks(bd_srv_t *bd, aoff64_t *rnb)
606{
607 hr_volume_t *vol = bd->srvs->sarg;
608
609 *rnb = vol->data_blkno;
610 return EOK;
611}
612
613static void hr_raid5_vol_state_eval_forced(hr_volume_t *vol)
614{
615 fibril_rwlock_read_lock(&vol->extents_lock);
616 fibril_rwlock_write_lock(&vol->states_lock);
617
618 hr_vol_state_t state = vol->state;
619
620 size_t bad = 0;
621 for (size_t i = 0; i < vol->extent_no; i++)
622 if (vol->extents[i].state != HR_EXT_ONLINE)
623 bad++;
624
625 switch (bad) {
626 case 0:
627 if (state != HR_VOL_ONLINE)
628 hr_update_vol_state(vol, HR_VOL_ONLINE);
629 break;
630 case 1:
631 if (state != HR_VOL_DEGRADED && state != HR_VOL_REBUILD)
632 hr_update_vol_state(vol, HR_VOL_DEGRADED);
633
634 if (state != HR_VOL_REBUILD) {
635 /* XXX: allow REBUILD on INVALID extents */
636 fibril_mutex_lock(&vol->hotspare_lock);
637 size_t hs_no = vol->hotspare_no;
638 fibril_mutex_unlock(&vol->hotspare_lock);
639 if (hs_no > 0) {
640 fid_t fib = fibril_create(hr_raid5_rebuild,
641 vol);
642 if (fib == 0)
643 break;
644 fibril_start(fib);
645 fibril_detach(fib);
646 }
647 }
648 break;
649 default:
650 if (state != HR_VOL_FAULTY)
651 hr_update_vol_state(vol, HR_VOL_FAULTY);
652 break;
653 }
654
655 fibril_rwlock_write_unlock(&vol->states_lock);
656 fibril_rwlock_read_unlock(&vol->extents_lock);
657}
658
659static void xor(void *dst, const void *src, size_t size)
660{
661 size_t i;
662 uint64_t *d = dst;
663 const uint64_t *s = src;
664
665 for (i = 0; i < size / sizeof(uint64_t); ++i)
666 *d++ ^= *s++;
667}
668
669static size_t hr_raid5_parity_extent(hr_level_t level,
670 hr_layout_t layout, size_t extent_no, uint64_t strip_no)
671{
672 switch (level) {
673 case HR_LVL_4:
674 switch (layout) {
675 case HR_LAYOUT_RAID4_0:
676 return (0);
677 case HR_LAYOUT_RAID4_N:
678 return (extent_no - 1);
679 default:
680 assert(0 && "invalid layout configuration");
681 }
682 case HR_LVL_5:
683 switch (layout) {
684 case HR_LAYOUT_RAID5_0R:
685 return ((strip_no / (extent_no - 1)) % extent_no);
686 case HR_LAYOUT_RAID5_NR:
687 case HR_LAYOUT_RAID5_NC:
688 return ((extent_no - 1) -
689 (strip_no / (extent_no - 1)) % extent_no);
690 default:
691 assert(0 && "invalid layout configuration");
692 }
693 default:
694 assert(0 && "invalid layout configuration");
695 }
696}
697
698static size_t hr_raid5_data_extent(hr_level_t level,
699 hr_layout_t layout, size_t extent_no, uint64_t strip_no, size_t p_extent)
700{
701 switch (level) {
702 case HR_LVL_4:
703 switch (layout) {
704 case HR_LAYOUT_RAID4_0:
705 return ((strip_no % (extent_no - 1)) + 1);
706 case HR_LAYOUT_RAID4_N:
707 return (strip_no % (extent_no - 1));
708 default:
709 assert(0 && "invalid layout configuration");
710 }
711 case HR_LVL_5:
712 switch (layout) {
713 case HR_LAYOUT_RAID5_0R:
714 case HR_LAYOUT_RAID5_NR:
715 if ((strip_no % (extent_no - 1)) < p_extent)
716 return (strip_no % (extent_no - 1));
717 else
718 return ((strip_no % (extent_no - 1)) + 1);
719 case HR_LAYOUT_RAID5_NC:
720 return (((strip_no % (extent_no - 1)) + p_extent + 1) %
721 extent_no);
722 default:
723 assert(0 && "invalid layout configuration");
724 }
725 default:
726 assert(0 && "invalid layout configuration");
727 }
728}
729
730static errno_t hr_raid5_rebuild(void *arg)
731{
732 HR_DEBUG("hr_raid5_rebuild()\n");
733
734 hr_volume_t *vol = arg;
735 errno_t rc = EOK;
736 void *buf = NULL, *xorbuf = NULL;
737
738 fibril_rwlock_read_lock(&vol->extents_lock);
739 fibril_rwlock_write_lock(&vol->states_lock);
740
741 if (vol->hotspare_no == 0) {
742 HR_WARN("hr_raid5_rebuild(): no free hotspares on \"%s\", "
743 "aborting rebuild\n", vol->devname);
744 /* retval isn't checked for now */
745 goto end;
746 }
747
748 size_t bad = vol->extent_no;
749 for (size_t i = 0; i < vol->extent_no; i++) {
750 if (vol->extents[i].state == HR_EXT_FAILED) {
751 bad = i;
752 break;
753 }
754 }
755
756 if (bad == vol->extent_no) {
757 HR_WARN("hr_raid5_rebuild(): no bad extent on \"%s\", "
758 "aborting rebuild\n", vol->devname);
759 /* retval isn't checked for now */
760 goto end;
761 }
762
763 size_t hotspare_idx = vol->hotspare_no - 1;
764
765 hr_ext_state_t hs_state = vol->hotspares[hotspare_idx].state;
766 if (hs_state != HR_EXT_HOTSPARE) {
767 HR_ERROR("hr_raid5_rebuild(): invalid hotspare state \"%s\", "
768 "aborting rebuild\n", hr_get_ext_state_str(hs_state));
769 rc = EINVAL;
770 goto end;
771 }
772
773 HR_DEBUG("hr_raid5_rebuild(): swapping in hotspare\n");
774
775 block_fini(vol->extents[bad].svc_id);
776
777 vol->extents[bad].svc_id = vol->hotspares[hotspare_idx].svc_id;
778 hr_update_ext_state(vol, bad, HR_EXT_HOTSPARE);
779
780 vol->hotspares[hotspare_idx].svc_id = 0;
781 fibril_mutex_lock(&vol->hotspare_lock);
782 hr_update_hotspare_state(vol, hotspare_idx, HR_EXT_MISSING);
783 fibril_mutex_unlock(&vol->hotspare_lock);
784
785 vol->hotspare_no--;
786
787 hr_extent_t *rebuild_ext = &vol->extents[bad];
788
789 HR_DEBUG("hr_raid5_rebuild(): starting rebuild on (%" PRIun ")\n",
790 rebuild_ext->svc_id);
791
792 hr_update_ext_state(vol, bad, HR_EXT_REBUILD);
793 hr_update_vol_state(vol, HR_VOL_REBUILD);
794
795 uint64_t max_blks = DATA_XFER_LIMIT / vol->bsize;
796 uint64_t left = vol->data_blkno / (vol->extent_no - 1);
797 buf = malloc(max_blks * vol->bsize);
798 xorbuf = malloc(max_blks * vol->bsize);
799
800 uint64_t ba = 0, cnt;
801 hr_add_data_offset(vol, &ba);
802
803 while (left != 0) {
804 cnt = min(left, max_blks);
805
806 /*
807 * Almost the same as read_degraded,
808 * but we don't want to allocate new
809 * xorbuf each blk rebuild batch.
810 */
811 bool first = true;
812 for (size_t i = 0; i < vol->extent_no; i++) {
813 if (i == bad)
814 continue;
815 if (first)
816 rc = block_read_direct(vol->extents[i].svc_id,
817 ba, cnt, xorbuf);
818 else
819 rc = block_read_direct(vol->extents[i].svc_id,
820 ba, cnt, buf);
821 if (rc != EOK) {
822 hr_raid5_ext_state_cb(vol, i, rc);
823 HR_ERROR("rebuild on \"%s\" (%" PRIun "), "
824 "failed due to a failed ONLINE extent, "
825 "number %zu\n",
826 vol->devname, vol->svc_id, i);
827 goto end;
828 }
829
830 if (!first)
831 xor(xorbuf, buf, cnt * vol->bsize);
832 else
833 first = false;
834 }
835
836 rc = block_write_direct(rebuild_ext->svc_id, ba, cnt, xorbuf);
837 if (rc != EOK) {
838 hr_raid5_ext_state_cb(vol, bad, rc);
839 HR_ERROR("rebuild on \"%s\" (%" PRIun "), failed due to "
840 "the rebuilt extent number %zu failing\n",
841 vol->devname, vol->svc_id, bad);
842 goto end;
843 }
844
845 ba += cnt;
846 left -= cnt;
847
848 /*
849 * Let other IO requests be served
850 * during rebuild.
851 */
852
853 /*
854 * fibril_rwlock_write_unlock(&vol->states_lock);
855 * fibril_mutex_unlock(&vol->lock);
856 * fibril_mutex_lock(&vol->lock);
857 * fibril_rwlock_write_lock(&vol->states_lock);
858 */
859 }
860
861 HR_DEBUG("hr_raid5_rebuild(): rebuild finished on \"%s\" (%" PRIun "), "
862 "extent number %zu\n", vol->devname, vol->svc_id, hotspare_idx);
863
864 hr_update_ext_state(vol, bad, HR_EXT_ONLINE);
865
866 fibril_rwlock_write_unlock(&vol->states_lock);
867 fibril_rwlock_read_unlock(&vol->extents_lock);
868
869 rc = vol->meta_ops->save(vol, WITH_STATE_CALLBACK);
870
871 fibril_rwlock_read_lock(&vol->extents_lock);
872 fibril_rwlock_write_lock(&vol->states_lock);
873
874end:
875 hr_raid5_vol_state_eval_forced(vol);
876
877 fibril_rwlock_write_unlock(&vol->states_lock);
878 fibril_rwlock_read_unlock(&vol->extents_lock);
879
880 if (buf != NULL)
881 free(buf);
882
883 if (xorbuf != NULL)
884 free(xorbuf);
885
886 return rc;
887}
888
889/** @}
890 */
Note: See TracBrowser for help on using the repository browser.