source: mainline/uspace/srv/bd/hr/raid5.c

Last change on this file was c1c1c41, checked in by Miroslav Cimerman <mc@…>, 8 days ago

hr: add author's email address to RAID 5 files

  • Property mode set to 100644
File size: 21.6 KB
Line 
1/*
2 * Copyright (c) 2025 Miroslav Cimerman <mc@doas.su>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * - Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * - Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * - The name of the author may not be used to endorse or promote products
15 * derived from this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/** @addtogroup hr
30 * @{
31 */
32/**
33 * @file
34 */
35
36#include <abi/ipc/ipc.h>
37#include <bd_srv.h>
38#include <block.h>
39#include <errno.h>
40#include <hr.h>
41#include <inttypes.h>
42#include <io/log.h>
43#include <ipc/hr.h>
44#include <ipc/services.h>
45#include <loc.h>
46#include <mem.h>
47#include <task.h>
48#include <stdio.h>
49#include <stdlib.h>
50#include <str_error.h>
51
52#include "io.h"
53#include "parity_stripe.h"
54#include "superblock.h"
55#include "util.h"
56#include "var.h"
57
58static void hr_raid5_vol_state_eval_forced(hr_volume_t *);
59static size_t hr_raid5_parity_extent(hr_level_t, hr_layout_t, size_t,
60 uint64_t);
61static size_t hr_raid5_data_extent(hr_level_t, hr_layout_t, size_t, uint64_t,
62 size_t);
63static errno_t hr_raid5_rebuild(void *);
64
65/* bdops */
66static errno_t hr_raid5_bd_open(bd_srvs_t *, bd_srv_t *);
67static errno_t hr_raid5_bd_close(bd_srv_t *);
68static errno_t hr_raid5_bd_read_blocks(bd_srv_t *, aoff64_t, size_t, void *,
69 size_t);
70static errno_t hr_raid5_bd_sync_cache(bd_srv_t *, aoff64_t, size_t);
71static errno_t hr_raid5_bd_write_blocks(bd_srv_t *, aoff64_t, size_t,
72 const void *, size_t);
73static errno_t hr_raid5_bd_get_block_size(bd_srv_t *, size_t *);
74static errno_t hr_raid5_bd_get_num_blocks(bd_srv_t *, aoff64_t *);
75
76static bd_ops_t hr_raid5_bd_ops = {
77 .open = hr_raid5_bd_open,
78 .close = hr_raid5_bd_close,
79 .sync_cache = hr_raid5_bd_sync_cache,
80 .read_blocks = hr_raid5_bd_read_blocks,
81 .write_blocks = hr_raid5_bd_write_blocks,
82 .get_block_size = hr_raid5_bd_get_block_size,
83 .get_num_blocks = hr_raid5_bd_get_num_blocks
84};
85
86extern loc_srv_t *hr_srv;
87
88errno_t hr_raid5_create(hr_volume_t *new_volume)
89{
90 HR_DEBUG("%s()", __func__);
91
92 if (new_volume->level != HR_LVL_5 && new_volume->level != HR_LVL_4)
93 return EINVAL;
94
95 if (new_volume->extent_no < 3) {
96 HR_ERROR("RAID 5 volume needs at least 3 devices\n");
97 return EINVAL;
98 }
99
100 hr_raid5_vol_state_eval_forced(new_volume);
101
102 fibril_rwlock_read_lock(&new_volume->states_lock);
103 hr_vol_state_t state = new_volume->state;
104 fibril_rwlock_read_unlock(&new_volume->states_lock);
105 if (state == HR_VOL_FAULTY || state == HR_VOL_NONE) {
106 HR_NOTE("\"%s\": unusable state, not creating\n",
107 new_volume->devname);
108 return EINVAL;
109 }
110
111 bd_srvs_init(&new_volume->hr_bds);
112 new_volume->hr_bds.ops = &hr_raid5_bd_ops;
113 new_volume->hr_bds.sarg = new_volume;
114
115 return EOK;
116}
117
118/*
119 * Called only once in volume's lifetime.
120 */
121errno_t hr_raid5_init(hr_volume_t *vol)
122{
123 HR_DEBUG("%s()", __func__);
124
125 if (vol->level != HR_LVL_5 && vol->level != HR_LVL_4)
126 return EINVAL;
127
128 vol->data_offset = vol->meta_ops->get_data_offset();
129
130 uint64_t single_sz = vol->truncated_blkno - vol->meta_ops->get_size();
131 vol->data_blkno = single_sz * (vol->extent_no - 1);
132
133 vol->strip_size = hr_closest_pow2(HR_STRIP_SIZE / (vol->extent_no - 1));
134
135 if (vol->level == HR_LVL_4)
136 vol->layout = HR_LAYOUT_RAID4_N;
137 else
138 vol->layout = HR_LAYOUT_RAID5_NR;
139
140 return EOK;
141}
142
143void hr_raid5_vol_state_eval(hr_volume_t *vol)
144{
145 HR_DEBUG("%s()", __func__);
146
147 bool exp = true;
148 if (!atomic_compare_exchange_strong(&vol->state_dirty, &exp, false))
149 return;
150
151 vol->meta_ops->inc_counter(vol);
152 vol->meta_ops->save(vol, WITH_STATE_CALLBACK);
153
154 hr_raid5_vol_state_eval_forced(vol);
155}
156
157void hr_raid5_ext_state_cb(hr_volume_t *vol, size_t extent, errno_t rc)
158{
159 HR_DEBUG("%s()", __func__);
160
161 assert(fibril_rwlock_is_locked(&vol->extents_lock));
162
163 if (rc == EOK)
164 return;
165
166 fibril_rwlock_write_lock(&vol->states_lock);
167
168 switch (rc) {
169 case ENOENT:
170 hr_update_ext_state(vol, extent, HR_EXT_MISSING);
171 break;
172 default:
173 hr_update_ext_state(vol, extent, HR_EXT_FAILED);
174 }
175
176 hr_mark_vol_state_dirty(vol);
177
178 fibril_rwlock_write_unlock(&vol->states_lock);
179}
180
181static errno_t hr_raid5_bd_open(bd_srvs_t *bds, bd_srv_t *bd)
182{
183 HR_DEBUG("%s()\n", __func__);
184
185 hr_volume_t *vol = bd->srvs->sarg;
186
187 atomic_fetch_add_explicit(&vol->open_cnt, 1, memory_order_relaxed);
188
189 return EOK;
190}
191
192static errno_t hr_raid5_bd_close(bd_srv_t *bd)
193{
194 HR_DEBUG("%s()\n", __func__);
195
196 hr_volume_t *vol = bd->srvs->sarg;
197
198 atomic_fetch_sub_explicit(&vol->open_cnt, 1, memory_order_relaxed);
199
200 return EOK;
201}
202
203static errno_t hr_raid5_bd_sync_cache(bd_srv_t *bd, aoff64_t ba, size_t cnt)
204{
205 hr_volume_t *vol = bd->srvs->sarg;
206
207 return hr_sync_extents(vol);
208}
209
210static errno_t hr_raid5_bd_read_blocks(bd_srv_t *bd, uint64_t ba, size_t cnt,
211 void *data_read, size_t size)
212{
213 hr_volume_t *vol = bd->srvs->sarg;
214 errno_t rc;
215
216 if (size < cnt * vol->bsize)
217 return EINVAL;
218
219 fibril_rwlock_read_lock(&vol->states_lock);
220 hr_vol_state_t vol_state = vol->state;
221 fibril_rwlock_read_unlock(&vol->states_lock);
222
223 if (vol_state == HR_VOL_FAULTY || vol_state == HR_VOL_NONE)
224 return EIO;
225
226 rc = hr_check_ba_range(vol, cnt, ba);
227 if (rc != EOK)
228 return rc;
229
230 uint64_t strip_size = vol->strip_size / vol->bsize; /* in blocks */
231 uint64_t strip_no = ba / strip_size;
232
233 /* calculate number of stripes touched */
234 uint64_t last_ba = ba + cnt - 1;
235 uint64_t end_strip_no = last_ba / strip_size;
236 uint64_t start_stripe = strip_no / (vol->extent_no - 1);
237 uint64_t end_stripe = end_strip_no / (vol->extent_no - 1);
238 size_t stripes_cnt = end_stripe - start_stripe + 1;
239
240 hr_stripe_t *stripes = hr_create_stripes(vol, vol->strip_size,
241 stripes_cnt, false);
242
243 uint64_t phys_block, len;
244 size_t left;
245
246 hr_layout_t layout = vol->layout;
247 hr_level_t level = vol->level;
248
249 /* parity extent */
250 size_t p_extent = hr_raid5_parity_extent(level, layout,
251 vol->extent_no, strip_no);
252
253 uint64_t strip_off = ba % strip_size;
254
255 left = cnt;
256
257 while (left != 0) {
258 if (level == HR_LVL_5) {
259 p_extent = hr_raid5_parity_extent(level, layout,
260 vol->extent_no, strip_no);
261 }
262
263 size_t extent = hr_raid5_data_extent(level, layout,
264 vol->extent_no, strip_no, p_extent);
265
266 uint64_t stripe_no = strip_no / (vol->extent_no - 1);
267 size_t relative_si = stripe_no - start_stripe; /* relative stripe index */
268 hr_stripe_t *stripe = &stripes[relative_si];
269 stripe->p_extent = p_extent;
270
271 stripe->strips_touched++;
272
273 phys_block = stripe_no * strip_size + strip_off;
274 cnt = min(left, strip_size - strip_off);
275 len = vol->bsize * cnt;
276 hr_add_data_offset(vol, &phys_block);
277
278 stripe->extent_span[extent].range.start = phys_block;
279 stripe->extent_span[extent].range.end = phys_block + cnt - 1;
280 stripe->extent_span[extent].cnt = cnt;
281 stripe->extent_span[extent].data_read = data_read;
282 stripe->extent_span[extent].strip_off = strip_off;
283
284 data_read += len;
285 left -= cnt;
286 strip_off = 0;
287 strip_no++;
288 }
289
290 hr_range_lock_t **rlps = hr_malloc_waitok(stripes_cnt * sizeof(*rlps));
291
292 /*
293 * extent order has to be locked for the whole IO duration,
294 * so that workers have consistent targets
295 */
296 fibril_rwlock_read_lock(&vol->extents_lock);
297
298 for (uint64_t s = start_stripe; s <= end_stripe; s++) {
299 uint64_t relative = s - start_stripe;
300 rlps[relative] = hr_range_lock_acquire(vol, s, 1);
301 }
302
303retry:
304 size_t bad_extent = vol->extent_no;
305
306 uint64_t rebuild_pos = atomic_load_explicit(&vol->rebuild_blk,
307 memory_order_relaxed);
308
309 fibril_rwlock_read_lock(&vol->states_lock);
310
311 for (size_t e = 0; e < vol->extent_no; e++) {
312 hr_ext_state_t s = vol->extents[e].state;
313 if ((vol->state == HR_VOL_DEGRADED && s != HR_EXT_ONLINE) ||
314 (s == HR_EXT_REBUILD && end_stripe >= rebuild_pos)) {
315 bad_extent = e;
316 break;
317 }
318 }
319
320 fibril_rwlock_read_unlock(&vol->states_lock);
321
322 for (size_t s = 0; s < stripes_cnt; s++) {
323 if (stripes[s].done)
324 continue;
325 hr_execute_stripe(&stripes[s], bad_extent);
326 }
327
328 for (size_t s = 0; s < stripes_cnt; s++) {
329 if (stripes[s].done)
330 continue;
331 hr_wait_for_stripe(&stripes[s]);
332 }
333
334 hr_raid5_vol_state_eval(vol);
335
336 rc = EOK;
337
338 fibril_rwlock_read_lock(&vol->states_lock);
339
340 if (vol->state == HR_VOL_FAULTY) {
341 fibril_rwlock_read_unlock(&vol->states_lock);
342 rc = EIO;
343 goto end;
344 }
345
346 fibril_rwlock_read_unlock(&vol->states_lock);
347
348 for (size_t s = 0; s < stripes_cnt; s++)
349 if (stripes[s].rc == EAGAIN)
350 goto retry;
351
352 /* all stripes are done */
353end:
354 fibril_rwlock_read_unlock(&vol->extents_lock);
355
356 for (size_t i = 0; i < stripes_cnt; i++)
357 hr_range_lock_release(rlps[i]);
358
359 free(rlps);
360
361 hr_destroy_stripes(stripes, stripes_cnt);
362
363 return rc;
364}
365
366static errno_t hr_raid5_bd_write_blocks(bd_srv_t *bd, aoff64_t ba, size_t cnt,
367 const void *data_write, size_t size)
368{
369 hr_volume_t *vol = bd->srvs->sarg;
370 errno_t rc;
371
372 if (size < cnt * vol->bsize)
373 return EINVAL;
374
375 if (vol->vflags & HR_VOL_FLAG_READ_ONLY)
376 return ENOTSUP;
377
378 fibril_rwlock_read_lock(&vol->states_lock);
379 hr_vol_state_t vol_state = vol->state;
380 fibril_rwlock_read_unlock(&vol->states_lock);
381
382 if (vol_state == HR_VOL_FAULTY || vol_state == HR_VOL_NONE)
383 return EIO;
384
385 /* increment metadata counter only on first write */
386 bool exp = false;
387 if (atomic_compare_exchange_strong(&vol->first_write, &exp, true)) {
388 vol->meta_ops->inc_counter(vol);
389 vol->meta_ops->save(vol, WITH_STATE_CALLBACK);
390 }
391
392 rc = hr_check_ba_range(vol, cnt, ba);
393 if (rc != EOK)
394 return rc;
395
396 uint64_t strip_size = vol->strip_size / vol->bsize; /* in blocks */
397 uint64_t strip_no = ba / strip_size;
398
399 /* calculate number of stripes touched */
400 uint64_t last_ba = ba + cnt - 1;
401 uint64_t end_strip_no = last_ba / strip_size;
402 uint64_t start_stripe = strip_no / (vol->extent_no - 1);
403 uint64_t end_stripe = end_strip_no / (vol->extent_no - 1);
404 size_t stripes_cnt = end_stripe - start_stripe + 1;
405
406 hr_stripe_t *stripes = hr_create_stripes(vol, vol->strip_size,
407 stripes_cnt, true);
408
409 uint64_t stripe_size = strip_size * (vol->extent_no - 1);
410
411 for (uint64_t stripe = start_stripe; stripe <= end_stripe; stripe++) {
412 uint64_t relative_stripe = stripe - start_stripe;
413
414 uint64_t s_start = stripe * stripe_size;
415 uint64_t s_end = s_start + stripe_size - 1;
416
417 uint64_t overlap_start;
418 if (ba > s_start)
419 overlap_start = ba;
420 else
421 overlap_start = s_start;
422
423 uint64_t overlap_end;
424 if (last_ba < s_end)
425 overlap_end = last_ba;
426 else
427 overlap_end = s_end;
428
429 uint64_t start_strip_index =
430 (overlap_start - s_start) / strip_size;
431 uint64_t end_strip_index = (overlap_end - s_start) / strip_size;
432 size_t strips_touched = end_strip_index - start_strip_index + 1;
433
434 stripes[relative_stripe].strips_touched = strips_touched;
435
436 uint64_t first_offset = (overlap_start - s_start) % strip_size;
437 uint64_t last_offset = (overlap_end - s_start) % strip_size;
438
439 size_t partials = 0;
440 if (first_offset != 0)
441 partials++;
442 if (last_offset != strip_size - 1)
443 partials++;
444 if (start_strip_index == end_strip_index && partials == 2)
445 partials = 1;
446
447 stripes[relative_stripe].strips_touched = strips_touched;
448 stripes[relative_stripe].partial_strips_touched = partials;
449
450 if (strips_touched < (vol->extent_no - 1) / 2)
451 stripes[relative_stripe].subtract = true;
452 }
453
454 uint64_t phys_block, len;
455 size_t left;
456
457 hr_layout_t layout = vol->layout;
458 hr_level_t level = vol->level;
459
460 /* parity extent */
461 size_t p_extent = hr_raid5_parity_extent(level, layout,
462 vol->extent_no, strip_no);
463
464 uint64_t strip_off = ba % strip_size;
465
466 left = cnt;
467
468 while (left != 0) {
469 if (level == HR_LVL_5) {
470 p_extent = hr_raid5_parity_extent(level, layout,
471 vol->extent_no, strip_no);
472 }
473
474 size_t extent = hr_raid5_data_extent(level, layout,
475 vol->extent_no, strip_no, p_extent);
476
477 uint64_t stripe_no = strip_no / (vol->extent_no - 1);
478 size_t relative_si = stripe_no - start_stripe; /* relative stripe index */
479 hr_stripe_t *stripe = &stripes[relative_si];
480 stripe->p_extent = p_extent;
481
482 phys_block = stripe_no * strip_size + strip_off;
483 cnt = min(left, strip_size - strip_off);
484 len = vol->bsize * cnt;
485 hr_add_data_offset(vol, &phys_block);
486
487 stripe->extent_span[extent].range.start = phys_block;
488 stripe->extent_span[extent].range.end = phys_block + cnt - 1;
489 stripe->extent_span[extent].cnt = cnt;
490 stripe->extent_span[extent].data_write = data_write;
491 stripe->extent_span[extent].strip_off = strip_off;
492
493 data_write += len;
494 left -= cnt;
495 strip_off = 0;
496 strip_no++;
497 }
498
499 hr_range_lock_t **rlps = hr_malloc_waitok(stripes_cnt * sizeof(*rlps));
500
501 /*
502 * extent order has to be locked for the whole IO duration,
503 * so that workers have consistent targets
504 */
505 fibril_rwlock_read_lock(&vol->extents_lock);
506
507 for (uint64_t s = start_stripe; s <= end_stripe; s++) {
508 uint64_t relative = s - start_stripe;
509 rlps[relative] = hr_range_lock_acquire(vol, s, 1);
510 }
511
512retry:
513 size_t bad_extent = vol->extent_no;
514
515 uint64_t rebuild_pos = atomic_load_explicit(&vol->rebuild_blk,
516 memory_order_relaxed);
517
518 fibril_rwlock_read_lock(&vol->states_lock);
519
520 for (size_t e = 0; e < vol->extent_no; e++) {
521 hr_ext_state_t s = vol->extents[e].state;
522 if ((vol->state == HR_VOL_DEGRADED && s != HR_EXT_ONLINE) ||
523 (s == HR_EXT_REBUILD && start_stripe > rebuild_pos)) {
524 bad_extent = e;
525 break;
526 }
527 }
528
529 fibril_rwlock_read_unlock(&vol->states_lock);
530
531 for (size_t s = 0; s < stripes_cnt; s++) {
532 if (stripes[s].done)
533 continue;
534 hr_execute_stripe(&stripes[s], bad_extent);
535 }
536
537 for (size_t s = 0; s < stripes_cnt; s++) {
538 if (stripes[s].done)
539 continue;
540 hr_wait_for_stripe(&stripes[s]);
541 }
542
543 hr_raid5_vol_state_eval(vol);
544
545 rc = EOK;
546
547 fibril_rwlock_read_lock(&vol->states_lock);
548
549 if (vol->state == HR_VOL_FAULTY) {
550 fibril_rwlock_read_unlock(&vol->states_lock);
551 rc = EIO;
552 goto end;
553 }
554
555 fibril_rwlock_read_unlock(&vol->states_lock);
556
557 for (size_t s = 0; s < stripes_cnt; s++)
558 if (stripes[s].rc == EAGAIN)
559 goto retry;
560
561 /* all stripes are done */
562end:
563 fibril_rwlock_read_unlock(&vol->extents_lock);
564
565 for (size_t i = 0; i < stripes_cnt; i++)
566 hr_range_lock_release(rlps[i]);
567
568 free(rlps);
569
570 hr_destroy_stripes(stripes, stripes_cnt);
571
572 return rc;
573}
574
575static errno_t hr_raid5_bd_get_block_size(bd_srv_t *bd, size_t *rsize)
576{
577 hr_volume_t *vol = bd->srvs->sarg;
578
579 *rsize = vol->bsize;
580 return EOK;
581}
582
583static errno_t hr_raid5_bd_get_num_blocks(bd_srv_t *bd, aoff64_t *rnb)
584{
585 hr_volume_t *vol = bd->srvs->sarg;
586
587 *rnb = vol->data_blkno;
588 return EOK;
589}
590
591static void hr_raid5_vol_state_eval_forced(hr_volume_t *vol)
592{
593 fibril_rwlock_read_lock(&vol->extents_lock);
594 fibril_rwlock_write_lock(&vol->states_lock);
595
596 hr_vol_state_t state = vol->state;
597
598 size_t bad = 0;
599 for (size_t i = 0; i < vol->extent_no; i++)
600 if (vol->extents[i].state != HR_EXT_ONLINE)
601 bad++;
602
603 size_t invalid_no = hr_count_extents(vol, HR_EXT_INVALID);
604
605 size_t rebuild_no = hr_count_extents(vol, HR_EXT_REBUILD);
606
607 fibril_mutex_lock(&vol->hotspare_lock);
608 size_t hs_no = vol->hotspare_no;
609 fibril_mutex_unlock(&vol->hotspare_lock);
610
611 switch (bad) {
612 case 0:
613 if (state != HR_VOL_OPTIMAL)
614 hr_update_vol_state(vol, HR_VOL_OPTIMAL);
615 break;
616 case 1:
617 if (state != HR_VOL_DEGRADED && state != HR_VOL_REBUILD)
618 hr_update_vol_state(vol, HR_VOL_DEGRADED);
619
620 if (state != HR_VOL_REBUILD) {
621 if (hs_no > 0 || invalid_no > 0 || rebuild_no > 0) {
622 fid_t fib = fibril_create(hr_raid5_rebuild,
623 vol);
624 if (fib == 0)
625 break;
626 fibril_start(fib);
627 fibril_detach(fib);
628 }
629 }
630 break;
631 default:
632 if (state != HR_VOL_FAULTY)
633 hr_update_vol_state(vol, HR_VOL_FAULTY);
634 break;
635 }
636
637 fibril_rwlock_write_unlock(&vol->states_lock);
638 fibril_rwlock_read_unlock(&vol->extents_lock);
639}
640
641static size_t hr_raid5_parity_extent(hr_level_t level,
642 hr_layout_t layout, size_t extent_no, uint64_t strip_no)
643{
644 switch (level) {
645 case HR_LVL_4:
646 switch (layout) {
647 case HR_LAYOUT_RAID4_0:
648 return (0);
649 case HR_LAYOUT_RAID4_N:
650 return (extent_no - 1);
651 default:
652 assert(0 && "invalid layout configuration");
653 }
654 case HR_LVL_5:
655 switch (layout) {
656 case HR_LAYOUT_RAID5_0R:
657 return ((strip_no / (extent_no - 1)) % extent_no);
658 case HR_LAYOUT_RAID5_NR:
659 case HR_LAYOUT_RAID5_NC:
660 return ((extent_no - 1) -
661 (strip_no / (extent_no - 1)) % extent_no);
662 default:
663 assert(0 && "invalid layout configuration");
664 }
665 default:
666 assert(0 && "invalid layout configuration");
667 }
668}
669
670static size_t hr_raid5_data_extent(hr_level_t level,
671 hr_layout_t layout, size_t extent_no, uint64_t strip_no, size_t p_extent)
672{
673 switch (level) {
674 case HR_LVL_4:
675 switch (layout) {
676 case HR_LAYOUT_RAID4_0:
677 return ((strip_no % (extent_no - 1)) + 1);
678 case HR_LAYOUT_RAID4_N:
679 return (strip_no % (extent_no - 1));
680 default:
681 assert(0 && "invalid layout configuration");
682 }
683 case HR_LVL_5:
684 switch (layout) {
685 case HR_LAYOUT_RAID5_0R:
686 case HR_LAYOUT_RAID5_NR:
687 if ((strip_no % (extent_no - 1)) < p_extent)
688 return (strip_no % (extent_no - 1));
689 else
690 return ((strip_no % (extent_no - 1)) + 1);
691 case HR_LAYOUT_RAID5_NC:
692 return (((strip_no % (extent_no - 1)) + p_extent + 1) %
693 extent_no);
694 default:
695 assert(0 && "invalid layout configuration");
696 }
697 default:
698 assert(0 && "invalid layout configuration");
699 }
700}
701
702static errno_t hr_raid5_rebuild(void *arg)
703{
704 HR_DEBUG("%s()", __func__);
705
706 hr_volume_t *vol = arg;
707 errno_t rc = EOK;
708 size_t rebuild_idx;
709
710 if (vol->vflags & HR_VOL_FLAG_READ_ONLY)
711 return ENOTSUP;
712 if (!(vol->meta_ops->get_flags() & HR_METADATA_ALLOW_REBUILD))
713 return ENOTSUP;
714
715 rc = hr_init_rebuild(vol, &rebuild_idx);
716 if (rc != EOK)
717 return rc;
718
719 uint64_t max_blks = DATA_XFER_LIMIT / vol->bsize;
720 uint64_t left =
721 vol->data_blkno / (vol->extent_no - 1) - vol->rebuild_blk;
722
723 uint64_t strip_size = vol->strip_size / vol->bsize; /* in blocks */
724
725 size_t cnt;
726 uint64_t ba = vol->rebuild_blk;
727 hr_add_data_offset(vol, &ba);
728
729 /*
730 * this is not necessary because a rebuild is
731 * protected by itself, i.e. there can be only
732 * one REBUILD at a time
733 */
734 fibril_rwlock_read_lock(&vol->extents_lock);
735
736 /* increment metadata counter only on first write */
737 bool exp = false;
738 if (atomic_compare_exchange_strong(&vol->first_write, &exp, true)) {
739 vol->meta_ops->inc_counter(vol);
740 vol->meta_ops->save(vol, WITH_STATE_CALLBACK);
741 }
742
743 hr_range_lock_t *rl = NULL;
744 hr_stripe_t *stripe = hr_create_stripes(vol, max_blks * vol->bsize, 1,
745 false);
746
747 HR_NOTE("\"%s\": REBUILD started on extent no. %zu at "
748 "block %" PRIu64 ".\n",
749 vol->devname, rebuild_idx, ba);
750
751 uint64_t written = 0;
752 unsigned int percent, old_percent = 100;
753 while (left != 0) {
754 cnt = min(left, max_blks);
755
756 uint64_t strip_no = ba / strip_size;
757 uint64_t last_ba = ba + cnt - 1;
758 uint64_t end_strip_no = last_ba / strip_size;
759 uint64_t start_stripe = strip_no / (vol->extent_no - 1);
760 uint64_t end_stripe = end_strip_no / (vol->extent_no - 1);
761 size_t stripes_cnt = end_stripe - start_stripe + 1;
762
763 stripe->ps_to_be_added = vol->extent_no - 1;
764 stripe->p_count_final = true;
765
766 hr_fgroup_t *worker_group =
767 hr_fgroup_create(vol->fge, vol->extent_no);
768
769 rl = hr_range_lock_acquire(vol, start_stripe, stripes_cnt);
770
771 atomic_store_explicit(&vol->rebuild_blk, ba,
772 memory_order_relaxed);
773
774 for (size_t e = 0; e < vol->extent_no; e++) {
775 if (e == rebuild_idx)
776 continue;
777
778 hr_io_raid5_t *io = hr_fgroup_alloc(worker_group);
779 io->extent = e;
780 io->ba = ba;
781 io->cnt = cnt;
782 io->strip_off = 0;
783 io->vol = vol;
784 io->stripe = stripe;
785
786 hr_fgroup_submit(worker_group,
787 hr_io_raid5_reconstruct_reader, io);
788 }
789
790 hr_io_raid5_t *io = hr_fgroup_alloc(worker_group);
791 io->extent = rebuild_idx;
792 io->ba = ba;
793 io->cnt = cnt;
794 io->strip_off = 0;
795 io->vol = vol;
796 io->stripe = stripe;
797
798 hr_fgroup_submit(worker_group, hr_io_raid5_parity_writer, io);
799
800 size_t failed;
801 (void)hr_fgroup_wait(worker_group, NULL, &failed);
802 if (failed > 0) {
803 hr_range_lock_release(rl);
804 HR_NOTE("\"%s\": REBUILD aborted.\n", vol->devname);
805 goto end;
806 }
807
808 percent = ((ba + cnt) * 100) / vol->data_blkno;
809 if (percent != old_percent) {
810 if (percent % 5 == 0)
811 HR_DEBUG("\"%s\" REBUILD progress: %u%%\n",
812 vol->devname, percent);
813 }
814
815 if (written * vol->bsize > HR_REBUILD_SAVE_BYTES) {
816 vol->meta_ops->save_ext(vol, rebuild_idx,
817 WITH_STATE_CALLBACK);
818 written = 0;
819 }
820
821 hr_range_lock_release(rl);
822 hr_reset_stripe(stripe);
823
824 written += cnt;
825 ba += cnt;
826 left -= cnt;
827 old_percent = percent;
828
829 /*
830 * Let other IO requests be served
831 * during rebuild.
832 */
833 }
834
835 HR_DEBUG("hr_raid5_rebuild(): rebuild finished on \"%s\" (%" PRIun "), "
836 "extent number %zu\n", vol->devname, vol->svc_id, rebuild_idx);
837
838 fibril_rwlock_write_lock(&vol->states_lock);
839
840 hr_update_ext_state(vol, rebuild_idx, HR_EXT_ONLINE);
841
842 atomic_store_explicit(&vol->rebuild_blk, 0, memory_order_relaxed);
843
844 hr_mark_vol_state_dirty(vol);
845
846 hr_update_vol_state(vol, HR_VOL_DEGRADED);
847
848 fibril_rwlock_write_unlock(&vol->states_lock);
849end:
850 fibril_rwlock_read_unlock(&vol->extents_lock);
851
852 hr_raid1_vol_state_eval(vol);
853
854 hr_destroy_stripes(stripe, 1);
855
856 return rc;
857}
858
859/** @}
860 */
Note: See TracBrowser for help on using the repository browser.