source: mainline/uspace/srv/bd/hr/raid5.c@ f0950d2

Last change on this file since f0950d2 was b5c95da5, checked in by Miroslav Cimerman <mc@…>, 8 weeks ago

hr: raid*_{create,init}(): replace asserts with EINVAL

  • Property mode set to 100644
File size: 21.3 KB
Line 
1/*
2 * Copyright (c) 2025 Miroslav Cimerman
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * - Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * - Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * - The name of the author may not be used to endorse or promote products
15 * derived from this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/** @addtogroup hr
30 * @{
31 */
32/**
33 * @file
34 */
35
36#include <abi/ipc/ipc.h>
37#include <bd_srv.h>
38#include <block.h>
39#include <errno.h>
40#include <hr.h>
41#include <inttypes.h>
42#include <io/log.h>
43#include <ipc/hr.h>
44#include <ipc/services.h>
45#include <loc.h>
46#include <mem.h>
47#include <task.h>
48#include <stdio.h>
49#include <stdlib.h>
50#include <str_error.h>
51
52#include "superblock.h"
53#include "util.h"
54#include "var.h"
55
56static errno_t hr_raid5_vol_usable(hr_volume_t *);
57static ssize_t hr_raid5_get_bad_ext(hr_volume_t *);
58static errno_t hr_raid5_update_vol_state(hr_volume_t *);
59static void xor(void *, const void *, size_t);
60
61static errno_t hr_raid5_read_degraded(hr_volume_t *, uint64_t, uint64_t,
62 void *, size_t);
63static errno_t hr_raid5_write(hr_volume_t *, uint64_t, uint64_t, aoff64_t,
64 const void *, size_t);
65static errno_t hr_raid5_write_parity(hr_volume_t *, uint64_t, uint64_t,
66 uint64_t, const void *, size_t);
67static errno_t hr_raid5_bd_op(hr_bd_op_type_t, bd_srv_t *, aoff64_t, size_t,
68 void *, const void *, size_t);
69static errno_t hr_raid5_rebuild(void *);
70
71/* bdops */
72static errno_t hr_raid5_bd_open(bd_srvs_t *, bd_srv_t *);
73static errno_t hr_raid5_bd_close(bd_srv_t *);
74static errno_t hr_raid5_bd_read_blocks(bd_srv_t *, aoff64_t, size_t, void *,
75 size_t);
76static errno_t hr_raid5_bd_sync_cache(bd_srv_t *, aoff64_t, size_t);
77static errno_t hr_raid5_bd_write_blocks(bd_srv_t *, aoff64_t, size_t,
78 const void *, size_t);
79static errno_t hr_raid5_bd_get_block_size(bd_srv_t *, size_t *);
80static errno_t hr_raid5_bd_get_num_blocks(bd_srv_t *, aoff64_t *);
81
82static bd_ops_t hr_raid5_bd_ops = {
83 .open = hr_raid5_bd_open,
84 .close = hr_raid5_bd_close,
85 .sync_cache = hr_raid5_bd_sync_cache,
86 .read_blocks = hr_raid5_bd_read_blocks,
87 .write_blocks = hr_raid5_bd_write_blocks,
88 .get_block_size = hr_raid5_bd_get_block_size,
89 .get_num_blocks = hr_raid5_bd_get_num_blocks
90};
91
92extern loc_srv_t *hr_srv;
93
94errno_t hr_raid5_create(hr_volume_t *new_volume)
95{
96 HR_DEBUG("%s()", __func__);
97
98 if (new_volume->level != HR_LVL_5 && new_volume->level != HR_LVL_4)
99 return EINVAL;
100
101 if (new_volume->extent_no < 3) {
102 HR_ERROR("RAID 5 volume needs at least 3 devices\n");
103 return EINVAL;
104 }
105
106 fibril_rwlock_write_lock(&new_volume->states_lock);
107
108 errno_t rc = hr_raid5_update_vol_state(new_volume);
109 if (rc != EOK) {
110 HR_NOTE("\"%s\": unusable state, not creating\n",
111 new_volume->devname);
112 fibril_rwlock_write_unlock(&new_volume->states_lock);
113 return rc;
114 }
115
116 bd_srvs_init(&new_volume->hr_bds);
117 new_volume->hr_bds.ops = &hr_raid5_bd_ops;
118 new_volume->hr_bds.sarg = new_volume;
119
120 fibril_rwlock_write_unlock(&new_volume->states_lock);
121
122 return EOK;
123}
124
125/*
126 * Called only once in volume's lifetime.
127 */
128errno_t hr_raid5_init(hr_volume_t *vol)
129{
130 HR_DEBUG("%s()", __func__);
131
132 if (vol->level != HR_LVL_5 && vol->level != HR_LVL_4)
133 return EINVAL;
134
135 uint64_t total_blkno = vol->truncated_blkno * vol->extent_no;
136
137 vol->data_offset = vol->meta_ops->get_data_offset();
138
139 vol->data_blkno = total_blkno;
140 /* count md blocks */
141 vol->data_blkno -= vol->meta_ops->get_size() * vol->extent_no;
142 vol->data_blkno -= vol->truncated_blkno; /* count parity */
143
144 vol->strip_size = HR_STRIP_SIZE;
145
146 if (vol->level == HR_LVL_4)
147 vol->layout = HR_LAYOUT_RAID4_N;
148 else
149 vol->layout = HR_LAYOUT_RAID5_NR;
150
151 return EOK;
152}
153
154void hr_raid5_vol_state_eval(hr_volume_t *vol)
155{
156 fibril_mutex_lock(&vol->lock);
157 fibril_rwlock_write_lock(&vol->states_lock);
158 (void)hr_raid5_update_vol_state(vol);
159 fibril_rwlock_write_unlock(&vol->states_lock);
160 fibril_mutex_unlock(&vol->lock);
161}
162
163errno_t hr_raid5_add_hotspare(hr_volume_t *vol, service_id_t hotspare)
164{
165 HR_DEBUG("%s()", __func__);
166
167 fibril_mutex_lock(&vol->lock);
168
169 errno_t rc = hr_util_add_hotspare(vol, hotspare);
170 if (rc != EOK)
171 goto end;
172
173 /*
174 * If the volume is degraded, start rebuild right away.
175 */
176 if (vol->state == HR_VOL_DEGRADED) {
177 HR_DEBUG("hr_raid5_add_hotspare(): volume in DEGRADED state, "
178 "spawning new rebuild fibril\n");
179 fid_t fib = fibril_create(hr_raid5_rebuild, vol);
180 if (fib == 0) {
181 fibril_mutex_unlock(&vol->hotspare_lock);
182 fibril_mutex_unlock(&vol->lock);
183 return ENOMEM;
184 }
185 fibril_start(fib);
186 fibril_detach(fib);
187 }
188
189end:
190 fibril_mutex_unlock(&vol->lock);
191
192 return rc;
193}
194
195void hr_raid5_ext_state_cb(hr_volume_t *vol, size_t extent,
196 errno_t rc)
197{
198 if (rc == ENOENT)
199 hr_update_ext_state(vol, extent, HR_EXT_MISSING);
200 else if (rc != EOK)
201 hr_update_ext_state(vol, extent, HR_EXT_FAILED);
202}
203
204static errno_t hr_raid5_bd_open(bd_srvs_t *bds, bd_srv_t *bd)
205{
206 HR_DEBUG("%s()\n", __func__);
207
208 hr_volume_t *vol = bd->srvs->sarg;
209
210 atomic_fetch_add_explicit(&vol->open_cnt, 1, memory_order_relaxed);
211
212 return EOK;
213}
214
215static errno_t hr_raid5_bd_close(bd_srv_t *bd)
216{
217 HR_DEBUG("%s()\n", __func__);
218
219 hr_volume_t *vol = bd->srvs->sarg;
220
221 atomic_fetch_sub_explicit(&vol->open_cnt, 1, memory_order_relaxed);
222
223 return EOK;
224}
225
226static errno_t hr_raid5_bd_sync_cache(bd_srv_t *bd, aoff64_t ba, size_t cnt)
227{
228 return hr_raid5_bd_op(HR_BD_SYNC, bd, ba, cnt, NULL, NULL, 0);
229}
230
231static errno_t hr_raid5_bd_read_blocks(bd_srv_t *bd, aoff64_t ba, size_t cnt,
232 void *buf, size_t size)
233{
234 return hr_raid5_bd_op(HR_BD_READ, bd, ba, cnt, buf, NULL, size);
235}
236
237static errno_t hr_raid5_bd_write_blocks(bd_srv_t *bd, aoff64_t ba, size_t cnt,
238 const void *data, size_t size)
239{
240 return hr_raid5_bd_op(HR_BD_WRITE, bd, ba, cnt, NULL, data, size);
241}
242
243static errno_t hr_raid5_bd_get_block_size(bd_srv_t *bd, size_t *rsize)
244{
245 hr_volume_t *vol = bd->srvs->sarg;
246
247 *rsize = vol->bsize;
248 return EOK;
249}
250
251static errno_t hr_raid5_bd_get_num_blocks(bd_srv_t *bd, aoff64_t *rnb)
252{
253 hr_volume_t *vol = bd->srvs->sarg;
254
255 *rnb = vol->data_blkno;
256 return EOK;
257}
258
259static errno_t hr_raid5_vol_usable(hr_volume_t *vol)
260{
261 if (vol->state == HR_VOL_ONLINE ||
262 vol->state == HR_VOL_DEGRADED ||
263 vol->state == HR_VOL_REBUILD)
264 return EOK;
265 return EIO;
266}
267
268/*
269 * Returns (-1) if all extents are online,
270 * else returns index of first bad one.
271 */
272static ssize_t hr_raid5_get_bad_ext(hr_volume_t *vol)
273{
274 for (size_t i = 0; i < vol->extent_no; i++)
275 if (vol->extents[i].state != HR_EXT_ONLINE)
276 return i;
277 return -1;
278}
279
280static errno_t hr_raid5_update_vol_state(hr_volume_t *vol)
281{
282 hr_vol_state_t old_state = vol->state;
283 size_t bad = 0;
284 for (size_t i = 0; i < vol->extent_no; i++)
285 if (vol->extents[i].state != HR_EXT_ONLINE)
286 bad++;
287
288 switch (bad) {
289 case 0:
290 if (old_state != HR_VOL_ONLINE)
291 hr_update_vol_state(vol, HR_VOL_ONLINE);
292 return EOK;
293 case 1:
294 if (old_state != HR_VOL_DEGRADED &&
295 old_state != HR_VOL_REBUILD) {
296
297 hr_update_vol_state(vol, HR_VOL_DEGRADED);
298
299 if (vol->hotspare_no > 0) {
300 fid_t fib = fibril_create(hr_raid5_rebuild,
301 vol);
302 if (fib == 0)
303 return ENOMEM;
304 fibril_start(fib);
305 fibril_detach(fib);
306 }
307 }
308 return EOK;
309 default:
310 if (old_state != HR_VOL_FAULTY)
311 hr_update_vol_state(vol, HR_VOL_FAULTY);
312 return EIO;
313 }
314}
315
316static void xor(void *dst, const void *src, size_t size)
317{
318 size_t i;
319 uint64_t *d = dst;
320 const uint64_t *s = src;
321
322 for (i = 0; i < size / sizeof(uint64_t); ++i)
323 *d++ ^= *s++;
324}
325
326static errno_t hr_raid5_read_degraded(hr_volume_t *vol, uint64_t bad,
327 uint64_t block, void *data, size_t cnt)
328{
329 errno_t rc;
330 size_t i;
331 void *xorbuf;
332 void *buf;
333 uint64_t len = vol->bsize * cnt;
334
335 xorbuf = malloc(len);
336 if (xorbuf == NULL)
337 return ENOMEM;
338
339 buf = malloc(len);
340 if (buf == NULL) {
341 free(xorbuf);
342 return ENOMEM;
343 }
344
345 /* read all other extents in the stripe */
346 bool first = true;
347 for (i = 0; i < vol->extent_no; i++) {
348 if (i == bad)
349 continue;
350
351 if (first) {
352 rc = block_read_direct(vol->extents[i].svc_id, block,
353 cnt, xorbuf);
354 if (rc != EOK)
355 goto end;
356
357 first = false;
358 } else {
359 rc = block_read_direct(vol->extents[i].svc_id, block,
360 cnt, buf);
361 if (rc != EOK)
362 goto end;
363 xor(xorbuf, buf, len);
364 }
365 }
366
367 memcpy(data, xorbuf, len);
368end:
369 free(xorbuf);
370 free(buf);
371 return rc;
372}
373
374static errno_t hr_raid5_write(hr_volume_t *vol, uint64_t p_extent,
375 uint64_t extent, aoff64_t ba, const void *data, size_t cnt)
376{
377 errno_t rc;
378 size_t i;
379 void *xorbuf;
380 void *buf;
381 uint64_t len = vol->bsize * cnt;
382
383 ssize_t bad = hr_raid5_get_bad_ext(vol);
384 if (bad == -1 || (size_t)bad == p_extent) {
385 rc = block_write_direct(vol->extents[extent].svc_id, ba, cnt,
386 data);
387 if (rc != EOK)
388 return rc;
389 /*
390 * DEGRADED parity - skip parity write
391 */
392 if ((size_t)bad == p_extent)
393 return EOK;
394
395 rc = hr_raid5_write_parity(vol, p_extent, extent, ba, data,
396 cnt);
397 return rc;
398 }
399
400 xorbuf = malloc(len);
401 if (xorbuf == NULL)
402 return ENOMEM;
403
404 buf = malloc(len);
405 if (buf == NULL) {
406 free(xorbuf);
407 return ENOMEM;
408 }
409
410 if (extent == (size_t)bad) {
411 /*
412 * new parity = read other and xor in new data
413 *
414 * write new parity
415 */
416 bool first = true;
417 for (i = 0; i < vol->extent_no; i++) {
418 if (i == (size_t)bad)
419 continue;
420 if (i == p_extent)
421 continue;
422 if (first) {
423 rc = block_read_direct(vol->extents[i].svc_id,
424 ba, cnt, xorbuf);
425 if (rc != EOK)
426 goto end;
427
428 first = false;
429 } else {
430 rc = block_read_direct(vol->extents[i].svc_id,
431 ba, cnt, buf);
432 if (rc != EOK)
433 goto end;
434 xor(xorbuf, buf, len);
435 }
436 }
437 xor(xorbuf, data, len);
438 rc = block_write_direct(vol->extents[p_extent].svc_id, ba, cnt,
439 xorbuf);
440 if (rc != EOK)
441 goto end;
442 } else {
443 /*
444 * new parity = xor original data and old parity and new data
445 *
446 * write parity, new data
447 */
448 rc = block_read_direct(vol->extents[extent].svc_id, ba, cnt,
449 xorbuf);
450 if (rc != EOK)
451 goto end;
452 rc = block_read_direct(vol->extents[p_extent].svc_id, ba, cnt,
453 buf);
454 if (rc != EOK)
455 goto end;
456
457 xor(xorbuf, buf, len);
458
459 xor(xorbuf, data, len);
460
461 rc = block_write_direct(vol->extents[p_extent].svc_id, ba, cnt,
462 xorbuf);
463 if (rc != EOK)
464 goto end;
465 rc = block_write_direct(vol->extents[extent].svc_id, ba, cnt,
466 data);
467 if (rc != EOK)
468 goto end;
469 }
470end:
471 free(xorbuf);
472 free(buf);
473 return rc;
474}
475
476static errno_t hr_raid5_write_parity(hr_volume_t *vol, uint64_t p_extent,
477 uint64_t extent, uint64_t block, const void *data, size_t cnt)
478{
479 errno_t rc;
480 size_t i;
481 void *xorbuf;
482 void *buf;
483 uint64_t len = vol->bsize * cnt;
484
485 xorbuf = malloc(len);
486 if (xorbuf == NULL)
487 return ENOMEM;
488
489 buf = malloc(len);
490 if (buf == NULL) {
491 free(xorbuf);
492 return ENOMEM;
493 }
494
495 bool first = true;
496 for (i = 0; i < vol->extent_no; i++) {
497 if (i == p_extent)
498 continue;
499
500 if (first) {
501 if (i == extent) {
502 memcpy(xorbuf, data, len);
503 } else {
504 rc = block_read_direct(vol->extents[i].svc_id,
505 block, cnt, xorbuf);
506 if (rc != EOK)
507 goto end;
508 }
509
510 first = false;
511 } else {
512 if (i == extent) {
513 xor(xorbuf, data, len);
514 } else {
515 rc = block_read_direct(vol->extents[i].svc_id,
516 block, cnt, buf);
517 if (rc != EOK)
518 goto end;
519
520 xor(xorbuf, buf, len);
521 }
522 }
523 }
524
525 rc = block_write_direct(vol->extents[p_extent].svc_id, block, cnt,
526 xorbuf);
527end:
528 free(xorbuf);
529 free(buf);
530 return rc;
531}
532
533static errno_t hr_raid5_bd_op(hr_bd_op_type_t type, bd_srv_t *bd, aoff64_t ba,
534 size_t cnt, void *dst, const void *src, size_t size)
535{
536 hr_volume_t *vol = bd->srvs->sarg;
537 errno_t rc;
538 uint64_t phys_block, len;
539 size_t left;
540 const uint8_t *data_write = src;
541 uint8_t *data_read = dst;
542
543 /* propagate sync */
544 if (type == HR_BD_SYNC && ba == 0 && cnt == 0) {
545 hr_sync_all_extents(vol);
546 rc = hr_raid5_update_vol_state(vol);
547 return rc;
548 }
549
550 if (type == HR_BD_READ || type == HR_BD_WRITE)
551 if (size < cnt * vol->bsize)
552 return EINVAL;
553
554 rc = hr_check_ba_range(vol, cnt, ba);
555 if (rc != EOK)
556 return rc;
557
558 hr_layout_t layout = vol->layout;
559 hr_level_t level = vol->level;
560
561 uint64_t strip_size = vol->strip_size / vol->bsize; /* in blocks */
562 uint64_t stripe = (ba / strip_size); /* stripe number */
563
564 /* parity extent */
565 uint64_t p_extent;
566 if (level == HR_LVL_4 && layout == HR_LAYOUT_RAID4_0) {
567 p_extent = 0;
568 } else if (level == HR_LVL_4 && layout == HR_LAYOUT_RAID4_N) {
569 p_extent = vol->extent_no - 1;
570 } else if (level == HR_LVL_5 && layout == HR_LAYOUT_RAID5_0R) {
571 p_extent = (stripe / (vol->extent_no - 1)) % vol->extent_no;
572 } else if (level == HR_LVL_5 &&
573 (layout == HR_LAYOUT_RAID5_NR || layout == HR_LAYOUT_RAID5_NC)) {
574 p_extent = (vol->extent_no - 1) -
575 (stripe / (vol->extent_no - 1)) % vol->extent_no;
576 } else {
577 return EINVAL;
578 }
579
580 uint64_t extent;
581 if (level == HR_LVL_4 && layout == HR_LAYOUT_RAID4_0) {
582 extent = (stripe % (vol->extent_no - 1)) + 1;
583 } else if (level == HR_LVL_4 && layout == HR_LAYOUT_RAID4_N) {
584 extent = stripe % (vol->extent_no - 1);
585 } else if (level == HR_LVL_5 &&
586 (layout == HR_LAYOUT_RAID5_0R || layout == HR_LAYOUT_RAID5_NR)) {
587 if ((stripe % (vol->extent_no - 1)) < p_extent)
588 extent = stripe % (vol->extent_no - 1);
589 else
590 extent = (stripe % (vol->extent_no - 1)) + 1;
591 } else if (level == HR_LVL_5 && layout == HR_LAYOUT_RAID5_NC) {
592 extent =
593 ((stripe % (vol->extent_no - 1)) + p_extent + 1) %
594 vol->extent_no;
595 } else {
596 return EINVAL;
597 }
598
599 uint64_t ext_stripe = stripe / (vol->extent_no - 1); /* stripe level */
600 uint64_t strip_off = ba % strip_size; /* strip offset */
601
602 fibril_mutex_lock(&vol->lock);
603
604 rc = hr_raid5_vol_usable(vol);
605 if (rc != EOK) {
606 fibril_mutex_unlock(&vol->lock);
607 return EIO;
608 }
609
610 left = cnt;
611
612 fibril_rwlock_write_lock(&vol->states_lock);
613 while (left != 0) {
614 phys_block = ext_stripe * strip_size + strip_off;
615 cnt = min(left, strip_size - strip_off);
616 len = vol->bsize * cnt;
617 hr_add_ba_offset(vol, &phys_block);
618 switch (type) {
619 case HR_BD_SYNC:
620 if (vol->extents[extent].state != HR_EXT_ONLINE)
621 break;
622 rc = block_sync_cache(vol->extents[extent].svc_id,
623 phys_block, cnt);
624 /* allow unsupported sync */
625 if (rc == ENOTSUP)
626 rc = EOK;
627 break;
628 case HR_BD_READ:
629 retry_read:
630 ssize_t bad = hr_raid5_get_bad_ext(vol);
631 if (bad > -1 && extent == (size_t)bad) {
632 rc = hr_raid5_read_degraded(vol, bad,
633 phys_block, data_read, cnt);
634 } else {
635 rc = block_read_direct(vol->extents[extent].svc_id,
636 phys_block, cnt, data_read);
637 }
638 data_read += len;
639 break;
640 case HR_BD_WRITE:
641 retry_write:
642 rc = hr_raid5_write(vol, p_extent, extent, phys_block,
643 data_write, cnt);
644 data_write += len;
645 break;
646 default:
647 rc = EINVAL;
648 goto error;
649 }
650
651 if (rc == ENOMEM)
652 goto error;
653
654 hr_raid5_ext_state_cb(vol, extent, rc);
655
656 if (rc != EOK) {
657 rc = hr_raid5_update_vol_state(vol);
658 if (rc == EOK) {
659 /*
660 * State changed from ONLINE -> DEGRADED,
661 * rewind and retry
662 */
663 if (type == HR_BD_WRITE) {
664 data_write -= len;
665 goto retry_write;
666 } else if (type == HR_BD_WRITE) {
667 data_read -= len;
668 goto retry_read;
669 }
670 } else {
671 rc = EIO;
672 goto error;
673 }
674 }
675
676 left -= cnt;
677 strip_off = 0;
678 stripe++;
679
680 ext_stripe = stripe / (vol->extent_no - 1); /* stripe level */
681
682 if (level == HR_LVL_5 && layout == HR_LAYOUT_RAID5_0R) {
683 p_extent =
684 (stripe / (vol->extent_no - 1)) % vol->extent_no;
685 } else if (level == HR_LVL_5 &&
686 (layout == HR_LAYOUT_RAID5_NR || layout == HR_LAYOUT_RAID5_NC)) {
687 p_extent = (vol->extent_no - 1) -
688 (stripe / (vol->extent_no - 1)) % vol->extent_no;
689 }
690
691 if (level == HR_LVL_4 && layout == HR_LAYOUT_RAID4_0) {
692 extent = (stripe % (vol->extent_no - 1)) + 1;
693 } else if (level == HR_LVL_4 && layout == HR_LAYOUT_RAID4_N) {
694 extent = stripe % (vol->extent_no - 1);
695 } else if (level == HR_LVL_5 &&
696 (layout == HR_LAYOUT_RAID5_0R || layout == HR_LAYOUT_RAID5_NR)) {
697 if ((stripe % (vol->extent_no - 1)) < p_extent)
698 extent = stripe % (vol->extent_no - 1);
699 else
700 extent = (stripe % (vol->extent_no - 1)) + 1;
701 } else if (level == HR_LVL_5 && layout == HR_LAYOUT_RAID5_NC) {
702 extent =
703 ((stripe % (vol->extent_no - 1)) + p_extent + 1) %
704 vol->extent_no;
705 }
706 }
707
708error:
709 (void)hr_raid5_update_vol_state(vol);
710 fibril_rwlock_write_unlock(&vol->states_lock);
711 fibril_mutex_unlock(&vol->lock);
712 return rc;
713}
714
715static errno_t hr_raid5_rebuild(void *arg)
716{
717 HR_DEBUG("hr_raid5_rebuild()\n");
718
719 hr_volume_t *vol = arg;
720 errno_t rc = EOK;
721 void *buf = NULL, *xorbuf = NULL;
722
723 fibril_mutex_lock(&vol->lock);
724 fibril_rwlock_read_lock(&vol->extents_lock);
725 fibril_rwlock_write_lock(&vol->states_lock);
726
727 if (vol->hotspare_no == 0) {
728 HR_WARN("hr_raid5_rebuild(): no free hotspares on \"%s\", "
729 "aborting rebuild\n", vol->devname);
730 /* retval isn't checked for now */
731 goto end;
732 }
733
734 size_t bad = vol->extent_no;
735 for (size_t i = 0; i < vol->extent_no; i++) {
736 if (vol->extents[i].state == HR_EXT_FAILED) {
737 bad = i;
738 break;
739 }
740 }
741
742 if (bad == vol->extent_no) {
743 HR_WARN("hr_raid5_rebuild(): no bad extent on \"%s\", "
744 "aborting rebuild\n", vol->devname);
745 /* retval isn't checked for now */
746 goto end;
747 }
748
749 size_t hotspare_idx = vol->hotspare_no - 1;
750
751 hr_ext_state_t hs_state = vol->hotspares[hotspare_idx].state;
752 if (hs_state != HR_EXT_HOTSPARE) {
753 HR_ERROR("hr_raid5_rebuild(): invalid hotspare state \"%s\", "
754 "aborting rebuild\n", hr_get_ext_state_str(hs_state));
755 rc = EINVAL;
756 goto end;
757 }
758
759 HR_DEBUG("hr_raid5_rebuild(): swapping in hotspare\n");
760
761 block_fini(vol->extents[bad].svc_id);
762
763 vol->extents[bad].svc_id = vol->hotspares[hotspare_idx].svc_id;
764 hr_update_ext_state(vol, bad, HR_EXT_HOTSPARE);
765
766 vol->hotspares[hotspare_idx].svc_id = 0;
767 fibril_mutex_lock(&vol->hotspare_lock);
768 hr_update_hotspare_state(vol, hotspare_idx, HR_EXT_MISSING);
769 fibril_mutex_unlock(&vol->hotspare_lock);
770
771 vol->hotspare_no--;
772
773 hr_extent_t *rebuild_ext = &vol->extents[bad];
774
775 HR_DEBUG("hr_raid5_rebuild(): starting rebuild on (%" PRIun ")\n",
776 rebuild_ext->svc_id);
777
778 hr_update_ext_state(vol, bad, HR_EXT_REBUILD);
779 hr_update_vol_state(vol, HR_VOL_REBUILD);
780
781 uint64_t max_blks = DATA_XFER_LIMIT / vol->bsize;
782 uint64_t left = vol->data_blkno / (vol->extent_no - 1);
783 buf = malloc(max_blks * vol->bsize);
784 xorbuf = malloc(max_blks * vol->bsize);
785
786 uint64_t ba = 0, cnt;
787 hr_add_ba_offset(vol, &ba);
788
789 while (left != 0) {
790 cnt = min(left, max_blks);
791
792 /*
793 * Almost the same as read_degraded,
794 * but we don't want to allocate new
795 * xorbuf each blk rebuild batch.
796 */
797 bool first = true;
798 for (size_t i = 0; i < vol->extent_no; i++) {
799 if (i == bad)
800 continue;
801 if (first)
802 rc = block_read_direct(vol->extents[i].svc_id,
803 ba, cnt, xorbuf);
804 else
805 rc = block_read_direct(vol->extents[i].svc_id,
806 ba, cnt, buf);
807 if (rc != EOK) {
808 hr_raid5_ext_state_cb(vol, i, rc);
809 HR_ERROR("rebuild on \"%s\" (%" PRIun "), "
810 "failed due to a failed ONLINE extent, "
811 "number %zu\n",
812 vol->devname, vol->svc_id, i);
813 goto end;
814 }
815
816 if (!first)
817 xor(xorbuf, buf, cnt * vol->bsize);
818 else
819 first = false;
820 }
821
822 rc = block_write_direct(rebuild_ext->svc_id, ba, cnt, xorbuf);
823 if (rc != EOK) {
824 hr_raid5_ext_state_cb(vol, bad, rc);
825 HR_ERROR("rebuild on \"%s\" (%" PRIun "), failed due to "
826 "the rebuilt extent number %zu failing\n",
827 vol->devname, vol->svc_id, bad);
828 goto end;
829 }
830
831 ba += cnt;
832 left -= cnt;
833
834 /*
835 * Let other IO requests be served
836 * during rebuild.
837 */
838
839 /*
840 * fibril_rwlock_write_unlock(&vol->states_lock);
841 * fibril_mutex_unlock(&vol->lock);
842 * fibril_mutex_lock(&vol->lock);
843 * fibril_rwlock_write_lock(&vol->states_lock);
844 */
845 }
846
847 HR_DEBUG("hr_raid5_rebuild(): rebuild finished on \"%s\" (%" PRIun "), "
848 "extent number %zu\n", vol->devname, vol->svc_id, hotspare_idx);
849
850 hr_update_ext_state(vol, bad, HR_EXT_ONLINE);
851
852 fibril_rwlock_write_unlock(&vol->states_lock);
853 fibril_rwlock_read_unlock(&vol->extents_lock);
854 fibril_mutex_unlock(&vol->lock);
855
856 rc = vol->meta_ops->save(vol, WITH_STATE_CALLBACK);
857
858 fibril_mutex_lock(&vol->lock);
859 fibril_rwlock_read_lock(&vol->extents_lock);
860 fibril_rwlock_write_lock(&vol->states_lock);
861
862end:
863 (void)hr_raid5_update_vol_state(vol);
864
865 fibril_rwlock_write_unlock(&vol->states_lock);
866 fibril_rwlock_read_unlock(&vol->extents_lock);
867 fibril_mutex_unlock(&vol->lock);
868
869 if (buf != NULL)
870 free(buf);
871
872 if (xorbuf != NULL)
873 free(xorbuf);
874
875 return rc;
876}
877
878/** @}
879 */
Note: See TracBrowser for help on using the repository browser.