source: mainline/uspace/srv/bd/hr/raid5.c@ 8a65373

Last change on this file since 8a65373 was 8a65373, checked in by Miroslav Cimerman <mc@…>, 7 months ago

hr: move registering out of specific RAIDs

  • Property mode set to 100644
File size: 21.3 KB
Line 
1/*
2 * Copyright (c) 2024 Miroslav Cimerman
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * - Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * - Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * - The name of the author may not be used to endorse or promote products
15 * derived from this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/** @addtogroup hr
30 * @{
31 */
32/**
33 * @file
34 */
35
36#include <abi/ipc/ipc.h>
37#include <bd_srv.h>
38#include <block.h>
39#include <errno.h>
40#include <hr.h>
41#include <io/log.h>
42#include <ipc/hr.h>
43#include <ipc/services.h>
44#include <loc.h>
45#include <mem.h>
46#include <task.h>
47#include <stdio.h>
48#include <stdlib.h>
49#include <str_error.h>
50
51#include "superblock.h"
52#include "util.h"
53#include "var.h"
54
55extern loc_srv_t *hr_srv;
56
57static errno_t hr_raid5_vol_usable(hr_volume_t *);
58static ssize_t hr_raid5_get_bad_ext(hr_volume_t *);
59static errno_t hr_raid5_update_vol_status(hr_volume_t *);
60static void hr_raid5_handle_extent_error(hr_volume_t *, size_t, errno_t);
61static void xor(void *, const void *, size_t);
62static errno_t hr_raid5_read_degraded(hr_volume_t *, uint64_t, uint64_t,
63 void *, size_t);
64static errno_t hr_raid5_write(hr_volume_t *, uint64_t, uint64_t, aoff64_t,
65 const void *, size_t);
66static errno_t hr_raid5_write_parity(hr_volume_t *, uint64_t, uint64_t,
67 uint64_t, const void *, size_t);
68static errno_t hr_raid5_bd_op(hr_bd_op_type_t, bd_srv_t *, aoff64_t, size_t,
69 void *, const void *, size_t);
70static errno_t hr_raid5_rebuild(void *);
71
72/* bdops */
73static errno_t hr_raid5_bd_open(bd_srvs_t *, bd_srv_t *);
74static errno_t hr_raid5_bd_close(bd_srv_t *);
75static errno_t hr_raid5_bd_read_blocks(bd_srv_t *, aoff64_t, size_t, void *,
76 size_t);
77static errno_t hr_raid5_bd_sync_cache(bd_srv_t *, aoff64_t, size_t);
78static errno_t hr_raid5_bd_write_blocks(bd_srv_t *, aoff64_t, size_t,
79 const void *, size_t);
80static errno_t hr_raid5_bd_get_block_size(bd_srv_t *, size_t *);
81static errno_t hr_raid5_bd_get_num_blocks(bd_srv_t *, aoff64_t *);
82
83static bd_ops_t hr_raid5_bd_ops = {
84 .open = hr_raid5_bd_open,
85 .close = hr_raid5_bd_close,
86 .sync_cache = hr_raid5_bd_sync_cache,
87 .read_blocks = hr_raid5_bd_read_blocks,
88 .write_blocks = hr_raid5_bd_write_blocks,
89 .get_block_size = hr_raid5_bd_get_block_size,
90 .get_num_blocks = hr_raid5_bd_get_num_blocks
91};
92
93errno_t hr_raid5_create(hr_volume_t *new_volume)
94{
95 assert(new_volume->level == HR_LVL_5 || new_volume->level == HR_LVL_4);
96
97 if (new_volume->extent_no < 3) {
98 HR_ERROR("RAID 5 array needs at least 3 devices\n");
99 return EINVAL;
100 }
101
102 fibril_rwlock_write_lock(&new_volume->states_lock);
103
104 errno_t rc = hr_raid5_update_vol_status(new_volume);
105 if (rc != EOK) {
106 fibril_rwlock_write_unlock(&new_volume->states_lock);
107 return rc;
108 }
109
110 bd_srvs_init(&new_volume->hr_bds);
111 new_volume->hr_bds.ops = &hr_raid5_bd_ops;
112 new_volume->hr_bds.sarg = new_volume;
113
114 fibril_rwlock_write_unlock(&new_volume->states_lock);
115
116 return EOK;
117}
118
119errno_t hr_raid5_init(hr_volume_t *vol)
120{
121 errno_t rc;
122 size_t bsize;
123 uint64_t total_blkno;
124
125 assert(vol->level == HR_LVL_5 || vol->level == HR_LVL_4);
126
127 rc = hr_check_devs(vol, &total_blkno, &bsize);
128 if (rc != EOK)
129 return rc;
130
131 vol->nblocks = total_blkno;
132 vol->bsize = bsize;
133 vol->data_offset = HR_DATA_OFF;
134 vol->data_blkno = vol->nblocks - (vol->data_offset * vol->extent_no) -
135 (vol->nblocks / vol->extent_no);
136 vol->strip_size = HR_STRIP_SIZE;
137
138 return EOK;
139}
140
141void hr_raid5_status_event(hr_volume_t *vol)
142{
143 fibril_mutex_lock(&vol->lock);
144 fibril_rwlock_write_lock(&vol->states_lock);
145 (void)hr_raid5_update_vol_status(vol);
146 fibril_rwlock_write_unlock(&vol->states_lock);
147 fibril_mutex_unlock(&vol->lock);
148}
149
150errno_t hr_raid5_add_hotspare(hr_volume_t *vol, service_id_t hotspare)
151{
152 HR_DEBUG("hr_raid5_add_hotspare()\n");
153
154 fibril_mutex_lock(&vol->lock);
155 fibril_mutex_lock(&vol->hotspare_lock);
156
157 if (vol->hotspare_no >= HR_MAX_HOTSPARES) {
158 HR_ERROR("hr_raid5_add_hotspare(): cannot add more hotspares "
159 "to \"%s\"\n", vol->devname);
160 fibril_mutex_unlock(&vol->lock);
161 return ELIMIT;
162 }
163
164 vol->hotspares[vol->hotspare_no].svc_id = hotspare;
165
166 vol->hotspare_no++;
167
168 hr_update_hotspare_status(vol, vol->hotspare_no - 1, HR_EXT_HOTSPARE);
169
170 /*
171 * If the volume is degraded, start rebuild right away.
172 */
173 if (vol->status == HR_VOL_DEGRADED) {
174 HR_DEBUG("hr_raid5_add_hotspare(): volume in DEGRADED state, "
175 "spawning new rebuild fibril\n");
176 fid_t fib = fibril_create(hr_raid5_rebuild, vol);
177 if (fib == 0) {
178 fibril_mutex_unlock(&vol->hotspare_lock);
179 fibril_mutex_unlock(&vol->lock);
180 return ENOMEM;
181 }
182 fibril_start(fib);
183 fibril_detach(fib);
184 }
185
186 fibril_mutex_unlock(&vol->hotspare_lock);
187 fibril_mutex_unlock(&vol->lock);
188
189 return EOK;
190}
191
192static errno_t hr_raid5_bd_open(bd_srvs_t *bds, bd_srv_t *bd)
193{
194 HR_DEBUG("%s()\n", __func__);
195
196 hr_volume_t *vol = bd->srvs->sarg;
197
198 atomic_fetch_add_explicit(&vol->open_cnt, 1, memory_order_relaxed);
199
200 return EOK;
201}
202
203static errno_t hr_raid5_bd_close(bd_srv_t *bd)
204{
205 HR_DEBUG("%s()\n", __func__);
206
207 hr_volume_t *vol = bd->srvs->sarg;
208
209 atomic_fetch_sub_explicit(&vol->open_cnt, 1, memory_order_relaxed);
210
211 return EOK;
212}
213
214static errno_t hr_raid5_bd_sync_cache(bd_srv_t *bd, aoff64_t ba, size_t cnt)
215{
216 return hr_raid5_bd_op(HR_BD_SYNC, bd, ba, cnt, NULL, NULL, 0);
217}
218
219static errno_t hr_raid5_bd_read_blocks(bd_srv_t *bd, aoff64_t ba, size_t cnt,
220 void *buf, size_t size)
221{
222 return hr_raid5_bd_op(HR_BD_READ, bd, ba, cnt, buf, NULL, size);
223}
224
225static errno_t hr_raid5_bd_write_blocks(bd_srv_t *bd, aoff64_t ba, size_t cnt,
226 const void *data, size_t size)
227{
228 return hr_raid5_bd_op(HR_BD_WRITE, bd, ba, cnt, NULL, data, size);
229}
230
231static errno_t hr_raid5_bd_get_block_size(bd_srv_t *bd, size_t *rsize)
232{
233 hr_volume_t *vol = bd->srvs->sarg;
234
235 *rsize = vol->bsize;
236 return EOK;
237}
238
239static errno_t hr_raid5_bd_get_num_blocks(bd_srv_t *bd, aoff64_t *rnb)
240{
241 hr_volume_t *vol = bd->srvs->sarg;
242
243 *rnb = vol->data_blkno;
244 return EOK;
245}
246
247static errno_t hr_raid5_vol_usable(hr_volume_t *vol)
248{
249 if (vol->status == HR_VOL_ONLINE ||
250 vol->status == HR_VOL_DEGRADED ||
251 vol->status == HR_VOL_REBUILD)
252 return EOK;
253 return EIO;
254}
255
256/*
257 * Returns (-1) if all extents are online,
258 * else returns index of first bad one.
259 */
260static ssize_t hr_raid5_get_bad_ext(hr_volume_t *vol)
261{
262 for (size_t i = 0; i < vol->extent_no; i++)
263 if (vol->extents[i].status != HR_EXT_ONLINE)
264 return i;
265 return -1;
266}
267
268static errno_t hr_raid5_update_vol_status(hr_volume_t *vol)
269{
270 hr_vol_status_t old_state = vol->status;
271 size_t bad = 0;
272 for (size_t i = 0; i < vol->extent_no; i++)
273 if (vol->extents[i].status != HR_EXT_ONLINE)
274 bad++;
275
276 switch (bad) {
277 case 0:
278 if (old_state != HR_VOL_ONLINE)
279 hr_update_vol_status(vol, HR_VOL_ONLINE);
280 return EOK;
281 case 1:
282 if (old_state != HR_VOL_DEGRADED &&
283 old_state != HR_VOL_REBUILD) {
284
285 hr_update_vol_status(vol, HR_VOL_DEGRADED);
286
287 if (vol->hotspare_no > 0) {
288 fid_t fib = fibril_create(hr_raid5_rebuild,
289 vol);
290 if (fib == 0)
291 return ENOMEM;
292 fibril_start(fib);
293 fibril_detach(fib);
294 }
295 }
296 return EOK;
297 default:
298 if (old_state != HR_VOL_FAULTY)
299 hr_update_vol_status(vol, HR_VOL_FAULTY);
300 return EIO;
301 }
302}
303
304static void hr_raid5_handle_extent_error(hr_volume_t *vol, size_t extent,
305 errno_t rc)
306{
307 if (rc == ENOENT)
308 hr_update_ext_status(vol, extent, HR_EXT_MISSING);
309 else if (rc != EOK)
310 hr_update_ext_status(vol, extent, HR_EXT_FAILED);
311}
312
313static void xor(void *dst, const void *src, size_t size)
314{
315 size_t i;
316 uint64_t *d = dst;
317 const uint64_t *s = src;
318
319 for (i = 0; i < size / sizeof(uint64_t); ++i)
320 *d++ ^= *s++;
321}
322
323static errno_t hr_raid5_read_degraded(hr_volume_t *vol, uint64_t bad,
324 uint64_t block, void *data, size_t cnt)
325{
326 errno_t rc;
327 size_t i;
328 void *xorbuf;
329 void *buf;
330 uint64_t len = vol->bsize * cnt;
331
332 xorbuf = malloc(len);
333 if (xorbuf == NULL)
334 return ENOMEM;
335
336 buf = malloc(len);
337 if (buf == NULL) {
338 free(xorbuf);
339 return ENOMEM;
340 }
341
342 /* read all other extents in the stripe */
343 bool first = true;
344 for (i = 0; i < vol->extent_no; i++) {
345 if (i == bad)
346 continue;
347
348 if (first) {
349 rc = block_read_direct(vol->extents[i].svc_id, block,
350 cnt, xorbuf);
351 if (rc != EOK)
352 goto end;
353
354 first = false;
355 } else {
356 rc = block_read_direct(vol->extents[i].svc_id, block,
357 cnt, buf);
358 if (rc != EOK)
359 goto end;
360 xor(xorbuf, buf, len);
361 }
362 }
363
364 memcpy(data, xorbuf, len);
365end:
366 free(xorbuf);
367 free(buf);
368 return rc;
369}
370
371static errno_t hr_raid5_write(hr_volume_t *vol, uint64_t p_extent,
372 uint64_t extent, aoff64_t ba, const void *data, size_t cnt)
373{
374 errno_t rc;
375 size_t i;
376 void *xorbuf;
377 void *buf;
378 uint64_t len = vol->bsize * cnt;
379
380 ssize_t bad = hr_raid5_get_bad_ext(vol);
381 if (bad == -1 || (size_t)bad == p_extent) {
382 rc = block_write_direct(vol->extents[extent].svc_id, ba, cnt,
383 data);
384 if (rc != EOK)
385 return rc;
386 /*
387 * DEGRADED parity - skip parity write
388 */
389 if ((size_t)bad == p_extent)
390 return EOK;
391
392 rc = hr_raid5_write_parity(vol, p_extent, extent, ba, data,
393 cnt);
394 return rc;
395 }
396
397 xorbuf = malloc(len);
398 if (xorbuf == NULL)
399 return ENOMEM;
400
401 buf = malloc(len);
402 if (buf == NULL) {
403 free(xorbuf);
404 return ENOMEM;
405 }
406
407 if (extent == (size_t)bad) {
408 /*
409 * new parity = read other and xor in new data
410 *
411 * write new parity
412 */
413 bool first = true;
414 for (i = 0; i < vol->extent_no; i++) {
415 if (i == (size_t)bad)
416 continue;
417 if (i == p_extent)
418 continue;
419 if (first) {
420 rc = block_read_direct(vol->extents[i].svc_id,
421 ba, cnt, xorbuf);
422 if (rc != EOK)
423 goto end;
424
425 first = false;
426 } else {
427 rc = block_read_direct(vol->extents[i].svc_id,
428 ba, cnt, buf);
429 if (rc != EOK)
430 goto end;
431 xor(xorbuf, buf, len);
432 }
433 }
434 xor(xorbuf, data, len);
435 rc = block_write_direct(vol->extents[p_extent].svc_id, ba, cnt,
436 xorbuf);
437 if (rc != EOK)
438 goto end;
439 } else {
440 /*
441 * new parity = xor original data and old parity and new data
442 *
443 * write parity, new data
444 */
445 rc = block_read_direct(vol->extents[extent].svc_id, ba, cnt,
446 xorbuf);
447 if (rc != EOK)
448 goto end;
449 rc = block_read_direct(vol->extents[p_extent].svc_id, ba, cnt,
450 buf);
451 if (rc != EOK)
452 goto end;
453
454 xor(xorbuf, buf, len);
455
456 xor(xorbuf, data, len);
457
458 rc = block_write_direct(vol->extents[p_extent].svc_id, ba, cnt,
459 xorbuf);
460 if (rc != EOK)
461 goto end;
462 rc = block_write_direct(vol->extents[extent].svc_id, ba, cnt,
463 data);
464 if (rc != EOK)
465 goto end;
466 }
467end:
468 free(xorbuf);
469 free(buf);
470 return rc;
471}
472
473static errno_t hr_raid5_write_parity(hr_volume_t *vol, uint64_t p_extent,
474 uint64_t extent, uint64_t block, const void *data, size_t cnt)
475{
476 errno_t rc;
477 size_t i;
478 void *xorbuf;
479 void *buf;
480 uint64_t len = vol->bsize * cnt;
481
482 xorbuf = malloc(len);
483 if (xorbuf == NULL)
484 return ENOMEM;
485
486 buf = malloc(len);
487 if (buf == NULL) {
488 free(xorbuf);
489 return ENOMEM;
490 }
491
492 bool first = true;
493 for (i = 0; i < vol->extent_no; i++) {
494 if (i == p_extent)
495 continue;
496
497 if (first) {
498 if (i == extent) {
499 memcpy(xorbuf, data, len);
500 } else {
501 rc = block_read_direct(vol->extents[i].svc_id,
502 block, cnt, xorbuf);
503 if (rc != EOK)
504 goto end;
505 }
506
507 first = false;
508 } else {
509 if (i == extent) {
510 xor(xorbuf, data, len);
511 } else {
512 rc = block_read_direct(vol->extents[i].svc_id,
513 block, cnt, buf);
514 if (rc != EOK)
515 goto end;
516
517 xor(xorbuf, buf, len);
518 }
519 }
520 }
521
522 rc = block_write_direct(vol->extents[p_extent].svc_id, block, cnt,
523 xorbuf);
524end:
525 free(xorbuf);
526 free(buf);
527 return rc;
528}
529
530static errno_t hr_raid5_bd_op(hr_bd_op_type_t type, bd_srv_t *bd, aoff64_t ba,
531 size_t cnt, void *dst, const void *src, size_t size)
532{
533 hr_volume_t *vol = bd->srvs->sarg;
534 errno_t rc;
535 uint64_t phys_block, len;
536 size_t left;
537 const uint8_t *data_write = src;
538 uint8_t *data_read = dst;
539
540 /* propagate sync */
541 if (type == HR_BD_SYNC && ba == 0 && cnt == 0) {
542 hr_sync_all_extents(vol);
543 rc = hr_raid5_update_vol_status(vol);
544 return rc;
545 }
546
547 if (type == HR_BD_READ || type == HR_BD_WRITE)
548 if (size < cnt * vol->bsize)
549 return EINVAL;
550
551 rc = hr_check_ba_range(vol, cnt, ba);
552 if (rc != EOK)
553 return rc;
554
555 uint8_t layout = vol->layout;
556 hr_level_t level = vol->level;
557
558 uint64_t strip_size = vol->strip_size / vol->bsize; /* in blocks */
559 uint64_t stripe = (ba / strip_size); /* stripe number */
560
561 /* parity extent */
562 uint64_t p_extent;
563 if (level == HR_LVL_4 && layout == HR_RLQ_RAID4_0) {
564 p_extent = 0;
565 } else if (level == HR_LVL_4 && layout == HR_RLQ_RAID4_N) {
566 p_extent = vol->extent_no - 1;
567 } else if (level == HR_LVL_5 && layout == HR_RLQ_RAID5_0R) {
568 p_extent = (stripe / (vol->extent_no - 1)) % vol->extent_no;
569 } else if (level == HR_LVL_5 &&
570 (layout == HR_RLQ_RAID5_NR || layout == HR_RLQ_RAID5_NC)) {
571 p_extent = (vol->extent_no - 1) -
572 (stripe / (vol->extent_no - 1)) % vol->extent_no;
573 } else {
574 return EINVAL;
575 }
576
577 uint64_t extent;
578 if (level == HR_LVL_4 && layout == HR_RLQ_RAID4_0) {
579 extent = (stripe % (vol->extent_no - 1)) + 1;
580 } else if (level == HR_LVL_4 && layout == HR_RLQ_RAID4_N) {
581 extent = stripe % (vol->extent_no - 1);
582 } else if (level == HR_LVL_5 &&
583 (layout == HR_RLQ_RAID5_0R || layout == HR_RLQ_RAID5_NR)) {
584 if ((stripe % (vol->extent_no - 1)) < p_extent)
585 extent = stripe % (vol->extent_no - 1);
586 else
587 extent = (stripe % (vol->extent_no - 1)) + 1;
588 } else if (level == HR_LVL_5 && layout == HR_RLQ_RAID5_NC) {
589 extent =
590 ((stripe % (vol->extent_no - 1)) + p_extent + 1) %
591 vol->extent_no;
592 } else {
593 return EINVAL;
594 }
595
596 uint64_t ext_stripe = stripe / (vol->extent_no - 1); /* stripe level */
597 uint64_t strip_off = ba % strip_size; /* strip offset */
598
599 fibril_mutex_lock(&vol->lock);
600
601 rc = hr_raid5_vol_usable(vol);
602 if (rc != EOK) {
603 fibril_mutex_unlock(&vol->lock);
604 return EIO;
605 }
606
607 left = cnt;
608
609 fibril_rwlock_write_lock(&vol->states_lock);
610 while (left != 0) {
611 phys_block = ext_stripe * strip_size + strip_off;
612 cnt = min(left, strip_size - strip_off);
613 len = vol->bsize * cnt;
614 hr_add_ba_offset(vol, &phys_block);
615 switch (type) {
616 case HR_BD_SYNC:
617 if (vol->extents[extent].status != HR_EXT_ONLINE)
618 break;
619 rc = block_sync_cache(vol->extents[extent].svc_id,
620 phys_block, cnt);
621 /* allow unsupported sync */
622 if (rc == ENOTSUP)
623 rc = EOK;
624 break;
625 case HR_BD_READ:
626 retry_read:
627 ssize_t bad = hr_raid5_get_bad_ext(vol);
628 if (bad > -1 && extent == (size_t)bad) {
629 rc = hr_raid5_read_degraded(vol, bad,
630 phys_block, data_read, cnt);
631 } else {
632 rc = block_read_direct(vol->extents[extent].svc_id,
633 phys_block, cnt, data_read);
634 }
635 data_read += len;
636 break;
637 case HR_BD_WRITE:
638 retry_write:
639 rc = hr_raid5_write(vol, p_extent, extent, phys_block,
640 data_write, cnt);
641 data_write += len;
642 break;
643 default:
644 rc = EINVAL;
645 goto error;
646 }
647
648 if (rc == ENOMEM)
649 goto error;
650
651 hr_raid5_handle_extent_error(vol, extent, rc);
652
653 if (rc != EOK) {
654 rc = hr_raid5_update_vol_status(vol);
655 if (rc == EOK) {
656 /*
657 * State changed from ONLINE -> DEGRADED,
658 * rewind and retry
659 */
660 if (type == HR_BD_WRITE) {
661 data_write -= len;
662 goto retry_write;
663 } else if (type == HR_BD_WRITE) {
664 data_read -= len;
665 goto retry_read;
666 }
667 } else {
668 rc = EIO;
669 goto error;
670 }
671 }
672
673 left -= cnt;
674 strip_off = 0;
675 stripe++;
676
677 ext_stripe = stripe / (vol->extent_no - 1); /* stripe level */
678
679 if (level == HR_LVL_5 && layout == HR_RLQ_RAID5_0R) {
680 p_extent =
681 (stripe / (vol->extent_no - 1)) % vol->extent_no;
682 } else if (level == HR_LVL_5 &&
683 (layout == HR_RLQ_RAID5_NR || layout == HR_RLQ_RAID5_NC)) {
684 p_extent = (vol->extent_no - 1) -
685 (stripe / (vol->extent_no - 1)) % vol->extent_no;
686 }
687
688 if (level == HR_LVL_4 && layout == HR_RLQ_RAID4_0) {
689 extent = (stripe % (vol->extent_no - 1)) + 1;
690 } else if (level == HR_LVL_4 && layout == HR_RLQ_RAID4_N) {
691 extent = stripe % (vol->extent_no - 1);
692 } else if (level == HR_LVL_5 &&
693 (layout == HR_RLQ_RAID5_0R || layout == HR_RLQ_RAID5_NR)) {
694 if ((stripe % (vol->extent_no - 1)) < p_extent)
695 extent = stripe % (vol->extent_no - 1);
696 else
697 extent = (stripe % (vol->extent_no - 1)) + 1;
698 } else if (level == HR_LVL_5 && layout == HR_RLQ_RAID5_NC) {
699 extent =
700 ((stripe % (vol->extent_no - 1)) + p_extent + 1) %
701 vol->extent_no;
702 }
703 }
704
705error:
706 (void)hr_raid5_update_vol_status(vol);
707 fibril_rwlock_write_unlock(&vol->states_lock);
708 fibril_mutex_unlock(&vol->lock);
709 return rc;
710}
711
712static errno_t hr_raid5_rebuild(void *arg)
713{
714 HR_DEBUG("hr_raid5_rebuild()\n");
715
716 hr_volume_t *vol = arg;
717 errno_t rc = EOK;
718 void *buf = NULL, *xorbuf = NULL;
719
720 fibril_mutex_lock(&vol->lock);
721 fibril_rwlock_read_lock(&vol->extents_lock);
722 fibril_rwlock_write_lock(&vol->states_lock);
723
724 if (vol->hotspare_no == 0) {
725 HR_WARN("hr_raid5_rebuild(): no free hotspares on \"%s\", "
726 "aborting rebuild\n", vol->devname);
727 /* retval isn't checked for now */
728 goto end;
729 }
730
731 size_t bad = vol->extent_no;
732 for (size_t i = 0; i < vol->extent_no; i++) {
733 if (vol->extents[i].status == HR_EXT_FAILED) {
734 bad = i;
735 break;
736 }
737 }
738
739 if (bad == vol->extent_no) {
740 HR_WARN("hr_raid5_rebuild(): no bad extent on \"%s\", "
741 "aborting rebuild\n", vol->devname);
742 /* retval isn't checked for now */
743 goto end;
744 }
745
746 size_t hotspare_idx = vol->hotspare_no - 1;
747
748 hr_ext_status_t hs_state = vol->hotspares[hotspare_idx].status;
749 if (hs_state != HR_EXT_HOTSPARE) {
750 HR_ERROR("hr_raid5_rebuild(): invalid hotspare state \"%s\", "
751 "aborting rebuild\n", hr_get_ext_status_msg(hs_state));
752 rc = EINVAL;
753 goto end;
754 }
755
756 HR_DEBUG("hr_raid5_rebuild(): swapping in hotspare\n");
757
758 block_fini(vol->extents[bad].svc_id);
759
760 vol->extents[bad].svc_id = vol->hotspares[hotspare_idx].svc_id;
761 hr_update_ext_status(vol, bad, HR_EXT_HOTSPARE);
762
763 vol->hotspares[hotspare_idx].svc_id = 0;
764 fibril_mutex_lock(&vol->hotspare_lock);
765 hr_update_hotspare_status(vol, hotspare_idx, HR_EXT_MISSING);
766 fibril_mutex_unlock(&vol->hotspare_lock);
767
768 vol->hotspare_no--;
769
770 hr_extent_t *rebuild_ext = &vol->extents[bad];
771
772 rc = block_init(rebuild_ext->svc_id);
773 if (rc != EOK) {
774 HR_ERROR("hr_raid5_rebuild(): initing (%lu) failed, "
775 "aborting rebuild\n", rebuild_ext->svc_id);
776 goto end;
777 }
778
779 HR_DEBUG("hr_raid5_rebuild(): starting rebuild on (%lu)\n",
780 rebuild_ext->svc_id);
781
782 hr_update_ext_status(vol, bad, HR_EXT_REBUILD);
783 hr_update_vol_status(vol, HR_VOL_REBUILD);
784
785 uint64_t max_blks = DATA_XFER_LIMIT / vol->bsize;
786 uint64_t left = vol->data_blkno / (vol->extent_no - 1);
787 buf = malloc(max_blks * vol->bsize);
788 xorbuf = malloc(max_blks * vol->bsize);
789
790 uint64_t ba = 0, cnt;
791 hr_add_ba_offset(vol, &ba);
792
793 while (left != 0) {
794 cnt = min(left, max_blks);
795
796 /*
797 * Almost the same as read_degraded,
798 * but we don't want to allocate new
799 * xorbuf each blk rebuild batch.
800 */
801 bool first = true;
802 for (size_t i = 0; i < vol->extent_no; i++) {
803 if (i == bad)
804 continue;
805 if (first)
806 rc = block_read_direct(vol->extents[i].svc_id,
807 ba, cnt, xorbuf);
808 else
809 rc = block_read_direct(vol->extents[i].svc_id,
810 ba, cnt, buf);
811 if (rc != EOK) {
812 hr_raid5_handle_extent_error(vol, i, rc);
813 HR_ERROR("rebuild on \"%s\" (%lu), failed due "
814 "to a failed ONLINE extent, number %lu\n",
815 vol->devname, vol->svc_id, i);
816 goto end;
817 }
818
819 if (!first)
820 xor(xorbuf, buf, cnt * vol->bsize);
821 else
822 first = false;
823 }
824
825 rc = block_write_direct(rebuild_ext->svc_id, ba, cnt, xorbuf);
826 if (rc != EOK) {
827 hr_raid5_handle_extent_error(vol, bad, rc);
828 HR_ERROR("rebuild on \"%s\" (%lu), failed due to "
829 "the rebuilt extent number %lu failing\n",
830 vol->devname, vol->svc_id, bad);
831 goto end;
832 }
833
834 ba += cnt;
835 left -= cnt;
836
837 /*
838 * Let other IO requests be served
839 * during rebuild.
840 */
841 fibril_rwlock_write_unlock(&vol->states_lock);
842 fibril_mutex_unlock(&vol->lock);
843 fibril_mutex_lock(&vol->lock);
844 fibril_rwlock_write_lock(&vol->states_lock);
845 }
846
847 HR_DEBUG("hr_raid5_rebuild(): rebuild finished on \"%s\" (%lu), "
848 "extent number %lu\n", vol->devname, vol->svc_id, hotspare_idx);
849
850 hr_update_ext_status(vol, bad, HR_EXT_ONLINE);
851 /*
852 * For now write metadata at the end, because
853 * we don't sync metada accross extents yet.
854 */
855 hr_write_meta_to_ext(vol, bad);
856end:
857 (void)hr_raid5_update_vol_status(vol);
858
859 fibril_rwlock_write_unlock(&vol->states_lock);
860 fibril_rwlock_read_unlock(&vol->extents_lock);
861 fibril_mutex_unlock(&vol->lock);
862
863 if (buf != NULL)
864 free(buf);
865
866 if (xorbuf != NULL)
867 free(xorbuf);
868
869 return rc;
870}
871
872/** @}
873 */
Note: See TracBrowser for help on using the repository browser.