source: mainline/uspace/srv/bd/hr/raid1.c@ d773bea9

Last change on this file since d773bea9 was d773bea9, checked in by Miroslav Cimerman <mc@…>, 7 months ago

hr: RAID1: handle state edge cases in a rebuild

  • Property mode set to 100644
File size: 20.6 KB
Line 
1/*
2 * Copyright (c) 2025 Miroslav Cimerman
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * - Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * - Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * - The name of the author may not be used to endorse or promote products
15 * derived from this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/** @addtogroup hr
30 * @{
31 */
32/**
33 * @file
34 */
35
36#include <bd_srv.h>
37#include <block.h>
38#include <errno.h>
39#include <hr.h>
40#include <io/log.h>
41#include <ipc/hr.h>
42#include <ipc/services.h>
43#include <loc.h>
44#include <task.h>
45#include <stdatomic.h>
46#include <stdio.h>
47#include <stdlib.h>
48#include <str_error.h>
49
50#include "fge.h"
51#include "io.h"
52#include "superblock.h"
53#include "util.h"
54#include "var.h"
55
56extern loc_srv_t *hr_srv;
57
58static void process_deferred_invalidations(hr_volume_t *);
59static void hr_raid1_update_vol_status(hr_volume_t *);
60static void hr_raid1_ext_state_callback(hr_volume_t *, size_t, errno_t);
61static size_t hr_raid1_count_good_extents(hr_volume_t *, uint64_t, size_t,
62 uint64_t);
63static errno_t hr_raid1_bd_op(hr_bd_op_type_t, bd_srv_t *, aoff64_t, size_t,
64 void *, const void *, size_t);
65static errno_t swap_hs(hr_volume_t *, size_t, size_t);
66static errno_t init_rebuild(hr_volume_t *, size_t *);
67static errno_t hr_raid1_rebuild(void *);
68
69/* bdops */
70static errno_t hr_raid1_bd_open(bd_srvs_t *, bd_srv_t *);
71static errno_t hr_raid1_bd_close(bd_srv_t *);
72static errno_t hr_raid1_bd_read_blocks(bd_srv_t *, aoff64_t, size_t, void *,
73 size_t);
74static errno_t hr_raid1_bd_sync_cache(bd_srv_t *, aoff64_t, size_t);
75static errno_t hr_raid1_bd_write_blocks(bd_srv_t *, aoff64_t, size_t,
76 const void *, size_t);
77static errno_t hr_raid1_bd_get_block_size(bd_srv_t *, size_t *);
78static errno_t hr_raid1_bd_get_num_blocks(bd_srv_t *, aoff64_t *);
79
80static bd_ops_t hr_raid1_bd_ops = {
81 .open = hr_raid1_bd_open,
82 .close = hr_raid1_bd_close,
83 .sync_cache = hr_raid1_bd_sync_cache,
84 .read_blocks = hr_raid1_bd_read_blocks,
85 .write_blocks = hr_raid1_bd_write_blocks,
86 .get_block_size = hr_raid1_bd_get_block_size,
87 .get_num_blocks = hr_raid1_bd_get_num_blocks
88};
89
90errno_t hr_raid1_create(hr_volume_t *new_volume)
91{
92 errno_t rc;
93
94 assert(new_volume->level == HR_LVL_1);
95
96 if (new_volume->extent_no < 2) {
97 HR_ERROR("RAID 1 array needs at least 2 devices\n");
98 return EINVAL;
99 }
100
101 bd_srvs_init(&new_volume->hr_bds);
102 new_volume->hr_bds.ops = &hr_raid1_bd_ops;
103 new_volume->hr_bds.sarg = new_volume;
104
105 /* force volume state update */
106 atomic_store(&new_volume->state_changed, true);
107 hr_raid1_update_vol_status(new_volume);
108 if (new_volume->status == HR_VOL_FAULTY)
109 return EINVAL;
110
111 rc = hr_register_volume(new_volume);
112
113 return rc;
114}
115
116errno_t hr_raid1_init(hr_volume_t *vol)
117{
118 errno_t rc;
119 size_t bsize;
120 uint64_t total_blkno;
121
122 assert(vol->level == HR_LVL_1);
123
124 rc = hr_check_devs(vol, &total_blkno, &bsize);
125 if (rc != EOK)
126 return rc;
127
128 vol->nblocks = total_blkno / vol->extent_no;
129 vol->bsize = bsize;
130 vol->data_offset = HR_DATA_OFF;
131 vol->data_blkno = vol->nblocks - vol->data_offset;
132 vol->strip_size = 0;
133
134 return EOK;
135}
136
137void hr_raid1_status_event(hr_volume_t *vol)
138{
139 hr_raid1_update_vol_status(vol);
140}
141
142errno_t hr_raid1_add_hotspare(hr_volume_t *vol, service_id_t hotspare)
143{
144 HR_DEBUG("hr_raid1_add_hotspare()\n");
145
146 errno_t rc = EOK;
147
148 fibril_mutex_lock(&vol->hotspare_lock);
149
150 if (vol->hotspare_no >= HR_MAX_HOTSPARES) {
151 HR_ERROR("hr_raid1_add_hotspare(): cannot add more hotspares "
152 "to \"%s\"\n", vol->devname);
153 rc = ELIMIT;
154 goto error;
155 }
156
157 size_t hs_idx = vol->hotspare_no;
158
159 vol->hotspare_no++;
160
161 hr_update_hotspare_svc_id(vol, hs_idx, hotspare);
162 hr_update_hotspare_status(vol, hs_idx, HR_EXT_HOTSPARE);
163
164 atomic_store(&vol->state_changed, true);
165error:
166 fibril_mutex_unlock(&vol->hotspare_lock);
167
168 hr_raid1_update_vol_status(vol);
169
170 return rc;
171}
172
173static errno_t hr_raid1_bd_open(bd_srvs_t *bds, bd_srv_t *bd)
174{
175 HR_DEBUG("hr_bd_open()\n");
176 return EOK;
177}
178
179static errno_t hr_raid1_bd_close(bd_srv_t *bd)
180{
181 HR_DEBUG("hr_bd_close()\n");
182 return EOK;
183}
184
185static errno_t hr_raid1_bd_sync_cache(bd_srv_t *bd, aoff64_t ba, size_t cnt)
186{
187 return hr_raid1_bd_op(HR_BD_SYNC, bd, ba, cnt, NULL, NULL, 0);
188}
189
190static errno_t hr_raid1_bd_read_blocks(bd_srv_t *bd, aoff64_t ba, size_t cnt,
191 void *buf, size_t size)
192{
193 return hr_raid1_bd_op(HR_BD_READ, bd, ba, cnt, buf, NULL, size);
194}
195
196static errno_t hr_raid1_bd_write_blocks(bd_srv_t *bd, aoff64_t ba, size_t cnt,
197 const void *data, size_t size)
198{
199 return hr_raid1_bd_op(HR_BD_WRITE, bd, ba, cnt, NULL, data, size);
200}
201
202static errno_t hr_raid1_bd_get_block_size(bd_srv_t *bd, size_t *rsize)
203{
204 hr_volume_t *vol = bd->srvs->sarg;
205
206 *rsize = vol->bsize;
207 return EOK;
208}
209
210static errno_t hr_raid1_bd_get_num_blocks(bd_srv_t *bd, aoff64_t *rnb)
211{
212 hr_volume_t *vol = bd->srvs->sarg;
213
214 *rnb = vol->data_blkno;
215 return EOK;
216}
217
218static void process_deferred_invalidations(hr_volume_t *vol)
219{
220 HR_DEBUG("hr_raid1_update_vol_status(): deferred invalidations\n");
221
222 fibril_mutex_lock(&vol->halt_lock);
223 vol->halt_please = true;
224 fibril_rwlock_write_lock(&vol->extents_lock);
225 fibril_rwlock_write_lock(&vol->states_lock);
226 fibril_mutex_lock(&vol->hotspare_lock);
227
228 list_foreach(vol->deferred_invalidations_list, link,
229 hr_deferred_invalidation_t, di) {
230 assert(vol->extents[di->index].status == HR_EXT_INVALID);
231
232 HR_DEBUG("moving invalidated extent no. %lu to hotspares\n",
233 di->index);
234
235 block_fini(di->svc_id);
236
237 size_t hs_idx = vol->hotspare_no;
238
239 vol->hotspare_no++;
240
241 hr_update_hotspare_svc_id(vol, hs_idx, di->svc_id);
242 hr_update_hotspare_status(vol, hs_idx, HR_EXT_HOTSPARE);
243
244 hr_update_ext_svc_id(vol, di->index, 0);
245 hr_update_ext_status(vol, di->index, HR_EXT_MISSING);
246
247 assert(vol->hotspare_no < HR_MAX_HOTSPARES + HR_MAX_EXTENTS);
248 }
249
250 for (size_t i = 0; i < HR_MAX_EXTENTS; i++) {
251 hr_deferred_invalidation_t *di = &vol->deferred_inval[i];
252 if (di->svc_id != 0) {
253 list_remove(&di->link);
254 di->svc_id = 0;
255 }
256 }
257
258 fibril_mutex_unlock(&vol->hotspare_lock);
259 fibril_rwlock_write_unlock(&vol->states_lock);
260 fibril_rwlock_write_unlock(&vol->extents_lock);
261 vol->halt_please = false;
262 fibril_mutex_unlock(&vol->halt_lock);
263}
264
265static void hr_raid1_update_vol_status(hr_volume_t *vol)
266{
267 bool exp = true;
268
269 if (!atomic_compare_exchange_strong(&vol->state_changed, &exp, false))
270 return;
271
272 if (atomic_compare_exchange_strong(&vol->pending_invalidation, &exp,
273 false)) {
274 fibril_mutex_lock(&vol->deferred_list_lock);
275 process_deferred_invalidations(vol);
276 fibril_mutex_unlock(&vol->deferred_list_lock);
277 }
278
279 fibril_rwlock_read_lock(&vol->extents_lock);
280 fibril_rwlock_read_lock(&vol->states_lock);
281
282 hr_vol_status_t old_state = vol->status;
283 size_t healthy = hr_count_extents(vol, HR_EXT_ONLINE);
284
285 fibril_rwlock_read_unlock(&vol->states_lock);
286 fibril_rwlock_read_unlock(&vol->extents_lock);
287
288 if (healthy == 0) {
289 if (old_state != HR_VOL_FAULTY) {
290 fibril_rwlock_write_lock(&vol->states_lock);
291 hr_update_vol_status(vol, HR_VOL_FAULTY);
292 fibril_rwlock_write_unlock(&vol->states_lock);
293 }
294 } else if (healthy < vol->extent_no) {
295 if (old_state != HR_VOL_REBUILD &&
296 old_state != HR_VOL_DEGRADED) {
297 fibril_rwlock_write_lock(&vol->states_lock);
298 hr_update_vol_status(vol, HR_VOL_DEGRADED);
299 fibril_rwlock_write_unlock(&vol->states_lock);
300 }
301
302 if (old_state != HR_VOL_REBUILD) {
303 if (vol->hotspare_no > 0) {
304 fid_t fib = fibril_create(hr_raid1_rebuild,
305 vol);
306 if (fib == 0)
307 return;
308 fibril_start(fib);
309 fibril_detach(fib);
310 }
311 }
312 } else {
313 if (old_state != HR_VOL_ONLINE) {
314 fibril_rwlock_write_lock(&vol->states_lock);
315 hr_update_vol_status(vol, HR_VOL_ONLINE);
316 fibril_rwlock_write_unlock(&vol->states_lock);
317 }
318 }
319}
320
321static void hr_raid1_ext_state_callback(hr_volume_t *vol, size_t extent,
322 errno_t rc)
323{
324 if (rc == EOK)
325 return;
326
327 assert(fibril_rwlock_is_locked(&vol->extents_lock));
328
329 fibril_rwlock_write_lock(&vol->states_lock);
330
331 switch (rc) {
332 case ENOMEM:
333 fibril_mutex_lock(&vol->deferred_list_lock);
334
335 service_id_t invalid_svc_id = vol->extents[extent].svc_id;
336
337 list_foreach(vol->deferred_invalidations_list, link,
338 hr_deferred_invalidation_t, di) {
339 if (di->svc_id == invalid_svc_id) {
340 assert(vol->extents[extent].status ==
341 HR_EXT_INVALID);
342 goto deferring_end;
343 }
344 }
345
346 assert(vol->extents[extent].svc_id != HR_EXT_INVALID);
347
348 hr_update_ext_status(vol, extent, HR_EXT_INVALID);
349
350 size_t i = list_count(&vol->deferred_invalidations_list);
351 vol->deferred_inval[i].svc_id = invalid_svc_id;
352 vol->deferred_inval[i].index = extent;
353
354 list_append(&vol->deferred_inval[i].link,
355 &vol->deferred_invalidations_list);
356
357 atomic_store(&vol->pending_invalidation, true);
358 deferring_end:
359
360 fibril_mutex_unlock(&vol->deferred_list_lock);
361 break;
362 case ENOENT:
363 hr_update_ext_status(vol, extent, HR_EXT_MISSING);
364 break;
365 default:
366 hr_update_ext_status(vol, extent, HR_EXT_FAILED);
367 }
368
369 atomic_store(&vol->state_changed, true);
370
371 fibril_rwlock_write_unlock(&vol->states_lock);
372}
373
374static size_t hr_raid1_count_good_extents(hr_volume_t *vol, uint64_t ba,
375 size_t cnt, uint64_t rebuild_blk)
376{
377 assert(fibril_rwlock_is_locked(&vol->extents_lock));
378 assert(fibril_rwlock_is_locked(&vol->states_lock));
379
380 size_t count = 0;
381 for (size_t i = 0; i < vol->extent_no; i++) {
382 if (vol->extents[i].status == HR_EXT_ONLINE ||
383 (vol->extents[i].status == HR_EXT_REBUILD &&
384 ba < rebuild_blk)) {
385 count++;
386 }
387 }
388
389 return count;
390
391}
392
393static errno_t hr_raid1_bd_op(hr_bd_op_type_t type, bd_srv_t *bd, aoff64_t ba,
394 size_t cnt, void *data_read, const void *data_write, size_t size)
395{
396 hr_volume_t *vol = bd->srvs->sarg;
397 hr_range_lock_t *rl = NULL;
398 errno_t rc;
399 size_t i;
400 uint64_t rebuild_blk;
401
402 fibril_rwlock_read_lock(&vol->states_lock);
403 hr_vol_status_t vol_state = vol->status;
404 fibril_rwlock_read_unlock(&vol->states_lock);
405
406 if (vol_state == HR_VOL_FAULTY)
407 return EIO;
408
409 if (type == HR_BD_READ || type == HR_BD_WRITE)
410 if (size < cnt * vol->bsize)
411 return EINVAL;
412
413 rc = hr_check_ba_range(vol, cnt, ba);
414 if (rc != EOK)
415 return rc;
416
417 /* allow full dev sync */
418 if (type != HR_BD_SYNC || ba != 0)
419 hr_add_ba_offset(vol, &ba);
420
421 /*
422 * this is to allow adding hotspare or start a rebuild on
423 * very busy array, because of how rwlocks are implemented
424 * in HelenOS (no writer priority, so if there are multiple
425 * continuos readers, writer will never own the lock)
426 */
427 if (vol->halt_please) {
428 fibril_mutex_lock(&vol->halt_lock);
429 fibril_mutex_unlock(&vol->halt_lock);
430 }
431
432 /*
433 * extent order has to be locked for the whole IO duration,
434 * so that workers have consistent targets
435 */
436 fibril_rwlock_read_lock(&vol->extents_lock);
437
438 size_t successful = 0;
439 switch (type) {
440 case HR_BD_READ:
441 rebuild_blk = atomic_load_explicit(&vol->rebuild_blk,
442 memory_order_relaxed);
443
444 for (i = 0; i < vol->extent_no; i++) {
445 fibril_rwlock_read_lock(&vol->states_lock);
446 hr_ext_status_t state = vol->extents[i].status;
447 fibril_rwlock_read_unlock(&vol->states_lock);
448
449 if (state != HR_EXT_ONLINE &&
450 (state != HR_EXT_REBUILD ||
451 ba + cnt - 1 >= rebuild_blk)) {
452 continue;
453 }
454
455 rc = block_read_direct(vol->extents[i].svc_id, ba, cnt,
456 data_read);
457
458 if (rc == ENOMEM && i + 1 == vol->extent_no)
459 goto end;
460
461 if (rc == ENOMEM)
462 continue;
463
464 if (rc != EOK) {
465 hr_raid1_ext_state_callback(vol, i, rc);
466 } else {
467 successful++;
468 break;
469 }
470 }
471 break;
472 case HR_BD_SYNC:
473 case HR_BD_WRITE:
474 if (type == HR_BD_WRITE) {
475 rl = hr_range_lock_acquire(vol, ba, cnt);
476 if (rl == NULL) {
477 rc = ENOMEM;
478 goto end;
479 }
480 }
481
482 fibril_rwlock_read_lock(&vol->states_lock);
483
484 rebuild_blk = atomic_load_explicit(&vol->rebuild_blk,
485 memory_order_relaxed);
486
487 size_t good = hr_raid1_count_good_extents(vol, ba, cnt,
488 rebuild_blk);
489
490 hr_fgroup_t *group = hr_fgroup_create(vol->fge, good);
491 if (group == NULL) {
492 if (type == HR_BD_WRITE)
493 hr_range_lock_release(rl);
494 rc = ENOMEM;
495 fibril_rwlock_read_unlock(&vol->states_lock);
496 goto end;
497 }
498
499 for (i = 0; i < vol->extent_no; i++) {
500 if (vol->extents[i].status != HR_EXT_ONLINE &&
501 (vol->extents[i].status != HR_EXT_REBUILD ||
502 ba >= rebuild_blk)) {
503 /*
504 * When the extent is being rebuilt,
505 * we only write to the part that is already
506 * rebuilt. If IO starts after vol->rebuild_blk
507 * we do not proceed, the write is going to
508 * be replicated later in the rebuild.
509 */
510 continue;
511 }
512
513 hr_io_t *io = hr_fgroup_alloc(group);
514 io->extent = i;
515 io->data_write = data_write;
516 io->data_read = data_read;
517 io->ba = ba;
518 io->cnt = cnt;
519 io->type = type;
520 io->vol = vol;
521 io->state_callback = hr_raid1_ext_state_callback;
522
523 hr_fgroup_submit(group, hr_io_worker, io);
524 }
525
526 fibril_rwlock_read_unlock(&vol->states_lock);
527
528 (void)hr_fgroup_wait(group, &successful, NULL);
529
530 if (type == HR_BD_WRITE)
531 hr_range_lock_release(rl);
532
533 break;
534 default:
535 rc = EINVAL;
536 goto end;
537 }
538
539 if (successful > 0)
540 rc = EOK;
541 else
542 rc = EIO;
543
544end:
545 fibril_rwlock_read_unlock(&vol->extents_lock);
546
547 hr_raid1_update_vol_status(vol);
548
549 return rc;
550}
551
552static errno_t swap_hs(hr_volume_t *vol, size_t bad, size_t hs)
553{
554 HR_DEBUG("hr_raid1_rebuild(): swapping in hotspare\n");
555
556 service_id_t faulty_svc_id = vol->extents[bad].svc_id;
557 service_id_t hs_svc_id = vol->hotspares[hs].svc_id;
558
559 errno_t rc = block_init(hs_svc_id);
560 if (rc != EOK) {
561 HR_ERROR("hr_raid1_rebuild(): initing hotspare (%lu) failed\n",
562 hs_svc_id);
563 return rc;
564 }
565
566 hr_update_ext_svc_id(vol, bad, hs_svc_id);
567 hr_update_ext_status(vol, bad, HR_EXT_HOTSPARE);
568
569 hr_update_hotspare_svc_id(vol, hs, 0);
570 hr_update_hotspare_status(vol, hs, HR_EXT_INVALID);
571
572 vol->hotspare_no--;
573
574 if (faulty_svc_id != 0)
575 block_fini(faulty_svc_id);
576
577 return EOK;
578}
579
580static errno_t init_rebuild(hr_volume_t *vol, size_t *rebuild_idx)
581{
582 errno_t rc = EOK;
583
584 fibril_mutex_lock(&vol->halt_lock);
585 vol->halt_please = true;
586 fibril_rwlock_write_lock(&vol->extents_lock);
587 fibril_rwlock_write_lock(&vol->states_lock);
588 fibril_mutex_lock(&vol->hotspare_lock);
589
590 if (vol->hotspare_no == 0) {
591 HR_WARN("hr_raid1_rebuild(): no free hotspares on \"%s\", "
592 "aborting rebuild\n", vol->devname);
593 rc = EINVAL;
594 goto error;
595 }
596
597 size_t bad = vol->extent_no;
598 for (size_t i = 0; i < vol->extent_no; i++) {
599 if (vol->extents[i].status != HR_EXT_ONLINE) {
600 bad = i;
601 break;
602 }
603 }
604
605 if (bad == vol->extent_no) {
606 HR_WARN("hr_raid1_rebuild(): no bad extent on \"%s\", "
607 "aborting rebuild\n", vol->devname);
608 rc = EINVAL;
609 goto error;
610 }
611
612 size_t hotspare_idx = vol->hotspare_no - 1;
613
614 hr_ext_status_t hs_state = vol->hotspares[hotspare_idx].status;
615 if (hs_state != HR_EXT_HOTSPARE) {
616 HR_ERROR("hr_raid1_rebuild(): invalid hotspare state \"%s\", "
617 "aborting rebuild\n", hr_get_ext_status_msg(hs_state));
618 rc = EINVAL;
619 goto error;
620 }
621
622 rc = swap_hs(vol, bad, hotspare_idx);
623 if (rc != EOK) {
624 HR_ERROR("hr_raid1_rebuild(): swapping hotspare failed, "
625 "aborting rebuild\n");
626 goto error;
627 }
628
629 hr_extent_t *rebuild_ext = &vol->extents[bad];
630
631 HR_DEBUG("hr_raid1_rebuild(): starting REBUILD on extent no. %lu (%lu)"
632 "\n", bad, rebuild_ext->svc_id);
633
634 atomic_store_explicit(&vol->rebuild_blk, 0, memory_order_relaxed);
635
636 hr_update_ext_status(vol, bad, HR_EXT_REBUILD);
637 hr_update_vol_status(vol, HR_VOL_REBUILD);
638
639 *rebuild_idx = bad;
640error:
641 fibril_mutex_unlock(&vol->hotspare_lock);
642 fibril_rwlock_write_unlock(&vol->states_lock);
643 fibril_rwlock_write_unlock(&vol->extents_lock);
644 vol->halt_please = false;
645 fibril_mutex_unlock(&vol->halt_lock);
646
647 return rc;
648}
649
650static errno_t hr_raid1_restore_blocks(hr_volume_t *vol, size_t rebuild_idx,
651 uint64_t ba, size_t cnt, void *buf)
652{
653 HR_DEBUG("REBUILD restoring blocks (ba: %lu, cnt: %lu)\n", ba, cnt);
654
655 assert(fibril_rwlock_is_locked(&vol->extents_lock));
656
657 errno_t rc = ENOENT;
658 hr_extent_t *ext, *rebuild_ext = &vol->extents[rebuild_idx];
659
660 for (size_t i = 0; i < vol->extent_no; i++) {
661 fibril_rwlock_read_lock(&vol->states_lock);
662
663 ext = &vol->extents[i];
664 if (ext->status != HR_EXT_ONLINE)
665 continue;
666
667 fibril_rwlock_read_unlock(&vol->states_lock);
668
669 rc = block_read_direct(ext->svc_id, ba, cnt, buf);
670 if (rc == EOK)
671 break;
672
673 if (rc != ENOMEM)
674 hr_raid1_ext_state_callback(vol, i, rc);
675
676 if (i + 1 >= vol->extent_no) {
677 if (rc != ENOMEM) {
678 HR_ERROR("rebuild on \"%s\" (%lu), failed due "
679 "to too many failed extents\n",
680 vol->devname, vol->svc_id);
681 }
682
683 /* for now we have to invalidate the rebuild extent */
684 if (rc == ENOMEM) {
685 HR_ERROR("rebuild on \"%s\" (%lu), failed due "
686 "to too many failed reads, because of not "
687 "enough memory\n",
688 vol->devname, vol->svc_id);
689 hr_raid1_ext_state_callback(vol, rebuild_idx,
690 ENOMEM);
691 }
692
693 return rc;
694 }
695 }
696
697 rc = block_write_direct(rebuild_ext->svc_id, ba, cnt, buf);
698 if (rc != EOK) {
699 /*
700 * Here we dont handle ENOMEM, because maybe in the
701 * future, there is going to be M_WAITOK, or we are
702 * going to wait for more memory, so that we don't
703 * have to invalidate it...
704 *
705 * XXX: for now we do
706 */
707 hr_raid1_ext_state_callback(vol, rebuild_idx, rc);
708
709 HR_ERROR("rebuild on \"%s\" (%lu), failed due to "
710 "the rebuilt extent no. %lu WRITE (rc: %s)\n",
711 vol->devname, vol->svc_id, rebuild_idx, str_error(rc));
712
713 return rc;
714 }
715
716 return EOK;
717}
718
719/*
720 * Put the last HOTSPARE extent in place
721 * of first that != ONLINE, and start the rebuild.
722 */
723static errno_t hr_raid1_rebuild(void *arg)
724{
725 HR_DEBUG("hr_raid1_rebuild()\n");
726
727 hr_volume_t *vol = arg;
728 void *buf = NULL;
729 size_t rebuild_idx;
730 errno_t rc;
731
732 rc = init_rebuild(vol, &rebuild_idx);
733 if (rc != EOK)
734 return rc;
735
736 size_t left = vol->data_blkno;
737 size_t max_blks = DATA_XFER_LIMIT / vol->bsize;
738 buf = malloc(max_blks * vol->bsize);
739
740 size_t cnt;
741 uint64_t ba = 0;
742 hr_add_ba_offset(vol, &ba);
743
744 fibril_rwlock_read_lock(&vol->extents_lock);
745
746 hr_range_lock_t *rl = NULL;
747
748 while (left != 0) {
749 if (vol->halt_please) {
750 fibril_rwlock_read_unlock(&vol->extents_lock);
751 fibril_mutex_lock(&vol->halt_lock);
752 fibril_mutex_unlock(&vol->halt_lock);
753 fibril_rwlock_read_lock(&vol->extents_lock);
754 }
755
756 cnt = min(max_blks, left);
757
758 rl = hr_range_lock_acquire(vol, ba, cnt);
759 if (rl == NULL) {
760 rc = ENOMEM;
761 goto end;
762 }
763
764 atomic_store_explicit(&vol->rebuild_blk, ba,
765 memory_order_relaxed);
766
767 rc = hr_raid1_restore_blocks(vol, rebuild_idx, ba, cnt, buf);
768
769 hr_range_lock_release(rl);
770
771 if (rc != EOK)
772 goto end;
773
774 ba += cnt;
775 left -= cnt;
776 }
777
778 HR_DEBUG("hr_raid1_rebuild(): rebuild finished on \"%s\" (%lu), "
779 "extent no. %lu\n", vol->devname, vol->svc_id, rebuild_idx);
780
781 fibril_rwlock_write_lock(&vol->states_lock);
782
783 hr_update_ext_status(vol, rebuild_idx, HR_EXT_ONLINE);
784 /*
785 * We can be optimistic here, if some extents are
786 * still INVALID, FAULTY or MISSING, the update vol
787 * function will pick them up, and set the volume
788 * state accordingly.
789 */
790 hr_update_vol_status(vol, HR_VOL_ONLINE);
791 atomic_store(&vol->state_changed, true);
792
793 fibril_rwlock_write_unlock(&vol->states_lock);
794
795 /*
796 * For now write metadata at the end, because
797 * we don't sync metada accross extents yet.
798 */
799 hr_write_meta_to_ext(vol, rebuild_idx);
800end:
801 if (rc != EOK) {
802 /*
803 * We can fail either because:
804 * - the rebuild extent failing or invalidation
805 * - there is are no ONLINE extents (vol is FAULTY)
806 * - we got ENOMEM on all READs (we also invalidate the
807 * rebuild extent here, for now)
808 */
809 fibril_rwlock_write_lock(&vol->states_lock);
810 hr_update_vol_status(vol, HR_VOL_DEGRADED);
811 atomic_store(&vol->state_changed, true);
812 fibril_rwlock_write_unlock(&vol->states_lock);
813 }
814
815 fibril_rwlock_read_unlock(&vol->extents_lock);
816
817 hr_raid1_update_vol_status(vol);
818
819 if (buf != NULL)
820 free(buf);
821
822 return rc;
823}
824
825/** @}
826 */
Note: See TracBrowser for help on using the repository browser.