source: mainline/uspace/srv/bd/hr/raid5.c@ f0360ec

Last change on this file since f0360ec was f0360ec, checked in by Miroslav Cimerman <mc@…>, 7 weeks ago

hr: RAID 0, 1: use ENOMEM safe primitives

  • Property mode set to 100644
File size: 22.4 KB
RevLine 
[dceb6e7]1/*
[746e636]2 * Copyright (c) 2025 Miroslav Cimerman
[dceb6e7]3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * - Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * - Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * - The name of the author may not be used to endorse or promote products
15 * derived from this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/** @addtogroup hr
30 * @{
31 */
32/**
33 * @file
34 */
35
36#include <abi/ipc/ipc.h>
37#include <bd_srv.h>
38#include <block.h>
39#include <errno.h>
40#include <hr.h>
[ca7fa5b]41#include <inttypes.h>
[dceb6e7]42#include <io/log.h>
43#include <ipc/hr.h>
44#include <ipc/services.h>
45#include <loc.h>
[978130a]46#include <mem.h>
[dceb6e7]47#include <task.h>
48#include <stdio.h>
49#include <stdlib.h>
50#include <str_error.h>
51
[a3486f2]52#include "io.h"
53#include "parity_stripe.h"
[dceb6e7]54#include "superblock.h"
55#include "util.h"
56#include "var.h"
57
[a3486f2]58static void hr_raid5_vol_state_eval_forced(hr_volume_t *);
59
60static size_t hr_raid5_parity_extent(hr_level_t, hr_layout_t, size_t,
61 uint64_t);
62static size_t hr_raid5_data_extent(hr_level_t, hr_layout_t, size_t, uint64_t,
63 uint64_t);
[6f13257]64
65static errno_t hr_raid5_rebuild(void *);
[733564a]66
67/* bdops */
[6f13257]68static errno_t hr_raid5_bd_open(bd_srvs_t *, bd_srv_t *);
69static errno_t hr_raid5_bd_close(bd_srv_t *);
70static errno_t hr_raid5_bd_read_blocks(bd_srv_t *, aoff64_t, size_t, void *,
[dceb6e7]71 size_t);
[6f13257]72static errno_t hr_raid5_bd_sync_cache(bd_srv_t *, aoff64_t, size_t);
73static errno_t hr_raid5_bd_write_blocks(bd_srv_t *, aoff64_t, size_t,
[dceb6e7]74 const void *, size_t);
[6f13257]75static errno_t hr_raid5_bd_get_block_size(bd_srv_t *, size_t *);
76static errno_t hr_raid5_bd_get_num_blocks(bd_srv_t *, aoff64_t *);
[dceb6e7]77
78static bd_ops_t hr_raid5_bd_ops = {
[6f13257]79 .open = hr_raid5_bd_open,
80 .close = hr_raid5_bd_close,
81 .sync_cache = hr_raid5_bd_sync_cache,
82 .read_blocks = hr_raid5_bd_read_blocks,
83 .write_blocks = hr_raid5_bd_write_blocks,
84 .get_block_size = hr_raid5_bd_get_block_size,
85 .get_num_blocks = hr_raid5_bd_get_num_blocks
[dceb6e7]86};
87
[6d0fc11]88extern loc_srv_t *hr_srv;
89
[733564a]90errno_t hr_raid5_create(hr_volume_t *new_volume)
91{
[baa4929]92 HR_DEBUG("%s()", __func__);
93
[b5c95da5]94 if (new_volume->level != HR_LVL_5 && new_volume->level != HR_LVL_4)
95 return EINVAL;
[733564a]96
[65706f1]97 if (new_volume->extent_no < 3) {
[af73327a]98 HR_ERROR("RAID 5 volume needs at least 3 devices\n");
[733564a]99 return EINVAL;
100 }
101
102 bd_srvs_init(&new_volume->hr_bds);
103 new_volume->hr_bds.ops = &hr_raid5_bd_ops;
104 new_volume->hr_bds.sarg = new_volume;
105
[a3486f2]106 hr_raid5_vol_state_eval_forced(new_volume);
107
108 fibril_rwlock_read_lock(&new_volume->states_lock);
109 hr_vol_state_t state = new_volume->state;
110 fibril_rwlock_read_unlock(&new_volume->states_lock);
111 if (state == HR_VOL_FAULTY || state == HR_VOL_NONE) {
112 HR_NOTE("\"%s\": unusable state, not creating\n",
113 new_volume->devname);
114 return EINVAL;
115 }
[f1be66bf]116
[8a65373]117 return EOK;
[733564a]118}
119
[746e636]120/*
121 * Called only once in volume's lifetime.
122 */
[733564a]123errno_t hr_raid5_init(hr_volume_t *vol)
124{
[baa4929]125 HR_DEBUG("%s()", __func__);
[733564a]126
[b5c95da5]127 if (vol->level != HR_LVL_5 && vol->level != HR_LVL_4)
128 return EINVAL;
[733564a]129
[50603405]130 vol->data_offset = vol->meta_ops->get_data_offset();
[baa4929]131
[a3486f2]132 uint64_t single_sz = vol->truncated_blkno - vol->meta_ops->get_size();
133 vol->data_blkno = single_sz * (vol->extent_no - 1);
[baa4929]134
[733564a]135 vol->strip_size = HR_STRIP_SIZE;
136
[9ee9c60b]137 if (vol->level == HR_LVL_4)
138 vol->layout = HR_LAYOUT_RAID4_N;
139 else
140 vol->layout = HR_LAYOUT_RAID5_NR;
[1cfce3f]141
[733564a]142 return EOK;
143}
144
[da80de9]145void hr_raid5_vol_state_eval(hr_volume_t *vol)
[7b359f5]146{
[a3486f2]147 HR_DEBUG("%s()", __func__);
148
149 bool exp = true;
150 if (!atomic_compare_exchange_strong(&vol->state_dirty, &exp, false))
151 return;
152
153 vol->meta_ops->inc_counter(vol);
154 (void)vol->meta_ops->save(vol, WITH_STATE_CALLBACK);
155
156 hr_raid5_vol_state_eval_forced(vol);
[7b359f5]157}
158
[aa7864b]159errno_t hr_raid5_add_hotspare(hr_volume_t *vol, service_id_t hotspare)
160{
[56214383]161 HR_DEBUG("%s()", __func__);
[aa7864b]162
[56214383]163 errno_t rc = hr_util_add_hotspare(vol, hotspare);
[f1be66bf]164
[a3486f2]165 hr_raid5_vol_state_eval(vol);
[aa7864b]166
[56214383]167 return rc;
[aa7864b]168}
169
[a3486f2]170void hr_raid5_ext_state_cb(hr_volume_t *vol, size_t extent, errno_t rc)
[da80de9]171{
[a3486f2]172 HR_DEBUG("%s()", __func__);
173
174 assert(fibril_rwlock_is_locked(&vol->extents_lock));
175
176 if (rc == EOK)
177 return;
178
179 fibril_rwlock_write_lock(&vol->states_lock);
180
181 switch (rc) {
182 case ENOENT:
[da80de9]183 hr_update_ext_state(vol, extent, HR_EXT_MISSING);
[a3486f2]184 break;
185 default:
[da80de9]186 hr_update_ext_state(vol, extent, HR_EXT_FAILED);
[a3486f2]187 }
188
189 hr_mark_vol_state_dirty(vol);
190
191 fibril_rwlock_write_unlock(&vol->states_lock);
[da80de9]192}
193
[733564a]194static errno_t hr_raid5_bd_open(bd_srvs_t *bds, bd_srv_t *bd)
195{
[7a80c63]196 HR_DEBUG("%s()\n", __func__);
197
198 hr_volume_t *vol = bd->srvs->sarg;
199
200 atomic_fetch_add_explicit(&vol->open_cnt, 1, memory_order_relaxed);
201
[733564a]202 return EOK;
203}
204
205static errno_t hr_raid5_bd_close(bd_srv_t *bd)
206{
[7a80c63]207 HR_DEBUG("%s()\n", __func__);
208
209 hr_volume_t *vol = bd->srvs->sarg;
210
211 atomic_fetch_sub_explicit(&vol->open_cnt, 1, memory_order_relaxed);
212
[733564a]213 return EOK;
214}
215
216static errno_t hr_raid5_bd_sync_cache(bd_srv_t *bd, aoff64_t ba, size_t cnt)
217{
[137f7cf5]218 hr_volume_t *vol = bd->srvs->sarg;
219
220 return hr_sync_extents(vol);
[733564a]221}
222
[a3486f2]223static errno_t hr_raid5_bd_read_blocks(bd_srv_t *bd, uint64_t ba, size_t cnt,
224 void *data_read, size_t size)
[733564a]225{
226 hr_volume_t *vol = bd->srvs->sarg;
[a3486f2]227 errno_t rc;
[733564a]228
[a3486f2]229 if (size < cnt * vol->bsize)
230 return EINVAL;
[733564a]231
[a3486f2]232 fibril_rwlock_read_lock(&vol->states_lock);
233 hr_vol_state_t vol_state = vol->state;
234 fibril_rwlock_read_unlock(&vol->states_lock);
[733564a]235
[a3486f2]236 if (vol_state == HR_VOL_FAULTY || vol_state == HR_VOL_NONE)
237 return EIO;
[733564a]238
[a3486f2]239 rc = hr_check_ba_range(vol, cnt, ba);
240 if (rc != EOK)
241 return rc;
[da0570a]242
[a3486f2]243 uint64_t strip_size = vol->strip_size / vol->bsize; /* in blocks */
244 uint64_t strip_no = ba / strip_size;
[da0570a]245
[a3486f2]246 /* calculate number of stripes touched */
247 uint64_t last_ba = ba + cnt - 1;
248 uint64_t end_strip_no = last_ba / strip_size;
249 uint64_t start_stripe = strip_no / (vol->extent_no - 1);
250 uint64_t end_stripe = end_strip_no / (vol->extent_no - 1);
251 size_t stripes_cnt = end_stripe - start_stripe + 1;
[da0570a]252
[a3486f2]253 hr_stripe_t *stripes = hr_create_stripes(vol, stripes_cnt, false);
254 if (stripes == NULL)
255 return ENOMEM;
[a0c3080]256
[a3486f2]257 /*
258 * Pre-allocate range locks, because after group creation and
259 * firing off IO requests there is no easy consistent ENOMEM error
260 * path.
261 */
262 hr_range_lock_t **rlps = malloc_waitok(stripes_cnt * sizeof(*rlps));
263 for (size_t i = 0; i < stripes_cnt; i++)
264 rlps[i] = malloc_waitok(sizeof(**rlps));
[a0c3080]265
[a3486f2]266 /*
267 * extent order has to be locked for the whole IO duration,
268 * so that workers have consistent targets
269 */
270 fibril_rwlock_read_lock(&vol->extents_lock);
271
272 for (uint64_t s = start_stripe; s <= end_stripe; s++) {
273 uint64_t relative = s - start_stripe;
274 hr_range_lock_acquire_noalloc(rlps[relative], vol, s, 1);
[da0570a]275 }
276
[a3486f2]277 uint64_t phys_block, len;
278 size_t left;
[dceb6e7]279
[a3486f2]280 hr_layout_t layout = vol->layout;
281 hr_level_t level = vol->level;
[dceb6e7]282
[a3486f2]283 /* parity extent */
284 size_t p_extent = hr_raid5_parity_extent(level, layout,
285 vol->extent_no, strip_no);
[dceb6e7]286
[a3486f2]287 uint64_t strip_off = ba % strip_size;
[dceb6e7]288
[a3486f2]289 left = cnt;
[dceb6e7]290
[a3486f2]291 while (left != 0) {
292 if (level == HR_LVL_5) {
293 p_extent = hr_raid5_parity_extent(level, layout,
294 vol->extent_no, strip_no);
295 }
[8160e4c0]296
[a3486f2]297 size_t extent = hr_raid5_data_extent(level, layout,
298 vol->extent_no, strip_no, p_extent);
[8160e4c0]299
[a3486f2]300 uint64_t stripe_no = strip_no / (vol->extent_no - 1);
301 size_t relative_si = stripe_no - start_stripe; /* relative stripe index */
302 hr_stripe_t *stripe = &stripes[relative_si];
303 stripe->p_extent = p_extent;
[978130a]304
[a3486f2]305 stripe->strips_touched++;
[da0570a]306
[a3486f2]307 phys_block = stripe_no * strip_size + strip_off;
308 cnt = min(left, strip_size - strip_off);
309 len = vol->bsize * cnt;
310 hr_add_data_offset(vol, &phys_block);
[da0570a]311
[a3486f2]312 stripe->extent_span[extent].range.start = phys_block;
313 stripe->extent_span[extent].range.end = phys_block + cnt - 1;
314 stripe->extent_span[extent].cnt = cnt;
315 stripe->extent_span[extent].data_read = data_read;
316 stripe->extent_span[extent].strip_off = strip_off;
317
318 data_read += len;
319 left -= cnt;
320 strip_off = 0;
321 strip_no++;
[da0570a]322 }
323
[a3486f2]324retry:
325 size_t bad_extent = vol->extent_no;
[da0570a]326
[a3486f2]327 uint64_t rebuild_pos = atomic_load_explicit(&vol->rebuild_blk,
328 memory_order_relaxed);
[da0570a]329
[a3486f2]330 fibril_rwlock_read_lock(&vol->states_lock);
[8160e4c0]331
[a3486f2]332 for (size_t e = 0; e < vol->extent_no; e++) {
333 hr_ext_state_t s = vol->extents[e].state;
334 if ((vol->state == HR_VOL_DEGRADED && s != HR_EXT_ONLINE) ||
335 (s == HR_EXT_REBUILD && rebuild_pos < start_stripe)) {
336 bad_extent = e;
337 break;
[dceb6e7]338 }
[a3486f2]339 }
[da0570a]340
[a3486f2]341 fibril_rwlock_read_unlock(&vol->states_lock);
[dceb6e7]342
[a3486f2]343 for (size_t s = 0; s < stripes_cnt; s++) {
344 if (stripes[s].done)
345 continue;
346 execute_stripe(&stripes[s], bad_extent);
347 }
[da0570a]348
[a3486f2]349 for (size_t s = 0; s < stripes_cnt; s++) {
350 if (stripes[s].done)
351 continue;
352 wait_for_stripe(&stripes[s]);
[da0570a]353 }
354
[a3486f2]355 hr_raid5_vol_state_eval(vol);
[da0570a]356
[a3486f2]357 rc = EOK;
[da0570a]358
[a3486f2]359 fibril_rwlock_read_lock(&vol->states_lock);
360
361 if (vol->state == HR_VOL_FAULTY) {
362 fibril_rwlock_read_unlock(&vol->states_lock);
363 rc = EIO;
364 goto end;
[978130a]365 }
[dceb6e7]366
[a3486f2]367 fibril_rwlock_read_unlock(&vol->states_lock);
[8160e4c0]368
[a3486f2]369 for (size_t s = 0; s < stripes_cnt; s++)
370 if (stripes[s].rc == EAGAIN)
371 goto retry;
[8160e4c0]372
[a3486f2]373 /* all stripes are done */
374end:
375 fibril_rwlock_read_unlock(&vol->extents_lock);
[8160e4c0]376
[a3486f2]377 for (size_t i = 0; i < stripes_cnt; i++)
378 hr_range_lock_release(rlps[i]);
379
380 hr_destroy_stripes(stripes, stripes_cnt);
[da0570a]381
[12321f8]382 return rc;
[dceb6e7]383}
384
[a3486f2]385static errno_t hr_raid5_bd_write_blocks(bd_srv_t *bd, aoff64_t ba, size_t cnt,
386 const void *data_write, size_t size)
[dceb6e7]387{
388 hr_volume_t *vol = bd->srvs->sarg;
389 errno_t rc;
[da0570a]390
[a3486f2]391 if (size < cnt * vol->bsize)
392 return EINVAL;
393
394 fibril_rwlock_read_lock(&vol->states_lock);
395 hr_vol_state_t vol_state = vol->state;
396 fibril_rwlock_read_unlock(&vol->states_lock);
[fad91b9]397
[a3486f2]398 if (vol_state == HR_VOL_FAULTY || vol_state == HR_VOL_NONE)
399 return EIO;
400
401 /* increment metadata counter only on first write */
402 bool exp = false;
403 if (atomic_compare_exchange_strong(&vol->first_write, &exp, true)) {
404 vol->meta_ops->inc_counter(vol);
405 vol->meta_ops->save(vol, WITH_STATE_CALLBACK);
406 }
[dceb6e7]407
408 rc = hr_check_ba_range(vol, cnt, ba);
409 if (rc != EOK)
410 return rc;
411
[978130a]412 uint64_t strip_size = vol->strip_size / vol->bsize; /* in blocks */
[a3486f2]413 uint64_t strip_no = ba / strip_size;
[d7768d11]414
[a3486f2]415 /* calculate number of stripes touched */
416 uint64_t last_ba = ba + cnt - 1;
417 uint64_t end_strip_no = last_ba / strip_size;
418 uint64_t start_stripe = strip_no / (vol->extent_no - 1);
419 uint64_t end_stripe = end_strip_no / (vol->extent_no - 1);
420 size_t stripes_cnt = end_stripe - start_stripe + 1;
421
422 hr_stripe_t *stripes = hr_create_stripes(vol, stripes_cnt, true);
423 if (stripes == NULL)
424 return ENOMEM;
425
426 uint64_t stripe_size = strip_size * (vol->extent_no - 1);
427
428 for (uint64_t stripe = start_stripe; stripe <= end_stripe; stripe++) {
429 uint64_t relative_stripe = stripe - start_stripe;
430
431 uint64_t s_start = stripe * stripe_size;
432 uint64_t s_end = s_start + stripe_size - 1;
[d7768d11]433
[a3486f2]434 uint64_t overlap_start;
435 if (ba > s_start)
436 overlap_start = ba;
[d7768d11]437 else
[a3486f2]438 overlap_start = s_start;
439
440 uint64_t overlap_end;
441 if (last_ba < s_end)
442 overlap_end = last_ba;
443 else
444 overlap_end = s_end;
445
446 uint64_t start_strip_index =
447 (overlap_start - s_start) / strip_size;
448 uint64_t end_strip_index = (overlap_end - s_start) / strip_size;
449 size_t strips_touched = end_strip_index - start_strip_index + 1;
450
451 stripes[relative_stripe].strips_touched = strips_touched;
452
453 uint64_t first_offset = (overlap_start - s_start) % strip_size;
454 uint64_t last_offset = (overlap_end - s_start) % strip_size;
455
456 size_t partials = 0;
457 if (first_offset != 0)
458 partials++;
459 if (last_offset != strip_size - 1)
460 partials++;
461 if (start_strip_index == end_strip_index && partials == 2)
462 partials = 1;
463
464 stripes[relative_stripe].strips_touched = strips_touched;
465 stripes[relative_stripe].partial_strips_touched = partials;
466
467 if (strips_touched < (vol->extent_no - 1) / 2)
468 stripes[relative_stripe].subtract = true;
[d7768d11]469 }
470
[a3486f2]471 /*
472 * Pre-allocate range locks, because after group creation and
473 * firing off IO requests there is no easy consistent ENOMEM error
474 * path.
475 */
476 hr_range_lock_t **rlps = malloc_waitok(stripes_cnt * sizeof(*rlps));
477 for (size_t i = 0; i < stripes_cnt; i++)
478 rlps[i] = malloc_waitok(sizeof(**rlps));
[978130a]479
[a3486f2]480 /*
481 * extent order has to be locked for the whole IO duration,
482 * so that workers have consistent targets
483 */
484 fibril_rwlock_read_lock(&vol->extents_lock);
[dceb6e7]485
[a3486f2]486 for (uint64_t s = start_stripe; s <= end_stripe; s++) {
487 uint64_t relative = s - start_stripe;
488 hr_range_lock_acquire_noalloc(rlps[relative], vol, s, 1);
[da0570a]489 }
490
[a3486f2]491 uint64_t phys_block, len;
492 size_t left;
493
494 hr_layout_t layout = vol->layout;
495 hr_level_t level = vol->level;
496
497 /* parity extent */
498 size_t p_extent = hr_raid5_parity_extent(level, layout,
499 vol->extent_no, strip_no);
500
501 uint64_t strip_off = ba % strip_size;
502
[fad91b9]503 left = cnt;
[a0c3080]504
[dceb6e7]505 while (left != 0) {
[a3486f2]506 if (level == HR_LVL_5) {
507 p_extent = hr_raid5_parity_extent(level, layout,
508 vol->extent_no, strip_no);
509 }
510
511 size_t extent = hr_raid5_data_extent(level, layout,
512 vol->extent_no, strip_no, p_extent);
513
514 uint64_t stripe_no = strip_no / (vol->extent_no - 1);
515 size_t relative_si = stripe_no - start_stripe; /* relative stripe index */
516 hr_stripe_t *stripe = &stripes[relative_si];
517 stripe->p_extent = p_extent;
518
519 phys_block = stripe_no * strip_size + strip_off;
[978130a]520 cnt = min(left, strip_size - strip_off);
[da0570a]521 len = vol->bsize * cnt;
[a3486f2]522 hr_add_data_offset(vol, &phys_block);
523
524 stripe->extent_span[extent].range.start = phys_block;
525 stripe->extent_span[extent].range.end = phys_block + cnt - 1;
526 stripe->extent_span[extent].cnt = cnt;
527 stripe->extent_span[extent].data_write = data_write;
528 stripe->extent_span[extent].strip_off = strip_off;
529
530 data_write += len;
531 left -= cnt;
532 strip_off = 0;
533 strip_no++;
534 }
535
536retry:
537 size_t bad_extent = vol->extent_no;
538
539 uint64_t rebuild_pos = atomic_load_explicit(&vol->rebuild_blk,
540 memory_order_relaxed);
541
542 fibril_rwlock_read_lock(&vol->states_lock);
543
544 for (size_t e = 0; e < vol->extent_no; e++) {
545 hr_ext_state_t s = vol->extents[e].state;
546 if ((vol->state == HR_VOL_DEGRADED && s != HR_EXT_ONLINE) ||
547 (s == HR_EXT_REBUILD && rebuild_pos < start_stripe)) {
548 bad_extent = e;
[dceb6e7]549 break;
[fad91b9]550 }
[a3486f2]551 }
[fad91b9]552
[a3486f2]553 fibril_rwlock_read_unlock(&vol->states_lock);
[fad91b9]554
[a3486f2]555 for (size_t s = 0; s < stripes_cnt; s++) {
556 if (stripes[s].done)
557 continue;
558 execute_stripe(&stripes[s], bad_extent);
559 }
[da0570a]560
[a3486f2]561 for (size_t s = 0; s < stripes_cnt; s++) {
562 if (stripes[s].done)
563 continue;
564 wait_for_stripe(&stripes[s]);
565 }
566
567 hr_raid5_vol_state_eval(vol);
568
569 rc = EOK;
570
571 fibril_rwlock_read_lock(&vol->states_lock);
572
573 if (vol->state == HR_VOL_FAULTY) {
574 fibril_rwlock_read_unlock(&vol->states_lock);
575 rc = EIO;
576 goto end;
577 }
578
579 fibril_rwlock_read_unlock(&vol->states_lock);
580
581 for (size_t s = 0; s < stripes_cnt; s++)
582 if (stripes[s].rc == EAGAIN)
583 goto retry;
584
585 /* all stripes are done */
586end:
587 fibril_rwlock_read_unlock(&vol->extents_lock);
588
589 for (size_t i = 0; i < stripes_cnt; i++)
590 hr_range_lock_release(rlps[i]);
591
592 hr_destroy_stripes(stripes, stripes_cnt);
593
594 return rc;
595}
596
597static errno_t hr_raid5_bd_get_block_size(bd_srv_t *bd, size_t *rsize)
598{
599 hr_volume_t *vol = bd->srvs->sarg;
600
601 *rsize = vol->bsize;
602 return EOK;
603}
604
605static errno_t hr_raid5_bd_get_num_blocks(bd_srv_t *bd, aoff64_t *rnb)
606{
607 hr_volume_t *vol = bd->srvs->sarg;
608
609 *rnb = vol->data_blkno;
610 return EOK;
611}
612
613static void hr_raid5_vol_state_eval_forced(hr_volume_t *vol)
614{
615 fibril_rwlock_read_lock(&vol->extents_lock);
616 fibril_rwlock_write_lock(&vol->states_lock);
617
618 hr_vol_state_t state = vol->state;
619
620 size_t bad = 0;
621 for (size_t i = 0; i < vol->extent_no; i++)
622 if (vol->extents[i].state != HR_EXT_ONLINE)
623 bad++;
624
625 switch (bad) {
626 case 0:
627 if (state != HR_VOL_ONLINE)
628 hr_update_vol_state(vol, HR_VOL_ONLINE);
629 break;
630 case 1:
631 if (state != HR_VOL_DEGRADED && state != HR_VOL_REBUILD)
632 hr_update_vol_state(vol, HR_VOL_DEGRADED);
633
634 if (state != HR_VOL_REBUILD) {
635 /* XXX: allow REBUILD on INVALID extents */
636 fibril_mutex_lock(&vol->hotspare_lock);
637 size_t hs_no = vol->hotspare_no;
638 fibril_mutex_unlock(&vol->hotspare_lock);
639 if (hs_no > 0) {
640 fid_t fib = fibril_create(hr_raid5_rebuild,
641 vol);
642 if (fib == 0)
643 break;
644 fibril_start(fib);
645 fibril_detach(fib);
[da0570a]646 }
647 }
[a3486f2]648 break;
649 default:
650 if (state != HR_VOL_FAULTY)
651 hr_update_vol_state(vol, HR_VOL_FAULTY);
652 break;
653 }
[da0570a]654
[a3486f2]655 fibril_rwlock_write_unlock(&vol->states_lock);
656 fibril_rwlock_read_unlock(&vol->extents_lock);
657}
[d7768d11]658
[a3486f2]659static void xor(void *dst, const void *src, size_t size)
660{
661 size_t i;
662 uint64_t *d = dst;
663 const uint64_t *s = src;
664
665 for (i = 0; i < size / sizeof(uint64_t); ++i)
666 *d++ ^= *s++;
667}
[d7768d11]668
[a3486f2]669static size_t hr_raid5_parity_extent(hr_level_t level,
670 hr_layout_t layout, size_t extent_no, uint64_t strip_no)
671{
672 switch (level) {
673 case HR_LVL_4:
674 switch (layout) {
675 case HR_LAYOUT_RAID4_0:
676 return (0);
677 case HR_LAYOUT_RAID4_N:
678 return (extent_no - 1);
679 default:
680 assert(0 && "invalid layout configuration");
[d7768d11]681 }
[a3486f2]682 case HR_LVL_5:
683 switch (layout) {
684 case HR_LAYOUT_RAID5_0R:
685 return ((strip_no / (extent_no - 1)) % extent_no);
686 case HR_LAYOUT_RAID5_NR:
687 case HR_LAYOUT_RAID5_NC:
688 return ((extent_no - 1) -
689 (strip_no / (extent_no - 1)) % extent_no);
690 default:
691 assert(0 && "invalid layout configuration");
692 }
693 default:
694 assert(0 && "invalid layout configuration");
695 }
696}
[d7768d11]697
[a3486f2]698static size_t hr_raid5_data_extent(hr_level_t level,
699 hr_layout_t layout, size_t extent_no, uint64_t strip_no, size_t p_extent)
700{
701 switch (level) {
702 case HR_LVL_4:
703 switch (layout) {
704 case HR_LAYOUT_RAID4_0:
705 return ((strip_no % (extent_no - 1)) + 1);
706 case HR_LAYOUT_RAID4_N:
707 return (strip_no % (extent_no - 1));
708 default:
709 assert(0 && "invalid layout configuration");
710 }
711 case HR_LVL_5:
712 switch (layout) {
713 case HR_LAYOUT_RAID5_0R:
714 case HR_LAYOUT_RAID5_NR:
715 if ((strip_no % (extent_no - 1)) < p_extent)
716 return (strip_no % (extent_no - 1));
[d7768d11]717 else
[a3486f2]718 return ((strip_no % (extent_no - 1)) + 1);
719 case HR_LAYOUT_RAID5_NC:
720 return (((strip_no % (extent_no - 1)) + p_extent + 1) %
721 extent_no);
722 default:
723 assert(0 && "invalid layout configuration");
[d7768d11]724 }
[a3486f2]725 default:
726 assert(0 && "invalid layout configuration");
[dceb6e7]727 }
728}
729
[aa7864b]730static errno_t hr_raid5_rebuild(void *arg)
731{
732 HR_DEBUG("hr_raid5_rebuild()\n");
733
734 hr_volume_t *vol = arg;
735 errno_t rc = EOK;
736 void *buf = NULL, *xorbuf = NULL;
737
[f1be66bf]738 fibril_rwlock_read_lock(&vol->extents_lock);
739 fibril_rwlock_write_lock(&vol->states_lock);
[aa7864b]740
741 if (vol->hotspare_no == 0) {
742 HR_WARN("hr_raid5_rebuild(): no free hotspares on \"%s\", "
743 "aborting rebuild\n", vol->devname);
744 /* retval isn't checked for now */
745 goto end;
746 }
747
[65706f1]748 size_t bad = vol->extent_no;
749 for (size_t i = 0; i < vol->extent_no; i++) {
[56602e0]750 if (vol->extents[i].state == HR_EXT_FAILED) {
[aa7864b]751 bad = i;
752 break;
753 }
754 }
755
[65706f1]756 if (bad == vol->extent_no) {
[aa7864b]757 HR_WARN("hr_raid5_rebuild(): no bad extent on \"%s\", "
758 "aborting rebuild\n", vol->devname);
759 /* retval isn't checked for now */
760 goto end;
761 }
762
763 size_t hotspare_idx = vol->hotspare_no - 1;
764
[56602e0]765 hr_ext_state_t hs_state = vol->hotspares[hotspare_idx].state;
[a0c3080]766 if (hs_state != HR_EXT_HOTSPARE) {
767 HR_ERROR("hr_raid5_rebuild(): invalid hotspare state \"%s\", "
[155d34f]768 "aborting rebuild\n", hr_get_ext_state_str(hs_state));
[a0c3080]769 rc = EINVAL;
770 goto end;
771 }
772
773 HR_DEBUG("hr_raid5_rebuild(): swapping in hotspare\n");
774
775 block_fini(vol->extents[bad].svc_id);
776
[aa7864b]777 vol->extents[bad].svc_id = vol->hotspares[hotspare_idx].svc_id;
[56602e0]778 hr_update_ext_state(vol, bad, HR_EXT_HOTSPARE);
[aa7864b]779
780 vol->hotspares[hotspare_idx].svc_id = 0;
[f1be66bf]781 fibril_mutex_lock(&vol->hotspare_lock);
[56602e0]782 hr_update_hotspare_state(vol, hotspare_idx, HR_EXT_MISSING);
[f1be66bf]783 fibril_mutex_unlock(&vol->hotspare_lock);
[aa7864b]784
[a0c3080]785 vol->hotspare_no--;
[aa7864b]786
[a0c3080]787 hr_extent_t *rebuild_ext = &vol->extents[bad];
[aa7864b]788
[ca7fa5b]789 HR_DEBUG("hr_raid5_rebuild(): starting rebuild on (%" PRIun ")\n",
[a0c3080]790 rebuild_ext->svc_id);
791
[56602e0]792 hr_update_ext_state(vol, bad, HR_EXT_REBUILD);
793 hr_update_vol_state(vol, HR_VOL_REBUILD);
[a0c3080]794
[aa7864b]795 uint64_t max_blks = DATA_XFER_LIMIT / vol->bsize;
[65706f1]796 uint64_t left = vol->data_blkno / (vol->extent_no - 1);
[aa7864b]797 buf = malloc(max_blks * vol->bsize);
798 xorbuf = malloc(max_blks * vol->bsize);
799
800 uint64_t ba = 0, cnt;
[a3486f2]801 hr_add_data_offset(vol, &ba);
[a0c3080]802
[aa7864b]803 while (left != 0) {
804 cnt = min(left, max_blks);
805
806 /*
807 * Almost the same as read_degraded,
808 * but we don't want to allocate new
809 * xorbuf each blk rebuild batch.
810 */
811 bool first = true;
[65706f1]812 for (size_t i = 0; i < vol->extent_no; i++) {
[aa7864b]813 if (i == bad)
814 continue;
[8160e4c0]815 if (first)
816 rc = block_read_direct(vol->extents[i].svc_id,
817 ba, cnt, xorbuf);
818 else
819 rc = block_read_direct(vol->extents[i].svc_id,
820 ba, cnt, buf);
[aa7864b]821 if (rc != EOK) {
[da80de9]822 hr_raid5_ext_state_cb(vol, i, rc);
[ca7fa5b]823 HR_ERROR("rebuild on \"%s\" (%" PRIun "), "
824 "failed due to a failed ONLINE extent, "
825 "number %zu\n",
[aa7864b]826 vol->devname, vol->svc_id, i);
827 goto end;
828 }
829
[8160e4c0]830 if (!first)
[aa7864b]831 xor(xorbuf, buf, cnt * vol->bsize);
[8160e4c0]832 else
833 first = false;
[aa7864b]834 }
835
[a0c3080]836 rc = block_write_direct(rebuild_ext->svc_id, ba, cnt, xorbuf);
[aa7864b]837 if (rc != EOK) {
[da80de9]838 hr_raid5_ext_state_cb(vol, bad, rc);
[ca7fa5b]839 HR_ERROR("rebuild on \"%s\" (%" PRIun "), failed due to "
840 "the rebuilt extent number %zu failing\n",
[aa7864b]841 vol->devname, vol->svc_id, bad);
842 goto end;
843 }
844
845 ba += cnt;
846 left -= cnt;
[40bf2c6]847
848 /*
849 * Let other IO requests be served
850 * during rebuild.
851 */
[00d80c6]852
853 /*
854 * fibril_rwlock_write_unlock(&vol->states_lock);
855 * fibril_mutex_unlock(&vol->lock);
856 * fibril_mutex_lock(&vol->lock);
857 * fibril_rwlock_write_lock(&vol->states_lock);
858 */
[aa7864b]859 }
860
[ca7fa5b]861 HR_DEBUG("hr_raid5_rebuild(): rebuild finished on \"%s\" (%" PRIun "), "
862 "extent number %zu\n", vol->devname, vol->svc_id, hotspare_idx);
[aa7864b]863
[56602e0]864 hr_update_ext_state(vol, bad, HR_EXT_ONLINE);
[0277ec2]865
[00d80c6]866 fibril_rwlock_write_unlock(&vol->states_lock);
867 fibril_rwlock_read_unlock(&vol->extents_lock);
868
[50603405]869 rc = vol->meta_ops->save(vol, WITH_STATE_CALLBACK);
[0277ec2]870
[00d80c6]871 fibril_rwlock_read_lock(&vol->extents_lock);
872 fibril_rwlock_write_lock(&vol->states_lock);
873
[aa7864b]874end:
[a3486f2]875 hr_raid5_vol_state_eval_forced(vol);
[aa7864b]876
[f1be66bf]877 fibril_rwlock_write_unlock(&vol->states_lock);
878 fibril_rwlock_read_unlock(&vol->extents_lock);
[aa7864b]879
880 if (buf != NULL)
881 free(buf);
882
883 if (xorbuf != NULL)
884 free(xorbuf);
885
886 return rc;
887}
888
[dceb6e7]889/** @}
890 */
Note: See TracBrowser for help on using the repository browser.