source: mainline/uspace/srv/bd/hr/raid5.c

Last change on this file was c1c1c41, checked in by Miroslav Cimerman <mc@…>, 12 days ago

hr: add author's email address to RAID 5 files

  • Property mode set to 100644
File size: 21.6 KB
RevLine 
[dceb6e7]1/*
[c1c1c41]2 * Copyright (c) 2025 Miroslav Cimerman <mc@doas.su>
[dceb6e7]3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * - Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * - Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * - The name of the author may not be used to endorse or promote products
15 * derived from this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/** @addtogroup hr
30 * @{
31 */
32/**
33 * @file
34 */
35
36#include <abi/ipc/ipc.h>
37#include <bd_srv.h>
38#include <block.h>
39#include <errno.h>
40#include <hr.h>
[ca7fa5b]41#include <inttypes.h>
[dceb6e7]42#include <io/log.h>
43#include <ipc/hr.h>
44#include <ipc/services.h>
45#include <loc.h>
[978130a]46#include <mem.h>
[dceb6e7]47#include <task.h>
48#include <stdio.h>
49#include <stdlib.h>
50#include <str_error.h>
51
[a3486f2]52#include "io.h"
53#include "parity_stripe.h"
[dceb6e7]54#include "superblock.h"
55#include "util.h"
56#include "var.h"
57
[a3486f2]58static void hr_raid5_vol_state_eval_forced(hr_volume_t *);
59static size_t hr_raid5_parity_extent(hr_level_t, hr_layout_t, size_t,
60 uint64_t);
61static size_t hr_raid5_data_extent(hr_level_t, hr_layout_t, size_t, uint64_t,
[09e01d23]62 size_t);
[6f13257]63static errno_t hr_raid5_rebuild(void *);
[733564a]64
65/* bdops */
[6f13257]66static errno_t hr_raid5_bd_open(bd_srvs_t *, bd_srv_t *);
67static errno_t hr_raid5_bd_close(bd_srv_t *);
68static errno_t hr_raid5_bd_read_blocks(bd_srv_t *, aoff64_t, size_t, void *,
[dceb6e7]69 size_t);
[6f13257]70static errno_t hr_raid5_bd_sync_cache(bd_srv_t *, aoff64_t, size_t);
71static errno_t hr_raid5_bd_write_blocks(bd_srv_t *, aoff64_t, size_t,
[dceb6e7]72 const void *, size_t);
[6f13257]73static errno_t hr_raid5_bd_get_block_size(bd_srv_t *, size_t *);
74static errno_t hr_raid5_bd_get_num_blocks(bd_srv_t *, aoff64_t *);
[dceb6e7]75
76static bd_ops_t hr_raid5_bd_ops = {
[6f13257]77 .open = hr_raid5_bd_open,
78 .close = hr_raid5_bd_close,
79 .sync_cache = hr_raid5_bd_sync_cache,
80 .read_blocks = hr_raid5_bd_read_blocks,
81 .write_blocks = hr_raid5_bd_write_blocks,
82 .get_block_size = hr_raid5_bd_get_block_size,
83 .get_num_blocks = hr_raid5_bd_get_num_blocks
[dceb6e7]84};
85
[6d0fc11]86extern loc_srv_t *hr_srv;
87
[733564a]88errno_t hr_raid5_create(hr_volume_t *new_volume)
89{
[baa4929]90 HR_DEBUG("%s()", __func__);
91
[b5c95da5]92 if (new_volume->level != HR_LVL_5 && new_volume->level != HR_LVL_4)
93 return EINVAL;
[733564a]94
[65706f1]95 if (new_volume->extent_no < 3) {
[af73327a]96 HR_ERROR("RAID 5 volume needs at least 3 devices\n");
[733564a]97 return EINVAL;
98 }
99
[a3486f2]100 hr_raid5_vol_state_eval_forced(new_volume);
101
102 fibril_rwlock_read_lock(&new_volume->states_lock);
103 hr_vol_state_t state = new_volume->state;
104 fibril_rwlock_read_unlock(&new_volume->states_lock);
105 if (state == HR_VOL_FAULTY || state == HR_VOL_NONE) {
106 HR_NOTE("\"%s\": unusable state, not creating\n",
107 new_volume->devname);
108 return EINVAL;
109 }
[f1be66bf]110
[9323bb8]111 bd_srvs_init(&new_volume->hr_bds);
112 new_volume->hr_bds.ops = &hr_raid5_bd_ops;
113 new_volume->hr_bds.sarg = new_volume;
114
[8a65373]115 return EOK;
[733564a]116}
117
[746e636]118/*
119 * Called only once in volume's lifetime.
120 */
[733564a]121errno_t hr_raid5_init(hr_volume_t *vol)
122{
[baa4929]123 HR_DEBUG("%s()", __func__);
[733564a]124
[b5c95da5]125 if (vol->level != HR_LVL_5 && vol->level != HR_LVL_4)
126 return EINVAL;
[733564a]127
[50603405]128 vol->data_offset = vol->meta_ops->get_data_offset();
[baa4929]129
[a3486f2]130 uint64_t single_sz = vol->truncated_blkno - vol->meta_ops->get_size();
131 vol->data_blkno = single_sz * (vol->extent_no - 1);
[baa4929]132
[ca212a51]133 vol->strip_size = hr_closest_pow2(HR_STRIP_SIZE / (vol->extent_no - 1));
[733564a]134
[9ee9c60b]135 if (vol->level == HR_LVL_4)
136 vol->layout = HR_LAYOUT_RAID4_N;
137 else
138 vol->layout = HR_LAYOUT_RAID5_NR;
[1cfce3f]139
[733564a]140 return EOK;
141}
142
[da80de9]143void hr_raid5_vol_state_eval(hr_volume_t *vol)
[7b359f5]144{
[a3486f2]145 HR_DEBUG("%s()", __func__);
146
147 bool exp = true;
148 if (!atomic_compare_exchange_strong(&vol->state_dirty, &exp, false))
149 return;
150
151 vol->meta_ops->inc_counter(vol);
[e0695ce]152 vol->meta_ops->save(vol, WITH_STATE_CALLBACK);
[a3486f2]153
154 hr_raid5_vol_state_eval_forced(vol);
[7b359f5]155}
156
[a3486f2]157void hr_raid5_ext_state_cb(hr_volume_t *vol, size_t extent, errno_t rc)
[da80de9]158{
[a3486f2]159 HR_DEBUG("%s()", __func__);
160
161 assert(fibril_rwlock_is_locked(&vol->extents_lock));
162
163 if (rc == EOK)
164 return;
165
166 fibril_rwlock_write_lock(&vol->states_lock);
167
168 switch (rc) {
169 case ENOENT:
[da80de9]170 hr_update_ext_state(vol, extent, HR_EXT_MISSING);
[a3486f2]171 break;
172 default:
[da80de9]173 hr_update_ext_state(vol, extent, HR_EXT_FAILED);
[a3486f2]174 }
175
176 hr_mark_vol_state_dirty(vol);
177
178 fibril_rwlock_write_unlock(&vol->states_lock);
[da80de9]179}
180
[733564a]181static errno_t hr_raid5_bd_open(bd_srvs_t *bds, bd_srv_t *bd)
182{
[7a80c63]183 HR_DEBUG("%s()\n", __func__);
184
185 hr_volume_t *vol = bd->srvs->sarg;
186
187 atomic_fetch_add_explicit(&vol->open_cnt, 1, memory_order_relaxed);
188
[733564a]189 return EOK;
190}
191
192static errno_t hr_raid5_bd_close(bd_srv_t *bd)
193{
[7a80c63]194 HR_DEBUG("%s()\n", __func__);
195
196 hr_volume_t *vol = bd->srvs->sarg;
197
198 atomic_fetch_sub_explicit(&vol->open_cnt, 1, memory_order_relaxed);
199
[733564a]200 return EOK;
201}
202
203static errno_t hr_raid5_bd_sync_cache(bd_srv_t *bd, aoff64_t ba, size_t cnt)
204{
[137f7cf5]205 hr_volume_t *vol = bd->srvs->sarg;
206
207 return hr_sync_extents(vol);
[733564a]208}
209
[a3486f2]210static errno_t hr_raid5_bd_read_blocks(bd_srv_t *bd, uint64_t ba, size_t cnt,
211 void *data_read, size_t size)
[733564a]212{
213 hr_volume_t *vol = bd->srvs->sarg;
[a3486f2]214 errno_t rc;
[733564a]215
[a3486f2]216 if (size < cnt * vol->bsize)
217 return EINVAL;
[733564a]218
[a3486f2]219 fibril_rwlock_read_lock(&vol->states_lock);
220 hr_vol_state_t vol_state = vol->state;
221 fibril_rwlock_read_unlock(&vol->states_lock);
[733564a]222
[a3486f2]223 if (vol_state == HR_VOL_FAULTY || vol_state == HR_VOL_NONE)
224 return EIO;
[733564a]225
[a3486f2]226 rc = hr_check_ba_range(vol, cnt, ba);
227 if (rc != EOK)
228 return rc;
[da0570a]229
[a3486f2]230 uint64_t strip_size = vol->strip_size / vol->bsize; /* in blocks */
231 uint64_t strip_no = ba / strip_size;
[da0570a]232
[a3486f2]233 /* calculate number of stripes touched */
234 uint64_t last_ba = ba + cnt - 1;
235 uint64_t end_strip_no = last_ba / strip_size;
236 uint64_t start_stripe = strip_no / (vol->extent_no - 1);
237 uint64_t end_stripe = end_strip_no / (vol->extent_no - 1);
238 size_t stripes_cnt = end_stripe - start_stripe + 1;
[da0570a]239
[cdfcaea]240 hr_stripe_t *stripes = hr_create_stripes(vol, vol->strip_size,
241 stripes_cnt, false);
[a0c3080]242
[a3486f2]243 uint64_t phys_block, len;
244 size_t left;
[dceb6e7]245
[a3486f2]246 hr_layout_t layout = vol->layout;
247 hr_level_t level = vol->level;
[dceb6e7]248
[a3486f2]249 /* parity extent */
250 size_t p_extent = hr_raid5_parity_extent(level, layout,
251 vol->extent_no, strip_no);
[dceb6e7]252
[a3486f2]253 uint64_t strip_off = ba % strip_size;
[dceb6e7]254
[a3486f2]255 left = cnt;
[dceb6e7]256
[a3486f2]257 while (left != 0) {
258 if (level == HR_LVL_5) {
259 p_extent = hr_raid5_parity_extent(level, layout,
260 vol->extent_no, strip_no);
261 }
[8160e4c0]262
[a3486f2]263 size_t extent = hr_raid5_data_extent(level, layout,
264 vol->extent_no, strip_no, p_extent);
[8160e4c0]265
[a3486f2]266 uint64_t stripe_no = strip_no / (vol->extent_no - 1);
267 size_t relative_si = stripe_no - start_stripe; /* relative stripe index */
268 hr_stripe_t *stripe = &stripes[relative_si];
269 stripe->p_extent = p_extent;
[978130a]270
[a3486f2]271 stripe->strips_touched++;
[da0570a]272
[a3486f2]273 phys_block = stripe_no * strip_size + strip_off;
274 cnt = min(left, strip_size - strip_off);
275 len = vol->bsize * cnt;
276 hr_add_data_offset(vol, &phys_block);
[da0570a]277
[a3486f2]278 stripe->extent_span[extent].range.start = phys_block;
279 stripe->extent_span[extent].range.end = phys_block + cnt - 1;
280 stripe->extent_span[extent].cnt = cnt;
281 stripe->extent_span[extent].data_read = data_read;
282 stripe->extent_span[extent].strip_off = strip_off;
283
284 data_read += len;
285 left -= cnt;
286 strip_off = 0;
287 strip_no++;
[da0570a]288 }
289
[9323bb8]290 hr_range_lock_t **rlps = hr_malloc_waitok(stripes_cnt * sizeof(*rlps));
291
292 /*
293 * extent order has to be locked for the whole IO duration,
294 * so that workers have consistent targets
295 */
296 fibril_rwlock_read_lock(&vol->extents_lock);
297
298 for (uint64_t s = start_stripe; s <= end_stripe; s++) {
299 uint64_t relative = s - start_stripe;
300 rlps[relative] = hr_range_lock_acquire(vol, s, 1);
301 }
302
[a3486f2]303retry:
304 size_t bad_extent = vol->extent_no;
[da0570a]305
[a3486f2]306 uint64_t rebuild_pos = atomic_load_explicit(&vol->rebuild_blk,
307 memory_order_relaxed);
[da0570a]308
[a3486f2]309 fibril_rwlock_read_lock(&vol->states_lock);
[8160e4c0]310
[a3486f2]311 for (size_t e = 0; e < vol->extent_no; e++) {
312 hr_ext_state_t s = vol->extents[e].state;
313 if ((vol->state == HR_VOL_DEGRADED && s != HR_EXT_ONLINE) ||
[cff2342]314 (s == HR_EXT_REBUILD && end_stripe >= rebuild_pos)) {
[a3486f2]315 bad_extent = e;
316 break;
[dceb6e7]317 }
[a3486f2]318 }
[da0570a]319
[a3486f2]320 fibril_rwlock_read_unlock(&vol->states_lock);
[dceb6e7]321
[a3486f2]322 for (size_t s = 0; s < stripes_cnt; s++) {
323 if (stripes[s].done)
324 continue;
[f7169a6]325 hr_execute_stripe(&stripes[s], bad_extent);
[a3486f2]326 }
[da0570a]327
[a3486f2]328 for (size_t s = 0; s < stripes_cnt; s++) {
329 if (stripes[s].done)
330 continue;
[f7169a6]331 hr_wait_for_stripe(&stripes[s]);
[da0570a]332 }
333
[a3486f2]334 hr_raid5_vol_state_eval(vol);
[da0570a]335
[a3486f2]336 rc = EOK;
[da0570a]337
[a3486f2]338 fibril_rwlock_read_lock(&vol->states_lock);
339
340 if (vol->state == HR_VOL_FAULTY) {
341 fibril_rwlock_read_unlock(&vol->states_lock);
342 rc = EIO;
343 goto end;
[978130a]344 }
[dceb6e7]345
[a3486f2]346 fibril_rwlock_read_unlock(&vol->states_lock);
[8160e4c0]347
[a3486f2]348 for (size_t s = 0; s < stripes_cnt; s++)
349 if (stripes[s].rc == EAGAIN)
350 goto retry;
[8160e4c0]351
[a3486f2]352 /* all stripes are done */
353end:
354 fibril_rwlock_read_unlock(&vol->extents_lock);
[8160e4c0]355
[a3486f2]356 for (size_t i = 0; i < stripes_cnt; i++)
357 hr_range_lock_release(rlps[i]);
358
[cdfcaea]359 free(rlps);
360
[a3486f2]361 hr_destroy_stripes(stripes, stripes_cnt);
[da0570a]362
[12321f8]363 return rc;
[dceb6e7]364}
365
[a3486f2]366static errno_t hr_raid5_bd_write_blocks(bd_srv_t *bd, aoff64_t ba, size_t cnt,
367 const void *data_write, size_t size)
[dceb6e7]368{
369 hr_volume_t *vol = bd->srvs->sarg;
370 errno_t rc;
[da0570a]371
[a3486f2]372 if (size < cnt * vol->bsize)
373 return EINVAL;
374
[95ca19d]375 if (vol->vflags & HR_VOL_FLAG_READ_ONLY)
376 return ENOTSUP;
377
[a3486f2]378 fibril_rwlock_read_lock(&vol->states_lock);
379 hr_vol_state_t vol_state = vol->state;
380 fibril_rwlock_read_unlock(&vol->states_lock);
[fad91b9]381
[a3486f2]382 if (vol_state == HR_VOL_FAULTY || vol_state == HR_VOL_NONE)
383 return EIO;
384
385 /* increment metadata counter only on first write */
386 bool exp = false;
387 if (atomic_compare_exchange_strong(&vol->first_write, &exp, true)) {
388 vol->meta_ops->inc_counter(vol);
389 vol->meta_ops->save(vol, WITH_STATE_CALLBACK);
390 }
[dceb6e7]391
392 rc = hr_check_ba_range(vol, cnt, ba);
393 if (rc != EOK)
394 return rc;
395
[978130a]396 uint64_t strip_size = vol->strip_size / vol->bsize; /* in blocks */
[a3486f2]397 uint64_t strip_no = ba / strip_size;
[d7768d11]398
[a3486f2]399 /* calculate number of stripes touched */
400 uint64_t last_ba = ba + cnt - 1;
401 uint64_t end_strip_no = last_ba / strip_size;
402 uint64_t start_stripe = strip_no / (vol->extent_no - 1);
403 uint64_t end_stripe = end_strip_no / (vol->extent_no - 1);
404 size_t stripes_cnt = end_stripe - start_stripe + 1;
405
[cdfcaea]406 hr_stripe_t *stripes = hr_create_stripes(vol, vol->strip_size,
407 stripes_cnt, true);
[a3486f2]408
409 uint64_t stripe_size = strip_size * (vol->extent_no - 1);
410
411 for (uint64_t stripe = start_stripe; stripe <= end_stripe; stripe++) {
412 uint64_t relative_stripe = stripe - start_stripe;
413
414 uint64_t s_start = stripe * stripe_size;
415 uint64_t s_end = s_start + stripe_size - 1;
[d7768d11]416
[a3486f2]417 uint64_t overlap_start;
418 if (ba > s_start)
419 overlap_start = ba;
[d7768d11]420 else
[a3486f2]421 overlap_start = s_start;
422
423 uint64_t overlap_end;
424 if (last_ba < s_end)
425 overlap_end = last_ba;
426 else
427 overlap_end = s_end;
428
429 uint64_t start_strip_index =
430 (overlap_start - s_start) / strip_size;
431 uint64_t end_strip_index = (overlap_end - s_start) / strip_size;
432 size_t strips_touched = end_strip_index - start_strip_index + 1;
433
434 stripes[relative_stripe].strips_touched = strips_touched;
435
436 uint64_t first_offset = (overlap_start - s_start) % strip_size;
437 uint64_t last_offset = (overlap_end - s_start) % strip_size;
438
439 size_t partials = 0;
440 if (first_offset != 0)
441 partials++;
442 if (last_offset != strip_size - 1)
443 partials++;
444 if (start_strip_index == end_strip_index && partials == 2)
445 partials = 1;
446
447 stripes[relative_stripe].strips_touched = strips_touched;
448 stripes[relative_stripe].partial_strips_touched = partials;
449
450 if (strips_touched < (vol->extent_no - 1) / 2)
451 stripes[relative_stripe].subtract = true;
[d7768d11]452 }
453
[a3486f2]454 uint64_t phys_block, len;
455 size_t left;
456
457 hr_layout_t layout = vol->layout;
458 hr_level_t level = vol->level;
459
460 /* parity extent */
461 size_t p_extent = hr_raid5_parity_extent(level, layout,
462 vol->extent_no, strip_no);
463
464 uint64_t strip_off = ba % strip_size;
465
[fad91b9]466 left = cnt;
[a0c3080]467
[dceb6e7]468 while (left != 0) {
[a3486f2]469 if (level == HR_LVL_5) {
470 p_extent = hr_raid5_parity_extent(level, layout,
471 vol->extent_no, strip_no);
472 }
473
474 size_t extent = hr_raid5_data_extent(level, layout,
475 vol->extent_no, strip_no, p_extent);
476
477 uint64_t stripe_no = strip_no / (vol->extent_no - 1);
478 size_t relative_si = stripe_no - start_stripe; /* relative stripe index */
479 hr_stripe_t *stripe = &stripes[relative_si];
480 stripe->p_extent = p_extent;
481
482 phys_block = stripe_no * strip_size + strip_off;
[978130a]483 cnt = min(left, strip_size - strip_off);
[da0570a]484 len = vol->bsize * cnt;
[a3486f2]485 hr_add_data_offset(vol, &phys_block);
486
487 stripe->extent_span[extent].range.start = phys_block;
488 stripe->extent_span[extent].range.end = phys_block + cnt - 1;
489 stripe->extent_span[extent].cnt = cnt;
490 stripe->extent_span[extent].data_write = data_write;
491 stripe->extent_span[extent].strip_off = strip_off;
492
493 data_write += len;
494 left -= cnt;
495 strip_off = 0;
496 strip_no++;
497 }
498
[9323bb8]499 hr_range_lock_t **rlps = hr_malloc_waitok(stripes_cnt * sizeof(*rlps));
500
501 /*
502 * extent order has to be locked for the whole IO duration,
503 * so that workers have consistent targets
504 */
505 fibril_rwlock_read_lock(&vol->extents_lock);
506
507 for (uint64_t s = start_stripe; s <= end_stripe; s++) {
508 uint64_t relative = s - start_stripe;
509 rlps[relative] = hr_range_lock_acquire(vol, s, 1);
510 }
511
[a3486f2]512retry:
513 size_t bad_extent = vol->extent_no;
514
515 uint64_t rebuild_pos = atomic_load_explicit(&vol->rebuild_blk,
516 memory_order_relaxed);
517
518 fibril_rwlock_read_lock(&vol->states_lock);
519
520 for (size_t e = 0; e < vol->extent_no; e++) {
521 hr_ext_state_t s = vol->extents[e].state;
522 if ((vol->state == HR_VOL_DEGRADED && s != HR_EXT_ONLINE) ||
[cff2342]523 (s == HR_EXT_REBUILD && start_stripe > rebuild_pos)) {
[a3486f2]524 bad_extent = e;
[dceb6e7]525 break;
[fad91b9]526 }
[a3486f2]527 }
[fad91b9]528
[a3486f2]529 fibril_rwlock_read_unlock(&vol->states_lock);
[fad91b9]530
[a3486f2]531 for (size_t s = 0; s < stripes_cnt; s++) {
532 if (stripes[s].done)
533 continue;
[f7169a6]534 hr_execute_stripe(&stripes[s], bad_extent);
[a3486f2]535 }
[da0570a]536
[a3486f2]537 for (size_t s = 0; s < stripes_cnt; s++) {
538 if (stripes[s].done)
539 continue;
[f7169a6]540 hr_wait_for_stripe(&stripes[s]);
[a3486f2]541 }
542
543 hr_raid5_vol_state_eval(vol);
544
545 rc = EOK;
546
547 fibril_rwlock_read_lock(&vol->states_lock);
548
549 if (vol->state == HR_VOL_FAULTY) {
550 fibril_rwlock_read_unlock(&vol->states_lock);
551 rc = EIO;
552 goto end;
553 }
554
555 fibril_rwlock_read_unlock(&vol->states_lock);
556
557 for (size_t s = 0; s < stripes_cnt; s++)
558 if (stripes[s].rc == EAGAIN)
559 goto retry;
560
561 /* all stripes are done */
562end:
563 fibril_rwlock_read_unlock(&vol->extents_lock);
564
565 for (size_t i = 0; i < stripes_cnt; i++)
566 hr_range_lock_release(rlps[i]);
567
[cdfcaea]568 free(rlps);
569
[a3486f2]570 hr_destroy_stripes(stripes, stripes_cnt);
571
572 return rc;
573}
574
575static errno_t hr_raid5_bd_get_block_size(bd_srv_t *bd, size_t *rsize)
576{
577 hr_volume_t *vol = bd->srvs->sarg;
578
579 *rsize = vol->bsize;
580 return EOK;
581}
582
583static errno_t hr_raid5_bd_get_num_blocks(bd_srv_t *bd, aoff64_t *rnb)
584{
585 hr_volume_t *vol = bd->srvs->sarg;
586
587 *rnb = vol->data_blkno;
588 return EOK;
589}
590
591static void hr_raid5_vol_state_eval_forced(hr_volume_t *vol)
592{
593 fibril_rwlock_read_lock(&vol->extents_lock);
594 fibril_rwlock_write_lock(&vol->states_lock);
595
596 hr_vol_state_t state = vol->state;
597
598 size_t bad = 0;
599 for (size_t i = 0; i < vol->extent_no; i++)
600 if (vol->extents[i].state != HR_EXT_ONLINE)
601 bad++;
602
[cdfcaea]603 size_t invalid_no = hr_count_extents(vol, HR_EXT_INVALID);
604
[e0695ce]605 size_t rebuild_no = hr_count_extents(vol, HR_EXT_REBUILD);
606
[cdfcaea]607 fibril_mutex_lock(&vol->hotspare_lock);
608 size_t hs_no = vol->hotspare_no;
609 fibril_mutex_unlock(&vol->hotspare_lock);
610
[a3486f2]611 switch (bad) {
612 case 0:
[263a2389]613 if (state != HR_VOL_OPTIMAL)
614 hr_update_vol_state(vol, HR_VOL_OPTIMAL);
[a3486f2]615 break;
616 case 1:
617 if (state != HR_VOL_DEGRADED && state != HR_VOL_REBUILD)
618 hr_update_vol_state(vol, HR_VOL_DEGRADED);
619
620 if (state != HR_VOL_REBUILD) {
[e0695ce]621 if (hs_no > 0 || invalid_no > 0 || rebuild_no > 0) {
[a3486f2]622 fid_t fib = fibril_create(hr_raid5_rebuild,
623 vol);
624 if (fib == 0)
625 break;
626 fibril_start(fib);
627 fibril_detach(fib);
[da0570a]628 }
629 }
[a3486f2]630 break;
631 default:
632 if (state != HR_VOL_FAULTY)
633 hr_update_vol_state(vol, HR_VOL_FAULTY);
634 break;
635 }
[da0570a]636
[a3486f2]637 fibril_rwlock_write_unlock(&vol->states_lock);
638 fibril_rwlock_read_unlock(&vol->extents_lock);
639}
[d7768d11]640
[a3486f2]641static size_t hr_raid5_parity_extent(hr_level_t level,
642 hr_layout_t layout, size_t extent_no, uint64_t strip_no)
643{
644 switch (level) {
645 case HR_LVL_4:
646 switch (layout) {
647 case HR_LAYOUT_RAID4_0:
648 return (0);
649 case HR_LAYOUT_RAID4_N:
650 return (extent_no - 1);
651 default:
652 assert(0 && "invalid layout configuration");
[d7768d11]653 }
[a3486f2]654 case HR_LVL_5:
655 switch (layout) {
656 case HR_LAYOUT_RAID5_0R:
657 return ((strip_no / (extent_no - 1)) % extent_no);
658 case HR_LAYOUT_RAID5_NR:
659 case HR_LAYOUT_RAID5_NC:
660 return ((extent_no - 1) -
661 (strip_no / (extent_no - 1)) % extent_no);
662 default:
663 assert(0 && "invalid layout configuration");
664 }
665 default:
666 assert(0 && "invalid layout configuration");
667 }
668}
[d7768d11]669
[a3486f2]670static size_t hr_raid5_data_extent(hr_level_t level,
671 hr_layout_t layout, size_t extent_no, uint64_t strip_no, size_t p_extent)
672{
673 switch (level) {
674 case HR_LVL_4:
675 switch (layout) {
676 case HR_LAYOUT_RAID4_0:
677 return ((strip_no % (extent_no - 1)) + 1);
678 case HR_LAYOUT_RAID4_N:
679 return (strip_no % (extent_no - 1));
680 default:
681 assert(0 && "invalid layout configuration");
682 }
683 case HR_LVL_5:
684 switch (layout) {
685 case HR_LAYOUT_RAID5_0R:
686 case HR_LAYOUT_RAID5_NR:
687 if ((strip_no % (extent_no - 1)) < p_extent)
688 return (strip_no % (extent_no - 1));
[d7768d11]689 else
[a3486f2]690 return ((strip_no % (extent_no - 1)) + 1);
691 case HR_LAYOUT_RAID5_NC:
692 return (((strip_no % (extent_no - 1)) + p_extent + 1) %
693 extent_no);
694 default:
695 assert(0 && "invalid layout configuration");
[d7768d11]696 }
[a3486f2]697 default:
698 assert(0 && "invalid layout configuration");
[dceb6e7]699 }
700}
701
[aa7864b]702static errno_t hr_raid5_rebuild(void *arg)
703{
[cdfcaea]704 HR_DEBUG("%s()", __func__);
[aa7864b]705
706 hr_volume_t *vol = arg;
707 errno_t rc = EOK;
[cdfcaea]708 size_t rebuild_idx;
[aa7864b]709
[95ca19d]710 if (vol->vflags & HR_VOL_FLAG_READ_ONLY)
711 return ENOTSUP;
[e5c3580]712 if (!(vol->meta_ops->get_flags() & HR_METADATA_ALLOW_REBUILD))
713 return ENOTSUP;
714
[cdfcaea]715 rc = hr_init_rebuild(vol, &rebuild_idx);
716 if (rc != EOK)
717 return rc;
[aa7864b]718
[cdfcaea]719 uint64_t max_blks = DATA_XFER_LIMIT / vol->bsize;
[e0695ce]720 uint64_t left =
721 vol->data_blkno / (vol->extent_no - 1) - vol->rebuild_blk;
[aa7864b]722
[cdfcaea]723 uint64_t strip_size = vol->strip_size / vol->bsize; /* in blocks */
[aa7864b]724
[e0695ce]725 size_t cnt;
726 uint64_t ba = vol->rebuild_blk;
[cdfcaea]727 hr_add_data_offset(vol, &ba);
[aa7864b]728
[cdfcaea]729 /*
730 * this is not necessary because a rebuild is
731 * protected by itself, i.e. there can be only
732 * one REBUILD at a time
733 */
734 fibril_rwlock_read_lock(&vol->extents_lock);
[aa7864b]735
[cdfcaea]736 /* increment metadata counter only on first write */
737 bool exp = false;
738 if (atomic_compare_exchange_strong(&vol->first_write, &exp, true)) {
739 vol->meta_ops->inc_counter(vol);
740 vol->meta_ops->save(vol, WITH_STATE_CALLBACK);
[a0c3080]741 }
742
[cdfcaea]743 hr_range_lock_t *rl = NULL;
744 hr_stripe_t *stripe = hr_create_stripes(vol, max_blks * vol->bsize, 1,
745 false);
[aa7864b]746
[09e01d23]747 HR_NOTE("\"%s\": REBUILD started on extent no. %zu at "
748 "block %" PRIu64 ".\n",
[e0695ce]749 vol->devname, rebuild_idx, ba);
750
751 uint64_t written = 0;
[cdfcaea]752 unsigned int percent, old_percent = 100;
753 while (left != 0) {
754 cnt = min(left, max_blks);
[aa7864b]755
[cdfcaea]756 uint64_t strip_no = ba / strip_size;
757 uint64_t last_ba = ba + cnt - 1;
758 uint64_t end_strip_no = last_ba / strip_size;
759 uint64_t start_stripe = strip_no / (vol->extent_no - 1);
760 uint64_t end_stripe = end_strip_no / (vol->extent_no - 1);
761 size_t stripes_cnt = end_stripe - start_stripe + 1;
[a0c3080]762
[cdfcaea]763 stripe->ps_to_be_added = vol->extent_no - 1;
764 stripe->p_count_final = true;
[a0c3080]765
[cdfcaea]766 hr_fgroup_t *worker_group =
767 hr_fgroup_create(vol->fge, vol->extent_no);
[aa7864b]768
[cdfcaea]769 rl = hr_range_lock_acquire(vol, start_stripe, stripes_cnt);
[a0c3080]770
[cdfcaea]771 atomic_store_explicit(&vol->rebuild_blk, ba,
772 memory_order_relaxed);
[aa7864b]773
[cdfcaea]774 for (size_t e = 0; e < vol->extent_no; e++) {
775 if (e == rebuild_idx)
[aa7864b]776 continue;
777
[cdfcaea]778 hr_io_raid5_t *io = hr_fgroup_alloc(worker_group);
779 io->extent = e;
780 io->ba = ba;
781 io->cnt = cnt;
782 io->strip_off = 0;
783 io->vol = vol;
784 io->stripe = stripe;
785
786 hr_fgroup_submit(worker_group,
787 hr_io_raid5_reconstruct_reader, io);
[aa7864b]788 }
789
[cdfcaea]790 hr_io_raid5_t *io = hr_fgroup_alloc(worker_group);
791 io->extent = rebuild_idx;
792 io->ba = ba;
793 io->cnt = cnt;
794 io->strip_off = 0;
795 io->vol = vol;
796 io->stripe = stripe;
797
798 hr_fgroup_submit(worker_group, hr_io_raid5_parity_writer, io);
799
800 size_t failed;
801 (void)hr_fgroup_wait(worker_group, NULL, &failed);
802 if (failed > 0) {
803 hr_range_lock_release(rl);
804 HR_NOTE("\"%s\": REBUILD aborted.\n", vol->devname);
[aa7864b]805 goto end;
806 }
807
[cdfcaea]808 percent = ((ba + cnt) * 100) / vol->data_blkno;
809 if (percent != old_percent) {
810 if (percent % 5 == 0)
811 HR_DEBUG("\"%s\" REBUILD progress: %u%%\n",
812 vol->devname, percent);
813 }
814
[e0695ce]815 if (written * vol->bsize > HR_REBUILD_SAVE_BYTES) {
[6a8c1569]816 vol->meta_ops->save_ext(vol, rebuild_idx,
817 WITH_STATE_CALLBACK);
[e0695ce]818 written = 0;
819 }
820
[cdfcaea]821 hr_range_lock_release(rl);
822 hr_reset_stripe(stripe);
823
[e0695ce]824 written += cnt;
[aa7864b]825 ba += cnt;
826 left -= cnt;
[e0695ce]827 old_percent = percent;
[40bf2c6]828
829 /*
830 * Let other IO requests be served
831 * during rebuild.
832 */
[aa7864b]833 }
834
[ca7fa5b]835 HR_DEBUG("hr_raid5_rebuild(): rebuild finished on \"%s\" (%" PRIun "), "
[cdfcaea]836 "extent number %zu\n", vol->devname, vol->svc_id, rebuild_idx);
837
838 fibril_rwlock_write_lock(&vol->states_lock);
[aa7864b]839
[cdfcaea]840 hr_update_ext_state(vol, rebuild_idx, HR_EXT_ONLINE);
[0277ec2]841
[e0695ce]842 atomic_store_explicit(&vol->rebuild_blk, 0, memory_order_relaxed);
843
[cdfcaea]844 hr_mark_vol_state_dirty(vol);
[00d80c6]845
[6aafb48]846 hr_update_vol_state(vol, HR_VOL_DEGRADED);
847
[cdfcaea]848 fibril_rwlock_write_unlock(&vol->states_lock);
[aa7864b]849end:
[f1be66bf]850 fibril_rwlock_read_unlock(&vol->extents_lock);
[aa7864b]851
[cdfcaea]852 hr_raid1_vol_state_eval(vol);
[aa7864b]853
[cdfcaea]854 hr_destroy_stripes(stripe, 1);
[aa7864b]855
856 return rc;
857}
858
[dceb6e7]859/** @}
860 */
Note: See TracBrowser for help on using the repository browser.