source: mainline/uspace/srv/bd/hr/util.c

Last change on this file was 974f9ba, checked in by Miroslav Cimerman <mc@…>, 11 days ago

hr: different RAID 1 read strategies

First - take first usable extent.
Closest - take extent with last seek position.
Round-robin - always switch extents.
Split - split I/O to multiple extents.

  • Property mode set to 100644
File size: 27.5 KB
Line 
1/*
2 * Copyright (c) 2025 Miroslav Cimerman
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * - Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * - Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * - The name of the author may not be used to endorse or promote products
15 * derived from this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/** @addtogroup hr
30 * @{
31 */
32/**
33 * @file
34 */
35
36#include <adt/list.h>
37#include <block.h>
38#include <errno.h>
39#include <fibril_synch.h>
40#include <hr.h>
41#include <inttypes.h>
42#include <io/log.h>
43#include <loc.h>
44#include <mem.h>
45#include <stdatomic.h>
46#include <stdlib.h>
47#include <stdio.h>
48#include <str.h>
49#include <str_error.h>
50#include <vbd.h>
51
52#include "io.h"
53#include "superblock.h"
54#include "util.h"
55#include "var.h"
56
57static bool hr_range_lock_overlap(hr_range_lock_t *, hr_range_lock_t *);
58static errno_t hr_add_svc_linked_to_list(list_t *, service_id_t, bool, void *);
59static void free_dev_list_member(struct dev_list_member *);
60static void free_svc_id_list(list_t *);
61static errno_t hr_fill_disk_part_svcs_list(list_t *);
62static errno_t block_init_dev_list(list_t *);
63static void block_fini_dev_list(list_t *);
64static errno_t hr_util_get_matching_md_svcs_list(list_t *, list_t *,
65 service_id_t, hr_metadata_type_t, void *);
66static errno_t hr_util_assemble_from_matching_list(list_t *,
67 hr_metadata_type_t, uint8_t);
68static errno_t hr_fill_svcs_list_from_cfg(hr_config_t *, list_t *);
69static errno_t hr_swap_hs(hr_volume_t *, size_t, size_t);
70
71#define HR_RL_LIST_LOCK(vol) (fibril_mutex_lock(&(vol)->range_lock_list_lock))
72#define HR_RL_LIST_UNLOCK(vol) \
73 (fibril_mutex_unlock(&(vol)->range_lock_list_lock))
74
75extern loc_srv_t *hr_srv;
76extern list_t hr_volumes;
77extern fibril_rwlock_t hr_volumes_lock;
78
79/*
80 * malloc() wrapper that behaves like
81 * FreeBSD malloc(9) with M_WAITOK flag.
82 *
83 * Return value is never NULL.
84 */
85void *hr_malloc_waitok(size_t size)
86{
87 void *ret;
88 while ((ret = malloc(size)) == NULL)
89 fibril_usleep(MSEC2USEC(250)); /* sleep 250ms */
90
91 return ret;
92}
93
94void *hr_calloc_waitok(size_t nmemb, size_t size)
95{
96 void *ret;
97 while ((ret = calloc(nmemb, size)) == NULL)
98 fibril_usleep(MSEC2USEC(250)); /* sleep 250ms */
99
100 return ret;
101}
102
103errno_t hr_create_vol_struct(hr_volume_t **rvol, hr_level_t level,
104 const char *devname, hr_metadata_type_t metadata_type, uint8_t vflags)
105{
106 HR_DEBUG("%s()", __func__);
107
108 errno_t rc;
109
110 hr_volume_t *vol = calloc(1, sizeof(hr_volume_t));
111 if (vol == NULL)
112 return ENOMEM;
113
114 str_cpy(vol->devname, HR_DEVNAME_LEN, devname);
115 vol->level = level;
116
117 vol->vflags = vflags;
118
119 vol->meta_ops = hr_get_meta_type_ops(metadata_type);
120
121 switch (level) {
122 case HR_LVL_0:
123 vol->hr_ops.create = hr_raid0_create;
124 vol->hr_ops.init = hr_raid0_init;
125 vol->hr_ops.vol_state_eval = hr_raid0_vol_state_eval;
126 vol->hr_ops.ext_state_cb = hr_raid0_ext_state_cb;
127 break;
128 case HR_LVL_1:
129 vol->hr_ops.create = hr_raid1_create;
130 vol->hr_ops.init = hr_raid1_init;
131 vol->hr_ops.vol_state_eval = hr_raid1_vol_state_eval;
132 vol->hr_ops.ext_state_cb = hr_raid1_ext_state_cb;
133 break;
134 case HR_LVL_4:
135 case HR_LVL_5:
136 vol->hr_ops.create = hr_raid5_create;
137 vol->hr_ops.init = hr_raid5_init;
138 vol->hr_ops.vol_state_eval = hr_raid5_vol_state_eval;
139 vol->hr_ops.ext_state_cb = hr_raid5_ext_state_cb;
140 break;
141 default:
142 HR_DEBUG("unkown level: %d, aborting\n", vol->level);
143 rc = EINVAL;
144 goto error;
145 }
146
147 if (level == HR_LVL_4 || level == HR_LVL_5)
148 vol->fge = hr_fpool_create(16, 32, sizeof(hr_io_raid5_t));
149 else
150 vol->fge = hr_fpool_create(16, 32, sizeof(hr_io_t));
151
152 if (vol->fge == NULL) {
153 rc = ENOMEM;
154 goto error;
155 }
156
157 vol->state = HR_VOL_NONE;
158
159 fibril_mutex_initialize(&vol->md_lock);
160
161 fibril_rwlock_initialize(&vol->extents_lock);
162 fibril_rwlock_initialize(&vol->states_lock);
163
164 fibril_mutex_initialize(&vol->hotspare_lock);
165
166 list_initialize(&vol->range_lock_list);
167 fibril_mutex_initialize(&vol->range_lock_list_lock);
168
169 atomic_init(&vol->state_dirty, false);
170 atomic_init(&vol->first_write, false);
171 for (size_t i = 0; i < HR_MAX_EXTENTS; i++)
172 atomic_init(&vol->last_ext_pos_arr[i], 0);
173 atomic_init(&vol->last_ext_used, 0);
174 atomic_init(&vol->rebuild_blk, 0);
175 atomic_init(&vol->open_cnt, 0);
176
177 *rvol = vol;
178
179 return EOK;
180error:
181 free(vol);
182 return rc;
183}
184
185void hr_destroy_vol_struct(hr_volume_t *vol)
186{
187 HR_DEBUG("%s()", __func__);
188
189 if (vol == NULL)
190 return;
191
192 hr_fpool_destroy(vol->fge);
193 hr_fini_devs(vol);
194 free(vol->in_mem_md);
195 free(vol);
196}
197
198errno_t hr_get_volume_svcs(size_t *rcnt, service_id_t **rsvcs)
199{
200 size_t i;
201 service_id_t *vol_svcs;
202
203 if (rcnt == NULL || rsvcs == NULL)
204 return EINVAL;
205
206 fibril_rwlock_read_lock(&hr_volumes_lock);
207
208 size_t vol_cnt = list_count(&hr_volumes);
209 vol_svcs = malloc(vol_cnt * sizeof(service_id_t));
210 if (vol_svcs == NULL) {
211 fibril_rwlock_read_unlock(&hr_volumes_lock);
212 return ENOMEM;
213 }
214
215 i = 0;
216 list_foreach(hr_volumes, lvolumes, hr_volume_t, iter)
217 vol_svcs[i++] = iter->svc_id;
218
219 fibril_rwlock_read_unlock(&hr_volumes_lock);
220
221 *rcnt = vol_cnt;
222 *rsvcs = vol_svcs;
223
224 return EOK;
225}
226
227hr_volume_t *hr_get_volume(service_id_t svc_id)
228{
229 HR_DEBUG("%s()", __func__);
230
231 hr_volume_t *rvol = NULL;
232
233 fibril_rwlock_read_lock(&hr_volumes_lock);
234 list_foreach(hr_volumes, lvolumes, hr_volume_t, iter) {
235 if (iter->svc_id == svc_id) {
236 rvol = iter;
237 break;
238 }
239 }
240 fibril_rwlock_read_unlock(&hr_volumes_lock);
241
242 return rvol;
243}
244
245errno_t hr_remove_volume(service_id_t svc_id)
246{
247 HR_DEBUG("%s()", __func__);
248
249 hr_volume_t *vol = hr_get_volume(svc_id);
250 if (vol == NULL)
251 return ENOENT;
252
253 fibril_rwlock_write_lock(&hr_volumes_lock);
254
255 int open_cnt = atomic_load_explicit(&vol->open_cnt,
256 memory_order_relaxed);
257
258 /*
259 * The atomicity of this if condition (and this whole
260 * operation) is provided by the write lock - no new
261 * bd connection can come, because we need to get the
262 * bd_srvs_t from the volume, which we get from the list.
263 * (see hr_client_conn() in hr.c)
264 */
265 if (open_cnt > 0) {
266 fibril_rwlock_write_unlock(&hr_volumes_lock);
267 return EBUSY;
268 }
269
270 list_remove(&vol->lvolumes);
271
272 fibril_rwlock_write_unlock(&hr_volumes_lock);
273
274 /* save metadata, but we don't care about states anymore */
275 vol->meta_ops->save(vol, NO_STATE_CALLBACK);
276
277 HR_NOTE("deactivating volume \"%s\"\n", vol->devname);
278
279 hr_destroy_vol_struct(vol);
280
281 errno_t rc = loc_service_unregister(hr_srv, svc_id);
282 return rc;
283}
284
285errno_t hr_init_extents_from_cfg(hr_volume_t *vol, hr_config_t *cfg)
286{
287 HR_DEBUG("%s()", __func__);
288
289 errno_t rc;
290 uint64_t blkno, smallest_blkno = ~0ULL;
291 size_t i, bsize;
292 size_t last_bsize = 0;
293
294 for (i = 0; i < cfg->dev_no; i++) {
295 service_id_t svc_id = cfg->devs[i];
296 if (svc_id == 0) {
297 rc = EINVAL;
298 goto error;
299 }
300
301 HR_DEBUG("%s(): block_init() on (%" PRIun ")\n", __func__,
302 svc_id);
303 rc = block_init(svc_id);
304 if (rc != EOK) {
305 HR_DEBUG("%s(): initing (%" PRIun ") failed, "
306 "aborting\n", __func__, svc_id);
307 goto error;
308 }
309
310 rc = block_get_nblocks(svc_id, &blkno);
311 if (rc != EOK)
312 goto error;
313
314 rc = block_get_bsize(svc_id, &bsize);
315 if (rc != EOK)
316 goto error;
317
318 if (last_bsize != 0 && bsize != last_bsize) {
319 HR_DEBUG("block sizes differ\n");
320 rc = EINVAL;
321 goto error;
322 }
323
324 vol->extents[i].svc_id = svc_id;
325 vol->extents[i].state = HR_EXT_ONLINE;
326
327 if (blkno < smallest_blkno)
328 smallest_blkno = blkno;
329 last_bsize = bsize;
330 }
331
332 vol->bsize = last_bsize;
333 vol->extent_no = cfg->dev_no;
334 vol->truncated_blkno = smallest_blkno;
335
336 for (i = 0; i < HR_MAX_HOTSPARES; i++)
337 vol->hotspares[i].state = HR_EXT_MISSING;
338
339 return EOK;
340
341error:
342 for (i = 0; i < HR_MAX_EXTENTS; i++) {
343 if (vol->extents[i].svc_id != 0)
344 block_fini(vol->extents[i].svc_id);
345 }
346
347 return rc;
348}
349
350void hr_fini_devs(hr_volume_t *vol)
351{
352 HR_DEBUG("%s()", __func__);
353
354 size_t i;
355
356 for (i = 0; i < vol->extent_no; i++) {
357 if (vol->extents[i].svc_id != 0) {
358 HR_DEBUG("hr_fini_devs(): block_fini() on "
359 "(%" PRIun ")\n", vol->extents[i].svc_id);
360 block_fini(vol->extents[i].svc_id);
361 }
362 }
363
364 for (i = 0; i < vol->hotspare_no; i++) {
365 if (vol->hotspares[i].svc_id != 0) {
366 HR_DEBUG("hr_fini_devs(): block_fini() on "
367 "(%" PRIun ")\n",
368 vol->hotspares[i].svc_id);
369 block_fini(vol->hotspares[i].svc_id);
370 }
371 }
372}
373
374errno_t hr_register_volume(hr_volume_t *vol)
375{
376 HR_DEBUG("%s()", __func__);
377
378 errno_t rc;
379 service_id_t new_id;
380 category_id_t cat_id;
381 const char *devname = vol->devname;
382
383 rc = loc_service_register(hr_srv, devname, fallback_port_id, &new_id);
384 if (rc != EOK) {
385 HR_ERROR("unable to register device \"%s\": %s\n",
386 devname, str_error(rc));
387 return rc;
388 }
389
390 rc = loc_category_get_id("raid", &cat_id, IPC_FLAG_BLOCKING);
391 if (rc != EOK) {
392 HR_ERROR("failed resolving category \"raid\": %s\n",
393 str_error(rc));
394 goto error;
395 }
396
397 rc = loc_service_add_to_cat(hr_srv, new_id, cat_id);
398 if (rc != EOK) {
399 HR_ERROR("failed adding \"%s\" to category \"raid\": %s\n",
400 devname, str_error(rc));
401 goto error;
402 }
403
404 vol->svc_id = new_id;
405 return EOK;
406error:
407 rc = loc_service_unregister(hr_srv, new_id);
408 return rc;
409}
410
411errno_t hr_check_ba_range(hr_volume_t *vol, size_t cnt, uint64_t ba)
412{
413 if (ba + cnt > vol->data_blkno)
414 return ERANGE;
415 return EOK;
416}
417
418void hr_add_data_offset(hr_volume_t *vol, uint64_t *ba)
419{
420 *ba = *ba + vol->data_offset;
421}
422
423void hr_sub_data_offset(hr_volume_t *vol, uint64_t *ba)
424{
425 *ba = *ba - vol->data_offset;
426}
427
428void hr_update_ext_state(hr_volume_t *vol, size_t ext_idx, hr_ext_state_t s)
429{
430 if (vol->level != HR_LVL_0)
431 assert(fibril_rwlock_is_locked(&vol->extents_lock));
432
433 assert(fibril_rwlock_is_write_locked(&vol->states_lock));
434
435 assert(ext_idx < vol->extent_no);
436
437 hr_ext_state_t old = vol->extents[ext_idx].state;
438 HR_DEBUG("\"%s\": changing extent %zu state: %s -> %s\n",
439 vol->devname, ext_idx, hr_get_ext_state_str(old),
440 hr_get_ext_state_str(s));
441 vol->extents[ext_idx].state = s;
442}
443
444void hr_update_hotspare_state(hr_volume_t *vol, size_t hs_idx,
445 hr_ext_state_t s)
446{
447 assert(fibril_mutex_is_locked(&vol->hotspare_lock));
448
449 assert(hs_idx < vol->hotspare_no);
450
451 hr_ext_state_t old = vol->hotspares[hs_idx].state;
452 HR_DEBUG("\"%s\": changing hotspare %zu state: %s -> %s\n",
453 vol->devname, hs_idx, hr_get_ext_state_str(old),
454 hr_get_ext_state_str(s));
455 vol->hotspares[hs_idx].state = s;
456}
457
458void hr_update_vol_state(hr_volume_t *vol, hr_vol_state_t new)
459{
460 assert(fibril_rwlock_is_write_locked(&vol->states_lock));
461
462 HR_NOTE("\"%s\": volume state changed: %s -> %s\n", vol->devname,
463 hr_get_vol_state_str(vol->state), hr_get_vol_state_str(new));
464 vol->state = new;
465}
466
467void hr_update_ext_svc_id(hr_volume_t *vol, size_t ext_idx, service_id_t new)
468{
469 if (vol->level != HR_LVL_0)
470 assert(fibril_rwlock_is_write_locked(&vol->extents_lock));
471
472 assert(ext_idx < vol->extent_no);
473
474 service_id_t old = vol->extents[ext_idx].svc_id;
475 HR_DEBUG("\"%s\": changing extent no. %zu svc_id: (%" PRIun ") -> "
476 "(%" PRIun ")\n", vol->devname, ext_idx, old, new);
477 vol->extents[ext_idx].svc_id = new;
478}
479
480void hr_update_hotspare_svc_id(hr_volume_t *vol, size_t hs_idx,
481 service_id_t new)
482{
483 assert(fibril_mutex_is_locked(&vol->hotspare_lock));
484
485 assert(hs_idx < vol->hotspare_no);
486
487 service_id_t old = vol->hotspares[hs_idx].svc_id;
488 HR_DEBUG("\"%s\": changing hotspare no. %zu svc_id: (%" PRIun ") -> "
489 "(%" PRIun ")\n", vol->devname, hs_idx, old, new);
490 vol->hotspares[hs_idx].svc_id = new;
491}
492
493size_t hr_count_extents(hr_volume_t *vol, hr_ext_state_t state)
494{
495 if (vol->level != HR_LVL_0)
496 assert(fibril_rwlock_is_locked(&vol->extents_lock));
497 assert(fibril_rwlock_is_locked(&vol->states_lock));
498
499 size_t count = 0;
500 for (size_t i = 0; i < vol->extent_no; i++)
501 if (vol->extents[i].state == state)
502 count++;
503
504 return count;
505}
506
507hr_range_lock_t *hr_range_lock_acquire(hr_volume_t *vol, uint64_t ba,
508 uint64_t cnt)
509{
510 hr_range_lock_t *rl = hr_malloc_waitok(sizeof(hr_range_lock_t));
511
512 rl->vol = vol;
513 rl->off = ba;
514 rl->len = cnt;
515
516 rl->pending = 1;
517 rl->ignore = false;
518
519 link_initialize(&rl->link);
520 fibril_mutex_initialize(&rl->lock);
521
522 fibril_mutex_lock(&rl->lock);
523
524again:
525 HR_RL_LIST_LOCK(vol);
526 list_foreach(vol->range_lock_list, link, hr_range_lock_t, rlp) {
527 if (rlp->ignore)
528 continue;
529 if (hr_range_lock_overlap(rlp, rl)) {
530 rlp->pending++;
531
532 HR_RL_LIST_UNLOCK(vol);
533
534 fibril_mutex_lock(&rlp->lock);
535
536 HR_RL_LIST_LOCK(vol);
537
538 rlp->pending--;
539
540 /*
541 * when ignore is set, after HR_RL_LIST_UNLOCK(),
542 * noone new is going to be able to start sleeping
543 * on the ignored range lock, only already waiting
544 * IOs will come through here
545 */
546 rlp->ignore = true;
547
548 fibril_mutex_unlock(&rlp->lock);
549
550 if (rlp->pending == 0) {
551 list_remove(&rlp->link);
552 free(rlp);
553 }
554
555 HR_RL_LIST_UNLOCK(vol);
556 goto again;
557 }
558 }
559
560 list_append(&rl->link, &vol->range_lock_list);
561
562 HR_RL_LIST_UNLOCK(vol);
563 return rl;
564}
565
566void hr_range_lock_release(hr_range_lock_t *rl)
567{
568 if (rl == NULL)
569 return;
570
571 HR_RL_LIST_LOCK(rl->vol);
572
573 rl->pending--;
574
575 fibril_mutex_unlock(&rl->lock);
576
577 if (rl->pending == 0) {
578 list_remove(&rl->link);
579 free(rl);
580 }
581
582 HR_RL_LIST_UNLOCK(rl->vol);
583}
584
585static bool hr_range_lock_overlap(hr_range_lock_t *rl1, hr_range_lock_t *rl2)
586{
587 uint64_t rl1_start = rl1->off;
588 uint64_t rl1_end = rl1->off + rl1->len - 1;
589 uint64_t rl2_start = rl2->off;
590 uint64_t rl2_end = rl2->off + rl2->len - 1;
591
592 /* one ends before the other starts */
593 if (rl1_end < rl2_start || rl2_end < rl1_start)
594 return false;
595
596 return true;
597}
598
599void hr_mark_vol_state_dirty(hr_volume_t *vol)
600{
601 atomic_store(&vol->state_dirty, true);
602}
603
604static errno_t hr_add_svc_linked_to_list(list_t *list, service_id_t svc_id,
605 bool inited, void *md)
606{
607 HR_DEBUG("%s()", __func__);
608
609 errno_t rc = EOK;
610 struct dev_list_member *to_add;
611
612 if (list == NULL)
613 return EINVAL;
614
615 to_add = malloc(sizeof(struct dev_list_member));
616 if (to_add == NULL) {
617 rc = ENOMEM;
618 goto error;
619 }
620
621 to_add->svc_id = svc_id;
622 to_add->inited = inited;
623
624 if (md != NULL) {
625 to_add->md = md;
626 to_add->md_present = true;
627 } else {
628 to_add->md_present = false;
629 }
630
631 list_append(&to_add->link, list);
632
633error:
634 return rc;
635}
636
637static void free_dev_list_member(struct dev_list_member *p)
638{
639 HR_DEBUG("%s()", __func__);
640
641 if (p->md_present)
642 free(p->md);
643 free(p);
644}
645
646static void free_svc_id_list(list_t *list)
647{
648 HR_DEBUG("%s()", __func__);
649
650 struct dev_list_member *dev_id;
651 while (!list_empty(list)) {
652 dev_id = list_pop(list, struct dev_list_member, link);
653
654 free_dev_list_member(dev_id);
655 }
656}
657
658static errno_t hr_fill_disk_part_svcs_list(list_t *list)
659{
660 HR_DEBUG("%s()", __func__);
661
662 errno_t rc;
663 size_t disk_count;
664 service_id_t *disk_svcs = NULL;
665 vbd_t *vbd = NULL;
666
667 rc = vbd_create(&vbd);
668 if (rc != EOK)
669 goto error;
670
671 rc = vbd_get_disks(vbd, &disk_svcs, &disk_count);
672 if (rc != EOK)
673 goto error;
674
675 for (size_t i = 0; i < disk_count; i++) {
676 vbd_disk_info_t disk_info;
677 rc = vbd_disk_info(vbd, disk_svcs[i], &disk_info);
678 if (rc != EOK)
679 goto error;
680
681 if (disk_info.ltype != lt_none) {
682 size_t part_count;
683 service_id_t *part_ids = NULL;
684 rc = vbd_label_get_parts(vbd, disk_svcs[i], &part_ids,
685 &part_count);
686 if (rc != EOK)
687 goto error;
688
689 for (size_t j = 0; j < part_count; j++) {
690 vbd_part_info_t part_info;
691 rc = vbd_part_get_info(vbd, part_ids[j],
692 &part_info);
693 if (rc != EOK) {
694 free(part_ids);
695 goto error;
696 }
697
698 rc = hr_add_svc_linked_to_list(list,
699 part_info.svc_id, false, NULL);
700 if (rc != EOK) {
701 free(part_ids);
702 goto error;
703 }
704 }
705
706 free(part_ids);
707
708 /*
709 * vbd can detect some bogus label type, but
710 * no partitions. In that case we handle the
711 * svc_id as a label-less disk.
712 *
713 * This can happen when creating an exfat fs
714 * in FreeBSD for example.
715 */
716 if (part_count == 0)
717 disk_info.ltype = lt_none;
718 }
719
720 if (disk_info.ltype == lt_none) {
721 rc = hr_add_svc_linked_to_list(list, disk_svcs[i],
722 false, NULL);
723 if (rc != EOK)
724 goto error;
725 }
726 }
727
728 free(disk_svcs);
729 vbd_destroy(vbd);
730 return EOK;
731error:
732 free_svc_id_list(list);
733 if (disk_svcs != NULL)
734 free(disk_svcs);
735 vbd_destroy(vbd);
736
737 return rc;
738}
739
740static errno_t block_init_dev_list(list_t *list)
741{
742 HR_DEBUG("%s()", __func__);
743
744 list_foreach_safe(*list, cur_link, next_link) {
745 struct dev_list_member *iter;
746 iter = list_get_instance(cur_link, struct dev_list_member,
747 link);
748
749 if (iter->inited)
750 continue;
751
752 errno_t rc = block_init(iter->svc_id);
753
754 /* already used as an extent of active volume */
755 /* XXX: figure out how it is with hotspares too */
756 if (rc == EEXIST) {
757 list_remove(cur_link);
758 free_dev_list_member(iter);
759 continue;
760 }
761
762 if (rc != EOK)
763 return rc;
764
765 iter->inited = true;
766 iter->fini = true;
767 }
768
769 return EOK;
770}
771
772static void block_fini_dev_list(list_t *list)
773{
774 HR_DEBUG("%s()", __func__);
775
776 list_foreach(*list, link, struct dev_list_member, iter) {
777 if (iter->inited && iter->fini) {
778 block_fini(iter->svc_id);
779 iter->inited = false;
780 iter->fini = false;
781 }
782 }
783}
784
785static errno_t hr_util_get_matching_md_svcs_list(list_t *rlist, list_t *list,
786 service_id_t svc_id, hr_metadata_type_t type_main,
787 void *metadata_struct_main)
788{
789 HR_DEBUG("%s()", __func__);
790
791 errno_t rc = EOK;
792
793 hr_superblock_ops_t *meta_ops = hr_get_meta_type_ops(type_main);
794
795 list_foreach(*list, link, struct dev_list_member, iter) {
796 if (iter->svc_id == svc_id)
797 continue;
798
799 void *metadata_struct;
800 hr_metadata_type_t type;
801
802 rc = hr_find_metadata(iter->svc_id, &metadata_struct, &type);
803 if (rc == ENOFS)
804 continue;
805 if (rc != EOK)
806 goto error;
807
808 if (type != type_main) {
809 free(metadata_struct);
810 continue;
811 }
812
813 if (!meta_ops->compare_uuids(metadata_struct_main,
814 metadata_struct)) {
815 free(metadata_struct);
816 continue;
817 }
818
819 rc = hr_add_svc_linked_to_list(rlist, iter->svc_id, true,
820 metadata_struct);
821 if (rc != EOK)
822 goto error;
823 }
824
825 return EOK;
826error:
827 free_svc_id_list(rlist);
828 return rc;
829}
830
831static errno_t hr_util_assemble_from_matching_list(list_t *list,
832 hr_metadata_type_t type, uint8_t vflags)
833{
834 HR_DEBUG("%s()", __func__);
835
836 errno_t rc = EOK;
837
838 hr_superblock_ops_t *meta_ops = hr_get_meta_type_ops(type);
839
840 link_t *memb_l = list_first(list);
841 struct dev_list_member *memb = list_get_instance(memb_l,
842 struct dev_list_member, link);
843
844 hr_level_t level = meta_ops->get_level(memb->md);
845 const char *devname = meta_ops->get_devname(memb->md);
846
847 hr_volume_t *vol;
848 rc = hr_create_vol_struct(&vol, level, devname, type, vflags);
849 if (rc != EOK)
850 return rc;
851
852 meta_ops->init_meta2vol(list, vol);
853 if (rc != EOK)
854 goto error;
855
856 rc = vol->hr_ops.create(vol);
857 if (rc != EOK)
858 goto error;
859
860 rc = hr_register_volume(vol);
861 if (rc != EOK)
862 goto error;
863
864 fibril_rwlock_write_lock(&hr_volumes_lock);
865 list_append(&vol->lvolumes, &hr_volumes);
866 fibril_rwlock_write_unlock(&hr_volumes_lock);
867
868 HR_NOTE("assembled volume \"%s\"\n", vol->devname);
869
870 return EOK;
871error:
872 hr_destroy_vol_struct(vol);
873 return rc;
874}
875
876static errno_t hr_fill_svcs_list_from_cfg(hr_config_t *cfg, list_t *list)
877{
878 HR_DEBUG("%s()", __func__);
879
880 errno_t rc = EOK;
881 for (size_t i = 0; i < cfg->dev_no; ++i) {
882 rc = hr_add_svc_linked_to_list(list, cfg->devs[i], false,
883 NULL);
884 if (rc != EOK)
885 goto error;
886 }
887
888 return EOK;
889error:
890 free_svc_id_list(list);
891 return rc;
892}
893
894errno_t hr_util_try_assemble(hr_config_t *cfg, size_t *rassembled_cnt)
895{
896 HR_DEBUG("%s()", __func__);
897
898 /*
899 * scan partitions or disks:
900 *
901 * When we find a metadata block with valid
902 * magic, take UUID and try to find other matching
903 * UUIDs.
904 *
905 * We ignore extents that are a part of already
906 * active volumes. (even when the counter is lower
907 * on active volumes... XXX: use timestamp as initial counter value
908 * when assembling, or writing dirty metadata?)
909 */
910
911 size_t asm_cnt = 0;
912 errno_t rc;
913 list_t dev_id_list;
914 uint8_t vflags = 0;
915
916 list_initialize(&dev_id_list);
917
918 if (cfg == NULL) {
919 rc = hr_fill_disk_part_svcs_list(&dev_id_list);
920 } else {
921 rc = hr_fill_svcs_list_from_cfg(cfg, &dev_id_list);
922 vflags = cfg->vol_flags;
923 }
924
925 if (rc != EOK)
926 goto error;
927
928 rc = block_init_dev_list(&dev_id_list);
929 if (rc != EOK)
930 goto error;
931
932 struct dev_list_member *iter;
933 while (!list_empty(&dev_id_list)) {
934 iter = list_pop(&dev_id_list, struct dev_list_member, link);
935
936 void *metadata_struct_main;
937 hr_metadata_type_t type;
938
939 rc = hr_find_metadata(iter->svc_id, &metadata_struct_main, &type);
940 if (rc == ENOFS) {
941 block_fini(iter->svc_id);
942 free_dev_list_member(iter);
943 rc = EOK;
944 continue;
945 }
946
947 if (rc != EOK)
948 goto error;
949
950 char *svc_name = NULL;
951 rc = loc_service_get_name(iter->svc_id, &svc_name);
952 if (rc != EOK)
953 goto error;
954 HR_DEBUG("found valid metadata on %s (type = %s), matching "
955 "other extents\n",
956 svc_name, hr_get_metadata_type_str(type));
957 free(svc_name);
958
959 list_t matching_svcs_list;
960 list_initialize(&matching_svcs_list);
961
962 rc = hr_util_get_matching_md_svcs_list(&matching_svcs_list,
963 &dev_id_list, iter->svc_id, type, metadata_struct_main);
964 if (rc != EOK)
965 goto error;
966
967 /* add current iter to list as well */
968 rc = hr_add_svc_linked_to_list(&matching_svcs_list,
969 iter->svc_id, true, metadata_struct_main);
970 if (rc != EOK) {
971 free_svc_id_list(&matching_svcs_list);
972 goto error;
973 }
974
975 /* remove matching list members from dev_id_list */
976 list_foreach(matching_svcs_list, link, struct dev_list_member,
977 iter2) {
978 struct dev_list_member *to_remove;
979 list_foreach_safe(dev_id_list, cur_link, next_link) {
980 to_remove = list_get_instance(cur_link,
981 struct dev_list_member, link);
982 if (to_remove->svc_id == iter2->svc_id) {
983 list_remove(cur_link);
984 free_dev_list_member(to_remove);
985 }
986 }
987 }
988
989 rc = hr_util_assemble_from_matching_list(&matching_svcs_list,
990 type, vflags);
991 switch (rc) {
992 case EOK:
993 asm_cnt++;
994 break;
995 case ENOMEM:
996 goto error;
997 default:
998 rc = EOK;
999 }
1000 block_fini_dev_list(&matching_svcs_list);
1001 free_svc_id_list(&matching_svcs_list);
1002 }
1003
1004error:
1005 if (rassembled_cnt != NULL)
1006 *rassembled_cnt = asm_cnt;
1007
1008 block_fini_dev_list(&dev_id_list);
1009 free_svc_id_list(&dev_id_list);
1010
1011 return rc;
1012}
1013
1014errno_t hr_util_add_hotspare(hr_volume_t *vol, service_id_t hotspare)
1015{
1016 HR_DEBUG("%s()", __func__);
1017
1018 errno_t rc = EOK;
1019
1020 fibril_mutex_lock(&vol->hotspare_lock);
1021
1022 if (vol->hotspare_no >= HR_MAX_HOTSPARES) {
1023 HR_ERROR("%s(): cannot add more hotspares "
1024 "to \"%s\"\n", __func__, vol->devname);
1025 rc = ELIMIT;
1026 goto error;
1027 }
1028
1029 for (size_t i = 0; i < vol->hotspare_no; i++) {
1030 if (vol->hotspares[i].svc_id == hotspare) {
1031 HR_ERROR("%s(): hotspare (%" PRIun ") already used in "
1032 "%s\n", __func__, hotspare, vol->devname);
1033 rc = EEXIST;
1034 goto error;
1035 }
1036 }
1037
1038 rc = block_init(hotspare);
1039 if (rc != EOK)
1040 goto error;
1041
1042 uint64_t hs_blkno;
1043 rc = block_get_nblocks(hotspare, &hs_blkno);
1044 if (rc != EOK) {
1045 block_fini(hotspare);
1046 goto error;
1047 }
1048
1049 if (hs_blkno < vol->truncated_blkno) {
1050 HR_ERROR("%s(): hotspare (%" PRIun ") doesn't have enough "
1051 "blocks\n", __func__, hotspare);
1052
1053 rc = EINVAL;
1054 block_fini(hotspare);
1055 goto error;
1056 }
1057
1058 size_t hs_idx = vol->hotspare_no;
1059
1060 vol->hotspare_no++;
1061
1062 hr_update_hotspare_svc_id(vol, hs_idx, hotspare);
1063 hr_update_hotspare_state(vol, hs_idx, HR_EXT_HOTSPARE);
1064
1065 hr_mark_vol_state_dirty(vol);
1066error:
1067 fibril_mutex_unlock(&vol->hotspare_lock);
1068 return rc;
1069}
1070
1071void hr_raid5_xor(void *dst, const void *src, size_t size)
1072{
1073 size_t i;
1074 uint64_t *d = dst;
1075 const uint64_t *s = src;
1076
1077 for (i = 0; i < size / sizeof(uint64_t); ++i)
1078 *d++ ^= *s++;
1079}
1080
1081errno_t hr_sync_extents(hr_volume_t *vol)
1082{
1083 errno_t rc = EOK;
1084
1085 fibril_rwlock_read_lock(&vol->extents_lock);
1086 for (size_t e = 0; e < vol->extent_no; e++) {
1087 fibril_rwlock_read_lock(&vol->states_lock);
1088 hr_ext_state_t s = vol->extents[e].state;
1089 fibril_rwlock_read_unlock(&vol->states_lock);
1090
1091 service_id_t svc_id = vol->extents[e].svc_id;
1092
1093 if (s == HR_EXT_ONLINE || s == HR_EXT_REBUILD) {
1094 errno_t rc = hr_sync_cache(svc_id, 0, 0);
1095 if (rc != EOK && rc != ENOTSUP)
1096 vol->hr_ops.ext_state_cb(vol, e, rc);
1097 }
1098 }
1099 fibril_rwlock_read_unlock(&vol->extents_lock);
1100
1101 vol->hr_ops.vol_state_eval(vol);
1102
1103 fibril_rwlock_read_lock(&vol->states_lock);
1104 hr_vol_state_t s = vol->state;
1105 fibril_rwlock_read_unlock(&vol->states_lock);
1106
1107 if (s == HR_VOL_FAULTY)
1108 rc = EIO;
1109
1110 return rc;
1111}
1112
1113errno_t hr_init_rebuild(hr_volume_t *vol, size_t *rebuild_idx)
1114{
1115 HR_DEBUG("%s()", __func__);
1116
1117 errno_t rc = EOK;
1118 size_t bad = vol->extent_no;
1119
1120 if (vol->level == HR_LVL_0)
1121 return EINVAL;
1122
1123 fibril_rwlock_read_lock(&vol->states_lock);
1124 if (vol->state != HR_VOL_DEGRADED) {
1125 fibril_rwlock_read_unlock(&vol->states_lock);
1126 return EINVAL;
1127 }
1128 fibril_rwlock_read_unlock(&vol->states_lock);
1129
1130 fibril_rwlock_write_lock(&vol->extents_lock);
1131 fibril_rwlock_write_lock(&vol->states_lock);
1132 fibril_mutex_lock(&vol->hotspare_lock);
1133
1134 size_t rebuild = vol->extent_no;
1135 for (size_t i = 0; i < vol->extent_no; i++) {
1136 if (vol->extents[i].state == HR_EXT_REBUILD) {
1137 rebuild = i;
1138 break;
1139 }
1140 }
1141
1142 if (rebuild < vol->extent_no) {
1143 bad = rebuild;
1144 goto init_rebuild;
1145 }
1146
1147 size_t invalid = vol->extent_no;
1148 for (size_t i = 0; i < vol->extent_no; i++) {
1149 if (vol->extents[i].state == HR_EXT_INVALID) {
1150 invalid = i;
1151 break;
1152 }
1153 }
1154
1155 if (invalid < vol->extent_no) {
1156 bad = invalid;
1157 goto init_rebuild;
1158 }
1159
1160 for (size_t i = 0; i < vol->extent_no; i++) {
1161 if (vol->extents[i].state != HR_EXT_ONLINE) {
1162 bad = i;
1163 break;
1164 }
1165 }
1166
1167 if (bad == vol->extent_no || vol->hotspare_no == 0) {
1168 rc = EINVAL;
1169 goto error;
1170 }
1171
1172 size_t hotspare_idx = vol->hotspare_no - 1;
1173
1174 hr_ext_state_t hs_state = vol->hotspares[hotspare_idx].state;
1175 if (hs_state != HR_EXT_HOTSPARE) {
1176 HR_ERROR("hr_raid1_rebuild(): invalid hotspare"
1177 "state \"%s\", aborting rebuild\n",
1178 hr_get_ext_state_str(hs_state));
1179 rc = EINVAL;
1180 goto error;
1181 }
1182
1183 rc = hr_swap_hs(vol, bad, hotspare_idx);
1184 if (rc != EOK) {
1185 HR_ERROR("hr_raid1_rebuild(): swapping "
1186 "hotspare failed, aborting rebuild\n");
1187 goto error;
1188 }
1189
1190 hr_extent_t *rebuild_ext = &vol->extents[bad];
1191
1192 HR_DEBUG("hr_raid1_rebuild(): starting REBUILD on extent no. %zu "
1193 "(%" PRIun ")\n", bad, rebuild_ext->svc_id);
1194
1195init_rebuild:
1196 hr_update_ext_state(vol, bad, HR_EXT_REBUILD);
1197 hr_update_vol_state(vol, HR_VOL_REBUILD);
1198
1199 *rebuild_idx = bad;
1200error:
1201 fibril_mutex_unlock(&vol->hotspare_lock);
1202 fibril_rwlock_write_unlock(&vol->states_lock);
1203 fibril_rwlock_write_unlock(&vol->extents_lock);
1204
1205 return rc;
1206}
1207
1208static errno_t hr_swap_hs(hr_volume_t *vol, size_t bad, size_t hs)
1209{
1210 HR_DEBUG("%s()", __func__);
1211
1212 service_id_t faulty_svc_id = vol->extents[bad].svc_id;
1213 service_id_t hs_svc_id = vol->hotspares[hs].svc_id;
1214
1215 hr_update_ext_svc_id(vol, bad, hs_svc_id);
1216 hr_update_ext_state(vol, bad, HR_EXT_HOTSPARE);
1217
1218 hr_update_hotspare_svc_id(vol, hs, 0);
1219 hr_update_hotspare_state(vol, hs, HR_EXT_MISSING);
1220
1221 vol->hotspare_no--;
1222
1223 if (faulty_svc_id != 0)
1224 block_fini(faulty_svc_id);
1225
1226 return EOK;
1227}
1228
1229/** @}
1230 */
Note: See TracBrowser for help on using the repository browser.