Changeset cdfcaea in mainline
- Timestamp:
- 2025-06-23T13:39:56Z (4 months ago)
- Children:
- f18e36e
- Parents:
- 09c195e8
- Location:
- uspace/srv/bd/hr
- Files:
-
- 6 edited
Legend:
- Unmodified
- Added
- Removed
-
uspace/srv/bd/hr/io.c
r09c195e8 rcdfcaea 315 315 stripe->parity + io->strip_off); 316 316 if (rc != EOK) 317 io->vol->hr_ops.ext_state_cb(io->vol, stripe->p_extent, rc);317 io->vol->hr_ops.ext_state_cb(io->vol, io->extent, rc); 318 318 319 319 return rc; -
uspace/srv/bd/hr/parity_stripe.c
r09c195e8 rcdfcaea 53 53 range_t *); 54 54 static size_t hr_stripe_merge_extent_spans(hr_stripe_t *, size_t, range_t [2]); 55 static void hr_reset_stripe(hr_stripe_t *);56 55 static void hr_stripe_extend_range(range_t *, const range_t *); 57 56 static bool hr_ranges_overlap(const range_t *, const range_t *, range_t *); 58 57 59 hr_stripe_t *hr_create_stripes(hr_volume_t *vol, size_t cnt, bool write) 60 { 61 hr_stripe_t *stripes = calloc(cnt, sizeof(*stripes)); 62 if (stripes == NULL) 63 return NULL; 58 hr_stripe_t *hr_create_stripes(hr_volume_t *vol, uint64_t strip_size, 59 size_t cnt, bool write) 60 { 61 hr_stripe_t *stripes = hr_calloc_waitok(cnt, sizeof(*stripes)); 64 62 65 63 for (size_t i = 0; i < cnt; i++) { … … 68 66 stripes[i].vol = vol; 69 67 stripes[i].write = write; 70 stripes[i].parity = calloc(1, vol->strip_size); 71 if (stripes[i].parity == NULL) 72 goto error; 73 stripes[i].extent_span = 74 calloc(vol->extent_no, sizeof(*stripes[i].extent_span)); 75 if (stripes[i].extent_span == NULL) 76 goto error; 68 stripes[i].parity = hr_calloc_waitok(1, strip_size); 69 stripes[i].parity_size = strip_size; 70 stripes[i].extent_span = hr_calloc_waitok(vol->extent_no, 71 sizeof(*stripes[i].extent_span)); 77 72 } 78 73 79 74 return stripes; 80 error:81 hr_destroy_stripes(stripes, cnt);82 return NULL;83 75 } 84 76 … … 96 88 97 89 free(stripes); 90 } 91 92 void hr_reset_stripe(hr_stripe_t *stripe) 93 { 94 memset(stripe->parity, 0, stripe->parity_size); 95 stripe->ps_added = 0; 96 stripe->ps_to_be_added = 0; 97 stripe->p_count_final = false; 98 99 stripe->rc = EOK; 100 stripe->abort = false; 101 stripe->done = false; 98 102 } 99 103 … … 878 882 } 879 883 880 static void hr_reset_stripe(hr_stripe_t *stripe)881 {882 printf("%s\n", __func__);883 884 memset(stripe->parity, 0, stripe->vol->strip_size);885 stripe->ps_added = 0;886 stripe->ps_to_be_added = 0;887 stripe->p_count_final = false;888 889 stripe->rc = EOK;890 stripe->abort = false;891 stripe->done = false;892 }893 894 884 /** Extend a range. 895 885 * -
uspace/srv/bd/hr/parity_stripe.h
r09c195e8 rcdfcaea 73 73 fibril_mutex_t parity_lock; 74 74 uint8_t *parity; /* the actual parity strip */ 75 uint64_t parity_size; 75 76 76 77 /* parity writers waiting until this many parity commits */ … … 104 105 } hr_stripe_t; 105 106 106 extern hr_stripe_t *hr_create_stripes(hr_volume_t *, size_t, bool);107 extern hr_stripe_t *hr_create_stripes(hr_volume_t *, uint64_t, size_t, bool); 107 108 extern void hr_destroy_stripes(hr_stripe_t *, size_t); 109 extern void hr_reset_stripe(hr_stripe_t *); 108 110 extern void hr_stripe_commit_parity(hr_stripe_t *, uint64_t, const void *, 109 111 uint64_t); -
uspace/srv/bd/hr/raid5.c
r09c195e8 rcdfcaea 57 57 58 58 static void hr_raid5_vol_state_eval_forced(hr_volume_t *); 59 60 59 static size_t hr_raid5_parity_extent(hr_level_t, hr_layout_t, size_t, 61 60 uint64_t); 62 61 static size_t hr_raid5_data_extent(hr_level_t, hr_layout_t, size_t, uint64_t, 63 62 uint64_t); 64 65 63 static errno_t hr_raid5_rebuild(void *); 66 64 … … 251 249 size_t stripes_cnt = end_stripe - start_stripe + 1; 252 250 253 hr_stripe_t *stripes = hr_create_stripes(vol, stripes_cnt, false); 254 if (stripes == NULL) 255 return ENOMEM; 256 257 /* 258 * Pre-allocate range locks, because after group creation and 259 * firing off IO requests there is no easy consistent ENOMEM error 260 * path. 261 */ 251 hr_stripe_t *stripes = hr_create_stripes(vol, vol->strip_size, 252 stripes_cnt, false); 253 262 254 hr_range_lock_t **rlps = hr_malloc_waitok(stripes_cnt * sizeof(*rlps)); 263 for (size_t i = 0; i < stripes_cnt; i++)264 rlps[i] = hr_malloc_waitok(sizeof(**rlps));265 255 266 256 /* … … 272 262 for (uint64_t s = start_stripe; s <= end_stripe; s++) { 273 263 uint64_t relative = s - start_stripe; 274 hr_range_lock_acquire_noalloc(rlps[relative],vol, s, 1);264 rlps[relative] = hr_range_lock_acquire(vol, s, 1); 275 265 } 276 266 … … 378 368 hr_range_lock_release(rlps[i]); 379 369 370 free(rlps); 371 380 372 hr_destroy_stripes(stripes, stripes_cnt); 381 373 … … 420 412 size_t stripes_cnt = end_stripe - start_stripe + 1; 421 413 422 hr_stripe_t *stripes = hr_create_stripes(vol, stripes_cnt, true); 423 if (stripes == NULL) 424 return ENOMEM; 414 hr_stripe_t *stripes = hr_create_stripes(vol, vol->strip_size, 415 stripes_cnt, true); 425 416 426 417 uint64_t stripe_size = strip_size * (vol->extent_no - 1); … … 469 460 } 470 461 471 /*472 * Pre-allocate range locks, because after group creation and473 * firing off IO requests there is no easy consistent ENOMEM error474 * path.475 */476 462 hr_range_lock_t **rlps = hr_malloc_waitok(stripes_cnt * sizeof(*rlps)); 477 for (size_t i = 0; i < stripes_cnt; i++)478 rlps[i] = hr_malloc_waitok(sizeof(**rlps));479 463 480 464 /* … … 486 470 for (uint64_t s = start_stripe; s <= end_stripe; s++) { 487 471 uint64_t relative = s - start_stripe; 488 hr_range_lock_acquire_noalloc(rlps[relative],vol, s, 1);472 rlps[relative] = hr_range_lock_acquire(vol, s, 1); 489 473 } 490 474 … … 590 574 hr_range_lock_release(rlps[i]); 591 575 576 free(rlps); 577 592 578 hr_destroy_stripes(stripes, stripes_cnt); 593 579 … … 622 608 if (vol->extents[i].state != HR_EXT_ONLINE) 623 609 bad++; 610 611 size_t invalid_no = hr_count_extents(vol, HR_EXT_INVALID); 612 613 fibril_mutex_lock(&vol->hotspare_lock); 614 size_t hs_no = vol->hotspare_no; 615 fibril_mutex_unlock(&vol->hotspare_lock); 624 616 625 617 switch (bad) { … … 633 625 634 626 if (state != HR_VOL_REBUILD) { 635 /* XXX: allow REBUILD on INVALID extents */ 636 fibril_mutex_lock(&vol->hotspare_lock); 637 size_t hs_no = vol->hotspare_no; 638 fibril_mutex_unlock(&vol->hotspare_lock); 639 if (hs_no > 0) { 627 if (hs_no > 0 || invalid_no > 0) { 640 628 fid_t fib = fibril_create(hr_raid5_rebuild, 641 629 vol); … … 655 643 fibril_rwlock_write_unlock(&vol->states_lock); 656 644 fibril_rwlock_read_unlock(&vol->extents_lock); 657 }658 659 static void xor(void *dst, const void *src, size_t size)660 {661 size_t i;662 uint64_t *d = dst;663 const uint64_t *s = src;664 665 for (i = 0; i < size / sizeof(uint64_t); ++i)666 *d++ ^= *s++;667 645 } 668 646 … … 730 708 static errno_t hr_raid5_rebuild(void *arg) 731 709 { 732 HR_DEBUG(" hr_raid5_rebuild()\n");710 HR_DEBUG("%s()", __func__); 733 711 734 712 hr_volume_t *vol = arg; 735 713 errno_t rc = EOK; 714 size_t rebuild_idx; 736 715 void *buf = NULL, *xorbuf = NULL; 737 716 738 fibril_rwlock_read_lock(&vol->extents_lock); 739 fibril_rwlock_write_lock(&vol->states_lock); 740 741 if (vol->hotspare_no == 0) { 742 HR_WARN("hr_raid5_rebuild(): no free hotspares on \"%s\", " 743 "aborting rebuild\n", vol->devname); 744 /* retval isn't checked for now */ 745 goto end; 746 } 747 748 size_t bad = vol->extent_no; 749 for (size_t i = 0; i < vol->extent_no; i++) { 750 if (vol->extents[i].state == HR_EXT_FAILED) { 751 bad = i; 752 break; 753 } 754 } 755 756 if (bad == vol->extent_no) { 757 HR_WARN("hr_raid5_rebuild(): no bad extent on \"%s\", " 758 "aborting rebuild\n", vol->devname); 759 /* retval isn't checked for now */ 760 goto end; 761 } 762 763 size_t hotspare_idx = vol->hotspare_no - 1; 764 765 hr_ext_state_t hs_state = vol->hotspares[hotspare_idx].state; 766 if (hs_state != HR_EXT_HOTSPARE) { 767 HR_ERROR("hr_raid5_rebuild(): invalid hotspare state \"%s\", " 768 "aborting rebuild\n", hr_get_ext_state_str(hs_state)); 769 rc = EINVAL; 770 goto end; 771 } 772 773 HR_DEBUG("hr_raid5_rebuild(): swapping in hotspare\n"); 774 775 block_fini(vol->extents[bad].svc_id); 776 777 vol->extents[bad].svc_id = vol->hotspares[hotspare_idx].svc_id; 778 hr_update_ext_state(vol, bad, HR_EXT_HOTSPARE); 779 780 vol->hotspares[hotspare_idx].svc_id = 0; 781 fibril_mutex_lock(&vol->hotspare_lock); 782 hr_update_hotspare_state(vol, hotspare_idx, HR_EXT_MISSING); 783 fibril_mutex_unlock(&vol->hotspare_lock); 784 785 vol->hotspare_no--; 786 787 hr_extent_t *rebuild_ext = &vol->extents[bad]; 788 789 HR_DEBUG("hr_raid5_rebuild(): starting rebuild on (%" PRIun ")\n", 790 rebuild_ext->svc_id); 791 792 hr_update_ext_state(vol, bad, HR_EXT_REBUILD); 793 hr_update_vol_state(vol, HR_VOL_REBUILD); 717 rc = hr_init_rebuild(vol, &rebuild_idx); 718 if (rc != EOK) 719 return rc; 794 720 795 721 uint64_t max_blks = DATA_XFER_LIMIT / vol->bsize; 796 722 uint64_t left = vol->data_blkno / (vol->extent_no - 1); 797 buf = malloc(max_blks * vol->bsize); 798 xorbuf = malloc(max_blks * vol->bsize); 723 buf = hr_malloc_waitok(max_blks * vol->bsize); 724 xorbuf = hr_malloc_waitok(max_blks * vol->bsize); 725 726 uint64_t strip_size = vol->strip_size / vol->bsize; /* in blocks */ 799 727 800 728 uint64_t ba = 0, cnt; 801 729 hr_add_data_offset(vol, &ba); 802 730 731 /* 732 * this is not necessary because a rebuild is 733 * protected by itself, i.e. there can be only 734 * one REBUILD at a time 735 */ 736 fibril_rwlock_read_lock(&vol->extents_lock); 737 738 /* increment metadata counter only on first write */ 739 bool exp = false; 740 if (atomic_compare_exchange_strong(&vol->first_write, &exp, true)) { 741 vol->meta_ops->inc_counter(vol); 742 vol->meta_ops->save(vol, WITH_STATE_CALLBACK); 743 } 744 745 hr_range_lock_t *rl = NULL; 746 hr_stripe_t *stripe = hr_create_stripes(vol, max_blks * vol->bsize, 1, 747 false); 748 749 unsigned int percent, old_percent = 100; 803 750 while (left != 0) { 804 751 cnt = min(left, max_blks); 805 752 806 /* 807 * Almost the same as read_degraded, 808 * but we don't want to allocate new 809 * xorbuf each blk rebuild batch. 810 */ 811 bool first = true; 812 for (size_t i = 0; i < vol->extent_no; i++) { 813 if (i == bad) 753 uint64_t strip_no = ba / strip_size; 754 uint64_t last_ba = ba + cnt - 1; 755 uint64_t end_strip_no = last_ba / strip_size; 756 uint64_t start_stripe = strip_no / (vol->extent_no - 1); 757 uint64_t end_stripe = end_strip_no / (vol->extent_no - 1); 758 size_t stripes_cnt = end_stripe - start_stripe + 1; 759 760 stripe->ps_to_be_added = vol->extent_no - 1; 761 stripe->p_count_final = true; 762 763 hr_fgroup_t *worker_group = 764 hr_fgroup_create(vol->fge, vol->extent_no); 765 766 rl = hr_range_lock_acquire(vol, start_stripe, stripes_cnt); 767 768 atomic_store_explicit(&vol->rebuild_blk, ba, 769 memory_order_relaxed); 770 771 for (size_t e = 0; e < vol->extent_no; e++) { 772 if (e == rebuild_idx) 814 773 continue; 815 if (first) 816 rc = block_read_direct(vol->extents[i].svc_id, 817 ba, cnt, xorbuf); 818 else 819 rc = block_read_direct(vol->extents[i].svc_id, 820 ba, cnt, buf); 821 if (rc != EOK) { 822 hr_raid5_ext_state_cb(vol, i, rc); 823 HR_ERROR("rebuild on \"%s\" (%" PRIun "), " 824 "failed due to a failed ONLINE extent, " 825 "number %zu\n", 826 vol->devname, vol->svc_id, i); 827 goto end; 828 } 829 830 if (!first) 831 xor(xorbuf, buf, cnt * vol->bsize); 832 else 833 first = false; 834 } 835 836 rc = block_write_direct(rebuild_ext->svc_id, ba, cnt, xorbuf); 837 if (rc != EOK) { 838 hr_raid5_ext_state_cb(vol, bad, rc); 839 HR_ERROR("rebuild on \"%s\" (%" PRIun "), failed due to " 840 "the rebuilt extent number %zu failing\n", 841 vol->devname, vol->svc_id, bad); 774 775 hr_io_raid5_t *io = hr_fgroup_alloc(worker_group); 776 io->extent = e; 777 io->ba = ba; 778 io->cnt = cnt; 779 io->strip_off = 0; 780 io->vol = vol; 781 io->stripe = stripe; 782 783 hr_fgroup_submit(worker_group, 784 hr_io_raid5_reconstruct_reader, io); 785 } 786 787 hr_io_raid5_t *io = hr_fgroup_alloc(worker_group); 788 io->extent = rebuild_idx; 789 io->ba = ba; 790 io->cnt = cnt; 791 io->strip_off = 0; 792 io->vol = vol; 793 io->stripe = stripe; 794 795 hr_fgroup_submit(worker_group, hr_io_raid5_parity_writer, io); 796 797 size_t failed; 798 (void)hr_fgroup_wait(worker_group, NULL, &failed); 799 if (failed > 0) { 800 hr_range_lock_release(rl); 801 HR_NOTE("\"%s\": REBUILD aborted.\n", vol->devname); 842 802 goto end; 843 803 } 804 805 percent = ((ba + cnt) * 100) / vol->data_blkno; 806 if (percent != old_percent) { 807 if (percent % 5 == 0) 808 HR_DEBUG("\"%s\" REBUILD progress: %u%%\n", 809 vol->devname, percent); 810 } 811 812 hr_range_lock_release(rl); 813 hr_reset_stripe(stripe); 844 814 845 815 ba += cnt; … … 850 820 * during rebuild. 851 821 */ 852 853 /*854 * fibril_rwlock_write_unlock(&vol->states_lock);855 * fibril_mutex_unlock(&vol->lock);856 * fibril_mutex_lock(&vol->lock);857 * fibril_rwlock_write_lock(&vol->states_lock);858 */859 822 } 860 823 861 824 HR_DEBUG("hr_raid5_rebuild(): rebuild finished on \"%s\" (%" PRIun "), " 862 "extent number %zu\n", vol->devname, vol->svc_id, hotspare_idx); 863 864 hr_update_ext_state(vol, bad, HR_EXT_ONLINE); 825 "extent number %zu\n", vol->devname, vol->svc_id, rebuild_idx); 826 827 fibril_rwlock_write_lock(&vol->states_lock); 828 829 hr_update_ext_state(vol, rebuild_idx, HR_EXT_ONLINE); 830 831 hr_mark_vol_state_dirty(vol); 865 832 866 833 fibril_rwlock_write_unlock(&vol->states_lock); 834 835 /* (void)vol->meta_ops->save(vol, WITH_STATE_CALLBACK); */ 836 837 end: 867 838 fibril_rwlock_read_unlock(&vol->extents_lock); 868 839 869 rc = vol->meta_ops->save(vol, WITH_STATE_CALLBACK); 870 871 fibril_rwlock_read_lock(&vol->extents_lock); 872 fibril_rwlock_write_lock(&vol->states_lock); 873 874 end: 875 hr_raid5_vol_state_eval_forced(vol); 876 877 fibril_rwlock_write_unlock(&vol->states_lock); 878 fibril_rwlock_read_unlock(&vol->extents_lock); 879 880 if (buf != NULL) 881 free(buf); 882 883 if (xorbuf != NULL) 884 free(xorbuf); 840 hr_raid1_vol_state_eval(vol); 841 842 hr_destroy_stripes(stripe, 1); 843 free(buf); 844 free(xorbuf); 885 845 886 846 return rc; -
uspace/srv/bd/hr/util.c
r09c195e8 rcdfcaea 55 55 #include "var.h" 56 56 57 static hr_range_lock_t *hr_range_lock_acquire_internal(hr_range_lock_t *,58 hr_volume_t *, uint64_t, uint64_t);59 57 static bool hr_range_lock_overlap(hr_range_lock_t *, hr_range_lock_t *); 60 58 static errno_t hr_add_svc_linked_to_list(list_t *, service_id_t, bool, void *); … … 515 513 } 516 514 517 void hr_range_lock_acquire_noalloc(hr_range_lock_t *rl, hr_volume_t *vol,518 uint64_t ba, uint64_t cnt)519 {520 assert(rl != NULL);521 (void)hr_range_lock_acquire_internal(rl, vol, ba, cnt);522 }523 524 515 hr_range_lock_t *hr_range_lock_acquire(hr_volume_t *vol, uint64_t ba, 525 516 uint64_t cnt) … … 527 518 hr_range_lock_t *rl = hr_malloc_waitok(sizeof(hr_range_lock_t)); 528 519 529 return hr_range_lock_acquire_internal(rl, vol, ba, cnt);530 }531 532 static hr_range_lock_t *hr_range_lock_acquire_internal(hr_range_lock_t *rl,533 hr_volume_t *vol, uint64_t ba, uint64_t cnt)534 {535 520 rl->vol = vol; 536 521 rl->off = ba; -
uspace/srv/bd/hr/util.h
r09c195e8 rcdfcaea 103 103 extern size_t hr_count_extents(hr_volume_t *, hr_ext_state_t); 104 104 extern void hr_mark_vol_state_dirty(hr_volume_t *); 105 extern void hr_range_lock_acquire_noalloc(hr_range_lock_t *, hr_volume_t *,106 uint64_t, uint64_t);107 105 extern hr_range_lock_t *hr_range_lock_acquire(hr_volume_t *, uint64_t, 108 106 uint64_t);
Note:
See TracChangeset
for help on using the changeset viewer.