Changeset cdfcaea in mainline for uspace/srv/bd/hr/raid5.c
- Timestamp:
- 2025-06-23T13:39:56Z (9 months ago)
- Children:
- f18e36e
- Parents:
- 09c195e8
- File:
-
- 1 edited
-
uspace/srv/bd/hr/raid5.c (modified) (13 diffs)
Legend:
- Unmodified
- Added
- Removed
-
uspace/srv/bd/hr/raid5.c
r09c195e8 rcdfcaea 57 57 58 58 static void hr_raid5_vol_state_eval_forced(hr_volume_t *); 59 60 59 static size_t hr_raid5_parity_extent(hr_level_t, hr_layout_t, size_t, 61 60 uint64_t); 62 61 static size_t hr_raid5_data_extent(hr_level_t, hr_layout_t, size_t, uint64_t, 63 62 uint64_t); 64 65 63 static errno_t hr_raid5_rebuild(void *); 66 64 … … 251 249 size_t stripes_cnt = end_stripe - start_stripe + 1; 252 250 253 hr_stripe_t *stripes = hr_create_stripes(vol, stripes_cnt, false); 254 if (stripes == NULL) 255 return ENOMEM; 256 257 /* 258 * Pre-allocate range locks, because after group creation and 259 * firing off IO requests there is no easy consistent ENOMEM error 260 * path. 261 */ 251 hr_stripe_t *stripes = hr_create_stripes(vol, vol->strip_size, 252 stripes_cnt, false); 253 262 254 hr_range_lock_t **rlps = hr_malloc_waitok(stripes_cnt * sizeof(*rlps)); 263 for (size_t i = 0; i < stripes_cnt; i++)264 rlps[i] = hr_malloc_waitok(sizeof(**rlps));265 255 266 256 /* … … 272 262 for (uint64_t s = start_stripe; s <= end_stripe; s++) { 273 263 uint64_t relative = s - start_stripe; 274 hr_range_lock_acquire_noalloc(rlps[relative],vol, s, 1);264 rlps[relative] = hr_range_lock_acquire(vol, s, 1); 275 265 } 276 266 … … 378 368 hr_range_lock_release(rlps[i]); 379 369 370 free(rlps); 371 380 372 hr_destroy_stripes(stripes, stripes_cnt); 381 373 … … 420 412 size_t stripes_cnt = end_stripe - start_stripe + 1; 421 413 422 hr_stripe_t *stripes = hr_create_stripes(vol, stripes_cnt, true); 423 if (stripes == NULL) 424 return ENOMEM; 414 hr_stripe_t *stripes = hr_create_stripes(vol, vol->strip_size, 415 stripes_cnt, true); 425 416 426 417 uint64_t stripe_size = strip_size * (vol->extent_no - 1); … … 469 460 } 470 461 471 /*472 * Pre-allocate range locks, because after group creation and473 * firing off IO requests there is no easy consistent ENOMEM error474 * path.475 */476 462 hr_range_lock_t **rlps = hr_malloc_waitok(stripes_cnt * sizeof(*rlps)); 477 for (size_t i = 0; i < stripes_cnt; i++)478 rlps[i] = hr_malloc_waitok(sizeof(**rlps));479 463 480 464 /* … … 486 470 for (uint64_t s = start_stripe; s <= end_stripe; s++) { 487 471 uint64_t relative = s - start_stripe; 488 hr_range_lock_acquire_noalloc(rlps[relative],vol, s, 1);472 rlps[relative] = hr_range_lock_acquire(vol, s, 1); 489 473 } 490 474 … … 590 574 hr_range_lock_release(rlps[i]); 591 575 576 free(rlps); 577 592 578 hr_destroy_stripes(stripes, stripes_cnt); 593 579 … … 622 608 if (vol->extents[i].state != HR_EXT_ONLINE) 623 609 bad++; 610 611 size_t invalid_no = hr_count_extents(vol, HR_EXT_INVALID); 612 613 fibril_mutex_lock(&vol->hotspare_lock); 614 size_t hs_no = vol->hotspare_no; 615 fibril_mutex_unlock(&vol->hotspare_lock); 624 616 625 617 switch (bad) { … … 633 625 634 626 if (state != HR_VOL_REBUILD) { 635 /* XXX: allow REBUILD on INVALID extents */ 636 fibril_mutex_lock(&vol->hotspare_lock); 637 size_t hs_no = vol->hotspare_no; 638 fibril_mutex_unlock(&vol->hotspare_lock); 639 if (hs_no > 0) { 627 if (hs_no > 0 || invalid_no > 0) { 640 628 fid_t fib = fibril_create(hr_raid5_rebuild, 641 629 vol); … … 655 643 fibril_rwlock_write_unlock(&vol->states_lock); 656 644 fibril_rwlock_read_unlock(&vol->extents_lock); 657 }658 659 static void xor(void *dst, const void *src, size_t size)660 {661 size_t i;662 uint64_t *d = dst;663 const uint64_t *s = src;664 665 for (i = 0; i < size / sizeof(uint64_t); ++i)666 *d++ ^= *s++;667 645 } 668 646 … … 730 708 static errno_t hr_raid5_rebuild(void *arg) 731 709 { 732 HR_DEBUG(" hr_raid5_rebuild()\n");710 HR_DEBUG("%s()", __func__); 733 711 734 712 hr_volume_t *vol = arg; 735 713 errno_t rc = EOK; 714 size_t rebuild_idx; 736 715 void *buf = NULL, *xorbuf = NULL; 737 716 738 fibril_rwlock_read_lock(&vol->extents_lock); 739 fibril_rwlock_write_lock(&vol->states_lock); 740 741 if (vol->hotspare_no == 0) { 742 HR_WARN("hr_raid5_rebuild(): no free hotspares on \"%s\", " 743 "aborting rebuild\n", vol->devname); 744 /* retval isn't checked for now */ 745 goto end; 746 } 747 748 size_t bad = vol->extent_no; 749 for (size_t i = 0; i < vol->extent_no; i++) { 750 if (vol->extents[i].state == HR_EXT_FAILED) { 751 bad = i; 752 break; 753 } 754 } 755 756 if (bad == vol->extent_no) { 757 HR_WARN("hr_raid5_rebuild(): no bad extent on \"%s\", " 758 "aborting rebuild\n", vol->devname); 759 /* retval isn't checked for now */ 760 goto end; 761 } 762 763 size_t hotspare_idx = vol->hotspare_no - 1; 764 765 hr_ext_state_t hs_state = vol->hotspares[hotspare_idx].state; 766 if (hs_state != HR_EXT_HOTSPARE) { 767 HR_ERROR("hr_raid5_rebuild(): invalid hotspare state \"%s\", " 768 "aborting rebuild\n", hr_get_ext_state_str(hs_state)); 769 rc = EINVAL; 770 goto end; 771 } 772 773 HR_DEBUG("hr_raid5_rebuild(): swapping in hotspare\n"); 774 775 block_fini(vol->extents[bad].svc_id); 776 777 vol->extents[bad].svc_id = vol->hotspares[hotspare_idx].svc_id; 778 hr_update_ext_state(vol, bad, HR_EXT_HOTSPARE); 779 780 vol->hotspares[hotspare_idx].svc_id = 0; 781 fibril_mutex_lock(&vol->hotspare_lock); 782 hr_update_hotspare_state(vol, hotspare_idx, HR_EXT_MISSING); 783 fibril_mutex_unlock(&vol->hotspare_lock); 784 785 vol->hotspare_no--; 786 787 hr_extent_t *rebuild_ext = &vol->extents[bad]; 788 789 HR_DEBUG("hr_raid5_rebuild(): starting rebuild on (%" PRIun ")\n", 790 rebuild_ext->svc_id); 791 792 hr_update_ext_state(vol, bad, HR_EXT_REBUILD); 793 hr_update_vol_state(vol, HR_VOL_REBUILD); 717 rc = hr_init_rebuild(vol, &rebuild_idx); 718 if (rc != EOK) 719 return rc; 794 720 795 721 uint64_t max_blks = DATA_XFER_LIMIT / vol->bsize; 796 722 uint64_t left = vol->data_blkno / (vol->extent_no - 1); 797 buf = malloc(max_blks * vol->bsize); 798 xorbuf = malloc(max_blks * vol->bsize); 723 buf = hr_malloc_waitok(max_blks * vol->bsize); 724 xorbuf = hr_malloc_waitok(max_blks * vol->bsize); 725 726 uint64_t strip_size = vol->strip_size / vol->bsize; /* in blocks */ 799 727 800 728 uint64_t ba = 0, cnt; 801 729 hr_add_data_offset(vol, &ba); 802 730 731 /* 732 * this is not necessary because a rebuild is 733 * protected by itself, i.e. there can be only 734 * one REBUILD at a time 735 */ 736 fibril_rwlock_read_lock(&vol->extents_lock); 737 738 /* increment metadata counter only on first write */ 739 bool exp = false; 740 if (atomic_compare_exchange_strong(&vol->first_write, &exp, true)) { 741 vol->meta_ops->inc_counter(vol); 742 vol->meta_ops->save(vol, WITH_STATE_CALLBACK); 743 } 744 745 hr_range_lock_t *rl = NULL; 746 hr_stripe_t *stripe = hr_create_stripes(vol, max_blks * vol->bsize, 1, 747 false); 748 749 unsigned int percent, old_percent = 100; 803 750 while (left != 0) { 804 751 cnt = min(left, max_blks); 805 752 806 /* 807 * Almost the same as read_degraded, 808 * but we don't want to allocate new 809 * xorbuf each blk rebuild batch. 810 */ 811 bool first = true; 812 for (size_t i = 0; i < vol->extent_no; i++) { 813 if (i == bad) 753 uint64_t strip_no = ba / strip_size; 754 uint64_t last_ba = ba + cnt - 1; 755 uint64_t end_strip_no = last_ba / strip_size; 756 uint64_t start_stripe = strip_no / (vol->extent_no - 1); 757 uint64_t end_stripe = end_strip_no / (vol->extent_no - 1); 758 size_t stripes_cnt = end_stripe - start_stripe + 1; 759 760 stripe->ps_to_be_added = vol->extent_no - 1; 761 stripe->p_count_final = true; 762 763 hr_fgroup_t *worker_group = 764 hr_fgroup_create(vol->fge, vol->extent_no); 765 766 rl = hr_range_lock_acquire(vol, start_stripe, stripes_cnt); 767 768 atomic_store_explicit(&vol->rebuild_blk, ba, 769 memory_order_relaxed); 770 771 for (size_t e = 0; e < vol->extent_no; e++) { 772 if (e == rebuild_idx) 814 773 continue; 815 if (first) 816 rc = block_read_direct(vol->extents[i].svc_id, 817 ba, cnt, xorbuf); 818 else 819 rc = block_read_direct(vol->extents[i].svc_id, 820 ba, cnt, buf); 821 if (rc != EOK) { 822 hr_raid5_ext_state_cb(vol, i, rc); 823 HR_ERROR("rebuild on \"%s\" (%" PRIun "), " 824 "failed due to a failed ONLINE extent, " 825 "number %zu\n", 826 vol->devname, vol->svc_id, i); 827 goto end; 828 } 829 830 if (!first) 831 xor(xorbuf, buf, cnt * vol->bsize); 832 else 833 first = false; 834 } 835 836 rc = block_write_direct(rebuild_ext->svc_id, ba, cnt, xorbuf); 837 if (rc != EOK) { 838 hr_raid5_ext_state_cb(vol, bad, rc); 839 HR_ERROR("rebuild on \"%s\" (%" PRIun "), failed due to " 840 "the rebuilt extent number %zu failing\n", 841 vol->devname, vol->svc_id, bad); 774 775 hr_io_raid5_t *io = hr_fgroup_alloc(worker_group); 776 io->extent = e; 777 io->ba = ba; 778 io->cnt = cnt; 779 io->strip_off = 0; 780 io->vol = vol; 781 io->stripe = stripe; 782 783 hr_fgroup_submit(worker_group, 784 hr_io_raid5_reconstruct_reader, io); 785 } 786 787 hr_io_raid5_t *io = hr_fgroup_alloc(worker_group); 788 io->extent = rebuild_idx; 789 io->ba = ba; 790 io->cnt = cnt; 791 io->strip_off = 0; 792 io->vol = vol; 793 io->stripe = stripe; 794 795 hr_fgroup_submit(worker_group, hr_io_raid5_parity_writer, io); 796 797 size_t failed; 798 (void)hr_fgroup_wait(worker_group, NULL, &failed); 799 if (failed > 0) { 800 hr_range_lock_release(rl); 801 HR_NOTE("\"%s\": REBUILD aborted.\n", vol->devname); 842 802 goto end; 843 803 } 804 805 percent = ((ba + cnt) * 100) / vol->data_blkno; 806 if (percent != old_percent) { 807 if (percent % 5 == 0) 808 HR_DEBUG("\"%s\" REBUILD progress: %u%%\n", 809 vol->devname, percent); 810 } 811 812 hr_range_lock_release(rl); 813 hr_reset_stripe(stripe); 844 814 845 815 ba += cnt; … … 850 820 * during rebuild. 851 821 */ 852 853 /*854 * fibril_rwlock_write_unlock(&vol->states_lock);855 * fibril_mutex_unlock(&vol->lock);856 * fibril_mutex_lock(&vol->lock);857 * fibril_rwlock_write_lock(&vol->states_lock);858 */859 822 } 860 823 861 824 HR_DEBUG("hr_raid5_rebuild(): rebuild finished on \"%s\" (%" PRIun "), " 862 "extent number %zu\n", vol->devname, vol->svc_id, hotspare_idx); 863 864 hr_update_ext_state(vol, bad, HR_EXT_ONLINE); 825 "extent number %zu\n", vol->devname, vol->svc_id, rebuild_idx); 826 827 fibril_rwlock_write_lock(&vol->states_lock); 828 829 hr_update_ext_state(vol, rebuild_idx, HR_EXT_ONLINE); 830 831 hr_mark_vol_state_dirty(vol); 865 832 866 833 fibril_rwlock_write_unlock(&vol->states_lock); 834 835 /* (void)vol->meta_ops->save(vol, WITH_STATE_CALLBACK); */ 836 837 end: 867 838 fibril_rwlock_read_unlock(&vol->extents_lock); 868 839 869 rc = vol->meta_ops->save(vol, WITH_STATE_CALLBACK); 870 871 fibril_rwlock_read_lock(&vol->extents_lock); 872 fibril_rwlock_write_lock(&vol->states_lock); 873 874 end: 875 hr_raid5_vol_state_eval_forced(vol); 876 877 fibril_rwlock_write_unlock(&vol->states_lock); 878 fibril_rwlock_read_unlock(&vol->extents_lock); 879 880 if (buf != NULL) 881 free(buf); 882 883 if (xorbuf != NULL) 884 free(xorbuf); 840 hr_raid1_vol_state_eval(vol); 841 842 hr_destroy_stripes(stripe, 1); 843 free(buf); 844 free(xorbuf); 885 845 886 846 return rc;
Note:
See TracChangeset
for help on using the changeset viewer.
