Context Navigation

source: mainline/uspace/srv/bd/hr/raid1.c@ d773bea9

Visit:

Last change on this file since d773bea9 was d773bea9, checked in by Miroslav Cimerman <mc@…>, 7 months ago
hr: RAID1: handle state edge cases in a rebuild
Property mode set to `100644`
File size: 20.6 KB

Line
1	/*
2	* Copyright (c) 2025 Miroslav Cimerman
3	* All rights reserved.
4	*
5	* Redistribution and use in source and binary forms, with or without
6	* modification, are permitted provided that the following conditions
7	* are met:
8	*
9	* - Redistributions of source code must retain the above copyright
10	* notice, this list of conditions and the following disclaimer.
11	* - Redistributions in binary form must reproduce the above copyright
12	* notice, this list of conditions and the following disclaimer in the
13	* documentation and/or other materials provided with the distribution.
14	* - The name of the author may not be used to endorse or promote products
15	* derived from this software without specific prior written permission.
16	*
17	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27	*/
28
29	/** @addtogroup hr
30	* @{
31	*/
32	/**
33	* @file
34	*/
35
36	#include <bd_srv.h>
37	#include <block.h>
38	#include <errno.h>
39	#include <hr.h>
40	#include <io/log.h>
41	#include <ipc/hr.h>
42	#include <ipc/services.h>
43	#include <loc.h>
44	#include <task.h>
45	#include <stdatomic.h>
46	#include <stdio.h>
47	#include <stdlib.h>
48	#include <str_error.h>
49
50	#include "fge.h"
51	#include "io.h"
52	#include "superblock.h"
53	#include "util.h"
54	#include "var.h"
55
56	extern loc_srv_t *hr_srv;
57
58	static void process_deferred_invalidations(hr_volume_t *);
59	static void hr_raid1_update_vol_status(hr_volume_t *);
60	static void hr_raid1_ext_state_callback(hr_volume_t *, size_t, errno_t);
61	static size_t hr_raid1_count_good_extents(hr_volume_t *, uint64_t, size_t,
62	uint64_t);
63	static errno_t hr_raid1_bd_op(hr_bd_op_type_t, bd_srv_t *, aoff64_t, size_t,
64	void , const void , size_t);
65	static errno_t swap_hs(hr_volume_t *, size_t, size_t);
66	static errno_t init_rebuild(hr_volume_t , size_t );
67	static errno_t hr_raid1_rebuild(void *);
68
69	/* bdops */
70	static errno_t hr_raid1_bd_open(bd_srvs_t , bd_srv_t );
71	static errno_t hr_raid1_bd_close(bd_srv_t *);
72	static errno_t hr_raid1_bd_read_blocks(bd_srv_t , aoff64_t, size_t, void ,
73	size_t);
74	static errno_t hr_raid1_bd_sync_cache(bd_srv_t *, aoff64_t, size_t);
75	static errno_t hr_raid1_bd_write_blocks(bd_srv_t *, aoff64_t, size_t,
76	const void *, size_t);
77	static errno_t hr_raid1_bd_get_block_size(bd_srv_t , size_t );
78	static errno_t hr_raid1_bd_get_num_blocks(bd_srv_t , aoff64_t );
79
80	static bd_ops_t hr_raid1_bd_ops = {
81	.open = hr_raid1_bd_open,
82	.close = hr_raid1_bd_close,
83	.sync_cache = hr_raid1_bd_sync_cache,
84	.read_blocks = hr_raid1_bd_read_blocks,
85	.write_blocks = hr_raid1_bd_write_blocks,
86	.get_block_size = hr_raid1_bd_get_block_size,
87	.get_num_blocks = hr_raid1_bd_get_num_blocks
88	};
89
90	errno_t hr_raid1_create(hr_volume_t *new_volume)
91	{
92	errno_t rc;
93
94	assert(new_volume->level == HR_LVL_1);
95
96	if (new_volume->extent_no < 2) {
97	HR_ERROR("RAID 1 array needs at least 2 devices\n");
98	return EINVAL;
99	}
100
101	bd_srvs_init(&new_volume->hr_bds);
102	new_volume->hr_bds.ops = &hr_raid1_bd_ops;
103	new_volume->hr_bds.sarg = new_volume;
104
105	/* force volume state update */
106	atomic_store(&new_volume->state_changed, true);
107	hr_raid1_update_vol_status(new_volume);
108	if (new_volume->status == HR_VOL_FAULTY)
109	return EINVAL;
110
111	rc = hr_register_volume(new_volume);
112
113	return rc;
114	}
115
116	errno_t hr_raid1_init(hr_volume_t *vol)
117	{
118	errno_t rc;
119	size_t bsize;
120	uint64_t total_blkno;
121
122	assert(vol->level == HR_LVL_1);
123
124	rc = hr_check_devs(vol, &total_blkno, &bsize);
125	if (rc != EOK)
126	return rc;
127
128	vol->nblocks = total_blkno / vol->extent_no;
129	vol->bsize = bsize;
130	vol->data_offset = HR_DATA_OFF;
131	vol->data_blkno = vol->nblocks - vol->data_offset;
132	vol->strip_size = 0;
133
134	return EOK;
135	}
136
137	void hr_raid1_status_event(hr_volume_t *vol)
138	{
139	hr_raid1_update_vol_status(vol);
140	}
141
142	errno_t hr_raid1_add_hotspare(hr_volume_t *vol, service_id_t hotspare)
143	{
144	HR_DEBUG("hr_raid1_add_hotspare()\n");
145
146	errno_t rc = EOK;
147
148	fibril_mutex_lock(&vol->hotspare_lock);
149
150	if (vol->hotspare_no >= HR_MAX_HOTSPARES) {
151	HR_ERROR("hr_raid1_add_hotspare(): cannot add more hotspares "
152	"to \"%s\"\n", vol->devname);
153	rc = ELIMIT;
154	goto error;
155	}
156
157	size_t hs_idx = vol->hotspare_no;
158
159	vol->hotspare_no++;
160
161	hr_update_hotspare_svc_id(vol, hs_idx, hotspare);
162	hr_update_hotspare_status(vol, hs_idx, HR_EXT_HOTSPARE);
163
164	atomic_store(&vol->state_changed, true);
165	error:
166	fibril_mutex_unlock(&vol->hotspare_lock);
167
168	hr_raid1_update_vol_status(vol);
169
170	return rc;
171	}
172
173	static errno_t hr_raid1_bd_open(bd_srvs_t bds, bd_srv_t bd)
174	{
175	HR_DEBUG("hr_bd_open()\n");
176	return EOK;
177	}
178
179	static errno_t hr_raid1_bd_close(bd_srv_t *bd)
180	{
181	HR_DEBUG("hr_bd_close()\n");
182	return EOK;
183	}
184
185	static errno_t hr_raid1_bd_sync_cache(bd_srv_t *bd, aoff64_t ba, size_t cnt)
186	{
187	return hr_raid1_bd_op(HR_BD_SYNC, bd, ba, cnt, NULL, NULL, 0);
188	}
189
190	static errno_t hr_raid1_bd_read_blocks(bd_srv_t *bd, aoff64_t ba, size_t cnt,
191	void *buf, size_t size)
192	{
193	return hr_raid1_bd_op(HR_BD_READ, bd, ba, cnt, buf, NULL, size);
194	}
195
196	static errno_t hr_raid1_bd_write_blocks(bd_srv_t *bd, aoff64_t ba, size_t cnt,
197	const void *data, size_t size)
198	{
199	return hr_raid1_bd_op(HR_BD_WRITE, bd, ba, cnt, NULL, data, size);
200	}
201
202	static errno_t hr_raid1_bd_get_block_size(bd_srv_t bd, size_t rsize)
203	{
204	hr_volume_t *vol = bd->srvs->sarg;
205
206	*rsize = vol->bsize;
207	return EOK;
208	}
209
210	static errno_t hr_raid1_bd_get_num_blocks(bd_srv_t bd, aoff64_t rnb)
211	{
212	hr_volume_t *vol = bd->srvs->sarg;
213
214	*rnb = vol->data_blkno;
215	return EOK;
216	}
217
218	static void process_deferred_invalidations(hr_volume_t *vol)
219	{
220	HR_DEBUG("hr_raid1_update_vol_status(): deferred invalidations\n");
221
222	fibril_mutex_lock(&vol->halt_lock);
223	vol->halt_please = true;
224	fibril_rwlock_write_lock(&vol->extents_lock);
225	fibril_rwlock_write_lock(&vol->states_lock);
226	fibril_mutex_lock(&vol->hotspare_lock);
227
228	list_foreach(vol->deferred_invalidations_list, link,
229	hr_deferred_invalidation_t, di) {
230	assert(vol->extents[di->index].status == HR_EXT_INVALID);
231
232	HR_DEBUG("moving invalidated extent no. %lu to hotspares\n",
233	di->index);
234
235	block_fini(di->svc_id);
236
237	size_t hs_idx = vol->hotspare_no;
238
239	vol->hotspare_no++;
240
241	hr_update_hotspare_svc_id(vol, hs_idx, di->svc_id);
242	hr_update_hotspare_status(vol, hs_idx, HR_EXT_HOTSPARE);
243
244	hr_update_ext_svc_id(vol, di->index, 0);
245	hr_update_ext_status(vol, di->index, HR_EXT_MISSING);
246
247	assert(vol->hotspare_no < HR_MAX_HOTSPARES + HR_MAX_EXTENTS);
248	}
249
250	for (size_t i = 0; i < HR_MAX_EXTENTS; i++) {
251	hr_deferred_invalidation_t *di = &vol->deferred_inval[i];
252	if (di->svc_id != 0) {
253	list_remove(&di->link);
254	di->svc_id = 0;
255	}
256	}
257
258	fibril_mutex_unlock(&vol->hotspare_lock);
259	fibril_rwlock_write_unlock(&vol->states_lock);
260	fibril_rwlock_write_unlock(&vol->extents_lock);
261	vol->halt_please = false;
262	fibril_mutex_unlock(&vol->halt_lock);
263	}
264
265	static void hr_raid1_update_vol_status(hr_volume_t *vol)
266	{
267	bool exp = true;
268
269	if (!atomic_compare_exchange_strong(&vol->state_changed, &exp, false))
270	return;
271
272	if (atomic_compare_exchange_strong(&vol->pending_invalidation, &exp,
273	false)) {
274	fibril_mutex_lock(&vol->deferred_list_lock);
275	process_deferred_invalidations(vol);
276	fibril_mutex_unlock(&vol->deferred_list_lock);
277	}
278
279	fibril_rwlock_read_lock(&vol->extents_lock);
280	fibril_rwlock_read_lock(&vol->states_lock);
281
282	hr_vol_status_t old_state = vol->status;
283	size_t healthy = hr_count_extents(vol, HR_EXT_ONLINE);
284
285	fibril_rwlock_read_unlock(&vol->states_lock);
286	fibril_rwlock_read_unlock(&vol->extents_lock);
287
288	if (healthy == 0) {
289	if (old_state != HR_VOL_FAULTY) {
290	fibril_rwlock_write_lock(&vol->states_lock);
291	hr_update_vol_status(vol, HR_VOL_FAULTY);
292	fibril_rwlock_write_unlock(&vol->states_lock);
293	}
294	} else if (healthy < vol->extent_no) {
295	if (old_state != HR_VOL_REBUILD &&
296	old_state != HR_VOL_DEGRADED) {
297	fibril_rwlock_write_lock(&vol->states_lock);
298	hr_update_vol_status(vol, HR_VOL_DEGRADED);
299	fibril_rwlock_write_unlock(&vol->states_lock);
300	}
301
302	if (old_state != HR_VOL_REBUILD) {
303	if (vol->hotspare_no > 0) {
304	fid_t fib = fibril_create(hr_raid1_rebuild,
305	vol);
306	if (fib == 0)
307	return;
308	fibril_start(fib);
309	fibril_detach(fib);
310	}
311	}
312	} else {
313	if (old_state != HR_VOL_ONLINE) {
314	fibril_rwlock_write_lock(&vol->states_lock);
315	hr_update_vol_status(vol, HR_VOL_ONLINE);
316	fibril_rwlock_write_unlock(&vol->states_lock);
317	}
318	}
319	}
320
321	static void hr_raid1_ext_state_callback(hr_volume_t *vol, size_t extent,
322	errno_t rc)
323	{
324	if (rc == EOK)
325	return;
326
327	assert(fibril_rwlock_is_locked(&vol->extents_lock));
328
329	fibril_rwlock_write_lock(&vol->states_lock);
330
331	switch (rc) {
332	case ENOMEM:
333	fibril_mutex_lock(&vol->deferred_list_lock);
334
335	service_id_t invalid_svc_id = vol->extents[extent].svc_id;
336
337	list_foreach(vol->deferred_invalidations_list, link,
338	hr_deferred_invalidation_t, di) {
339	if (di->svc_id == invalid_svc_id) {
340	assert(vol->extents[extent].status ==
341	HR_EXT_INVALID);
342	goto deferring_end;
343	}
344	}
345
346	assert(vol->extents[extent].svc_id != HR_EXT_INVALID);
347
348	hr_update_ext_status(vol, extent, HR_EXT_INVALID);
349
350	size_t i = list_count(&vol->deferred_invalidations_list);
351	vol->deferred_inval[i].svc_id = invalid_svc_id;
352	vol->deferred_inval[i].index = extent;
353
354	list_append(&vol->deferred_inval[i].link,
355	&vol->deferred_invalidations_list);
356
357	atomic_store(&vol->pending_invalidation, true);
358	deferring_end:
359
360	fibril_mutex_unlock(&vol->deferred_list_lock);
361	break;
362	case ENOENT:
363	hr_update_ext_status(vol, extent, HR_EXT_MISSING);
364	break;
365	default:
366	hr_update_ext_status(vol, extent, HR_EXT_FAILED);
367	}
368
369	atomic_store(&vol->state_changed, true);
370
371	fibril_rwlock_write_unlock(&vol->states_lock);
372	}
373
374	static size_t hr_raid1_count_good_extents(hr_volume_t *vol, uint64_t ba,
375	size_t cnt, uint64_t rebuild_blk)
376	{
377	assert(fibril_rwlock_is_locked(&vol->extents_lock));
378	assert(fibril_rwlock_is_locked(&vol->states_lock));
379
380	size_t count = 0;
381	for (size_t i = 0; i < vol->extent_no; i++) {
382	if (vol->extents[i].status == HR_EXT_ONLINE \|\|
383	(vol->extents[i].status == HR_EXT_REBUILD &&
384	ba < rebuild_blk)) {
385	count++;
386	}
387	}
388
389	return count;
390
391	}
392
393	static errno_t hr_raid1_bd_op(hr_bd_op_type_t type, bd_srv_t *bd, aoff64_t ba,
394	size_t cnt, void data_read, const void data_write, size_t size)
395	{
396	hr_volume_t *vol = bd->srvs->sarg;
397	hr_range_lock_t *rl = NULL;
398	errno_t rc;
399	size_t i;
400	uint64_t rebuild_blk;
401
402	fibril_rwlock_read_lock(&vol->states_lock);
403	hr_vol_status_t vol_state = vol->status;
404	fibril_rwlock_read_unlock(&vol->states_lock);
405
406	if (vol_state == HR_VOL_FAULTY)
407	return EIO;
408
409	if (type == HR_BD_READ \|\| type == HR_BD_WRITE)
410	if (size < cnt * vol->bsize)
411	return EINVAL;
412
413	rc = hr_check_ba_range(vol, cnt, ba);
414	if (rc != EOK)
415	return rc;
416
417	/* allow full dev sync */
418	if (type != HR_BD_SYNC \|\| ba != 0)
419	hr_add_ba_offset(vol, &ba);
420
421	/*
422	* this is to allow adding hotspare or start a rebuild on
423	* very busy array, because of how rwlocks are implemented
424	* in HelenOS (no writer priority, so if there are multiple
425	* continuos readers, writer will never own the lock)
426	*/
427	if (vol->halt_please) {
428	fibril_mutex_lock(&vol->halt_lock);
429	fibril_mutex_unlock(&vol->halt_lock);
430	}
431
432	/*
433	* extent order has to be locked for the whole IO duration,
434	* so that workers have consistent targets
435	*/
436	fibril_rwlock_read_lock(&vol->extents_lock);
437
438	size_t successful = 0;
439	switch (type) {
440	case HR_BD_READ:
441	rebuild_blk = atomic_load_explicit(&vol->rebuild_blk,
442	memory_order_relaxed);
443
444	for (i = 0; i < vol->extent_no; i++) {
445	fibril_rwlock_read_lock(&vol->states_lock);
446	hr_ext_status_t state = vol->extents[i].status;
447	fibril_rwlock_read_unlock(&vol->states_lock);
448
449	if (state != HR_EXT_ONLINE &&
450	(state != HR_EXT_REBUILD \|\|
451	ba + cnt - 1 >= rebuild_blk)) {
452	continue;
453	}
454
455	rc = block_read_direct(vol->extents[i].svc_id, ba, cnt,
456	data_read);
457
458	if (rc == ENOMEM && i + 1 == vol->extent_no)
459	goto end;
460
461	if (rc == ENOMEM)
462	continue;
463
464	if (rc != EOK) {
465	hr_raid1_ext_state_callback(vol, i, rc);
466	} else {
467	successful++;
468	break;
469	}
470	}
471	break;
472	case HR_BD_SYNC:
473	case HR_BD_WRITE:
474	if (type == HR_BD_WRITE) {
475	rl = hr_range_lock_acquire(vol, ba, cnt);
476	if (rl == NULL) {
477	rc = ENOMEM;
478	goto end;
479	}
480	}
481
482	fibril_rwlock_read_lock(&vol->states_lock);
483
484	rebuild_blk = atomic_load_explicit(&vol->rebuild_blk,
485	memory_order_relaxed);
486
487	size_t good = hr_raid1_count_good_extents(vol, ba, cnt,
488	rebuild_blk);
489
490	hr_fgroup_t *group = hr_fgroup_create(vol->fge, good);
491	if (group == NULL) {
492	if (type == HR_BD_WRITE)
493	hr_range_lock_release(rl);
494	rc = ENOMEM;
495	fibril_rwlock_read_unlock(&vol->states_lock);
496	goto end;
497	}
498
499	for (i = 0; i < vol->extent_no; i++) {
500	if (vol->extents[i].status != HR_EXT_ONLINE &&
501	(vol->extents[i].status != HR_EXT_REBUILD \|\|
502	ba >= rebuild_blk)) {
503	/*
504	* When the extent is being rebuilt,
505	* we only write to the part that is already
506	* rebuilt. If IO starts after vol->rebuild_blk
507	* we do not proceed, the write is going to
508	* be replicated later in the rebuild.
509	*/
510	continue;
511	}
512
513	hr_io_t *io = hr_fgroup_alloc(group);
514	io->extent = i;
515	io->data_write = data_write;
516	io->data_read = data_read;
517	io->ba = ba;
518	io->cnt = cnt;
519	io->type = type;
520	io->vol = vol;
521	io->state_callback = hr_raid1_ext_state_callback;
522
523	hr_fgroup_submit(group, hr_io_worker, io);
524	}
525
526	fibril_rwlock_read_unlock(&vol->states_lock);
527
528	(void)hr_fgroup_wait(group, &successful, NULL);
529
530	if (type == HR_BD_WRITE)
531	hr_range_lock_release(rl);
532
533	break;
534	default:
535	rc = EINVAL;
536	goto end;
537	}
538
539	if (successful > 0)
540	rc = EOK;
541	else
542	rc = EIO;
543
544	end:
545	fibril_rwlock_read_unlock(&vol->extents_lock);
546
547	hr_raid1_update_vol_status(vol);
548
549	return rc;
550	}
551
552	static errno_t swap_hs(hr_volume_t *vol, size_t bad, size_t hs)
553	{
554	HR_DEBUG("hr_raid1_rebuild(): swapping in hotspare\n");
555
556	service_id_t faulty_svc_id = vol->extents[bad].svc_id;
557	service_id_t hs_svc_id = vol->hotspares[hs].svc_id;
558
559	errno_t rc = block_init(hs_svc_id);
560	if (rc != EOK) {
561	HR_ERROR("hr_raid1_rebuild(): initing hotspare (%lu) failed\n",
562	hs_svc_id);
563	return rc;
564	}
565
566	hr_update_ext_svc_id(vol, bad, hs_svc_id);
567	hr_update_ext_status(vol, bad, HR_EXT_HOTSPARE);
568
569	hr_update_hotspare_svc_id(vol, hs, 0);
570	hr_update_hotspare_status(vol, hs, HR_EXT_INVALID);
571
572	vol->hotspare_no--;
573
574	if (faulty_svc_id != 0)
575	block_fini(faulty_svc_id);
576
577	return EOK;
578	}
579
580	static errno_t init_rebuild(hr_volume_t vol, size_t rebuild_idx)
581	{
582	errno_t rc = EOK;
583
584	fibril_mutex_lock(&vol->halt_lock);
585	vol->halt_please = true;
586	fibril_rwlock_write_lock(&vol->extents_lock);
587	fibril_rwlock_write_lock(&vol->states_lock);
588	fibril_mutex_lock(&vol->hotspare_lock);
589
590	if (vol->hotspare_no == 0) {
591	HR_WARN("hr_raid1_rebuild(): no free hotspares on \"%s\", "
592	"aborting rebuild\n", vol->devname);
593	rc = EINVAL;
594	goto error;
595	}
596
597	size_t bad = vol->extent_no;
598	for (size_t i = 0; i < vol->extent_no; i++) {
599	if (vol->extents[i].status != HR_EXT_ONLINE) {
600	bad = i;
601	break;
602	}
603	}
604
605	if (bad == vol->extent_no) {
606	HR_WARN("hr_raid1_rebuild(): no bad extent on \"%s\", "
607	"aborting rebuild\n", vol->devname);
608	rc = EINVAL;
609	goto error;
610	}
611
612	size_t hotspare_idx = vol->hotspare_no - 1;
613
614	hr_ext_status_t hs_state = vol->hotspares[hotspare_idx].status;
615	if (hs_state != HR_EXT_HOTSPARE) {
616	HR_ERROR("hr_raid1_rebuild(): invalid hotspare state \"%s\", "
617	"aborting rebuild\n", hr_get_ext_status_msg(hs_state));
618	rc = EINVAL;
619	goto error;
620	}
621
622	rc = swap_hs(vol, bad, hotspare_idx);
623	if (rc != EOK) {
624	HR_ERROR("hr_raid1_rebuild(): swapping hotspare failed, "
625	"aborting rebuild\n");
626	goto error;
627	}
628
629	hr_extent_t *rebuild_ext = &vol->extents[bad];
630
631	HR_DEBUG("hr_raid1_rebuild(): starting REBUILD on extent no. %lu (%lu)"
632	"\n", bad, rebuild_ext->svc_id);
633
634	atomic_store_explicit(&vol->rebuild_blk, 0, memory_order_relaxed);
635
636	hr_update_ext_status(vol, bad, HR_EXT_REBUILD);
637	hr_update_vol_status(vol, HR_VOL_REBUILD);
638
639	*rebuild_idx = bad;
640	error:
641	fibril_mutex_unlock(&vol->hotspare_lock);
642	fibril_rwlock_write_unlock(&vol->states_lock);
643	fibril_rwlock_write_unlock(&vol->extents_lock);
644	vol->halt_please = false;
645	fibril_mutex_unlock(&vol->halt_lock);
646
647	return rc;
648	}
649
650	static errno_t hr_raid1_restore_blocks(hr_volume_t *vol, size_t rebuild_idx,
651	uint64_t ba, size_t cnt, void *buf)
652	{
653	HR_DEBUG("REBUILD restoring blocks (ba: %lu, cnt: %lu)\n", ba, cnt);
654
655	assert(fibril_rwlock_is_locked(&vol->extents_lock));
656
657	errno_t rc = ENOENT;
658	hr_extent_t ext, rebuild_ext = &vol->extents[rebuild_idx];
659
660	for (size_t i = 0; i < vol->extent_no; i++) {
661	fibril_rwlock_read_lock(&vol->states_lock);
662
663	ext = &vol->extents[i];
664	if (ext->status != HR_EXT_ONLINE)
665	continue;
666
667	fibril_rwlock_read_unlock(&vol->states_lock);
668
669	rc = block_read_direct(ext->svc_id, ba, cnt, buf);
670	if (rc == EOK)
671	break;
672
673	if (rc != ENOMEM)
674	hr_raid1_ext_state_callback(vol, i, rc);
675
676	if (i + 1 >= vol->extent_no) {
677	if (rc != ENOMEM) {
678	HR_ERROR("rebuild on \"%s\" (%lu), failed due "
679	"to too many failed extents\n",
680	vol->devname, vol->svc_id);
681	}
682
683	/* for now we have to invalidate the rebuild extent */
684	if (rc == ENOMEM) {
685	HR_ERROR("rebuild on \"%s\" (%lu), failed due "
686	"to too many failed reads, because of not "
687	"enough memory\n",
688	vol->devname, vol->svc_id);
689	hr_raid1_ext_state_callback(vol, rebuild_idx,
690	ENOMEM);
691	}
692
693	return rc;
694	}
695	}
696
697	rc = block_write_direct(rebuild_ext->svc_id, ba, cnt, buf);
698	if (rc != EOK) {
699	/*
700	* Here we dont handle ENOMEM, because maybe in the
701	* future, there is going to be M_WAITOK, or we are
702	* going to wait for more memory, so that we don't
703	* have to invalidate it...
704	*
705	* XXX: for now we do
706	*/
707	hr_raid1_ext_state_callback(vol, rebuild_idx, rc);
708
709	HR_ERROR("rebuild on \"%s\" (%lu), failed due to "
710	"the rebuilt extent no. %lu WRITE (rc: %s)\n",
711	vol->devname, vol->svc_id, rebuild_idx, str_error(rc));
712
713	return rc;
714	}
715
716	return EOK;
717	}
718
719	/*
720	* Put the last HOTSPARE extent in place
721	* of first that != ONLINE, and start the rebuild.
722	*/
723	static errno_t hr_raid1_rebuild(void *arg)
724	{
725	HR_DEBUG("hr_raid1_rebuild()\n");
726
727	hr_volume_t *vol = arg;
728	void *buf = NULL;
729	size_t rebuild_idx;
730	errno_t rc;
731
732	rc = init_rebuild(vol, &rebuild_idx);
733	if (rc != EOK)
734	return rc;
735
736	size_t left = vol->data_blkno;
737	size_t max_blks = DATA_XFER_LIMIT / vol->bsize;
738	buf = malloc(max_blks * vol->bsize);
739
740	size_t cnt;
741	uint64_t ba = 0;
742	hr_add_ba_offset(vol, &ba);
743
744	fibril_rwlock_read_lock(&vol->extents_lock);
745
746	hr_range_lock_t *rl = NULL;
747
748	while (left != 0) {
749	if (vol->halt_please) {
750	fibril_rwlock_read_unlock(&vol->extents_lock);
751	fibril_mutex_lock(&vol->halt_lock);
752	fibril_mutex_unlock(&vol->halt_lock);
753	fibril_rwlock_read_lock(&vol->extents_lock);
754	}
755
756	cnt = min(max_blks, left);
757
758	rl = hr_range_lock_acquire(vol, ba, cnt);
759	if (rl == NULL) {
760	rc = ENOMEM;
761	goto end;
762	}
763
764	atomic_store_explicit(&vol->rebuild_blk, ba,
765	memory_order_relaxed);
766
767	rc = hr_raid1_restore_blocks(vol, rebuild_idx, ba, cnt, buf);
768
769	hr_range_lock_release(rl);
770
771	if (rc != EOK)
772	goto end;
773
774	ba += cnt;
775	left -= cnt;
776	}
777
778	HR_DEBUG("hr_raid1_rebuild(): rebuild finished on \"%s\" (%lu), "
779	"extent no. %lu\n", vol->devname, vol->svc_id, rebuild_idx);
780
781	fibril_rwlock_write_lock(&vol->states_lock);
782
783	hr_update_ext_status(vol, rebuild_idx, HR_EXT_ONLINE);
784	/*
785	* We can be optimistic here, if some extents are
786	* still INVALID, FAULTY or MISSING, the update vol
787	* function will pick them up, and set the volume
788	* state accordingly.
789	*/
790	hr_update_vol_status(vol, HR_VOL_ONLINE);
791	atomic_store(&vol->state_changed, true);
792
793	fibril_rwlock_write_unlock(&vol->states_lock);
794
795	/*
796	* For now write metadata at the end, because
797	* we don't sync metada accross extents yet.
798	*/
799	hr_write_meta_to_ext(vol, rebuild_idx);
800	end:
801	if (rc != EOK) {
802	/*
803	* We can fail either because:
804	* - the rebuild extent failing or invalidation
805	* - there is are no ONLINE extents (vol is FAULTY)
806	* - we got ENOMEM on all READs (we also invalidate the
807	* rebuild extent here, for now)
808	*/
809	fibril_rwlock_write_lock(&vol->states_lock);
810	hr_update_vol_status(vol, HR_VOL_DEGRADED);
811	atomic_store(&vol->state_changed, true);
812	fibril_rwlock_write_unlock(&vol->states_lock);
813	}
814
815	fibril_rwlock_read_unlock(&vol->extents_lock);
816
817	hr_raid1_update_vol_status(vol);
818
819	if (buf != NULL)
820	free(buf);
821
822	return rc;
823	}
824
825	/** @}
826	*/

Note: See TracBrowser for help on using the repository browser.

Download in other formats: