source: mainline/uspace/srv/bd/hr/raid4.c@ e76e12d8

Last change on this file since e76e12d8 was 7b359f5, checked in by Miroslav Cimerman <mc@…>, 9 months ago

hr: status/state event function for each RAID

  • Property mode set to 100644
File size: 12.9 KB
Line 
1/*
2 * Copyright (c) 2024 Miroslav Cimerman
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * - Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * - Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * - The name of the author may not be used to endorse or promote products
15 * derived from this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/** @addtogroup hr
30 * @{
31 */
32/**
33 * @file
34 */
35
36#include <abi/ipc/ipc.h>
37#include <bd_srv.h>
38#include <block.h>
39#include <errno.h>
40#include <hr.h>
41#include <io/log.h>
42#include <ipc/hr.h>
43#include <ipc/services.h>
44#include <loc.h>
45#include <mem.h>
46#include <task.h>
47#include <stdio.h>
48#include <stdlib.h>
49#include <str_error.h>
50
51#include "superblock.h"
52#include "util.h"
53#include "var.h"
54
55extern loc_srv_t *hr_srv;
56
57static errno_t hr_raid4_vol_usable(hr_volume_t *);
58static ssize_t hr_raid4_get_bad_ext(hr_volume_t *);
59static errno_t hr_raid4_update_vol_status(hr_volume_t *);
60static void xor(void *, const void *, size_t);
61static errno_t hr_raid4_read_degraded(hr_volume_t *, uint64_t, uint64_t,
62 void *, size_t);
63static errno_t hr_raid4_write(hr_volume_t *, uint64_t, aoff64_t, const void *,
64 size_t);
65static errno_t hr_raid4_write_parity(hr_volume_t *, uint64_t, uint64_t,
66 const void *, size_t);
67static errno_t hr_raid4_bd_op(hr_bd_op_type_t, bd_srv_t *, aoff64_t, size_t,
68 void *, const void *, size_t);
69
70/* bdops */
71static errno_t hr_raid4_bd_open(bd_srvs_t *, bd_srv_t *);
72static errno_t hr_raid4_bd_close(bd_srv_t *);
73static errno_t hr_raid4_bd_read_blocks(bd_srv_t *, aoff64_t, size_t, void *,
74 size_t);
75static errno_t hr_raid4_bd_sync_cache(bd_srv_t *, aoff64_t, size_t);
76static errno_t hr_raid4_bd_write_blocks(bd_srv_t *, aoff64_t, size_t,
77 const void *, size_t);
78static errno_t hr_raid4_bd_get_block_size(bd_srv_t *, size_t *);
79static errno_t hr_raid4_bd_get_num_blocks(bd_srv_t *, aoff64_t *);
80
81static errno_t hr_raid4_write_parity(hr_volume_t *, uint64_t, uint64_t,
82 const void *, size_t);
83
84static bd_ops_t hr_raid4_bd_ops = {
85 .open = hr_raid4_bd_open,
86 .close = hr_raid4_bd_close,
87 .sync_cache = hr_raid4_bd_sync_cache,
88 .read_blocks = hr_raid4_bd_read_blocks,
89 .write_blocks = hr_raid4_bd_write_blocks,
90 .get_block_size = hr_raid4_bd_get_block_size,
91 .get_num_blocks = hr_raid4_bd_get_num_blocks
92};
93
94errno_t hr_raid4_create(hr_volume_t *new_volume)
95{
96 errno_t rc;
97
98 assert(new_volume->level == HR_LVL_4);
99
100 if (new_volume->dev_no < 3) {
101 HR_ERROR("RAID 4 array needs at least 3 devices\n");
102 return EINVAL;
103 }
104
105 rc = hr_raid4_update_vol_status(new_volume);
106 if (rc != EOK)
107 return rc;
108
109 bd_srvs_init(&new_volume->hr_bds);
110 new_volume->hr_bds.ops = &hr_raid4_bd_ops;
111 new_volume->hr_bds.sarg = new_volume;
112
113 rc = hr_register_volume(new_volume);
114
115 return rc;
116}
117
118errno_t hr_raid4_init(hr_volume_t *vol)
119{
120 errno_t rc;
121 size_t bsize;
122 uint64_t total_blkno;
123
124 assert(vol->level == HR_LVL_4);
125
126 rc = hr_check_devs(vol, &total_blkno, &bsize);
127 if (rc != EOK)
128 return rc;
129
130 vol->nblocks = total_blkno;
131 vol->bsize = bsize;
132 vol->data_offset = HR_DATA_OFF;
133 vol->data_blkno = vol->nblocks - (vol->data_offset * vol->dev_no) -
134 (vol->nblocks / vol->dev_no);
135 vol->strip_size = HR_STRIP_SIZE;
136
137 return EOK;
138}
139
140void hr_raid4_status_event(hr_volume_t *vol)
141{
142 fibril_mutex_lock(&vol->lock);
143 (void) hr_raid4_update_vol_status(vol);
144 fibril_mutex_unlock(&vol->lock);
145}
146
147static errno_t hr_raid4_bd_open(bd_srvs_t *bds, bd_srv_t *bd)
148{
149 HR_DEBUG("hr_bd_open()\n");
150 return EOK;
151}
152
153static errno_t hr_raid4_bd_close(bd_srv_t *bd)
154{
155 HR_DEBUG("hr_bd_close()\n");
156 return EOK;
157}
158
159static errno_t hr_raid4_bd_sync_cache(bd_srv_t *bd, aoff64_t ba, size_t cnt)
160{
161 return hr_raid4_bd_op(HR_BD_SYNC, bd, ba, cnt, NULL, NULL, 0);
162}
163
164static errno_t hr_raid4_bd_read_blocks(bd_srv_t *bd, aoff64_t ba, size_t cnt,
165 void *buf, size_t size)
166{
167 return hr_raid4_bd_op(HR_BD_READ, bd, ba, cnt, buf, NULL, size);
168}
169
170static errno_t hr_raid4_bd_write_blocks(bd_srv_t *bd, aoff64_t ba, size_t cnt,
171 const void *data, size_t size)
172{
173 return hr_raid4_bd_op(HR_BD_WRITE, bd, ba, cnt, NULL, data, size);
174}
175
176static errno_t hr_raid4_bd_get_block_size(bd_srv_t *bd, size_t *rsize)
177{
178 hr_volume_t *vol = bd->srvs->sarg;
179
180 *rsize = vol->bsize;
181 return EOK;
182}
183
184static errno_t hr_raid4_bd_get_num_blocks(bd_srv_t *bd, aoff64_t *rnb)
185{
186 hr_volume_t *vol = bd->srvs->sarg;
187
188 *rnb = vol->data_blkno;
189 return EOK;
190}
191
192static errno_t hr_raid4_vol_usable(hr_volume_t *vol)
193{
194 if (vol->status == HR_VOL_ONLINE ||
195 vol->status == HR_VOL_DEGRADED)
196 return EOK;
197 return EINVAL;
198}
199
200/*
201 * Returns (-1) if all extents are online,
202 * else returns index of first bad one.
203 */
204static ssize_t hr_raid4_get_bad_ext(hr_volume_t *vol)
205{
206 for (size_t i = 0; i < vol->dev_no; i++)
207 if (vol->extents[i].status != HR_EXT_ONLINE)
208 return i;
209 return -1;
210}
211
212static errno_t hr_raid4_update_vol_status(hr_volume_t *vol)
213{
214 hr_vol_status_t old_state = vol->status;
215 size_t bad = 0;
216 for (size_t i = 0; i < vol->dev_no; i++)
217 if (vol->extents[i].status != HR_EXT_ONLINE)
218 bad++;
219
220 switch (bad) {
221 case 0:
222 if (old_state != HR_VOL_ONLINE) {
223 HR_WARN("RAID 4 has all extents online, "
224 "marking \"%s\" (%lu) as ONLINE",
225 vol->devname, vol->svc_id);
226 vol->status = HR_VOL_ONLINE;
227 }
228 return EOK;
229 case 1:
230 if (old_state != HR_VOL_DEGRADED) {
231 HR_WARN("RAID 4 array \"%s\" (%lu) has 1 extent "
232 "inactive, marking as DEGRADED",
233 vol->devname, vol->svc_id);
234 vol->status = HR_VOL_DEGRADED;
235 }
236 return EOK;
237 default:
238 if (old_state != HR_VOL_FAULTY) {
239 HR_WARN("RAID 4 array \"%s\" (%lu) has more "
240 "than one 1 extent inactive, marking as FAULTY",
241 vol->devname, vol->svc_id);
242 vol->status = HR_VOL_FAULTY;
243 }
244 return EINVAL;
245 }
246}
247
248static void xor(void *dst, const void *src, size_t size)
249{
250 size_t i;
251 uint64_t *d = dst;
252 const uint64_t *s = src;
253
254 for (i = 0; i < size / sizeof(uint64_t); ++i)
255 *d++ ^= *s++;
256}
257
258static errno_t hr_raid4_read_degraded(hr_volume_t *vol, uint64_t bad,
259 uint64_t block, void *data, size_t cnt)
260{
261 errno_t rc;
262 size_t i;
263 void *xorbuf;
264 void *buf;
265 uint64_t len = vol->bsize * cnt;
266
267 xorbuf = malloc(len);
268 if (xorbuf == NULL)
269 return ENOMEM;
270
271 buf = malloc(len);
272 if (buf == NULL) {
273 free(xorbuf);
274 return ENOMEM;
275 }
276
277 /* read all other extents in the stripe */
278 memset(xorbuf, 0, len);
279 for (i = 0; i < vol->dev_no; i++) {
280 if (i == bad) {
281 continue;
282 } else {
283 rc = block_read_direct(vol->extents[i].svc_id, block,
284 cnt, buf);
285 if (rc != EOK)
286 goto end;
287 xor(xorbuf, buf, len);
288 }
289 }
290
291 memcpy(data, xorbuf, len);
292end:
293 free(xorbuf);
294 free(buf);
295 return rc;
296}
297
298static errno_t hr_raid4_write(hr_volume_t *vol, uint64_t extent, aoff64_t ba,
299 const void *data, size_t cnt)
300{
301 errno_t rc;
302 size_t i;
303 void *xorbuf;
304 void *buf;
305 uint64_t len = vol->bsize * cnt;
306
307 ssize_t bad = hr_raid4_get_bad_ext(vol);
308 if (bad < 1) {
309 rc = block_write_direct(vol->extents[extent].svc_id, ba, cnt,
310 data);
311 if (rc != EOK)
312 return rc;
313 /*
314 * DEGRADED parity - skip parity write
315 */
316 if (bad == 0)
317 return EOK;
318
319 rc = hr_raid4_write_parity(vol, extent, ba, data, cnt);
320 return rc;
321 }
322
323 xorbuf = malloc(len);
324 if (xorbuf == NULL)
325 return ENOMEM;
326
327 buf = malloc(len);
328 if (buf == NULL) {
329 free(xorbuf);
330 return ENOMEM;
331 }
332
333 if (extent == (size_t) bad) {
334 /*
335 * new parity = read other and xor in new data
336 *
337 * write new parity
338 */
339 memset(xorbuf, 0, len);
340 for (i = 1; i < vol->dev_no; i++) {
341 if (i == (size_t) bad) {
342 continue;
343 } else {
344 rc = block_read_direct(vol->extents[i].svc_id,
345 ba, cnt, buf);
346 if (rc != EOK)
347 goto end;
348 xor(xorbuf, buf, len);
349 }
350 }
351 xor(xorbuf, data, len);
352 rc = block_write_direct(vol->extents[0].svc_id, ba, cnt,
353 xorbuf);
354 if (rc != EOK)
355 goto end;
356 } else {
357 /*
358 * new parity = xor original data and old parity and new data
359 *
360 * write parity, new data
361 */
362 rc = block_read_direct(vol->extents[extent].svc_id, ba, cnt,
363 xorbuf);
364 if (rc != EOK)
365 goto end;
366 rc = block_read_direct(vol->extents[0].svc_id, ba, cnt, buf);
367 if (rc != EOK)
368 goto end;
369
370 xor(xorbuf, buf, len);
371
372 xor(xorbuf, data, len);
373
374 rc = block_write_direct(vol->extents[0].svc_id, ba, cnt,
375 xorbuf);
376 if (rc != EOK)
377 goto end;
378 rc = block_write_direct(vol->extents[extent].svc_id, ba, cnt,
379 data);
380 if (rc != EOK)
381 goto end;
382 }
383end:
384 free(xorbuf);
385 free(buf);
386 return rc;
387}
388
389static errno_t hr_raid4_write_parity(hr_volume_t *vol, uint64_t extent,
390 uint64_t block, const void *data, size_t cnt)
391{
392 errno_t rc;
393 size_t i;
394 void *xorbuf;
395 void *buf;
396 uint64_t len = vol->bsize * cnt;
397
398 xorbuf = malloc(len);
399 if (xorbuf == NULL)
400 return ENOMEM;
401
402 buf = malloc(len);
403 if (buf == NULL) {
404 free(xorbuf);
405 return ENOMEM;
406 }
407
408 /*
409 * parity = read and xor all other data extents, xor in new data
410 *
411 * XXX: subtract method
412 */
413 memset(xorbuf, 0, len);
414 for (i = 1; i < vol->dev_no; i++) {
415 if (i == extent) {
416 xor(xorbuf, data, vol->bsize);
417 } else {
418 rc = block_read_direct(vol->extents[i].svc_id, block,
419 cnt, buf);
420 if (rc != EOK)
421 goto end;
422 xor(xorbuf, buf, len);
423 }
424 }
425
426 rc = block_write_direct(vol->extents[0].svc_id, block, cnt, xorbuf);
427end:
428 free(xorbuf);
429 free(buf);
430 return rc;
431}
432
433static errno_t hr_raid4_bd_op(hr_bd_op_type_t type, bd_srv_t *bd, aoff64_t ba,
434 size_t cnt, void *dst, const void *src, size_t size)
435{
436 hr_volume_t *vol = bd->srvs->sarg;
437 errno_t rc;
438 uint64_t phys_block, len;
439 size_t left;
440 const uint8_t *data_write = src;
441 uint8_t *data_read = dst;
442
443 /* propagate sync */
444 if (type == HR_BD_SYNC && ba == 0 && cnt == 0) {
445 hr_sync_all_extents(vol);
446 rc = hr_raid4_update_vol_status(vol);
447 return rc;
448 }
449
450 if (type == HR_BD_READ || type == HR_BD_WRITE)
451 if (size < cnt * vol->bsize)
452 return EINVAL;
453
454 rc = hr_check_ba_range(vol, cnt, ba);
455 if (rc != EOK)
456 return rc;
457
458 uint64_t strip_size = vol->strip_size / vol->bsize; /* in blocks */
459 uint64_t stripe = (ba / strip_size); /* stripe number */
460 uint64_t extent = (stripe % (vol->dev_no - 1)) + 1;
461 uint64_t ext_stripe = stripe / (vol->dev_no - 1); /* stripe level */
462 uint64_t strip_off = ba % strip_size; /* strip offset */
463
464 fibril_mutex_lock(&vol->lock);
465
466 rc = hr_raid4_vol_usable(vol);
467 if (rc != EOK) {
468 fibril_mutex_unlock(&vol->lock);
469 return EIO;
470 }
471
472 left = cnt;
473 while (left != 0) {
474 phys_block = ext_stripe * strip_size + strip_off;
475 cnt = min(left, strip_size - strip_off);
476 len = vol->bsize * cnt;
477 hr_add_ba_offset(vol, &phys_block);
478 switch (type) {
479 case HR_BD_SYNC:
480 if (vol->extents[extent].status != HR_EXT_ONLINE)
481 break;
482 rc = block_sync_cache(vol->extents[extent].svc_id,
483 phys_block, cnt);
484 /* allow unsupported sync */
485 if (rc == ENOTSUP)
486 rc = EOK;
487 break;
488 case HR_BD_READ:
489 retry_read:
490 ssize_t bad = hr_raid4_get_bad_ext(vol);
491 if (bad > 0 && extent == (size_t) bad) {
492 rc = hr_raid4_read_degraded(vol, bad,
493 phys_block, data_read, cnt);
494 } else {
495 rc = block_read_direct(vol->extents[extent].svc_id,
496 phys_block, cnt, data_read);
497 }
498 data_read += len;
499 break;
500 case HR_BD_WRITE:
501 retry_write:
502 rc = hr_raid4_write(vol, extent, phys_block,
503 data_write, cnt);
504 data_write += len;
505 break;
506 default:
507 rc = EINVAL;
508 goto error;
509 }
510
511 if (rc == ENOMEM)
512 goto error;
513
514 if (rc == ENOENT)
515 hr_update_ext_status(vol, extent, HR_EXT_MISSING);
516 else if (rc != EOK)
517 hr_update_ext_status(vol, extent, HR_EXT_FAILED);
518
519 if (rc != EOK) {
520 rc = hr_raid4_update_vol_status(vol);
521 if (rc == EOK) {
522 /*
523 * State changed from ONLINE -> DEGRADED,
524 * rewind and retry
525 */
526 if (type == HR_BD_WRITE) {
527 data_write -= len;
528 goto retry_write;
529 } else if (type == HR_BD_WRITE) {
530 data_read -= len;
531 goto retry_read;
532 }
533 } else {
534 rc = EIO;
535 goto error;
536 }
537 }
538
539 left -= cnt;
540 strip_off = 0;
541 extent++;
542 if (extent >= vol->dev_no) {
543 ext_stripe++;
544 extent = 1;
545 }
546 }
547
548error:
549 (void) hr_raid4_update_vol_status(vol);
550 fibril_mutex_unlock(&vol->lock);
551 return rc;
552}
553
554/** @}
555 */
Note: See TracBrowser for help on using the repository browser.