source: mainline/uspace/srv/bd/hr/raid5.c@ 733564a

Last change on this file since 733564a was 733564a, checked in by Miroslav Cimerman <mc@…>, 9 months ago

hr: add all fcn prototypes in raid implementations

Also restructure the file in order: create(), init(),
public BD ops, internal util functions and io ops.

  • Property mode set to 100644
File size: 13.2 KB
Line 
1/*
2 * Copyright (c) 2024 Miroslav Cimerman
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * - Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * - Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * - The name of the author may not be used to endorse or promote products
15 * derived from this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/** @addtogroup hr
30 * @{
31 */
32/**
33 * @file
34 */
35
36#include <abi/ipc/ipc.h>
37#include <bd_srv.h>
38#include <block.h>
39#include <errno.h>
40#include <hr.h>
41#include <io/log.h>
42#include <ipc/hr.h>
43#include <ipc/services.h>
44#include <loc.h>
45#include <mem.h>
46#include <task.h>
47#include <stdio.h>
48#include <stdlib.h>
49#include <str_error.h>
50
51#include "superblock.h"
52#include "util.h"
53#include "var.h"
54
55extern loc_srv_t *hr_srv;
56
57static errno_t hr_raid5_vol_usable(hr_volume_t *);
58static ssize_t hr_raid5_get_bad_ext(hr_volume_t *);
59static errno_t hr_raid5_update_vol_status(hr_volume_t *);
60static void xor(void *, const void *, size_t);
61static errno_t hr_raid5_read_degraded(hr_volume_t *, uint64_t, uint64_t,
62 void *, size_t);
63static errno_t hr_raid5_write(hr_volume_t *, uint64_t, uint64_t, aoff64_t,
64 const void *, size_t);
65static errno_t hr_raid5_write_parity(hr_volume_t *, uint64_t, uint64_t,
66 uint64_t, const void *, size_t);
67static errno_t hr_raid5_bd_op(hr_bd_op_type_t, bd_srv_t *, aoff64_t, size_t,
68 void *, const void *, size_t);
69
70/* bdops */
71static errno_t hr_raid5_bd_open(bd_srvs_t *, bd_srv_t *);
72static errno_t hr_raid5_bd_close(bd_srv_t *);
73static errno_t hr_raid5_bd_read_blocks(bd_srv_t *, aoff64_t, size_t, void *,
74 size_t);
75static errno_t hr_raid5_bd_sync_cache(bd_srv_t *, aoff64_t, size_t);
76static errno_t hr_raid5_bd_write_blocks(bd_srv_t *, aoff64_t, size_t,
77 const void *, size_t);
78static errno_t hr_raid5_bd_get_block_size(bd_srv_t *, size_t *);
79static errno_t hr_raid5_bd_get_num_blocks(bd_srv_t *, aoff64_t *);
80
81static bd_ops_t hr_raid5_bd_ops = {
82 .open = hr_raid5_bd_open,
83 .close = hr_raid5_bd_close,
84 .sync_cache = hr_raid5_bd_sync_cache,
85 .read_blocks = hr_raid5_bd_read_blocks,
86 .write_blocks = hr_raid5_bd_write_blocks,
87 .get_block_size = hr_raid5_bd_get_block_size,
88 .get_num_blocks = hr_raid5_bd_get_num_blocks
89};
90
91errno_t hr_raid5_create(hr_volume_t *new_volume)
92{
93 errno_t rc;
94
95 assert(new_volume->level == HR_LVL_5);
96
97 if (new_volume->dev_no < 3) {
98 ERR_PRINTF("RAID 5 array needs at least 3 devices");
99 return EINVAL;
100 }
101
102 rc = hr_raid5_update_vol_status(new_volume);
103 if (rc != EOK)
104 return rc;
105
106 bd_srvs_init(&new_volume->hr_bds);
107 new_volume->hr_bds.ops = &hr_raid5_bd_ops;
108 new_volume->hr_bds.sarg = new_volume;
109
110 rc = hr_register_volume(new_volume);
111
112 return rc;
113}
114
115errno_t hr_raid5_init(hr_volume_t *vol)
116{
117 errno_t rc;
118 size_t bsize;
119 uint64_t total_blkno;
120
121 assert(vol->level == HR_LVL_5);
122
123 rc = hr_check_devs(vol, &total_blkno, &bsize);
124 if (rc != EOK)
125 return rc;
126
127 vol->nblocks = total_blkno;
128 vol->bsize = bsize;
129 vol->data_offset = HR_DATA_OFF;
130 vol->data_blkno = vol->nblocks - (vol->data_offset * vol->dev_no) -
131 (vol->nblocks / vol->dev_no);
132 vol->strip_size = HR_STRIP_SIZE;
133
134 return EOK;
135}
136
137static errno_t hr_raid5_bd_open(bd_srvs_t *bds, bd_srv_t *bd)
138{
139 DPRINTF("hr_bd_open()\n");
140 return EOK;
141}
142
143static errno_t hr_raid5_bd_close(bd_srv_t *bd)
144{
145 DPRINTF("hr_bd_close()\n");
146 return EOK;
147}
148
149static errno_t hr_raid5_bd_sync_cache(bd_srv_t *bd, aoff64_t ba, size_t cnt)
150{
151 return hr_raid5_bd_op(HR_BD_SYNC, bd, ba, cnt, NULL, NULL, 0);
152}
153
154static errno_t hr_raid5_bd_read_blocks(bd_srv_t *bd, aoff64_t ba, size_t cnt,
155 void *buf, size_t size)
156{
157 return hr_raid5_bd_op(HR_BD_READ, bd, ba, cnt, buf, NULL, size);
158}
159
160static errno_t hr_raid5_bd_write_blocks(bd_srv_t *bd, aoff64_t ba, size_t cnt,
161 const void *data, size_t size)
162{
163 return hr_raid5_bd_op(HR_BD_WRITE, bd, ba, cnt, NULL, data, size);
164}
165
166static errno_t hr_raid5_bd_get_block_size(bd_srv_t *bd, size_t *rsize)
167{
168 hr_volume_t *vol = bd->srvs->sarg;
169
170 *rsize = vol->bsize;
171 return EOK;
172}
173
174static errno_t hr_raid5_bd_get_num_blocks(bd_srv_t *bd, aoff64_t *rnb)
175{
176 hr_volume_t *vol = bd->srvs->sarg;
177
178 *rnb = vol->data_blkno;
179 return EOK;
180}
181
182static errno_t hr_raid5_vol_usable(hr_volume_t *vol)
183{
184 if (vol->status == HR_VOL_ONLINE ||
185 vol->status == HR_VOL_DEGRADED)
186 return EOK;
187 return EINVAL;
188}
189
190/*
191 * Returns (-1) if all extents are online,
192 * else returns index of first bad one.
193 */
194static ssize_t hr_raid5_get_bad_ext(hr_volume_t *vol)
195{
196 for (size_t i = 0; i < vol->dev_no; i++)
197 if (vol->extents[i].status != HR_EXT_ONLINE)
198 return i;
199 return -1;
200}
201
202static errno_t hr_raid5_update_vol_status(hr_volume_t *vol)
203{
204 hr_vol_status_t old_state = vol->status;
205 size_t bad = 0;
206 for (size_t i = 0; i < vol->dev_no; i++)
207 if (vol->extents[i].status != HR_EXT_ONLINE)
208 bad++;
209
210 switch (bad) {
211 case 0:
212 if (old_state != HR_VOL_ONLINE) {
213 DPRINTF("RAID 5 has all extents online, "
214 "marking \"%s\" (%lu) as ONLINE",
215 vol->devname, vol->svc_id);
216 vol->status = HR_VOL_ONLINE;
217 }
218 return EOK;
219 case 1:
220 if (old_state != HR_VOL_DEGRADED) {
221 ERR_PRINTF("RAID 5 array \"%s\" (%lu) has 1 extent "
222 "inactive, marking as DEGRADED",
223 vol->devname, vol->svc_id);
224 vol->status = HR_VOL_DEGRADED;
225 }
226 return EOK;
227 default:
228 if (old_state != HR_VOL_FAULTY) {
229 ERR_PRINTF("RAID 5 array \"%s\" (%lu) has more "
230 "than one 1 extent inactive, marking as FAULTY",
231 vol->devname, vol->svc_id);
232 vol->status = HR_VOL_FAULTY;
233 }
234 return EINVAL;
235 }
236}
237
238static void xor(void *dst, const void *src, size_t size)
239{
240 size_t i;
241 uint64_t *d = dst;
242 const uint64_t *s = src;
243
244 for (i = 0; i < size / sizeof(uint64_t); ++i)
245 *d++ ^= *s++;
246}
247
248static errno_t hr_raid5_read_degraded(hr_volume_t *vol, uint64_t bad,
249 uint64_t block, void *data, size_t cnt)
250{
251 errno_t rc;
252 size_t i;
253 void *xorbuf;
254 void *buf;
255 uint64_t len = vol->bsize * cnt;
256
257 xorbuf = malloc(len);
258 if (xorbuf == NULL)
259 return ENOMEM;
260
261 buf = malloc(len);
262 if (buf == NULL) {
263 free(xorbuf);
264 return ENOMEM;
265 }
266
267 /* read all other extents in the stripe */
268 memset(xorbuf, 0, len);
269 for (i = 0; i < vol->dev_no; i++) {
270 if (i == bad) {
271 continue;
272 } else {
273 rc = block_read_direct(vol->extents[i].svc_id, block,
274 cnt, buf);
275 if (rc != EOK)
276 goto end;
277 xor(xorbuf, buf, len);
278 }
279 }
280
281 memcpy(data, xorbuf, len);
282end:
283 free(xorbuf);
284 free(buf);
285 return rc;
286}
287
288static errno_t hr_raid5_write(hr_volume_t *vol, uint64_t p_extent,
289 uint64_t extent, aoff64_t ba, const void *data, size_t cnt)
290{
291 errno_t rc;
292 size_t i;
293 void *xorbuf;
294 void *buf;
295 uint64_t len = vol->bsize * cnt;
296
297 ssize_t bad = hr_raid5_get_bad_ext(vol);
298 if (bad == -1 || (size_t)bad == p_extent) {
299 rc = block_write_direct(vol->extents[extent].svc_id, ba, cnt,
300 data);
301 if (rc != EOK)
302 return rc;
303 /*
304 * DEGRADED parity - skip parity write
305 */
306 if ((size_t)bad == p_extent)
307 return EOK;
308
309 rc = hr_raid5_write_parity(vol, p_extent, extent, ba, data,
310 cnt);
311 return rc;
312 }
313
314 xorbuf = malloc(len);
315 if (xorbuf == NULL)
316 return ENOMEM;
317
318 buf = malloc(len);
319 if (buf == NULL) {
320 free(xorbuf);
321 return ENOMEM;
322 }
323
324 if (extent == (size_t) bad) {
325 /*
326 * new parity = read other and xor in new data
327 *
328 * write new parity
329 */
330 memset(xorbuf, 0, len);
331 for (i = 1; i < vol->dev_no; i++) {
332 if (i == (size_t) bad) {
333 continue;
334 } else {
335 rc = block_read_direct(vol->extents[i].svc_id,
336 ba, cnt, buf);
337 if (rc != EOK)
338 goto end;
339 xor(xorbuf, buf, len);
340 }
341 }
342 xor(xorbuf, data, len);
343 rc = block_write_direct(vol->extents[p_extent].svc_id, ba, cnt,
344 xorbuf);
345 if (rc != EOK)
346 goto end;
347 } else {
348 /*
349 * new parity = xor original data and old parity and new data
350 *
351 * write parity, new data
352 */
353 rc = block_read_direct(vol->extents[extent].svc_id, ba, cnt,
354 xorbuf);
355 if (rc != EOK)
356 goto end;
357 rc = block_read_direct(vol->extents[p_extent].svc_id, ba, cnt,
358 buf);
359 if (rc != EOK)
360 goto end;
361
362 xor(xorbuf, buf, len);
363
364 xor(xorbuf, data, len);
365
366 rc = block_write_direct(vol->extents[p_extent].svc_id, ba, cnt,
367 xorbuf);
368 if (rc != EOK)
369 goto end;
370 rc = block_write_direct(vol->extents[extent].svc_id, ba, cnt,
371 data);
372 if (rc != EOK)
373 goto end;
374 }
375end:
376 free(xorbuf);
377 free(buf);
378 return rc;
379}
380
381static errno_t hr_raid5_write_parity(hr_volume_t *vol, uint64_t p_extent,
382 uint64_t extent, uint64_t block, const void *data, size_t cnt)
383{
384 errno_t rc;
385 size_t i;
386 void *xorbuf;
387 void *buf;
388 uint64_t len = vol->bsize * cnt;
389
390 xorbuf = malloc(len);
391 if (xorbuf == NULL)
392 return ENOMEM;
393
394 buf = malloc(len);
395 if (buf == NULL) {
396 free(xorbuf);
397 return ENOMEM;
398 }
399
400 memset(xorbuf, 0, len);
401 for (i = 0; i < vol->dev_no; i++) {
402 if (i == p_extent)
403 continue;
404 if (i == extent) {
405 xor(xorbuf, data, vol->bsize);
406 } else {
407 rc = block_read_direct(vol->extents[i].svc_id,
408 block, cnt, buf);
409 if (rc != EOK)
410 goto end;
411 xor(xorbuf, buf, vol->bsize);
412 }
413 }
414
415 rc = block_write_direct(vol->extents[p_extent].svc_id, block, cnt,
416 xorbuf);
417end:
418 free(xorbuf);
419 free(buf);
420 return rc;
421}
422
423static errno_t hr_raid5_bd_op(hr_bd_op_type_t type, bd_srv_t *bd, aoff64_t ba,
424 size_t cnt, void *dst, const void *src, size_t size)
425{
426 hr_volume_t *vol = bd->srvs->sarg;
427 errno_t rc;
428 uint64_t phys_block, len;
429 size_t left;
430 const uint8_t *data_write = src;
431 uint8_t *data_read = dst;
432
433 /* propagate sync */
434 if (type == HR_BD_SYNC && ba == 0 && cnt == 0) {
435 hr_sync_all_extents(vol);
436 rc = hr_raid5_update_vol_status(vol);
437 return rc;
438 }
439
440 if (type == HR_BD_READ || type == HR_BD_WRITE)
441 if (size < cnt * vol->bsize)
442 return EINVAL;
443
444 rc = hr_check_ba_range(vol, cnt, ba);
445 if (rc != EOK)
446 return rc;
447
448 uint64_t strip_size = vol->strip_size / vol->bsize; /* in blocks */
449 uint64_t stripe = (ba / strip_size); /* stripe number */
450 uint64_t p_extent = (stripe / (vol->dev_no - 1)) % vol->dev_no; /* parity extent */
451 uint64_t extent;
452 if ((stripe % (vol->dev_no - 1)) < p_extent)
453 extent = (stripe % (vol->dev_no - 1));
454 else
455 extent = ((stripe % (vol->dev_no - 1)) + 1);
456 uint64_t ext_stripe = stripe / (vol->dev_no - 1); /* stripe level */
457 uint64_t strip_off = ba % strip_size; /* strip offset */
458
459 fibril_mutex_lock(&vol->lock);
460
461 rc = hr_raid5_vol_usable(vol);
462 if (rc != EOK) {
463 fibril_mutex_unlock(&vol->lock);
464 return EIO;
465 }
466
467 left = cnt;
468 while (left != 0) {
469 phys_block = ext_stripe * strip_size + strip_off;
470 cnt = min(left, strip_size - strip_off);
471 len = vol->bsize * cnt;
472 hr_add_ba_offset(vol, &phys_block);
473 switch (type) {
474 case HR_BD_SYNC:
475 if (vol->extents[extent].status != HR_EXT_ONLINE)
476 break;
477 rc = block_sync_cache(vol->extents[extent].svc_id,
478 phys_block, cnt);
479 /* allow unsupported sync */
480 if (rc == ENOTSUP)
481 rc = EOK;
482 break;
483 case HR_BD_READ:
484 retry_read:
485 ssize_t bad = hr_raid5_get_bad_ext(vol);
486 if (bad > 0 && extent == (size_t) bad) {
487 rc = hr_raid5_read_degraded(vol, bad,
488 phys_block, data_read, cnt);
489 } else {
490 rc = block_read_direct(vol->extents[extent].svc_id,
491 phys_block, cnt, data_read);
492 }
493 data_read += len;
494 break;
495 case HR_BD_WRITE:
496 retry_write:
497 rc = hr_raid5_write(vol, p_extent, extent, phys_block,
498 data_write, cnt);
499 data_write += len;
500 break;
501 default:
502 rc = EINVAL;
503 goto error;
504 }
505
506 if (rc == ENOMEM)
507 goto error;
508
509 if (rc == ENOENT)
510 hr_update_ext_status(vol, extent, HR_EXT_MISSING);
511 else if (rc != EOK)
512 hr_update_ext_status(vol, extent, HR_EXT_FAILED);
513
514 if (rc != EOK) {
515 rc = hr_raid5_update_vol_status(vol);
516 if (rc == EOK) {
517 /*
518 * State changed from ONLINE -> DEGRADED,
519 * rewind and retry
520 */
521 if (type == HR_BD_WRITE) {
522 data_write -= len;
523 goto retry_write;
524 } else if (type == HR_BD_WRITE) {
525 data_read -= len;
526 goto retry_read;
527 }
528 } else {
529 rc = EIO;
530 goto error;
531 }
532 }
533
534 left -= cnt;
535 strip_off = 0;
536 if (extent + 1 >= vol->dev_no ||
537 (extent + 1 == p_extent && p_extent + 1 >= vol->dev_no))
538 ext_stripe++;
539 stripe++;
540 p_extent = (stripe / (vol->dev_no - 1)) % vol->dev_no; /* parity extent */
541 if ((stripe % (vol->dev_no - 1)) < p_extent)
542 extent = (stripe % (vol->dev_no - 1));
543 else
544 extent = ((stripe % (vol->dev_no - 1)) + 1);
545 }
546
547error:
548 (void) hr_raid5_update_vol_status(vol);
549 fibril_mutex_unlock(&vol->lock);
550 return rc;
551}
552
553/** @}
554 */
Note: See TracBrowser for help on using the repository browser.