source: mainline/common/str.c@ 9bf95d4

Last change on this file since 9bf95d4 was 1db4e2ae, checked in by Jiří Zárevúcky <zarevucky.jiri@…>, 2 months ago

Have str_sanitize also remove C0 and C1 control codes

and use it to sanitize KIO output

  • Property mode set to 100644
File size: 46.2 KB
RevLine 
[936351c1]1/*
[d066259]2 * Copyright (c) 2001-2004 Jakub Jermar
[df4ed85]3 * Copyright (c) 2005 Martin Decky
[576845ec]4 * Copyright (c) 2008 Jiri Svoboda
[22cf42d9]5 * Copyright (c) 2011 Martin Sucha
[c4bbca8]6 * Copyright (c) 2011 Oleg Romanenko
[65bf084]7 * Copyright (c) 2025 Jiří Zárevúcky
[936351c1]8 * All rights reserved.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * - Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * - Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 * - The name of the author may not be used to endorse or promote products
20 * derived from this software without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
23 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
24 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
25 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
27 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
31 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33
[a46da63]34/** @addtogroup libc
[b2951e2]35 * @{
36 */
[d066259]37
38/**
39 * @file
40 * @brief String functions.
41 *
42 * Strings and characters use the Universal Character Set (UCS). The standard
43 * strings, called just strings are encoded in UTF-8. Wide strings (encoded
44 * in UTF-32) are supported to a limited degree. A single character is
[28a5ebd]45 * represented as char32_t.@n
[d066259]46 *
47 * Overview of the terminology:@n
48 *
49 * Term Meaning
50 * -------------------- ----------------------------------------------------
51 * byte 8 bits stored in uint8_t (unsigned 8 bit integer)
52 *
[28a5ebd]53 * character UTF-32 encoded Unicode character, stored in char32_t
54 * (unsigned 32 bit integer), code points 0 .. 1114111
[d066259]55 * are valid
56 *
[28c39f3]57 * Note that Unicode characters do not match
58 * one-to-one with displayed characters or glyphs on
59 * screen. For that level of precision, look up
60 * Grapheme Clusters.
61 *
[d066259]62 * ASCII character 7 bit encoded ASCII character, stored in char
63 * (usually signed 8 bit integer), code points 0 .. 127
64 * are valid
65 *
66 * string UTF-8 encoded NULL-terminated Unicode string, char *
67 *
68 * wide string UTF-32 encoded NULL-terminated Unicode string,
[28a5ebd]69 * char32_t *
[d066259]70 *
71 * [wide] string size number of BYTES in a [wide] string (excluding
72 * the NULL-terminator), size_t
73 *
74 * [wide] string length number of CHARACTERS in a [wide] string (excluding
75 * the NULL-terminator), size_t
76 *
77 * [wide] string width number of display cells on a monospace display taken
78 * by a [wide] string, size_t
79 *
[28c39f3]80 * This is virtually impossible to determine exactly for
81 * all strings without knowing specifics of the display
82 * device, due to various factors affecting text output.
83 * If you have the option to query the terminal for
84 * position change caused by outputting the string,
85 * it is preferrable to determine width that way.
86 *
[d066259]87 *
88 * Overview of string metrics:@n
89 *
90 * Metric Abbrev. Type Meaning
91 * ------ ------ ------ -------------------------------------------------
92 * size n size_t number of BYTES in a string (excluding the
93 * NULL-terminator)
94 *
95 * length l size_t number of CHARACTERS in a string (excluding the
96 * null terminator)
97 *
98 * width w size_t number of display cells on a monospace display
99 * taken by a string
100 *
101 *
102 * Function naming prefixes:@n
103 *
104 * chr_ operate on characters
105 * ascii_ operate on ASCII characters
106 * str_ operate on strings
107 * wstr_ operate on wide strings
108 *
109 * [w]str_[n|l|w] operate on a prefix limited by size, length
110 * or width
111 *
112 *
113 * A specific character inside a [wide] string can be referred to by:@n
114 *
[28a5ebd]115 * pointer (char *, char32_t *)
[d066259]116 * byte offset (size_t)
117 * character index (size_t)
118 *
[b2951e2]119 */
120
[19f857a]121#include <str.h>
[d066259]122
[28c39f3]123#include <align.h>
[38d150e]124#include <assert.h>
[e64c4b2]125#include <ctype.h>
[171f9a1]126#include <errno.h>
[65bf084]127#include <limits.h>
[28c39f3]128#include <macros.h>
129#include <mem.h>
[d066259]130#include <stdbool.h>
131#include <stddef.h>
132#include <stdint.h>
133#include <stdlib.h>
[28c39f3]134#include <uchar.h>
[171f9a1]135
[65bf084]136#if __STDC_HOSTED__
137#include <fibril.h>
138#endif
139
140static void _set_ilseq()
141{
142#ifdef errno
143 errno = EILSEQ;
144#endif
145}
146
[171f9a1]147/** Byte mask consisting of lowest @n bits (out of 8) */
148#define LO_MASK_8(n) ((uint8_t) ((1 << (n)) - 1))
149
150/** Byte mask consisting of lowest @n bits (out of 32) */
151#define LO_MASK_32(n) ((uint32_t) ((1 << (n)) - 1))
152
153/** Byte mask consisting of highest @n bits (out of 8) */
154#define HI_MASK_8(n) (~LO_MASK_8(8 - (n)))
155
156/** Number of data bits in a UTF-8 continuation byte */
157#define CONT_BITS 6
158
[65bf084]159#define UTF8_MASK_INITIAL2 0b00011111
160#define UTF8_MASK_INITIAL3 0b00001111
161#define UTF8_MASK_INITIAL4 0b00000111
162#define UTF8_MASK_CONT 0b00111111
163
164#define CHAR_INVALID ((char32_t) UINT_MAX)
165
[28c39f3]166static inline bool _is_ascii(uint8_t b)
167{
168 return b < 0x80;
169}
170
[65bf084]171static inline bool _is_continuation(uint8_t b)
[28c39f3]172{
[65bf084]173 return (b & 0xC0) == 0x80;
174}
175
176static inline bool _is_2_byte(uint8_t c)
177{
178 return (c & 0xE0) == 0xC0;
179}
180
181static inline bool _is_3_byte(uint8_t c)
182{
183 return (c & 0xF0) == 0xE0;
184}
185
186static inline bool _is_4_byte(uint8_t c)
187{
188 return (c & 0xF8) == 0xF0;
[28c39f3]189}
190
191static inline int _char_continuation_bytes(char32_t c)
192{
[6120b7b]193 if ((c & ~LO_MASK_32(7)) == 0)
194 return 0;
195
[28c39f3]196 if ((c & ~LO_MASK_32(11)) == 0)
197 return 1;
198
199 if ((c & ~LO_MASK_32(16)) == 0)
200 return 2;
201
202 if ((c & ~LO_MASK_32(21)) == 0)
203 return 3;
204
205 /* Codes longer than 21 bits are not supported */
206 return -1;
207}
208
209static inline int _continuation_bytes(uint8_t b)
210{
211 /* 0xxxxxxx */
212 if (_is_ascii(b))
213 return 0;
214
215 /* 110xxxxx 10xxxxxx */
[65bf084]216 if (_is_2_byte(b))
[28c39f3]217 return 1;
218
219 /* 1110xxxx 10xxxxxx 10xxxxxx */
[65bf084]220 if (_is_3_byte(b))
[28c39f3]221 return 2;
222
223 /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
[65bf084]224 if (_is_4_byte(b))
[28c39f3]225 return 3;
226
227 return -1;
228}
229
[65bf084]230static bool _is_non_shortest(const mbstate_t *mb, uint8_t b)
231{
232 return (mb->state == 0b1111110000000000 && !(b & 0b00100000)) ||
233 (mb->state == 0b1111111111110000 && !(b & 0b00110000));
234}
235
[b31323f]236static bool _is_surrogate(const mbstate_t *mb, uint8_t b)
237{
238 return (mb->state == 0b1111110000001101 && b >= 0xa0);
239}
240
[65bf084]241#define _likely(expr) __builtin_expect((expr), true)
242#define _unlikely(expr) __builtin_expect((expr), false)
243
244#define FAST_PATHS 1
245
246static char32_t _str_decode(const char *s, size_t *offset, size_t size, mbstate_t *mb)
247{
248 assert(s);
249 assert(offset);
250 assert(*offset <= size);
251 assert(size == STR_NO_LIMIT || s + size >= s);
252 assert(mb);
253
254 if (*offset == size)
255 return 0;
256
257 if (_likely(!mb->state)) {
258 /* Clean slate, read initial byte. */
259 uint8_t b = s[(*offset)++];
260
261 /* Fast exit for the most common case. */
262 if (_likely(_is_ascii(b)))
263 return b;
264
265 /* unexpected continuation byte */
266 if (_unlikely(_is_continuation(b)))
267 return CHAR_INVALID;
268
269 /*
270 * The value stored into `continuation` is designed to have
271 * just enough leading ones that after shifting in one less than
272 * the expected number of continuation bytes, the most significant
273 * bit becomes zero. (The field is 16b wide.)
274 */
275
276 if (_is_2_byte(b)) {
277 /* Reject non-shortest form. */
278 if (_unlikely(!(b & 0b00011110)))
279 return CHAR_INVALID;
280
281#if FAST_PATHS
282 /* We can usually take this exit. */
283 if (_likely(*offset < size && _is_continuation(s[*offset])))
284 return (b & UTF8_MASK_INITIAL2) << 6 |
285 (s[(*offset)++] & UTF8_MASK_CONT);
286#endif
287
288 /* 2 byte continuation 110xxxxx */
289 mb->state = b ^ 0b0000000011000000;
290
291 } else if (_is_3_byte(b)) {
292#if FAST_PATHS
293 /* We can usually take this exit. */
294 if (_likely(*offset + 1 < size && _is_continuation(s[*offset]) && _is_continuation(s[*offset + 1]))) {
295
296 char32_t ch = (b & UTF8_MASK_INITIAL3) << 12 |
297 (s[(*offset)] & UTF8_MASK_CONT) << 6 |
298 (s[(*offset) + 1] & UTF8_MASK_CONT);
299
300 *offset += 2;
301
302 /* Reject non-shortest form. */
303 if (_unlikely(!(ch & 0xFFFFF800)))
304 return CHAR_INVALID;
305
[b31323f]306 /* Reject surrogates */
307 if (_unlikely(ch >= 0xD800 && ch < 0xE000))
308 return CHAR_INVALID;
309
[65bf084]310 return ch;
311 }
312#endif
313
314 /* 3 byte continuation 1110xxxx */
315 mb->state = b ^ 0b1111110011100000;
316
317 } else if (_is_4_byte(b)) {
318#if FAST_PATHS
319 /* We can usually take this exit. */
320 if (_likely(*offset + 2 < size && _is_continuation(s[*offset]) &&
321 _is_continuation(s[*offset + 1]) && _is_continuation(s[*offset + 2]))) {
322
323 char32_t ch = (b & UTF8_MASK_INITIAL4) << 18 |
324 (s[(*offset)] & UTF8_MASK_CONT) << 12 |
325 (s[(*offset) + 1] & UTF8_MASK_CONT) << 6 |
326 (s[(*offset) + 2] & UTF8_MASK_CONT);
327
328 *offset += 3;
329
330 /* Reject non-shortest form. */
331 if (_unlikely(!(ch & 0xFFFF0000)))
332 return CHAR_INVALID;
333
[b31323f]334 /* Reject out-of-range characters. */
335 if (_unlikely(ch >= 0x110000))
336 return CHAR_INVALID;
337
[65bf084]338 return ch;
339 }
340#endif
341
342 /* 4 byte continuation 11110xxx */
343 mb->state = b ^ 0b1111111100000000;
344 } else {
345 return CHAR_INVALID;
346 }
347 }
348
349 /* Deal with the remaining edge and invalid cases. */
350 for (; *offset < size; (*offset)++) {
351 /* Read continuation bytes. */
352 uint8_t b = s[*offset];
353
[b31323f]354 if (!_is_continuation(b) || _is_non_shortest(mb, b) || _is_surrogate(mb, b)) {
[65bf084]355 mb->state = 0;
356 return CHAR_INVALID;
357 }
358
359 /* Top bit becomes zero when shifting in the second to last byte. */
360 if (!(mb->state & 0x8000)) {
361 char32_t c = ((char32_t) mb->state) << 6 | (b & UTF8_MASK_CONT);
362 mb->state = 0;
363 (*offset)++;
364 return c;
365 }
366
367 mb->state = mb->state << 6 | (b & UTF8_MASK_CONT);
368 }
369
370 /* Incomplete character. */
371 assert(mb->state);
372 return 0;
373}
374
375/** Standard <uchar.h> function since C11. */
376size_t mbrtoc32(char32_t *c, const char *s, size_t n, mbstate_t *mb)
377{
378#if __STDC_HOSTED__
379 static fibril_local mbstate_t global_state = { };
380
381 if (!mb)
382 mb = &global_state;
383#endif
384
385 if (!s) {
386 /* Equivalent to mbrtoc32(NULL, "", 1, mb); */
387 c = NULL;
388 s = "";
389 n = 1;
390 }
391
392 size_t offset = 0;
393 char32_t ret = _str_decode(s, &offset, n, mb);
394 if (ret == CHAR_INVALID) {
395 assert(!mb->state);
396 _set_ilseq();
397 return UCHAR_ILSEQ;
398 }
399 if (mb->state) {
400 assert(ret == 0);
401 return UCHAR_INCOMPLETE;
402 }
403
404 if (c)
405 *c = ret;
406 return ret ? offset : 0;
407}
408
[171f9a1]409/** Decode a single character from a string.
410 *
411 * Decode a single character from a string of size @a size. Decoding starts
412 * at @a offset and this offset is moved to the beginning of the next
413 * character. In case of decoding error, offset generally advances at least
414 * by one. However, offset is never moved beyond size.
415 *
416 * @param str String (not necessarily NULL-terminated).
417 * @param offset Byte offset in string where to start decoding.
418 * @param size Size of the string (in bytes).
419 *
420 * @return Value of decoded character, U_SPECIAL on decoding error or
421 * NULL if attempt to decode beyond @a size.
422 *
423 */
[28a5ebd]424char32_t str_decode(const char *str, size_t *offset, size_t size)
[171f9a1]425{
[65bf084]426 mbstate_t mb = { };
427 char32_t ch = _str_decode(str, offset, size, &mb);
[28c39f3]428
[45adeeb]429 if (ch == CHAR_INVALID || mb.state)
[0600976]430 return U_SPECIAL;
431
[171f9a1]432 return ch;
433}
434
[45adeeb]435char32_t str_decode_r(const char *str, size_t *offset, size_t size,
436 char32_t replacement, mbstate_t *mb)
437{
438 char32_t ch = _str_decode(str, offset, size, mb);
439 return (ch == CHAR_INVALID) ? replacement : ch;
440}
441
[568693b]442/** Decode a single character from a string to the left.
443 *
444 * Decode a single character from a string of size @a size. Decoding starts
445 * at @a offset and this offset is moved to the beginning of the previous
446 * character. In case of decoding error, offset generally decreases at least
447 * by one. However, offset is never moved before 0.
448 *
449 * @param str String (not necessarily NULL-terminated).
450 * @param offset Byte offset in string where to start decoding.
451 * @param size Size of the string (in bytes).
452 *
453 * @return Value of decoded character, U_SPECIAL on decoding error or
454 * NULL if attempt to decode beyond @a start of str.
455 *
456 */
[28a5ebd]457char32_t str_decode_reverse(const char *str, size_t *offset, size_t size)
[568693b]458{
459 if (*offset == 0)
460 return 0;
[a35b458]461
[28c39f3]462 int cbytes = 0;
[568693b]463 /* Continue while continuation bytes found */
[28c39f3]464 while (*offset > 0 && cbytes < 4) {
[568693b]465 uint8_t b = (uint8_t) str[--(*offset)];
[a35b458]466
[65bf084]467 if (_is_continuation(b)) {
[28c39f3]468 cbytes++;
469 continue;
[568693b]470 }
[28c39f3]471
[65bf084]472 /* Reject non-shortest form encoding. */
[28c39f3]473 if (cbytes != _continuation_bytes(b))
474 return U_SPECIAL;
475
476 /* Start byte */
477 size_t start_offset = *offset;
478 return str_decode(str, &start_offset, size);
[568693b]479 }
[28c39f3]480
[568693b]481 /* Too many continuation bytes */
482 return U_SPECIAL;
483}
484
[171f9a1]485/** Encode a single character to string representation.
486 *
487 * Encode a single character to string representation (i.e. UTF-8) and store
488 * it into a buffer at @a offset. Encoding starts at @a offset and this offset
489 * is moved to the position where the next character can be written to.
490 *
491 * @param ch Input character.
492 * @param str Output buffer.
493 * @param offset Byte offset where to start writing.
494 * @param size Size of the output buffer (in bytes).
495 *
496 * @return EOK if the character was encoded successfully, EOVERFLOW if there
[d4a3ee5]497 * was not enough space in the output buffer or EINVAL if the character
498 * code was invalid.
[171f9a1]499 */
[28c39f3]500errno_t chr_encode(char32_t ch, char *str, size_t *offset, size_t size)
[171f9a1]501{
[65bf084]502 // TODO: merge with c32rtomb()
503
[171f9a1]504 if (*offset >= size)
505 return EOVERFLOW;
[a35b458]506
[28c39f3]507 /* Fast exit for the most common case. */
508 if (ch < 0x80) {
509 str[(*offset)++] = (char) ch;
510 return EOK;
511 }
512
513 /* Codes longer than 21 bits are not supported */
[171f9a1]514 if (!chr_check(ch))
515 return EINVAL;
[a35b458]516
[171f9a1]517 /* Determine how many continuation bytes are needed */
[a35b458]518
[28c39f3]519 unsigned int cbytes = _char_continuation_bytes(ch);
520 unsigned int b0_bits = 6 - cbytes; /* Data bits in first byte */
[a35b458]521
[171f9a1]522 /* Check for available space in buffer */
523 if (*offset + cbytes >= size)
524 return EOVERFLOW;
[a35b458]525
[171f9a1]526 /* Encode continuation bytes */
527 unsigned int i;
528 for (i = cbytes; i > 0; i--) {
[28c39f3]529 str[*offset + i] = 0x80 | (ch & LO_MASK_32(CONT_BITS));
530 ch >>= CONT_BITS;
[171f9a1]531 }
[a35b458]532
[171f9a1]533 /* Encode first byte */
[28c39f3]534 str[*offset] = (ch & LO_MASK_32(b0_bits)) | HI_MASK_8(8 - b0_bits - 1);
[a35b458]535
[171f9a1]536 /* Advance offset */
537 *offset += cbytes + 1;
[a35b458]538
[171f9a1]539 return EOK;
540}
541
[b31323f]542/* Convert in place any bytes that don't form a valid character into replacement. */
543static size_t _str_sanitize(char *str, size_t n, uint8_t replacement)
[28c39f3]544{
[0600976]545 uint8_t *b = (uint8_t *) str;
[b31323f]546 size_t count = 0;
[0600976]547
[b31323f]548 for (; n > 0 && b[0]; b++, n--) {
[1db4e2ae]549 if (b[0] < ' ') {
550 /* C0 control codes */
551 b[0] = replacement;
552 count++;
553 continue;
554 }
555
[0600976]556 int cont = _continuation_bytes(b[0]);
557 if (__builtin_expect(cont, 0) == 0)
[28c39f3]558 continue;
559
560 if (cont < 0 || n <= (size_t) cont) {
[b31323f]561 b[0] = replacement;
562 count++;
[28c39f3]563 continue;
564 }
565
[0600976]566 /* Check continuation bytes. */
[b31323f]567 bool valid = true;
[28c39f3]568 for (int i = 1; i <= cont; i++) {
[65bf084]569 if (!_is_continuation(b[i])) {
[b31323f]570 valid = false;
571 break;
[28c39f3]572 }
573 }
[0600976]574
[b31323f]575 if (!valid) {
576 b[0] = replacement;
577 count++;
578 continue;
579 }
580
[0600976]581 /*
582 * Check for non-shortest form encoding.
583 * See https://www.unicode.org/versions/corrigendum1.html
584 */
585
[b31323f]586 /* 0b110!!!!x 0b10xxxxxx */
587 if (cont == 1 && !(b[0] & 0b00011110)) {
588 b[0] = replacement;
589 count++;
590 continue;
591 }
592
[1db4e2ae]593 bool c1_control = (b[0] == 0b11000010 && b[1] < 0b10100000);
594 if (cont == 1 && c1_control) {
595 b[0] = replacement;
596 count++;
597 continue;
598 }
599
[b31323f]600 /* 0b1110!!!! 0b10!xxxxx 0b10xxxxxx */
601 if (cont == 2 && !(b[0] & 0b00001111) && !(b[1] & 0b00100000)) {
602 b[0] = replacement;
603 count++;
604 continue;
605 }
[0600976]606
[b31323f]607 /* 0b11110!!! 0b10!!xxxx 0b10xxxxxx 0b10xxxxxx */
608 if (cont == 3 && !(b[0] & 0b00000111) && !(b[1] & 0b00110000)) {
609 b[0] = replacement;
610 count++;
[0600976]611 continue;
[b31323f]612 }
[0600976]613
[b31323f]614 /* Check for surrogate character encoding. */
615 if (cont == 2 && b[0] == 0xED && b[1] >= 0xA0) {
616 b[0] = replacement;
617 count++;
[0600976]618 continue;
[b31323f]619 }
[0600976]620
[b31323f]621 /* Check for out-of-range code points. */
622 if (cont == 3 && (b[0] > 0xF4 || (b[0] == 0xF4 && b[1] >= 0x90))) {
623 b[0] = replacement;
624 count++;
[0600976]625 continue;
626 }
[b31323f]627
628 b += cont;
629 n -= cont;
[28c39f3]630 }
[b31323f]631
632 return count;
633}
634
[1db4e2ae]635/** Replaces any byte that's not part of a complete valid UTF-8 character
636 * encoding with a replacement byte.
637 * Also replaces C0 and C1 control codes.
638 */
[b31323f]639size_t str_sanitize(char *str, size_t n, uint8_t replacement)
640{
641 return _str_sanitize(str, n, replacement);
[28c39f3]642}
643
644static size_t _str_size(const char *str)
645{
646 size_t size = 0;
647
648 while (*str++ != 0)
649 size++;
650
651 return size;
652}
653
[f2b8cdc]654/** Get size of string.
655 *
656 * Get the number of bytes which are used by the string @a str (excluding the
657 * NULL-terminator).
658 *
659 * @param str String to consider.
660 *
661 * @return Number of bytes used by the string
662 *
663 */
664size_t str_size(const char *str)
665{
[28c39f3]666 return _str_size(str);
[f2b8cdc]667}
668
669/** Get size of wide string.
670 *
671 * Get the number of bytes which are used by the wide string @a str (excluding the
672 * NULL-terminator).
673 *
674 * @param str Wide string to consider.
675 *
676 * @return Number of bytes used by the wide string
677 *
678 */
[28a5ebd]679size_t wstr_size(const char32_t *str)
[f2b8cdc]680{
[28a5ebd]681 return (wstr_length(str) * sizeof(char32_t));
[f2b8cdc]682}
683
684/** Get size of string with length limit.
685 *
686 * Get the number of bytes which are used by up to @a max_len first
687 * characters in the string @a str. If @a max_len is greater than
688 * the length of @a str, the entire string is measured (excluding the
689 * NULL-terminator).
690 *
691 * @param str String to consider.
692 * @param max_len Maximum number of characters to measure.
693 *
694 * @return Number of bytes used by the characters.
695 *
696 */
[d4a3ee5]697size_t str_lsize(const char *str, size_t max_len)
[f2b8cdc]698{
[d4a3ee5]699 size_t len = 0;
[f2b8cdc]700 size_t offset = 0;
[a35b458]701
[f2b8cdc]702 while (len < max_len) {
703 if (str_decode(str, &offset, STR_NO_LIMIT) == 0)
704 break;
[a35b458]705
[f2b8cdc]706 len++;
707 }
[a35b458]708
[f2b8cdc]709 return offset;
710}
711
[28c39f3]712static size_t _str_nsize(const char *str, size_t max_size)
713{
714 size_t size = 0;
715
716 while ((*str++ != 0) && (size < max_size))
717 size++;
718
719 return size;
720}
721
[560d79f]722/** Get size of string with size limit.
723 *
724 * Get the number of bytes which are used by the string @a str
725 * (excluding the NULL-terminator), but no more than @max_size bytes.
726 *
727 * @param str String to consider.
728 * @param max_size Maximum number of bytes to measure.
729 *
730 * @return Number of bytes used by the string
731 *
732 */
733size_t str_nsize(const char *str, size_t max_size)
734{
[28c39f3]735 return _str_nsize(str, max_size);
[560d79f]736}
737
738/** Get size of wide string with size limit.
739 *
740 * Get the number of bytes which are used by the wide string @a str
741 * (excluding the NULL-terminator), but no more than @max_size bytes.
742 *
743 * @param str Wide string to consider.
744 * @param max_size Maximum number of bytes to measure.
745 *
746 * @return Number of bytes used by the wide string
747 *
748 */
[28a5ebd]749size_t wstr_nsize(const char32_t *str, size_t max_size)
[560d79f]750{
[28a5ebd]751 return (wstr_nlength(str, max_size) * sizeof(char32_t));
[560d79f]752}
753
[f2b8cdc]754/** Get size of wide string with length limit.
755 *
756 * Get the number of bytes which are used by up to @a max_len first
757 * wide characters in the wide string @a str. If @a max_len is greater than
758 * the length of @a str, the entire wide string is measured (excluding the
759 * NULL-terminator).
760 *
761 * @param str Wide string to consider.
762 * @param max_len Maximum number of wide characters to measure.
763 *
764 * @return Number of bytes used by the wide characters.
765 *
766 */
[28a5ebd]767size_t wstr_lsize(const char32_t *str, size_t max_len)
[f2b8cdc]768{
[28a5ebd]769 return (wstr_nlength(str, max_len * sizeof(char32_t)) * sizeof(char32_t));
[f2b8cdc]770}
771
772/** Get number of characters in a string.
773 *
774 * @param str NULL-terminated string.
775 *
776 * @return Number of characters in string.
777 *
778 */
[d4a3ee5]779size_t str_length(const char *str)
[f2b8cdc]780{
[d4a3ee5]781 size_t len = 0;
[f2b8cdc]782 size_t offset = 0;
[a35b458]783
[f2b8cdc]784 while (str_decode(str, &offset, STR_NO_LIMIT) != 0)
785 len++;
[a35b458]786
[f2b8cdc]787 return len;
788}
789
790/** Get number of characters in a wide string.
791 *
792 * @param str NULL-terminated wide string.
793 *
794 * @return Number of characters in @a str.
795 *
796 */
[28a5ebd]797size_t wstr_length(const char32_t *wstr)
[f2b8cdc]798{
[d4a3ee5]799 size_t len = 0;
[a35b458]800
[f2b8cdc]801 while (*wstr++ != 0)
802 len++;
[a35b458]803
[f2b8cdc]804 return len;
805}
806
807/** Get number of characters in a string with size limit.
808 *
809 * @param str NULL-terminated string.
810 * @param size Maximum number of bytes to consider.
811 *
812 * @return Number of characters in string.
813 *
814 */
[d4a3ee5]815size_t str_nlength(const char *str, size_t size)
[f2b8cdc]816{
[d4a3ee5]817 size_t len = 0;
[f2b8cdc]818 size_t offset = 0;
[a35b458]819
[f2b8cdc]820 while (str_decode(str, &offset, size) != 0)
821 len++;
[a35b458]822
[f2b8cdc]823 return len;
824}
825
826/** Get number of characters in a string with size limit.
827 *
828 * @param str NULL-terminated string.
829 * @param size Maximum number of bytes to consider.
830 *
831 * @return Number of characters in string.
832 *
833 */
[28a5ebd]834size_t wstr_nlength(const char32_t *str, size_t size)
[f2b8cdc]835{
[d4a3ee5]836 size_t len = 0;
[28a5ebd]837 size_t limit = ALIGN_DOWN(size, sizeof(char32_t));
[d4a3ee5]838 size_t offset = 0;
[a35b458]839
[f2b8cdc]840 while ((offset < limit) && (*str++ != 0)) {
841 len++;
[28a5ebd]842 offset += sizeof(char32_t);
[f2b8cdc]843 }
[a35b458]844
[f2b8cdc]845 return len;
846}
847
[be2a38ad]848/** Get character display width on a character cell display.
849 *
850 * @param ch Character
851 * @return Width of character in cells.
852 */
[28a5ebd]853size_t chr_width(char32_t ch)
[be2a38ad]854{
855 return 1;
856}
857
858/** Get string display width on a character cell display.
859 *
860 * @param str String
861 * @return Width of string in cells.
862 */
863size_t str_width(const char *str)
864{
865 size_t width = 0;
866 size_t offset = 0;
[28a5ebd]867 char32_t ch;
[a35b458]868
[be2a38ad]869 while ((ch = str_decode(str, &offset, STR_NO_LIMIT)) != 0)
870 width += chr_width(ch);
[a35b458]871
[be2a38ad]872 return width;
873}
874
[f2b8cdc]875/** Check whether character is plain ASCII.
876 *
877 * @return True if character is plain ASCII.
878 *
879 */
[28a5ebd]880bool ascii_check(char32_t ch)
[f2b8cdc]881{
[28a5ebd]882 if (ch <= 127)
[f2b8cdc]883 return true;
[a35b458]884
[f2b8cdc]885 return false;
886}
887
[171f9a1]888/** Check whether character is valid
889 *
890 * @return True if character is a valid Unicode code point.
891 *
892 */
[28a5ebd]893bool chr_check(char32_t ch)
[171f9a1]894{
[28a5ebd]895 if (ch <= 1114111)
[171f9a1]896 return true;
[a35b458]897
[171f9a1]898 return false;
899}
[936351c1]900
[f2b8cdc]901/** Compare two NULL terminated strings.
902 *
903 * Do a char-by-char comparison of two NULL-terminated strings.
[4efeab5]904 * The strings are considered equal iff their length is equal
905 * and both strings consist of the same sequence of characters.
906 *
[1772e6d]907 * A string S1 is less than another string S2 if it has a character with
908 * lower value at the first character position where the strings differ.
909 * If the strings differ in length, the shorter one is treated as if
910 * padded by characters with a value of zero.
[f2b8cdc]911 *
912 * @param s1 First string to compare.
913 * @param s2 Second string to compare.
914 *
[1772e6d]915 * @return 0 if the strings are equal, -1 if the first is less than the second,
916 * 1 if the second is less than the first.
[f2b8cdc]917 *
918 */
919int str_cmp(const char *s1, const char *s2)
920{
[28c39f3]921 /*
922 * UTF-8 has the nice property that lexicographic ordering on bytes is
923 * the same as the lexicographic ordering of the character sequences.
924 */
925 while (*s1 == *s2 && *s1 != 0) {
926 s1++;
927 s2++;
[f2b8cdc]928 }
929
[28c39f3]930 if (*s1 == *s2)
931 return 0;
932
933 return (*s1 < *s2) ? -1 : 1;
[f2b8cdc]934}
935
936/** Compare two NULL terminated strings with length limit.
937 *
938 * Do a char-by-char comparison of two NULL-terminated strings.
[4efeab5]939 * The strings are considered equal iff
940 * min(str_length(s1), max_len) == min(str_length(s2), max_len)
941 * and both strings consist of the same sequence of characters,
942 * up to max_len characters.
943 *
[1772e6d]944 * A string S1 is less than another string S2 if it has a character with
945 * lower value at the first character position where the strings differ.
946 * If the strings differ in length, the shorter one is treated as if
947 * padded by characters with a value of zero. Only the first max_len
948 * characters are considered.
[f2b8cdc]949 *
950 * @param s1 First string to compare.
951 * @param s2 Second string to compare.
952 * @param max_len Maximum number of characters to consider.
953 *
[1772e6d]954 * @return 0 if the strings are equal, -1 if the first is less than the second,
955 * 1 if the second is less than the first.
[f2b8cdc]956 *
957 */
[d4a3ee5]958int str_lcmp(const char *s1, const char *s2, size_t max_len)
[f2b8cdc]959{
[28a5ebd]960 char32_t c1 = 0;
961 char32_t c2 = 0;
[8227d63]962
[f2b8cdc]963 size_t off1 = 0;
964 size_t off2 = 0;
[8227d63]965
[d4a3ee5]966 size_t len = 0;
[f2b8cdc]967
968 while (true) {
969 if (len >= max_len)
970 break;
971
972 c1 = str_decode(s1, &off1, STR_NO_LIMIT);
973 c2 = str_decode(s2, &off2, STR_NO_LIMIT);
974
[8227d63]975 if (c1 < c2)
976 return -1;
977
978 if (c1 > c2)
979 return 1;
980
981 if (c1 == 0 || c2 == 0)
982 break;
983
984 ++len;
985 }
986
987 return 0;
988
989}
990
991/** Compare two NULL terminated strings in case-insensitive manner.
992 *
993 * Do a char-by-char comparison of two NULL-terminated strings.
994 * The strings are considered equal iff their length is equal
995 * and both strings consist of the same sequence of characters
996 * when converted to lower case.
997 *
998 * A string S1 is less than another string S2 if it has a character with
999 * lower value at the first character position where the strings differ.
1000 * If the strings differ in length, the shorter one is treated as if
1001 * padded by characters with a value of zero.
1002 *
1003 * @param s1 First string to compare.
1004 * @param s2 Second string to compare.
1005 *
1006 * @return 0 if the strings are equal, -1 if the first is less than the second,
1007 * 1 if the second is less than the first.
1008 *
1009 */
1010int str_casecmp(const char *s1, const char *s2)
1011{
[28c39f3]1012 // FIXME: doesn't work for non-ASCII caseful characters
1013
[28a5ebd]1014 char32_t c1 = 0;
1015 char32_t c2 = 0;
[8227d63]1016
1017 size_t off1 = 0;
1018 size_t off2 = 0;
1019
1020 while (true) {
1021 c1 = tolower(str_decode(s1, &off1, STR_NO_LIMIT));
1022 c2 = tolower(str_decode(s2, &off2, STR_NO_LIMIT));
1023
1024 if (c1 < c2)
1025 return -1;
1026
1027 if (c1 > c2)
1028 return 1;
1029
1030 if (c1 == 0 || c2 == 0)
1031 break;
1032 }
1033
1034 return 0;
1035}
1036
1037/** Compare two NULL terminated strings with length limit in case-insensitive
1038 * manner.
1039 *
1040 * Do a char-by-char comparison of two NULL-terminated strings.
1041 * The strings are considered equal iff
1042 * min(str_length(s1), max_len) == min(str_length(s2), max_len)
1043 * and both strings consist of the same sequence of characters,
1044 * up to max_len characters.
1045 *
1046 * A string S1 is less than another string S2 if it has a character with
1047 * lower value at the first character position where the strings differ.
1048 * If the strings differ in length, the shorter one is treated as if
1049 * padded by characters with a value of zero. Only the first max_len
1050 * characters are considered.
1051 *
1052 * @param s1 First string to compare.
1053 * @param s2 Second string to compare.
1054 * @param max_len Maximum number of characters to consider.
1055 *
1056 * @return 0 if the strings are equal, -1 if the first is less than the second,
1057 * 1 if the second is less than the first.
1058 *
1059 */
1060int str_lcasecmp(const char *s1, const char *s2, size_t max_len)
1061{
[28c39f3]1062 // FIXME: doesn't work for non-ASCII caseful characters
1063
[28a5ebd]1064 char32_t c1 = 0;
1065 char32_t c2 = 0;
[a35b458]1066
[8227d63]1067 size_t off1 = 0;
1068 size_t off2 = 0;
[a35b458]1069
[8227d63]1070 size_t len = 0;
1071
1072 while (true) {
1073 if (len >= max_len)
1074 break;
1075
1076 c1 = tolower(str_decode(s1, &off1, STR_NO_LIMIT));
1077 c2 = tolower(str_decode(s2, &off2, STR_NO_LIMIT));
1078
[f2b8cdc]1079 if (c1 < c2)
1080 return -1;
1081
1082 if (c1 > c2)
1083 return 1;
1084
1085 if (c1 == 0 || c2 == 0)
1086 break;
1087
[1b20da0]1088 ++len;
[f2b8cdc]1089 }
1090
1091 return 0;
1092
1093}
1094
[28c39f3]1095static bool _test_prefix(const char *s, const char *p)
1096{
1097 while (*s == *p && *s != 0) {
1098 s++;
1099 p++;
1100 }
1101
1102 return *p == 0;
1103}
1104
[dce39b4]1105/** Test whether p is a prefix of s.
1106 *
1107 * Do a char-by-char comparison of two NULL-terminated strings
1108 * and determine if p is a prefix of s.
1109 *
1110 * @param s The string in which to look
1111 * @param p The string to check if it is a prefix of s
1112 *
1113 * @return true iff p is prefix of s else false
1114 *
1115 */
1116bool str_test_prefix(const char *s, const char *p)
1117{
[28c39f3]1118 return _test_prefix(s, p);
[dce39b4]1119}
1120
[086cab0]1121/** Get a string suffix.
1122 *
1123 * Return a string suffix defined by the prefix length.
1124 *
1125 * @param s The string to get the suffix from.
1126 * @param prefix_length Number of prefix characters to ignore.
1127 *
1128 * @return String suffix.
1129 *
1130 */
1131const char *str_suffix(const char *s, size_t prefix_length)
1132{
1133 size_t off = 0;
1134 size_t i = 0;
1135
1136 while (true) {
1137 str_decode(s, &off, STR_NO_LIMIT);
1138 i++;
1139
1140 if (i >= prefix_length)
1141 break;
1142 }
1143
1144 return s + off;
1145}
1146
[28c39f3]1147/** Copy string as a sequence of bytes. */
1148static void _str_cpy(char *dest, const char *src)
1149{
1150 while (*src)
1151 *(dest++) = *(src++);
1152
1153 *dest = 0;
1154}
1155
1156/** Copy string as a sequence of bytes. */
1157static void _str_cpyn(char *dest, size_t size, const char *src)
1158{
[0600976]1159 assert(dest && src && size);
1160
1161 if (!dest || !src || !size)
1162 return;
1163
1164 if (size == STR_NO_LIMIT)
1165 return _str_cpy(dest, src);
1166
[28c39f3]1167 char *dest_top = dest + size - 1;
[0600976]1168 assert(size == 1 || dest < dest_top);
[28c39f3]1169
1170 while (*src && dest < dest_top)
1171 *(dest++) = *(src++);
1172
1173 *dest = 0;
1174}
1175
[6eb2e96]1176/** Copy string.
[f2b8cdc]1177 *
[6eb2e96]1178 * Copy source string @a src to destination buffer @a dest.
1179 * No more than @a size bytes are written. If the size of the output buffer
1180 * is at least one byte, the output string will always be well-formed, i.e.
1181 * null-terminated and containing only complete characters.
[f2b8cdc]1182 *
[abf09311]1183 * @param dest Destination buffer.
[6700ee2]1184 * @param count Size of the destination buffer (must be > 0).
[6eb2e96]1185 * @param src Source string.
[8e893ae]1186 *
[f2b8cdc]1187 */
[6eb2e96]1188void str_cpy(char *dest, size_t size, const char *src)
[f2b8cdc]1189{
[6700ee2]1190 /* There must be space for a null terminator in the buffer. */
1191 assert(size > 0);
[d066259]1192 assert(src != NULL);
[28c39f3]1193 assert(dest != NULL);
[0600976]1194 assert(size == STR_NO_LIMIT || dest + size > dest);
[a35b458]1195
[28c39f3]1196 /* Copy data. */
1197 _str_cpyn(dest, size, src);
[a35b458]1198
[28c39f3]1199 /* In-place translate invalid bytes to U_SPECIAL. */
[b31323f]1200 _str_sanitize(dest, size, U_SPECIAL);
[6eb2e96]1201}
1202
1203/** Copy size-limited substring.
1204 *
[6700ee2]1205 * Copy prefix of string @a src of max. size @a size to destination buffer
1206 * @a dest. No more than @a size bytes are written. The output string will
1207 * always be well-formed, i.e. null-terminated and containing only complete
1208 * characters.
[6eb2e96]1209 *
1210 * No more than @a n bytes are read from the input string, so it does not
1211 * have to be null-terminated.
1212 *
[abf09311]1213 * @param dest Destination buffer.
[6700ee2]1214 * @param count Size of the destination buffer (must be > 0).
[6eb2e96]1215 * @param src Source string.
[abf09311]1216 * @param n Maximum number of bytes to read from @a src.
[8e893ae]1217 *
[6eb2e96]1218 */
1219void str_ncpy(char *dest, size_t size, const char *src, size_t n)
1220{
[6700ee2]1221 /* There must be space for a null terminator in the buffer. */
1222 assert(size > 0);
[28c39f3]1223 assert(src != NULL);
[a35b458]1224
[28c39f3]1225 /* Copy data. */
1226 _str_cpyn(dest, min(size, n + 1), src);
[a35b458]1227
[28c39f3]1228 /* In-place translate invalid bytes to U_SPECIAL. */
[b31323f]1229 _str_sanitize(dest, size, U_SPECIAL);
[f2b8cdc]1230}
1231
[4482bc7]1232/** Append one string to another.
1233 *
1234 * Append source string @a src to string in destination buffer @a dest.
1235 * Size of the destination buffer is @a dest. If the size of the output buffer
1236 * is at least one byte, the output string will always be well-formed, i.e.
1237 * null-terminated and containing only complete characters.
1238 *
[0f06dbc]1239 * @param dest Destination buffer.
[4482bc7]1240 * @param count Size of the destination buffer.
1241 * @param src Source string.
1242 */
1243void str_append(char *dest, size_t size, const char *src)
1244{
[28c39f3]1245 assert(src != NULL);
1246 assert(dest != NULL);
1247 assert(size > 0);
[0600976]1248 assert(size == STR_NO_LIMIT || dest + size > dest);
[a35b458]1249
[28c39f3]1250 size_t dstr_size = _str_nsize(dest, size);
[0600976]1251 if (dstr_size < size) {
1252 _str_cpyn(dest + dstr_size, size - dstr_size, src);
[b31323f]1253 _str_sanitize(dest + dstr_size, size - dstr_size, U_SPECIAL);
[0600976]1254 }
[4482bc7]1255}
1256
[dcb74c0a]1257/** Convert space-padded ASCII to string.
1258 *
1259 * Common legacy text encoding in hardware is 7-bit ASCII fitted into
[c3d19ac]1260 * a fixed-width byte buffer (bit 7 always zero), right-padded with spaces
[dcb74c0a]1261 * (ASCII 0x20). Convert space-padded ascii to string representation.
1262 *
1263 * If the text does not fit into the destination buffer, the function converts
1264 * as many characters as possible and returns EOVERFLOW.
1265 *
1266 * If the text contains non-ASCII bytes (with bit 7 set), the whole string is
1267 * converted anyway and invalid characters are replaced with question marks
1268 * (U_SPECIAL) and the function returns EIO.
1269 *
1270 * Regardless of return value upon return @a dest will always be well-formed.
1271 *
1272 * @param dest Destination buffer
1273 * @param size Size of destination buffer
1274 * @param src Space-padded ASCII.
1275 * @param n Size of the source buffer in bytes.
1276 *
1277 * @return EOK on success, EOVERFLOW if the text does not fit
1278 * destination buffer, EIO if the text contains
1279 * non-ASCII bytes.
1280 */
[b7fd2a0]1281errno_t spascii_to_str(char *dest, size_t size, const uint8_t *src, size_t n)
[dcb74c0a]1282{
[28c39f3]1283 size_t len = 0;
[dcb74c0a]1284
[28c39f3]1285 /* Determine the length of the source string. */
1286 for (size_t i = 0; i < n; i++) {
1287 if (src[i] == 0)
1288 break;
1289
1290 if (src[i] != ' ')
1291 len = i + 1;
1292 }
1293
1294 errno_t result = EOK;
1295 size_t out_len = min(len, size - 1);
1296
1297 /* Copy characters */
1298 for (size_t i = 0; i < out_len; i++) {
1299 dest[i] = src[i];
1300
1301 if (dest[i] < 0) {
1302 dest[i] = U_SPECIAL;
[dcb74c0a]1303 result = EIO;
1304 }
[28c39f3]1305 }
[dcb74c0a]1306
[28c39f3]1307 dest[out_len] = 0;
[dcb74c0a]1308
[28c39f3]1309 if (out_len < len)
1310 return EOVERFLOW;
[dcb74c0a]1311
1312 return result;
1313}
1314
[0f06dbc]1315/** Convert wide string to string.
[f2b8cdc]1316 *
[0f06dbc]1317 * Convert wide string @a src to string. The output is written to the buffer
1318 * specified by @a dest and @a size. @a size must be non-zero and the string
1319 * written will always be well-formed.
[f2b8cdc]1320 *
[0f06dbc]1321 * @param dest Destination buffer.
1322 * @param size Size of the destination buffer.
1323 * @param src Source wide string.
[f2b8cdc]1324 */
[28a5ebd]1325void wstr_to_str(char *dest, size_t size, const char32_t *src)
[f2b8cdc]1326{
[28a5ebd]1327 char32_t ch;
[0f06dbc]1328 size_t src_idx;
1329 size_t dest_off;
1330
1331 /* There must be space for a null terminator in the buffer. */
1332 assert(size > 0);
[a35b458]1333
[0f06dbc]1334 src_idx = 0;
1335 dest_off = 0;
1336
[f2b8cdc]1337 while ((ch = src[src_idx++]) != 0) {
[81e9cb3]1338 if (chr_encode(ch, dest, &dest_off, size - 1) != EOK)
[f2b8cdc]1339 break;
1340 }
[0f06dbc]1341
1342 dest[dest_off] = '\0';
[f2b8cdc]1343}
1344
[82374b2]1345/** Convert UTF16 string to string.
1346 *
1347 * Convert utf16 string @a src to string. The output is written to the buffer
1348 * specified by @a dest and @a size. @a size must be non-zero and the string
1349 * written will always be well-formed. Surrogate pairs also supported.
1350 *
1351 * @param dest Destination buffer.
1352 * @param size Size of the destination buffer.
1353 * @param src Source utf16 string.
1354 *
[cde999a]1355 * @return EOK, if success, an error code otherwise.
[82374b2]1356 */
[b7fd2a0]1357errno_t utf16_to_str(char *dest, size_t size, const uint16_t *src)
[82374b2]1358{
[abb7491c]1359 size_t idx = 0, dest_off = 0;
[28a5ebd]1360 char32_t ch;
[b7fd2a0]1361 errno_t rc = EOK;
[82374b2]1362
1363 /* There must be space for a null terminator in the buffer. */
1364 assert(size > 0);
1365
1366 while (src[idx]) {
1367 if ((src[idx] & 0xfc00) == 0xd800) {
[abb7491c]1368 if (src[idx + 1] && (src[idx + 1] & 0xfc00) == 0xdc00) {
[82374b2]1369 ch = 0x10000;
1370 ch += (src[idx] & 0x03FF) << 10;
[abb7491c]1371 ch += (src[idx + 1] & 0x03FF);
[82374b2]1372 idx += 2;
[1433ecda]1373 } else
[82374b2]1374 break;
1375 } else {
1376 ch = src[idx];
1377 idx++;
1378 }
[abb7491c]1379 rc = chr_encode(ch, dest, &dest_off, size - 1);
[82374b2]1380 if (rc != EOK)
1381 break;
1382 }
1383 dest[dest_off] = '\0';
1384 return rc;
1385}
1386
[b06414f]1387/** Convert string to UTF16 string.
1388 *
1389 * Convert string @a src to utf16 string. The output is written to the buffer
1390 * specified by @a dest and @a dlen. @a dlen must be non-zero and the string
1391 * written will always be well-formed. Surrogate pairs also supported.
1392 *
1393 * @param dest Destination buffer.
1394 * @param dlen Number of utf16 characters that fit in the destination buffer.
1395 * @param src Source string.
1396 *
[cde999a]1397 * @return EOK, if success, an error code otherwise.
[b06414f]1398 */
[b7fd2a0]1399errno_t str_to_utf16(uint16_t *dest, size_t dlen, const char *src)
[fc97128]1400{
[b7fd2a0]1401 errno_t rc = EOK;
[abb7491c]1402 size_t offset = 0;
1403 size_t idx = 0;
[28a5ebd]1404 char32_t c;
[fc97128]1405
[b06414f]1406 assert(dlen > 0);
[a35b458]1407
[fc97128]1408 while ((c = str_decode(src, &offset, STR_NO_LIMIT)) != 0) {
1409 if (c > 0x10000) {
[b06414f]1410 if (idx + 2 >= dlen - 1) {
[abb7491c]1411 rc = EOVERFLOW;
[fc97128]1412 break;
1413 }
1414 c = (c - 0x10000);
1415 dest[idx] = 0xD800 | (c >> 10);
[abb7491c]1416 dest[idx + 1] = 0xDC00 | (c & 0x3FF);
[fc97128]1417 idx++;
1418 } else {
[1433ecda]1419 dest[idx] = c;
[fc97128]1420 }
1421
1422 idx++;
[b06414f]1423 if (idx >= dlen - 1) {
[abb7491c]1424 rc = EOVERFLOW;
[fc97128]1425 break;
1426 }
1427 }
1428
1429 dest[idx] = '\0';
1430 return rc;
[f2b8cdc]1431}
1432
[b2906c0]1433/** Get size of UTF-16 string.
1434 *
1435 * Get the number of words which are used by the UTF-16 string @a ustr
1436 * (excluding the NULL-terminator).
1437 *
1438 * @param ustr UTF-16 string to consider.
1439 *
1440 * @return Number of words used by the UTF-16 string
1441 *
1442 */
1443size_t utf16_wsize(const uint16_t *ustr)
1444{
1445 size_t wsize = 0;
1446
1447 while (*ustr++ != 0)
1448 wsize++;
1449
1450 return wsize;
1451}
1452
[b67c7d64]1453/** Convert wide string to new string.
1454 *
1455 * Convert wide string @a src to string. Space for the new string is allocated
1456 * on the heap.
1457 *
1458 * @param src Source wide string.
1459 * @return New string.
1460 */
[28a5ebd]1461char *wstr_to_astr(const char32_t *src)
[b67c7d64]1462{
1463 char dbuf[STR_BOUNDS(1)];
1464 char *str;
[28a5ebd]1465 char32_t ch;
[b67c7d64]1466
1467 size_t src_idx;
1468 size_t dest_off;
1469 size_t dest_size;
1470
1471 /* Compute size of encoded string. */
1472
1473 src_idx = 0;
1474 dest_size = 0;
1475
1476 while ((ch = src[src_idx++]) != 0) {
1477 dest_off = 0;
1478 if (chr_encode(ch, dbuf, &dest_off, STR_BOUNDS(1)) != EOK)
1479 break;
1480 dest_size += dest_off;
1481 }
1482
1483 str = malloc(dest_size + 1);
1484 if (str == NULL)
1485 return NULL;
1486
1487 /* Encode string. */
1488
1489 src_idx = 0;
1490 dest_off = 0;
1491
1492 while ((ch = src[src_idx++]) != 0) {
1493 if (chr_encode(ch, str, &dest_off, dest_size) != EOK)
1494 break;
1495 }
1496
1497 str[dest_size] = '\0';
1498 return str;
1499}
1500
[da2bd08]1501/** Convert string to wide string.
1502 *
1503 * Convert string @a src to wide string. The output is written to the
[0f06dbc]1504 * buffer specified by @a dest and @a dlen. @a dlen must be non-zero
1505 * and the wide string written will always be null-terminated.
[da2bd08]1506 *
1507 * @param dest Destination buffer.
1508 * @param dlen Length of destination buffer (number of wchars).
1509 * @param src Source string.
1510 */
[28a5ebd]1511void str_to_wstr(char32_t *dest, size_t dlen, const char *src)
[da2bd08]1512{
1513 size_t offset;
1514 size_t di;
[28a5ebd]1515 char32_t c;
[da2bd08]1516
1517 assert(dlen > 0);
1518
1519 offset = 0;
1520 di = 0;
1521
1522 do {
[81e9cb3]1523 if (di >= dlen - 1)
[da2bd08]1524 break;
1525
1526 c = str_decode(src, &offset, STR_NO_LIMIT);
1527 dest[di++] = c;
1528 } while (c != '\0');
1529
1530 dest[dlen - 1] = '\0';
1531}
1532
[22cf42d9]1533/** Convert string to wide string.
1534 *
1535 * Convert string @a src to wide string. A new wide NULL-terminated
1536 * string will be allocated on the heap.
1537 *
1538 * @param src Source string.
1539 */
[28a5ebd]1540char32_t *str_to_awstr(const char *str)
[22cf42d9]1541{
1542 size_t len = str_length(str);
[a35b458]1543
[28a5ebd]1544 char32_t *wstr = calloc(len + 1, sizeof(char32_t));
[b48d046]1545 if (wstr == NULL)
1546 return NULL;
[a35b458]1547
[b48d046]1548 str_to_wstr(wstr, len + 1, str);
[22cf42d9]1549 return wstr;
1550}
1551
[28c39f3]1552static char *_strchr(const char *str, char c)
1553{
1554 while (*str != 0 && *str != c)
1555 str++;
1556
1557 return (*str == c) ? (char *) str : NULL;
1558}
1559
[f2b8cdc]1560/** Find first occurence of character in string.
1561 *
1562 * @param str String to search.
1563 * @param ch Character to look for.
1564 *
1565 * @return Pointer to character in @a str or NULL if not found.
1566 */
[28a5ebd]1567char *str_chr(const char *str, char32_t ch)
[f2b8cdc]1568{
[28c39f3]1569 /* Fast path for an ASCII character. */
1570 if (ascii_check(ch))
1571 return _strchr(str, ch);
[a35b458]1572
[28c39f3]1573 /* Convert character to UTF-8. */
1574 char utf8[STR_BOUNDS(1) + 1];
1575 size_t offset = 0;
1576
1577 if (chr_encode(ch, utf8, &offset, sizeof(utf8)) != EOK || offset == 0)
1578 return NULL;
1579
1580 utf8[offset] = '\0';
1581
1582 /* Find the first byte, then check if all of them are correct. */
1583 while (*str != 0) {
1584 str = _strchr(str, utf8[0]);
1585 if (!str)
1586 return NULL;
1587
1588 if (_test_prefix(str, utf8))
1589 return (char *) str;
1590
1591 str++;
[f2b8cdc]1592 }
[a35b458]1593
[f2b8cdc]1594 return NULL;
1595}
1596
[da680b4b]1597/** Find first occurence of substring in string.
1598 *
1599 * @param hs Haystack (string)
1600 * @param n Needle (substring to look for)
1601 *
1602 * @return Pointer to character in @a hs or @c NULL if not found.
1603 */
1604char *str_str(const char *hs, const char *n)
1605{
[28c39f3]1606 size_t hsize = _str_size(hs);
1607 size_t nsize = _str_size(n);
[da680b4b]1608
[28c39f3]1609 while (hsize >= nsize) {
1610 if (_test_prefix(hs, n))
1611 return (char *) hs;
[da680b4b]1612
[28c39f3]1613 hs++;
1614 hsize--;
[da680b4b]1615 }
1616
1617 return NULL;
1618}
1619
[28c39f3]1620static void _str_rtrim(char *str, char c)
1621{
1622 char *last = str;
1623
1624 while (*str) {
1625 if (*str != c)
1626 last = str;
1627
1628 str++;
1629 }
1630
1631 /* Truncate string. */
1632 last[1] = 0;
1633}
1634
[1737bfb]1635/** Removes specified trailing characters from a string.
1636 *
1637 * @param str String to remove from.
1638 * @param ch Character to remove.
1639 */
[28a5ebd]1640void str_rtrim(char *str, char32_t ch)
[1737bfb]1641{
[28c39f3]1642 /* Fast path for the ASCII case. */
1643 if (ascii_check(ch)) {
1644 _str_rtrim(str, ch);
1645 return;
1646 }
1647
[1737bfb]1648 size_t off = 0;
1649 size_t pos = 0;
[28a5ebd]1650 char32_t c;
[1737bfb]1651 bool update_last_chunk = true;
1652 char *last_chunk = NULL;
1653
1654 while ((c = str_decode(str, &off, STR_NO_LIMIT))) {
1655 if (c != ch) {
1656 update_last_chunk = true;
1657 last_chunk = NULL;
1658 } else if (update_last_chunk) {
1659 update_last_chunk = false;
1660 last_chunk = (str + pos);
1661 }
1662 pos = off;
1663 }
1664
1665 if (last_chunk)
1666 *last_chunk = '\0';
1667}
1668
[28c39f3]1669static void _str_ltrim(char *str, char c)
1670{
1671 char *p = str;
1672
1673 while (*p == c)
1674 p++;
1675
1676 if (str != p)
1677 _str_cpy(str, p);
1678}
1679
[1737bfb]1680/** Removes specified leading characters from a string.
1681 *
1682 * @param str String to remove from.
1683 * @param ch Character to remove.
1684 */
[28a5ebd]1685void str_ltrim(char *str, char32_t ch)
[1737bfb]1686{
[28c39f3]1687 /* Fast path for the ASCII case. */
1688 if (ascii_check(ch)) {
1689 _str_ltrim(str, ch);
1690 return;
1691 }
1692
[28a5ebd]1693 char32_t acc;
[1737bfb]1694 size_t off = 0;
1695 size_t pos = 0;
1696 size_t str_sz = str_size(str);
1697
1698 while ((acc = str_decode(str, &off, STR_NO_LIMIT)) != 0) {
1699 if (acc != ch)
1700 break;
1701 else
1702 pos = off;
1703 }
1704
1705 if (pos > 0) {
1706 memmove(str, &str[pos], str_sz - pos);
1707 pos = str_sz - pos;
[a18a8b9]1708 str[pos] = '\0';
[1737bfb]1709 }
1710}
1711
[28c39f3]1712static char *_str_rchr(const char *str, char c)
1713{
1714 const char *last = NULL;
1715
1716 while (*str) {
1717 if (*str == c)
1718 last = str;
1719
1720 str++;
1721 }
1722
1723 return (char *) last;
1724}
1725
[7afb4a5]1726/** Find last occurence of character in string.
1727 *
1728 * @param str String to search.
1729 * @param ch Character to look for.
1730 *
1731 * @return Pointer to character in @a str or NULL if not found.
1732 */
[28a5ebd]1733char *str_rchr(const char *str, char32_t ch)
[7afb4a5]1734{
[28c39f3]1735 if (ascii_check(ch))
1736 return _str_rchr(str, ch);
1737
[28a5ebd]1738 char32_t acc;
[7afb4a5]1739 size_t off = 0;
[f2d2c7ba]1740 size_t last = 0;
[d4a3ee5]1741 const char *res = NULL;
[a35b458]1742
[7afb4a5]1743 while ((acc = str_decode(str, &off, STR_NO_LIMIT)) != 0) {
1744 if (acc == ch)
[f2d2c7ba]1745 res = (str + last);
1746 last = off;
[7afb4a5]1747 }
[a35b458]1748
[dd2cfa7]1749 return (char *) res;
[7afb4a5]1750}
1751
[f2b8cdc]1752/** Insert a wide character into a wide string.
1753 *
1754 * Insert a wide character into a wide string at position
1755 * @a pos. The characters after the position are shifted.
1756 *
1757 * @param str String to insert to.
1758 * @param ch Character to insert to.
1759 * @param pos Character index where to insert.
[7c3fb9b]1760 * @param max_pos Characters in the buffer.
[f2b8cdc]1761 *
1762 * @return True if the insertion was sucessful, false if the position
1763 * is out of bounds.
1764 *
1765 */
[28a5ebd]1766bool wstr_linsert(char32_t *str, char32_t ch, size_t pos, size_t max_pos)
[f2b8cdc]1767{
[d4a3ee5]1768 size_t len = wstr_length(str);
[a35b458]1769
[f2b8cdc]1770 if ((pos > len) || (pos + 1 > max_pos))
1771 return false;
[a35b458]1772
[d4a3ee5]1773 size_t i;
[f2b8cdc]1774 for (i = len; i + 1 > pos; i--)
1775 str[i + 1] = str[i];
[a35b458]1776
[f2b8cdc]1777 str[pos] = ch;
[a35b458]1778
[f2b8cdc]1779 return true;
1780}
1781
1782/** Remove a wide character from a wide string.
1783 *
1784 * Remove a wide character from a wide string at position
1785 * @a pos. The characters after the position are shifted.
1786 *
1787 * @param str String to remove from.
1788 * @param pos Character index to remove.
1789 *
1790 * @return True if the removal was sucessful, false if the position
1791 * is out of bounds.
1792 *
1793 */
[28a5ebd]1794bool wstr_remove(char32_t *str, size_t pos)
[f2b8cdc]1795{
[d4a3ee5]1796 size_t len = wstr_length(str);
[a35b458]1797
[f2b8cdc]1798 if (pos >= len)
1799 return false;
[a35b458]1800
[d4a3ee5]1801 size_t i;
[f2b8cdc]1802 for (i = pos + 1; i <= len; i++)
1803 str[i - 1] = str[i];
[a35b458]1804
[f2b8cdc]1805 return true;
1806}
1807
[abf09311]1808/** Duplicate string.
1809 *
1810 * Allocate a new string and copy characters from the source
1811 * string into it. The duplicate string is allocated via sleeping
1812 * malloc(), thus this function can sleep in no memory conditions.
1813 *
1814 * The allocation cannot fail and the return value is always
1815 * a valid pointer. The duplicate string is always a well-formed
1816 * null-terminated UTF-8 string, but it can differ from the source
1817 * string on the byte level.
1818 *
1819 * @param src Source string.
1820 *
1821 * @return Duplicate string.
1822 *
1823 */
[fc6dd18]1824char *str_dup(const char *src)
1825{
[28c39f3]1826 size_t size = _str_size(src) + 1;
[d066259]1827 char *dest = malloc(size);
1828 if (!dest)
1829 return NULL;
[a35b458]1830
[0600976]1831 memcpy(dest, src, size);
[b31323f]1832 _str_sanitize(dest, size, U_SPECIAL);
[abf09311]1833 return dest;
[fc6dd18]1834}
1835
[abf09311]1836/** Duplicate string with size limit.
1837 *
1838 * Allocate a new string and copy up to @max_size bytes from the source
1839 * string into it. The duplicate string is allocated via sleeping
1840 * malloc(), thus this function can sleep in no memory conditions.
1841 * No more than @max_size + 1 bytes is allocated, but if the size
1842 * occupied by the source string is smaller than @max_size + 1,
1843 * less is allocated.
1844 *
1845 * The allocation cannot fail and the return value is always
1846 * a valid pointer. The duplicate string is always a well-formed
1847 * null-terminated UTF-8 string, but it can differ from the source
1848 * string on the byte level.
1849 *
1850 * @param src Source string.
1851 * @param n Maximum number of bytes to duplicate.
1852 *
1853 * @return Duplicate string.
1854 *
1855 */
1856char *str_ndup(const char *src, size_t n)
[fc6dd18]1857{
[0600976]1858 size_t size = _str_nsize(src, n);
[a35b458]1859
[0600976]1860 char *dest = malloc(size + 1);
[d066259]1861 if (!dest)
1862 return NULL;
[a35b458]1863
[0600976]1864 memcpy(dest, src, size);
[b31323f]1865 _str_sanitize(dest, size, U_SPECIAL);
[0600976]1866 dest[size] = 0;
[fc6dd18]1867 return dest;
1868}
1869
[ee3f6f6]1870/** Split string by delimiters.
1871 *
1872 * @param s String to be tokenized. May not be NULL.
1873 * @param delim String with the delimiters.
1874 * @param next Variable which will receive the pointer to the
1875 * continuation of the string following the first
1876 * occurrence of any of the delimiter characters.
1877 * May be NULL.
1878 * @return Pointer to the prefix of @a s before the first
1879 * delimiter character. NULL if no such prefix
1880 * exists.
1881 */
1882char *str_tok(char *s, const char *delim, char **next)
[576845ec]1883{
1884 char *start, *end;
[69df837f]1885
[ee3f6f6]1886 if (!s)
1887 return NULL;
[a35b458]1888
[ee3f6f6]1889 size_t len = str_size(s);
1890 size_t cur;
1891 size_t tmp;
[28a5ebd]1892 char32_t ch;
[69df837f]1893
[576845ec]1894 /* Skip over leading delimiters. */
[948222e4]1895 tmp = 0;
1896 cur = 0;
1897 while ((ch = str_decode(s, &tmp, len)) && str_chr(delim, ch))
[ee3f6f6]1898 cur = tmp;
1899 start = &s[cur];
[69df837f]1900
[576845ec]1901 /* Skip over token characters. */
[948222e4]1902 tmp = cur;
1903 while ((ch = str_decode(s, &tmp, len)) && !str_chr(delim, ch))
[ee3f6f6]1904 cur = tmp;
1905 end = &s[cur];
1906 if (next)
1907 *next = (ch ? &s[tmp] : &s[cur]);
1908
1909 if (start == end)
[576845ec]1910 return NULL; /* No more tokens. */
[69df837f]1911
[576845ec]1912 /* Overwrite delimiter with NULL terminator. */
1913 *end = '\0';
1914 return start;
[69df837f]1915}
1916
[e535eeb]1917void order_suffix(const uint64_t val, uint64_t *rv, char *suffix)
1918{
[933cadf]1919 if (val > UINT64_C(10000000000000000000)) {
1920 *rv = val / UINT64_C(1000000000000000000);
[e535eeb]1921 *suffix = 'Z';
[933cadf]1922 } else if (val > UINT64_C(1000000000000000000)) {
1923 *rv = val / UINT64_C(1000000000000000);
[e535eeb]1924 *suffix = 'E';
[933cadf]1925 } else if (val > UINT64_C(1000000000000000)) {
1926 *rv = val / UINT64_C(1000000000000);
[e535eeb]1927 *suffix = 'T';
[933cadf]1928 } else if (val > UINT64_C(1000000000000)) {
1929 *rv = val / UINT64_C(1000000000);
[e535eeb]1930 *suffix = 'G';
[933cadf]1931 } else if (val > UINT64_C(1000000000)) {
1932 *rv = val / UINT64_C(1000000);
[e535eeb]1933 *suffix = 'M';
[933cadf]1934 } else if (val > UINT64_C(1000000)) {
1935 *rv = val / UINT64_C(1000);
[e535eeb]1936 *suffix = 'k';
1937 } else {
1938 *rv = val;
1939 *suffix = ' ';
1940 }
1941}
1942
[933cadf]1943void bin_order_suffix(const uint64_t val, uint64_t *rv, const char **suffix,
1944 bool fixed)
1945{
1946 if (val > UINT64_C(1152921504606846976)) {
1947 *rv = val / UINT64_C(1125899906842624);
1948 *suffix = "EiB";
1949 } else if (val > UINT64_C(1125899906842624)) {
1950 *rv = val / UINT64_C(1099511627776);
1951 *suffix = "TiB";
1952 } else if (val > UINT64_C(1099511627776)) {
1953 *rv = val / UINT64_C(1073741824);
1954 *suffix = "GiB";
1955 } else if (val > UINT64_C(1073741824)) {
1956 *rv = val / UINT64_C(1048576);
1957 *suffix = "MiB";
1958 } else if (val > UINT64_C(1048576)) {
1959 *rv = val / UINT64_C(1024);
1960 *suffix = "KiB";
1961 } else {
1962 *rv = val;
1963 if (fixed)
1964 *suffix = "B ";
1965 else
1966 *suffix = "B";
1967 }
1968}
1969
[a46da63]1970/** @}
[b2951e2]1971 */
Note: See TracBrowser for help on using the repository browser.