source: mainline/common/str.c@ 45adeeb

Last change on this file since 45adeeb was 45adeeb, checked in by Jiří Zárevúcky <zarevucky.jiri@…>, 2 months ago

Expose restartable version of str_decode() as str_decode_r()

  • Property mode set to 100644
File size: 45.8 KB
Line 
1/*
2 * Copyright (c) 2001-2004 Jakub Jermar
3 * Copyright (c) 2005 Martin Decky
4 * Copyright (c) 2008 Jiri Svoboda
5 * Copyright (c) 2011 Martin Sucha
6 * Copyright (c) 2011 Oleg Romanenko
7 * Copyright (c) 2025 Jiří Zárevúcky
8 * All rights reserved.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * - Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * - Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 * - The name of the author may not be used to endorse or promote products
20 * derived from this software without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
23 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
24 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
25 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
27 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
31 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33
34/** @addtogroup libc
35 * @{
36 */
37
38/**
39 * @file
40 * @brief String functions.
41 *
42 * Strings and characters use the Universal Character Set (UCS). The standard
43 * strings, called just strings are encoded in UTF-8. Wide strings (encoded
44 * in UTF-32) are supported to a limited degree. A single character is
45 * represented as char32_t.@n
46 *
47 * Overview of the terminology:@n
48 *
49 * Term Meaning
50 * -------------------- ----------------------------------------------------
51 * byte 8 bits stored in uint8_t (unsigned 8 bit integer)
52 *
53 * character UTF-32 encoded Unicode character, stored in char32_t
54 * (unsigned 32 bit integer), code points 0 .. 1114111
55 * are valid
56 *
57 * Note that Unicode characters do not match
58 * one-to-one with displayed characters or glyphs on
59 * screen. For that level of precision, look up
60 * Grapheme Clusters.
61 *
62 * ASCII character 7 bit encoded ASCII character, stored in char
63 * (usually signed 8 bit integer), code points 0 .. 127
64 * are valid
65 *
66 * string UTF-8 encoded NULL-terminated Unicode string, char *
67 *
68 * wide string UTF-32 encoded NULL-terminated Unicode string,
69 * char32_t *
70 *
71 * [wide] string size number of BYTES in a [wide] string (excluding
72 * the NULL-terminator), size_t
73 *
74 * [wide] string length number of CHARACTERS in a [wide] string (excluding
75 * the NULL-terminator), size_t
76 *
77 * [wide] string width number of display cells on a monospace display taken
78 * by a [wide] string, size_t
79 *
80 * This is virtually impossible to determine exactly for
81 * all strings without knowing specifics of the display
82 * device, due to various factors affecting text output.
83 * If you have the option to query the terminal for
84 * position change caused by outputting the string,
85 * it is preferrable to determine width that way.
86 *
87 *
88 * Overview of string metrics:@n
89 *
90 * Metric Abbrev. Type Meaning
91 * ------ ------ ------ -------------------------------------------------
92 * size n size_t number of BYTES in a string (excluding the
93 * NULL-terminator)
94 *
95 * length l size_t number of CHARACTERS in a string (excluding the
96 * null terminator)
97 *
98 * width w size_t number of display cells on a monospace display
99 * taken by a string
100 *
101 *
102 * Function naming prefixes:@n
103 *
104 * chr_ operate on characters
105 * ascii_ operate on ASCII characters
106 * str_ operate on strings
107 * wstr_ operate on wide strings
108 *
109 * [w]str_[n|l|w] operate on a prefix limited by size, length
110 * or width
111 *
112 *
113 * A specific character inside a [wide] string can be referred to by:@n
114 *
115 * pointer (char *, char32_t *)
116 * byte offset (size_t)
117 * character index (size_t)
118 *
119 */
120
121#include <str.h>
122
123#include <align.h>
124#include <assert.h>
125#include <ctype.h>
126#include <errno.h>
127#include <limits.h>
128#include <macros.h>
129#include <mem.h>
130#include <stdbool.h>
131#include <stddef.h>
132#include <stdint.h>
133#include <stdlib.h>
134#include <uchar.h>
135
136#if __STDC_HOSTED__
137#include <fibril.h>
138#endif
139
140static void _set_ilseq()
141{
142#ifdef errno
143 errno = EILSEQ;
144#endif
145}
146
147/** Byte mask consisting of lowest @n bits (out of 8) */
148#define LO_MASK_8(n) ((uint8_t) ((1 << (n)) - 1))
149
150/** Byte mask consisting of lowest @n bits (out of 32) */
151#define LO_MASK_32(n) ((uint32_t) ((1 << (n)) - 1))
152
153/** Byte mask consisting of highest @n bits (out of 8) */
154#define HI_MASK_8(n) (~LO_MASK_8(8 - (n)))
155
156/** Number of data bits in a UTF-8 continuation byte */
157#define CONT_BITS 6
158
159#define UTF8_MASK_INITIAL2 0b00011111
160#define UTF8_MASK_INITIAL3 0b00001111
161#define UTF8_MASK_INITIAL4 0b00000111
162#define UTF8_MASK_CONT 0b00111111
163
164#define CHAR_INVALID ((char32_t) UINT_MAX)
165
166static inline bool _is_ascii(uint8_t b)
167{
168 return b < 0x80;
169}
170
171static inline bool _is_continuation(uint8_t b)
172{
173 return (b & 0xC0) == 0x80;
174}
175
176static inline bool _is_2_byte(uint8_t c)
177{
178 return (c & 0xE0) == 0xC0;
179}
180
181static inline bool _is_3_byte(uint8_t c)
182{
183 return (c & 0xF0) == 0xE0;
184}
185
186static inline bool _is_4_byte(uint8_t c)
187{
188 return (c & 0xF8) == 0xF0;
189}
190
191static inline int _char_continuation_bytes(char32_t c)
192{
193 if ((c & ~LO_MASK_32(7)) == 0)
194 return 0;
195
196 if ((c & ~LO_MASK_32(11)) == 0)
197 return 1;
198
199 if ((c & ~LO_MASK_32(16)) == 0)
200 return 2;
201
202 if ((c & ~LO_MASK_32(21)) == 0)
203 return 3;
204
205 /* Codes longer than 21 bits are not supported */
206 return -1;
207}
208
209static inline int _continuation_bytes(uint8_t b)
210{
211 /* 0xxxxxxx */
212 if (_is_ascii(b))
213 return 0;
214
215 /* 110xxxxx 10xxxxxx */
216 if (_is_2_byte(b))
217 return 1;
218
219 /* 1110xxxx 10xxxxxx 10xxxxxx */
220 if (_is_3_byte(b))
221 return 2;
222
223 /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
224 if (_is_4_byte(b))
225 return 3;
226
227 return -1;
228}
229
230static bool _is_non_shortest(const mbstate_t *mb, uint8_t b)
231{
232 return (mb->state == 0b1111110000000000 && !(b & 0b00100000)) ||
233 (mb->state == 0b1111111111110000 && !(b & 0b00110000));
234}
235
236static bool _is_surrogate(const mbstate_t *mb, uint8_t b)
237{
238 return (mb->state == 0b1111110000001101 && b >= 0xa0);
239}
240
241#define _likely(expr) __builtin_expect((expr), true)
242#define _unlikely(expr) __builtin_expect((expr), false)
243
244#define FAST_PATHS 1
245
246static char32_t _str_decode(const char *s, size_t *offset, size_t size, mbstate_t *mb)
247{
248 assert(s);
249 assert(offset);
250 assert(*offset <= size);
251 assert(size == STR_NO_LIMIT || s + size >= s);
252 assert(mb);
253
254 if (*offset == size)
255 return 0;
256
257 if (_likely(!mb->state)) {
258 /* Clean slate, read initial byte. */
259 uint8_t b = s[(*offset)++];
260
261 /* Fast exit for the most common case. */
262 if (_likely(_is_ascii(b)))
263 return b;
264
265 /* unexpected continuation byte */
266 if (_unlikely(_is_continuation(b)))
267 return CHAR_INVALID;
268
269 /*
270 * The value stored into `continuation` is designed to have
271 * just enough leading ones that after shifting in one less than
272 * the expected number of continuation bytes, the most significant
273 * bit becomes zero. (The field is 16b wide.)
274 */
275
276 if (_is_2_byte(b)) {
277 /* Reject non-shortest form. */
278 if (_unlikely(!(b & 0b00011110)))
279 return CHAR_INVALID;
280
281#if FAST_PATHS
282 /* We can usually take this exit. */
283 if (_likely(*offset < size && _is_continuation(s[*offset])))
284 return (b & UTF8_MASK_INITIAL2) << 6 |
285 (s[(*offset)++] & UTF8_MASK_CONT);
286#endif
287
288 /* 2 byte continuation 110xxxxx */
289 mb->state = b ^ 0b0000000011000000;
290
291 } else if (_is_3_byte(b)) {
292#if FAST_PATHS
293 /* We can usually take this exit. */
294 if (_likely(*offset + 1 < size && _is_continuation(s[*offset]) && _is_continuation(s[*offset + 1]))) {
295
296 char32_t ch = (b & UTF8_MASK_INITIAL3) << 12 |
297 (s[(*offset)] & UTF8_MASK_CONT) << 6 |
298 (s[(*offset) + 1] & UTF8_MASK_CONT);
299
300 *offset += 2;
301
302 /* Reject non-shortest form. */
303 if (_unlikely(!(ch & 0xFFFFF800)))
304 return CHAR_INVALID;
305
306 /* Reject surrogates */
307 if (_unlikely(ch >= 0xD800 && ch < 0xE000))
308 return CHAR_INVALID;
309
310 return ch;
311 }
312#endif
313
314 /* 3 byte continuation 1110xxxx */
315 mb->state = b ^ 0b1111110011100000;
316
317 } else if (_is_4_byte(b)) {
318#if FAST_PATHS
319 /* We can usually take this exit. */
320 if (_likely(*offset + 2 < size && _is_continuation(s[*offset]) &&
321 _is_continuation(s[*offset + 1]) && _is_continuation(s[*offset + 2]))) {
322
323 char32_t ch = (b & UTF8_MASK_INITIAL4) << 18 |
324 (s[(*offset)] & UTF8_MASK_CONT) << 12 |
325 (s[(*offset) + 1] & UTF8_MASK_CONT) << 6 |
326 (s[(*offset) + 2] & UTF8_MASK_CONT);
327
328 *offset += 3;
329
330 /* Reject non-shortest form. */
331 if (_unlikely(!(ch & 0xFFFF0000)))
332 return CHAR_INVALID;
333
334 /* Reject out-of-range characters. */
335 if (_unlikely(ch >= 0x110000))
336 return CHAR_INVALID;
337
338 return ch;
339 }
340#endif
341
342 /* 4 byte continuation 11110xxx */
343 mb->state = b ^ 0b1111111100000000;
344 } else {
345 return CHAR_INVALID;
346 }
347 }
348
349 /* Deal with the remaining edge and invalid cases. */
350 for (; *offset < size; (*offset)++) {
351 /* Read continuation bytes. */
352 uint8_t b = s[*offset];
353
354 if (!_is_continuation(b) || _is_non_shortest(mb, b) || _is_surrogate(mb, b)) {
355 mb->state = 0;
356 return CHAR_INVALID;
357 }
358
359 /* Top bit becomes zero when shifting in the second to last byte. */
360 if (!(mb->state & 0x8000)) {
361 char32_t c = ((char32_t) mb->state) << 6 | (b & UTF8_MASK_CONT);
362 mb->state = 0;
363 (*offset)++;
364 return c;
365 }
366
367 mb->state = mb->state << 6 | (b & UTF8_MASK_CONT);
368 }
369
370 /* Incomplete character. */
371 assert(mb->state);
372 return 0;
373}
374
375/** Standard <uchar.h> function since C11. */
376size_t mbrtoc32(char32_t *c, const char *s, size_t n, mbstate_t *mb)
377{
378#if __STDC_HOSTED__
379 static fibril_local mbstate_t global_state = { };
380
381 if (!mb)
382 mb = &global_state;
383#endif
384
385 if (!s) {
386 /* Equivalent to mbrtoc32(NULL, "", 1, mb); */
387 c = NULL;
388 s = "";
389 n = 1;
390 }
391
392 size_t offset = 0;
393 char32_t ret = _str_decode(s, &offset, n, mb);
394 if (ret == CHAR_INVALID) {
395 assert(!mb->state);
396 _set_ilseq();
397 return UCHAR_ILSEQ;
398 }
399 if (mb->state) {
400 assert(ret == 0);
401 return UCHAR_INCOMPLETE;
402 }
403
404 if (c)
405 *c = ret;
406 return ret ? offset : 0;
407}
408
409/** Decode a single character from a string.
410 *
411 * Decode a single character from a string of size @a size. Decoding starts
412 * at @a offset and this offset is moved to the beginning of the next
413 * character. In case of decoding error, offset generally advances at least
414 * by one. However, offset is never moved beyond size.
415 *
416 * @param str String (not necessarily NULL-terminated).
417 * @param offset Byte offset in string where to start decoding.
418 * @param size Size of the string (in bytes).
419 *
420 * @return Value of decoded character, U_SPECIAL on decoding error or
421 * NULL if attempt to decode beyond @a size.
422 *
423 */
424char32_t str_decode(const char *str, size_t *offset, size_t size)
425{
426 mbstate_t mb = { };
427 char32_t ch = _str_decode(str, offset, size, &mb);
428
429 if (ch == CHAR_INVALID || mb.state)
430 return U_SPECIAL;
431
432 return ch;
433}
434
435char32_t str_decode_r(const char *str, size_t *offset, size_t size,
436 char32_t replacement, mbstate_t *mb)
437{
438 char32_t ch = _str_decode(str, offset, size, mb);
439 return (ch == CHAR_INVALID) ? replacement : ch;
440}
441
442/** Decode a single character from a string to the left.
443 *
444 * Decode a single character from a string of size @a size. Decoding starts
445 * at @a offset and this offset is moved to the beginning of the previous
446 * character. In case of decoding error, offset generally decreases at least
447 * by one. However, offset is never moved before 0.
448 *
449 * @param str String (not necessarily NULL-terminated).
450 * @param offset Byte offset in string where to start decoding.
451 * @param size Size of the string (in bytes).
452 *
453 * @return Value of decoded character, U_SPECIAL on decoding error or
454 * NULL if attempt to decode beyond @a start of str.
455 *
456 */
457char32_t str_decode_reverse(const char *str, size_t *offset, size_t size)
458{
459 if (*offset == 0)
460 return 0;
461
462 int cbytes = 0;
463 /* Continue while continuation bytes found */
464 while (*offset > 0 && cbytes < 4) {
465 uint8_t b = (uint8_t) str[--(*offset)];
466
467 if (_is_continuation(b)) {
468 cbytes++;
469 continue;
470 }
471
472 /* Reject non-shortest form encoding. */
473 if (cbytes != _continuation_bytes(b))
474 return U_SPECIAL;
475
476 /* Start byte */
477 size_t start_offset = *offset;
478 return str_decode(str, &start_offset, size);
479 }
480
481 /* Too many continuation bytes */
482 return U_SPECIAL;
483}
484
485/** Encode a single character to string representation.
486 *
487 * Encode a single character to string representation (i.e. UTF-8) and store
488 * it into a buffer at @a offset. Encoding starts at @a offset and this offset
489 * is moved to the position where the next character can be written to.
490 *
491 * @param ch Input character.
492 * @param str Output buffer.
493 * @param offset Byte offset where to start writing.
494 * @param size Size of the output buffer (in bytes).
495 *
496 * @return EOK if the character was encoded successfully, EOVERFLOW if there
497 * was not enough space in the output buffer or EINVAL if the character
498 * code was invalid.
499 */
500errno_t chr_encode(char32_t ch, char *str, size_t *offset, size_t size)
501{
502 // TODO: merge with c32rtomb()
503
504 if (*offset >= size)
505 return EOVERFLOW;
506
507 /* Fast exit for the most common case. */
508 if (ch < 0x80) {
509 str[(*offset)++] = (char) ch;
510 return EOK;
511 }
512
513 /* Codes longer than 21 bits are not supported */
514 if (!chr_check(ch))
515 return EINVAL;
516
517 /* Determine how many continuation bytes are needed */
518
519 unsigned int cbytes = _char_continuation_bytes(ch);
520 unsigned int b0_bits = 6 - cbytes; /* Data bits in first byte */
521
522 /* Check for available space in buffer */
523 if (*offset + cbytes >= size)
524 return EOVERFLOW;
525
526 /* Encode continuation bytes */
527 unsigned int i;
528 for (i = cbytes; i > 0; i--) {
529 str[*offset + i] = 0x80 | (ch & LO_MASK_32(CONT_BITS));
530 ch >>= CONT_BITS;
531 }
532
533 /* Encode first byte */
534 str[*offset] = (ch & LO_MASK_32(b0_bits)) | HI_MASK_8(8 - b0_bits - 1);
535
536 /* Advance offset */
537 *offset += cbytes + 1;
538
539 return EOK;
540}
541
542/* Convert in place any bytes that don't form a valid character into replacement. */
543static size_t _str_sanitize(char *str, size_t n, uint8_t replacement)
544{
545 uint8_t *b = (uint8_t *) str;
546 size_t count = 0;
547
548 for (; n > 0 && b[0]; b++, n--) {
549 int cont = _continuation_bytes(b[0]);
550 if (__builtin_expect(cont, 0) == 0)
551 continue;
552
553 if (cont < 0 || n <= (size_t) cont) {
554 b[0] = replacement;
555 count++;
556 continue;
557 }
558
559 /* Check continuation bytes. */
560 bool valid = true;
561 for (int i = 1; i <= cont; i++) {
562 if (!_is_continuation(b[i])) {
563 valid = false;
564 break;
565 }
566 }
567
568 if (!valid) {
569 b[0] = replacement;
570 count++;
571 continue;
572 }
573
574 /*
575 * Check for non-shortest form encoding.
576 * See https://www.unicode.org/versions/corrigendum1.html
577 */
578
579 /* 0b110!!!!x 0b10xxxxxx */
580 if (cont == 1 && !(b[0] & 0b00011110)) {
581 b[0] = replacement;
582 count++;
583 continue;
584 }
585
586 /* 0b1110!!!! 0b10!xxxxx 0b10xxxxxx */
587 if (cont == 2 && !(b[0] & 0b00001111) && !(b[1] & 0b00100000)) {
588 b[0] = replacement;
589 count++;
590 continue;
591 }
592
593 /* 0b11110!!! 0b10!!xxxx 0b10xxxxxx 0b10xxxxxx */
594 if (cont == 3 && !(b[0] & 0b00000111) && !(b[1] & 0b00110000)) {
595 b[0] = replacement;
596 count++;
597 continue;
598 }
599
600 /* Check for surrogate character encoding. */
601 if (cont == 2 && b[0] == 0xED && b[1] >= 0xA0) {
602 b[0] = replacement;
603 count++;
604 continue;
605 }
606
607 /* Check for out-of-range code points. */
608 if (cont == 3 && (b[0] > 0xF4 || (b[0] == 0xF4 && b[1] >= 0x90))) {
609 b[0] = replacement;
610 count++;
611 continue;
612 }
613
614 b += cont;
615 n -= cont;
616 }
617
618 return count;
619}
620
621size_t str_sanitize(char *str, size_t n, uint8_t replacement)
622{
623 return _str_sanitize(str, n, replacement);
624}
625
626static size_t _str_size(const char *str)
627{
628 size_t size = 0;
629
630 while (*str++ != 0)
631 size++;
632
633 return size;
634}
635
636/** Get size of string.
637 *
638 * Get the number of bytes which are used by the string @a str (excluding the
639 * NULL-terminator).
640 *
641 * @param str String to consider.
642 *
643 * @return Number of bytes used by the string
644 *
645 */
646size_t str_size(const char *str)
647{
648 return _str_size(str);
649}
650
651/** Get size of wide string.
652 *
653 * Get the number of bytes which are used by the wide string @a str (excluding the
654 * NULL-terminator).
655 *
656 * @param str Wide string to consider.
657 *
658 * @return Number of bytes used by the wide string
659 *
660 */
661size_t wstr_size(const char32_t *str)
662{
663 return (wstr_length(str) * sizeof(char32_t));
664}
665
666/** Get size of string with length limit.
667 *
668 * Get the number of bytes which are used by up to @a max_len first
669 * characters in the string @a str. If @a max_len is greater than
670 * the length of @a str, the entire string is measured (excluding the
671 * NULL-terminator).
672 *
673 * @param str String to consider.
674 * @param max_len Maximum number of characters to measure.
675 *
676 * @return Number of bytes used by the characters.
677 *
678 */
679size_t str_lsize(const char *str, size_t max_len)
680{
681 size_t len = 0;
682 size_t offset = 0;
683
684 while (len < max_len) {
685 if (str_decode(str, &offset, STR_NO_LIMIT) == 0)
686 break;
687
688 len++;
689 }
690
691 return offset;
692}
693
694static size_t _str_nsize(const char *str, size_t max_size)
695{
696 size_t size = 0;
697
698 while ((*str++ != 0) && (size < max_size))
699 size++;
700
701 return size;
702}
703
704/** Get size of string with size limit.
705 *
706 * Get the number of bytes which are used by the string @a str
707 * (excluding the NULL-terminator), but no more than @max_size bytes.
708 *
709 * @param str String to consider.
710 * @param max_size Maximum number of bytes to measure.
711 *
712 * @return Number of bytes used by the string
713 *
714 */
715size_t str_nsize(const char *str, size_t max_size)
716{
717 return _str_nsize(str, max_size);
718}
719
720/** Get size of wide string with size limit.
721 *
722 * Get the number of bytes which are used by the wide string @a str
723 * (excluding the NULL-terminator), but no more than @max_size bytes.
724 *
725 * @param str Wide string to consider.
726 * @param max_size Maximum number of bytes to measure.
727 *
728 * @return Number of bytes used by the wide string
729 *
730 */
731size_t wstr_nsize(const char32_t *str, size_t max_size)
732{
733 return (wstr_nlength(str, max_size) * sizeof(char32_t));
734}
735
736/** Get size of wide string with length limit.
737 *
738 * Get the number of bytes which are used by up to @a max_len first
739 * wide characters in the wide string @a str. If @a max_len is greater than
740 * the length of @a str, the entire wide string is measured (excluding the
741 * NULL-terminator).
742 *
743 * @param str Wide string to consider.
744 * @param max_len Maximum number of wide characters to measure.
745 *
746 * @return Number of bytes used by the wide characters.
747 *
748 */
749size_t wstr_lsize(const char32_t *str, size_t max_len)
750{
751 return (wstr_nlength(str, max_len * sizeof(char32_t)) * sizeof(char32_t));
752}
753
754/** Get number of characters in a string.
755 *
756 * @param str NULL-terminated string.
757 *
758 * @return Number of characters in string.
759 *
760 */
761size_t str_length(const char *str)
762{
763 size_t len = 0;
764 size_t offset = 0;
765
766 while (str_decode(str, &offset, STR_NO_LIMIT) != 0)
767 len++;
768
769 return len;
770}
771
772/** Get number of characters in a wide string.
773 *
774 * @param str NULL-terminated wide string.
775 *
776 * @return Number of characters in @a str.
777 *
778 */
779size_t wstr_length(const char32_t *wstr)
780{
781 size_t len = 0;
782
783 while (*wstr++ != 0)
784 len++;
785
786 return len;
787}
788
789/** Get number of characters in a string with size limit.
790 *
791 * @param str NULL-terminated string.
792 * @param size Maximum number of bytes to consider.
793 *
794 * @return Number of characters in string.
795 *
796 */
797size_t str_nlength(const char *str, size_t size)
798{
799 size_t len = 0;
800 size_t offset = 0;
801
802 while (str_decode(str, &offset, size) != 0)
803 len++;
804
805 return len;
806}
807
808/** Get number of characters in a string with size limit.
809 *
810 * @param str NULL-terminated string.
811 * @param size Maximum number of bytes to consider.
812 *
813 * @return Number of characters in string.
814 *
815 */
816size_t wstr_nlength(const char32_t *str, size_t size)
817{
818 size_t len = 0;
819 size_t limit = ALIGN_DOWN(size, sizeof(char32_t));
820 size_t offset = 0;
821
822 while ((offset < limit) && (*str++ != 0)) {
823 len++;
824 offset += sizeof(char32_t);
825 }
826
827 return len;
828}
829
830/** Get character display width on a character cell display.
831 *
832 * @param ch Character
833 * @return Width of character in cells.
834 */
835size_t chr_width(char32_t ch)
836{
837 return 1;
838}
839
840/** Get string display width on a character cell display.
841 *
842 * @param str String
843 * @return Width of string in cells.
844 */
845size_t str_width(const char *str)
846{
847 size_t width = 0;
848 size_t offset = 0;
849 char32_t ch;
850
851 while ((ch = str_decode(str, &offset, STR_NO_LIMIT)) != 0)
852 width += chr_width(ch);
853
854 return width;
855}
856
857/** Check whether character is plain ASCII.
858 *
859 * @return True if character is plain ASCII.
860 *
861 */
862bool ascii_check(char32_t ch)
863{
864 if (ch <= 127)
865 return true;
866
867 return false;
868}
869
870/** Check whether character is valid
871 *
872 * @return True if character is a valid Unicode code point.
873 *
874 */
875bool chr_check(char32_t ch)
876{
877 if (ch <= 1114111)
878 return true;
879
880 return false;
881}
882
883/** Compare two NULL terminated strings.
884 *
885 * Do a char-by-char comparison of two NULL-terminated strings.
886 * The strings are considered equal iff their length is equal
887 * and both strings consist of the same sequence of characters.
888 *
889 * A string S1 is less than another string S2 if it has a character with
890 * lower value at the first character position where the strings differ.
891 * If the strings differ in length, the shorter one is treated as if
892 * padded by characters with a value of zero.
893 *
894 * @param s1 First string to compare.
895 * @param s2 Second string to compare.
896 *
897 * @return 0 if the strings are equal, -1 if the first is less than the second,
898 * 1 if the second is less than the first.
899 *
900 */
901int str_cmp(const char *s1, const char *s2)
902{
903 /*
904 * UTF-8 has the nice property that lexicographic ordering on bytes is
905 * the same as the lexicographic ordering of the character sequences.
906 */
907 while (*s1 == *s2 && *s1 != 0) {
908 s1++;
909 s2++;
910 }
911
912 if (*s1 == *s2)
913 return 0;
914
915 return (*s1 < *s2) ? -1 : 1;
916}
917
918/** Compare two NULL terminated strings with length limit.
919 *
920 * Do a char-by-char comparison of two NULL-terminated strings.
921 * The strings are considered equal iff
922 * min(str_length(s1), max_len) == min(str_length(s2), max_len)
923 * and both strings consist of the same sequence of characters,
924 * up to max_len characters.
925 *
926 * A string S1 is less than another string S2 if it has a character with
927 * lower value at the first character position where the strings differ.
928 * If the strings differ in length, the shorter one is treated as if
929 * padded by characters with a value of zero. Only the first max_len
930 * characters are considered.
931 *
932 * @param s1 First string to compare.
933 * @param s2 Second string to compare.
934 * @param max_len Maximum number of characters to consider.
935 *
936 * @return 0 if the strings are equal, -1 if the first is less than the second,
937 * 1 if the second is less than the first.
938 *
939 */
940int str_lcmp(const char *s1, const char *s2, size_t max_len)
941{
942 char32_t c1 = 0;
943 char32_t c2 = 0;
944
945 size_t off1 = 0;
946 size_t off2 = 0;
947
948 size_t len = 0;
949
950 while (true) {
951 if (len >= max_len)
952 break;
953
954 c1 = str_decode(s1, &off1, STR_NO_LIMIT);
955 c2 = str_decode(s2, &off2, STR_NO_LIMIT);
956
957 if (c1 < c2)
958 return -1;
959
960 if (c1 > c2)
961 return 1;
962
963 if (c1 == 0 || c2 == 0)
964 break;
965
966 ++len;
967 }
968
969 return 0;
970
971}
972
973/** Compare two NULL terminated strings in case-insensitive manner.
974 *
975 * Do a char-by-char comparison of two NULL-terminated strings.
976 * The strings are considered equal iff their length is equal
977 * and both strings consist of the same sequence of characters
978 * when converted to lower case.
979 *
980 * A string S1 is less than another string S2 if it has a character with
981 * lower value at the first character position where the strings differ.
982 * If the strings differ in length, the shorter one is treated as if
983 * padded by characters with a value of zero.
984 *
985 * @param s1 First string to compare.
986 * @param s2 Second string to compare.
987 *
988 * @return 0 if the strings are equal, -1 if the first is less than the second,
989 * 1 if the second is less than the first.
990 *
991 */
992int str_casecmp(const char *s1, const char *s2)
993{
994 // FIXME: doesn't work for non-ASCII caseful characters
995
996 char32_t c1 = 0;
997 char32_t c2 = 0;
998
999 size_t off1 = 0;
1000 size_t off2 = 0;
1001
1002 while (true) {
1003 c1 = tolower(str_decode(s1, &off1, STR_NO_LIMIT));
1004 c2 = tolower(str_decode(s2, &off2, STR_NO_LIMIT));
1005
1006 if (c1 < c2)
1007 return -1;
1008
1009 if (c1 > c2)
1010 return 1;
1011
1012 if (c1 == 0 || c2 == 0)
1013 break;
1014 }
1015
1016 return 0;
1017}
1018
1019/** Compare two NULL terminated strings with length limit in case-insensitive
1020 * manner.
1021 *
1022 * Do a char-by-char comparison of two NULL-terminated strings.
1023 * The strings are considered equal iff
1024 * min(str_length(s1), max_len) == min(str_length(s2), max_len)
1025 * and both strings consist of the same sequence of characters,
1026 * up to max_len characters.
1027 *
1028 * A string S1 is less than another string S2 if it has a character with
1029 * lower value at the first character position where the strings differ.
1030 * If the strings differ in length, the shorter one is treated as if
1031 * padded by characters with a value of zero. Only the first max_len
1032 * characters are considered.
1033 *
1034 * @param s1 First string to compare.
1035 * @param s2 Second string to compare.
1036 * @param max_len Maximum number of characters to consider.
1037 *
1038 * @return 0 if the strings are equal, -1 if the first is less than the second,
1039 * 1 if the second is less than the first.
1040 *
1041 */
1042int str_lcasecmp(const char *s1, const char *s2, size_t max_len)
1043{
1044 // FIXME: doesn't work for non-ASCII caseful characters
1045
1046 char32_t c1 = 0;
1047 char32_t c2 = 0;
1048
1049 size_t off1 = 0;
1050 size_t off2 = 0;
1051
1052 size_t len = 0;
1053
1054 while (true) {
1055 if (len >= max_len)
1056 break;
1057
1058 c1 = tolower(str_decode(s1, &off1, STR_NO_LIMIT));
1059 c2 = tolower(str_decode(s2, &off2, STR_NO_LIMIT));
1060
1061 if (c1 < c2)
1062 return -1;
1063
1064 if (c1 > c2)
1065 return 1;
1066
1067 if (c1 == 0 || c2 == 0)
1068 break;
1069
1070 ++len;
1071 }
1072
1073 return 0;
1074
1075}
1076
1077static bool _test_prefix(const char *s, const char *p)
1078{
1079 while (*s == *p && *s != 0) {
1080 s++;
1081 p++;
1082 }
1083
1084 return *p == 0;
1085}
1086
1087/** Test whether p is a prefix of s.
1088 *
1089 * Do a char-by-char comparison of two NULL-terminated strings
1090 * and determine if p is a prefix of s.
1091 *
1092 * @param s The string in which to look
1093 * @param p The string to check if it is a prefix of s
1094 *
1095 * @return true iff p is prefix of s else false
1096 *
1097 */
1098bool str_test_prefix(const char *s, const char *p)
1099{
1100 return _test_prefix(s, p);
1101}
1102
1103/** Get a string suffix.
1104 *
1105 * Return a string suffix defined by the prefix length.
1106 *
1107 * @param s The string to get the suffix from.
1108 * @param prefix_length Number of prefix characters to ignore.
1109 *
1110 * @return String suffix.
1111 *
1112 */
1113const char *str_suffix(const char *s, size_t prefix_length)
1114{
1115 size_t off = 0;
1116 size_t i = 0;
1117
1118 while (true) {
1119 str_decode(s, &off, STR_NO_LIMIT);
1120 i++;
1121
1122 if (i >= prefix_length)
1123 break;
1124 }
1125
1126 return s + off;
1127}
1128
1129/** Copy string as a sequence of bytes. */
1130static void _str_cpy(char *dest, const char *src)
1131{
1132 while (*src)
1133 *(dest++) = *(src++);
1134
1135 *dest = 0;
1136}
1137
1138/** Copy string as a sequence of bytes. */
1139static void _str_cpyn(char *dest, size_t size, const char *src)
1140{
1141 assert(dest && src && size);
1142
1143 if (!dest || !src || !size)
1144 return;
1145
1146 if (size == STR_NO_LIMIT)
1147 return _str_cpy(dest, src);
1148
1149 char *dest_top = dest + size - 1;
1150 assert(size == 1 || dest < dest_top);
1151
1152 while (*src && dest < dest_top)
1153 *(dest++) = *(src++);
1154
1155 *dest = 0;
1156}
1157
1158/** Copy string.
1159 *
1160 * Copy source string @a src to destination buffer @a dest.
1161 * No more than @a size bytes are written. If the size of the output buffer
1162 * is at least one byte, the output string will always be well-formed, i.e.
1163 * null-terminated and containing only complete characters.
1164 *
1165 * @param dest Destination buffer.
1166 * @param count Size of the destination buffer (must be > 0).
1167 * @param src Source string.
1168 *
1169 */
1170void str_cpy(char *dest, size_t size, const char *src)
1171{
1172 /* There must be space for a null terminator in the buffer. */
1173 assert(size > 0);
1174 assert(src != NULL);
1175 assert(dest != NULL);
1176 assert(size == STR_NO_LIMIT || dest + size > dest);
1177
1178 /* Copy data. */
1179 _str_cpyn(dest, size, src);
1180
1181 /* In-place translate invalid bytes to U_SPECIAL. */
1182 _str_sanitize(dest, size, U_SPECIAL);
1183}
1184
1185/** Copy size-limited substring.
1186 *
1187 * Copy prefix of string @a src of max. size @a size to destination buffer
1188 * @a dest. No more than @a size bytes are written. The output string will
1189 * always be well-formed, i.e. null-terminated and containing only complete
1190 * characters.
1191 *
1192 * No more than @a n bytes are read from the input string, so it does not
1193 * have to be null-terminated.
1194 *
1195 * @param dest Destination buffer.
1196 * @param count Size of the destination buffer (must be > 0).
1197 * @param src Source string.
1198 * @param n Maximum number of bytes to read from @a src.
1199 *
1200 */
1201void str_ncpy(char *dest, size_t size, const char *src, size_t n)
1202{
1203 /* There must be space for a null terminator in the buffer. */
1204 assert(size > 0);
1205 assert(src != NULL);
1206
1207 /* Copy data. */
1208 _str_cpyn(dest, min(size, n + 1), src);
1209
1210 /* In-place translate invalid bytes to U_SPECIAL. */
1211 _str_sanitize(dest, size, U_SPECIAL);
1212}
1213
1214/** Append one string to another.
1215 *
1216 * Append source string @a src to string in destination buffer @a dest.
1217 * Size of the destination buffer is @a dest. If the size of the output buffer
1218 * is at least one byte, the output string will always be well-formed, i.e.
1219 * null-terminated and containing only complete characters.
1220 *
1221 * @param dest Destination buffer.
1222 * @param count Size of the destination buffer.
1223 * @param src Source string.
1224 */
1225void str_append(char *dest, size_t size, const char *src)
1226{
1227 assert(src != NULL);
1228 assert(dest != NULL);
1229 assert(size > 0);
1230 assert(size == STR_NO_LIMIT || dest + size > dest);
1231
1232 size_t dstr_size = _str_nsize(dest, size);
1233 if (dstr_size < size) {
1234 _str_cpyn(dest + dstr_size, size - dstr_size, src);
1235 _str_sanitize(dest + dstr_size, size - dstr_size, U_SPECIAL);
1236 }
1237}
1238
1239/** Convert space-padded ASCII to string.
1240 *
1241 * Common legacy text encoding in hardware is 7-bit ASCII fitted into
1242 * a fixed-width byte buffer (bit 7 always zero), right-padded with spaces
1243 * (ASCII 0x20). Convert space-padded ascii to string representation.
1244 *
1245 * If the text does not fit into the destination buffer, the function converts
1246 * as many characters as possible and returns EOVERFLOW.
1247 *
1248 * If the text contains non-ASCII bytes (with bit 7 set), the whole string is
1249 * converted anyway and invalid characters are replaced with question marks
1250 * (U_SPECIAL) and the function returns EIO.
1251 *
1252 * Regardless of return value upon return @a dest will always be well-formed.
1253 *
1254 * @param dest Destination buffer
1255 * @param size Size of destination buffer
1256 * @param src Space-padded ASCII.
1257 * @param n Size of the source buffer in bytes.
1258 *
1259 * @return EOK on success, EOVERFLOW if the text does not fit
1260 * destination buffer, EIO if the text contains
1261 * non-ASCII bytes.
1262 */
1263errno_t spascii_to_str(char *dest, size_t size, const uint8_t *src, size_t n)
1264{
1265 size_t len = 0;
1266
1267 /* Determine the length of the source string. */
1268 for (size_t i = 0; i < n; i++) {
1269 if (src[i] == 0)
1270 break;
1271
1272 if (src[i] != ' ')
1273 len = i + 1;
1274 }
1275
1276 errno_t result = EOK;
1277 size_t out_len = min(len, size - 1);
1278
1279 /* Copy characters */
1280 for (size_t i = 0; i < out_len; i++) {
1281 dest[i] = src[i];
1282
1283 if (dest[i] < 0) {
1284 dest[i] = U_SPECIAL;
1285 result = EIO;
1286 }
1287 }
1288
1289 dest[out_len] = 0;
1290
1291 if (out_len < len)
1292 return EOVERFLOW;
1293
1294 return result;
1295}
1296
1297/** Convert wide string to string.
1298 *
1299 * Convert wide string @a src to string. The output is written to the buffer
1300 * specified by @a dest and @a size. @a size must be non-zero and the string
1301 * written will always be well-formed.
1302 *
1303 * @param dest Destination buffer.
1304 * @param size Size of the destination buffer.
1305 * @param src Source wide string.
1306 */
1307void wstr_to_str(char *dest, size_t size, const char32_t *src)
1308{
1309 char32_t ch;
1310 size_t src_idx;
1311 size_t dest_off;
1312
1313 /* There must be space for a null terminator in the buffer. */
1314 assert(size > 0);
1315
1316 src_idx = 0;
1317 dest_off = 0;
1318
1319 while ((ch = src[src_idx++]) != 0) {
1320 if (chr_encode(ch, dest, &dest_off, size - 1) != EOK)
1321 break;
1322 }
1323
1324 dest[dest_off] = '\0';
1325}
1326
1327/** Convert UTF16 string to string.
1328 *
1329 * Convert utf16 string @a src to string. The output is written to the buffer
1330 * specified by @a dest and @a size. @a size must be non-zero and the string
1331 * written will always be well-formed. Surrogate pairs also supported.
1332 *
1333 * @param dest Destination buffer.
1334 * @param size Size of the destination buffer.
1335 * @param src Source utf16 string.
1336 *
1337 * @return EOK, if success, an error code otherwise.
1338 */
1339errno_t utf16_to_str(char *dest, size_t size, const uint16_t *src)
1340{
1341 size_t idx = 0, dest_off = 0;
1342 char32_t ch;
1343 errno_t rc = EOK;
1344
1345 /* There must be space for a null terminator in the buffer. */
1346 assert(size > 0);
1347
1348 while (src[idx]) {
1349 if ((src[idx] & 0xfc00) == 0xd800) {
1350 if (src[idx + 1] && (src[idx + 1] & 0xfc00) == 0xdc00) {
1351 ch = 0x10000;
1352 ch += (src[idx] & 0x03FF) << 10;
1353 ch += (src[idx + 1] & 0x03FF);
1354 idx += 2;
1355 } else
1356 break;
1357 } else {
1358 ch = src[idx];
1359 idx++;
1360 }
1361 rc = chr_encode(ch, dest, &dest_off, size - 1);
1362 if (rc != EOK)
1363 break;
1364 }
1365 dest[dest_off] = '\0';
1366 return rc;
1367}
1368
1369/** Convert string to UTF16 string.
1370 *
1371 * Convert string @a src to utf16 string. The output is written to the buffer
1372 * specified by @a dest and @a dlen. @a dlen must be non-zero and the string
1373 * written will always be well-formed. Surrogate pairs also supported.
1374 *
1375 * @param dest Destination buffer.
1376 * @param dlen Number of utf16 characters that fit in the destination buffer.
1377 * @param src Source string.
1378 *
1379 * @return EOK, if success, an error code otherwise.
1380 */
1381errno_t str_to_utf16(uint16_t *dest, size_t dlen, const char *src)
1382{
1383 errno_t rc = EOK;
1384 size_t offset = 0;
1385 size_t idx = 0;
1386 char32_t c;
1387
1388 assert(dlen > 0);
1389
1390 while ((c = str_decode(src, &offset, STR_NO_LIMIT)) != 0) {
1391 if (c > 0x10000) {
1392 if (idx + 2 >= dlen - 1) {
1393 rc = EOVERFLOW;
1394 break;
1395 }
1396 c = (c - 0x10000);
1397 dest[idx] = 0xD800 | (c >> 10);
1398 dest[idx + 1] = 0xDC00 | (c & 0x3FF);
1399 idx++;
1400 } else {
1401 dest[idx] = c;
1402 }
1403
1404 idx++;
1405 if (idx >= dlen - 1) {
1406 rc = EOVERFLOW;
1407 break;
1408 }
1409 }
1410
1411 dest[idx] = '\0';
1412 return rc;
1413}
1414
1415/** Get size of UTF-16 string.
1416 *
1417 * Get the number of words which are used by the UTF-16 string @a ustr
1418 * (excluding the NULL-terminator).
1419 *
1420 * @param ustr UTF-16 string to consider.
1421 *
1422 * @return Number of words used by the UTF-16 string
1423 *
1424 */
1425size_t utf16_wsize(const uint16_t *ustr)
1426{
1427 size_t wsize = 0;
1428
1429 while (*ustr++ != 0)
1430 wsize++;
1431
1432 return wsize;
1433}
1434
1435/** Convert wide string to new string.
1436 *
1437 * Convert wide string @a src to string. Space for the new string is allocated
1438 * on the heap.
1439 *
1440 * @param src Source wide string.
1441 * @return New string.
1442 */
1443char *wstr_to_astr(const char32_t *src)
1444{
1445 char dbuf[STR_BOUNDS(1)];
1446 char *str;
1447 char32_t ch;
1448
1449 size_t src_idx;
1450 size_t dest_off;
1451 size_t dest_size;
1452
1453 /* Compute size of encoded string. */
1454
1455 src_idx = 0;
1456 dest_size = 0;
1457
1458 while ((ch = src[src_idx++]) != 0) {
1459 dest_off = 0;
1460 if (chr_encode(ch, dbuf, &dest_off, STR_BOUNDS(1)) != EOK)
1461 break;
1462 dest_size += dest_off;
1463 }
1464
1465 str = malloc(dest_size + 1);
1466 if (str == NULL)
1467 return NULL;
1468
1469 /* Encode string. */
1470
1471 src_idx = 0;
1472 dest_off = 0;
1473
1474 while ((ch = src[src_idx++]) != 0) {
1475 if (chr_encode(ch, str, &dest_off, dest_size) != EOK)
1476 break;
1477 }
1478
1479 str[dest_size] = '\0';
1480 return str;
1481}
1482
1483/** Convert string to wide string.
1484 *
1485 * Convert string @a src to wide string. The output is written to the
1486 * buffer specified by @a dest and @a dlen. @a dlen must be non-zero
1487 * and the wide string written will always be null-terminated.
1488 *
1489 * @param dest Destination buffer.
1490 * @param dlen Length of destination buffer (number of wchars).
1491 * @param src Source string.
1492 */
1493void str_to_wstr(char32_t *dest, size_t dlen, const char *src)
1494{
1495 size_t offset;
1496 size_t di;
1497 char32_t c;
1498
1499 assert(dlen > 0);
1500
1501 offset = 0;
1502 di = 0;
1503
1504 do {
1505 if (di >= dlen - 1)
1506 break;
1507
1508 c = str_decode(src, &offset, STR_NO_LIMIT);
1509 dest[di++] = c;
1510 } while (c != '\0');
1511
1512 dest[dlen - 1] = '\0';
1513}
1514
1515/** Convert string to wide string.
1516 *
1517 * Convert string @a src to wide string. A new wide NULL-terminated
1518 * string will be allocated on the heap.
1519 *
1520 * @param src Source string.
1521 */
1522char32_t *str_to_awstr(const char *str)
1523{
1524 size_t len = str_length(str);
1525
1526 char32_t *wstr = calloc(len + 1, sizeof(char32_t));
1527 if (wstr == NULL)
1528 return NULL;
1529
1530 str_to_wstr(wstr, len + 1, str);
1531 return wstr;
1532}
1533
1534static char *_strchr(const char *str, char c)
1535{
1536 while (*str != 0 && *str != c)
1537 str++;
1538
1539 return (*str == c) ? (char *) str : NULL;
1540}
1541
1542/** Find first occurence of character in string.
1543 *
1544 * @param str String to search.
1545 * @param ch Character to look for.
1546 *
1547 * @return Pointer to character in @a str or NULL if not found.
1548 */
1549char *str_chr(const char *str, char32_t ch)
1550{
1551 /* Fast path for an ASCII character. */
1552 if (ascii_check(ch))
1553 return _strchr(str, ch);
1554
1555 /* Convert character to UTF-8. */
1556 char utf8[STR_BOUNDS(1) + 1];
1557 size_t offset = 0;
1558
1559 if (chr_encode(ch, utf8, &offset, sizeof(utf8)) != EOK || offset == 0)
1560 return NULL;
1561
1562 utf8[offset] = '\0';
1563
1564 /* Find the first byte, then check if all of them are correct. */
1565 while (*str != 0) {
1566 str = _strchr(str, utf8[0]);
1567 if (!str)
1568 return NULL;
1569
1570 if (_test_prefix(str, utf8))
1571 return (char *) str;
1572
1573 str++;
1574 }
1575
1576 return NULL;
1577}
1578
1579/** Find first occurence of substring in string.
1580 *
1581 * @param hs Haystack (string)
1582 * @param n Needle (substring to look for)
1583 *
1584 * @return Pointer to character in @a hs or @c NULL if not found.
1585 */
1586char *str_str(const char *hs, const char *n)
1587{
1588 size_t hsize = _str_size(hs);
1589 size_t nsize = _str_size(n);
1590
1591 while (hsize >= nsize) {
1592 if (_test_prefix(hs, n))
1593 return (char *) hs;
1594
1595 hs++;
1596 hsize--;
1597 }
1598
1599 return NULL;
1600}
1601
1602static void _str_rtrim(char *str, char c)
1603{
1604 char *last = str;
1605
1606 while (*str) {
1607 if (*str != c)
1608 last = str;
1609
1610 str++;
1611 }
1612
1613 /* Truncate string. */
1614 last[1] = 0;
1615}
1616
1617/** Removes specified trailing characters from a string.
1618 *
1619 * @param str String to remove from.
1620 * @param ch Character to remove.
1621 */
1622void str_rtrim(char *str, char32_t ch)
1623{
1624 /* Fast path for the ASCII case. */
1625 if (ascii_check(ch)) {
1626 _str_rtrim(str, ch);
1627 return;
1628 }
1629
1630 size_t off = 0;
1631 size_t pos = 0;
1632 char32_t c;
1633 bool update_last_chunk = true;
1634 char *last_chunk = NULL;
1635
1636 while ((c = str_decode(str, &off, STR_NO_LIMIT))) {
1637 if (c != ch) {
1638 update_last_chunk = true;
1639 last_chunk = NULL;
1640 } else if (update_last_chunk) {
1641 update_last_chunk = false;
1642 last_chunk = (str + pos);
1643 }
1644 pos = off;
1645 }
1646
1647 if (last_chunk)
1648 *last_chunk = '\0';
1649}
1650
1651static void _str_ltrim(char *str, char c)
1652{
1653 char *p = str;
1654
1655 while (*p == c)
1656 p++;
1657
1658 if (str != p)
1659 _str_cpy(str, p);
1660}
1661
1662/** Removes specified leading characters from a string.
1663 *
1664 * @param str String to remove from.
1665 * @param ch Character to remove.
1666 */
1667void str_ltrim(char *str, char32_t ch)
1668{
1669 /* Fast path for the ASCII case. */
1670 if (ascii_check(ch)) {
1671 _str_ltrim(str, ch);
1672 return;
1673 }
1674
1675 char32_t acc;
1676 size_t off = 0;
1677 size_t pos = 0;
1678 size_t str_sz = str_size(str);
1679
1680 while ((acc = str_decode(str, &off, STR_NO_LIMIT)) != 0) {
1681 if (acc != ch)
1682 break;
1683 else
1684 pos = off;
1685 }
1686
1687 if (pos > 0) {
1688 memmove(str, &str[pos], str_sz - pos);
1689 pos = str_sz - pos;
1690 str[pos] = '\0';
1691 }
1692}
1693
1694static char *_str_rchr(const char *str, char c)
1695{
1696 const char *last = NULL;
1697
1698 while (*str) {
1699 if (*str == c)
1700 last = str;
1701
1702 str++;
1703 }
1704
1705 return (char *) last;
1706}
1707
1708/** Find last occurence of character in string.
1709 *
1710 * @param str String to search.
1711 * @param ch Character to look for.
1712 *
1713 * @return Pointer to character in @a str or NULL if not found.
1714 */
1715char *str_rchr(const char *str, char32_t ch)
1716{
1717 if (ascii_check(ch))
1718 return _str_rchr(str, ch);
1719
1720 char32_t acc;
1721 size_t off = 0;
1722 size_t last = 0;
1723 const char *res = NULL;
1724
1725 while ((acc = str_decode(str, &off, STR_NO_LIMIT)) != 0) {
1726 if (acc == ch)
1727 res = (str + last);
1728 last = off;
1729 }
1730
1731 return (char *) res;
1732}
1733
1734/** Insert a wide character into a wide string.
1735 *
1736 * Insert a wide character into a wide string at position
1737 * @a pos. The characters after the position are shifted.
1738 *
1739 * @param str String to insert to.
1740 * @param ch Character to insert to.
1741 * @param pos Character index where to insert.
1742 * @param max_pos Characters in the buffer.
1743 *
1744 * @return True if the insertion was sucessful, false if the position
1745 * is out of bounds.
1746 *
1747 */
1748bool wstr_linsert(char32_t *str, char32_t ch, size_t pos, size_t max_pos)
1749{
1750 size_t len = wstr_length(str);
1751
1752 if ((pos > len) || (pos + 1 > max_pos))
1753 return false;
1754
1755 size_t i;
1756 for (i = len; i + 1 > pos; i--)
1757 str[i + 1] = str[i];
1758
1759 str[pos] = ch;
1760
1761 return true;
1762}
1763
1764/** Remove a wide character from a wide string.
1765 *
1766 * Remove a wide character from a wide string at position
1767 * @a pos. The characters after the position are shifted.
1768 *
1769 * @param str String to remove from.
1770 * @param pos Character index to remove.
1771 *
1772 * @return True if the removal was sucessful, false if the position
1773 * is out of bounds.
1774 *
1775 */
1776bool wstr_remove(char32_t *str, size_t pos)
1777{
1778 size_t len = wstr_length(str);
1779
1780 if (pos >= len)
1781 return false;
1782
1783 size_t i;
1784 for (i = pos + 1; i <= len; i++)
1785 str[i - 1] = str[i];
1786
1787 return true;
1788}
1789
1790/** Duplicate string.
1791 *
1792 * Allocate a new string and copy characters from the source
1793 * string into it. The duplicate string is allocated via sleeping
1794 * malloc(), thus this function can sleep in no memory conditions.
1795 *
1796 * The allocation cannot fail and the return value is always
1797 * a valid pointer. The duplicate string is always a well-formed
1798 * null-terminated UTF-8 string, but it can differ from the source
1799 * string on the byte level.
1800 *
1801 * @param src Source string.
1802 *
1803 * @return Duplicate string.
1804 *
1805 */
1806char *str_dup(const char *src)
1807{
1808 size_t size = _str_size(src) + 1;
1809 char *dest = malloc(size);
1810 if (!dest)
1811 return NULL;
1812
1813 memcpy(dest, src, size);
1814 _str_sanitize(dest, size, U_SPECIAL);
1815 return dest;
1816}
1817
1818/** Duplicate string with size limit.
1819 *
1820 * Allocate a new string and copy up to @max_size bytes from the source
1821 * string into it. The duplicate string is allocated via sleeping
1822 * malloc(), thus this function can sleep in no memory conditions.
1823 * No more than @max_size + 1 bytes is allocated, but if the size
1824 * occupied by the source string is smaller than @max_size + 1,
1825 * less is allocated.
1826 *
1827 * The allocation cannot fail and the return value is always
1828 * a valid pointer. The duplicate string is always a well-formed
1829 * null-terminated UTF-8 string, but it can differ from the source
1830 * string on the byte level.
1831 *
1832 * @param src Source string.
1833 * @param n Maximum number of bytes to duplicate.
1834 *
1835 * @return Duplicate string.
1836 *
1837 */
1838char *str_ndup(const char *src, size_t n)
1839{
1840 size_t size = _str_nsize(src, n);
1841
1842 char *dest = malloc(size + 1);
1843 if (!dest)
1844 return NULL;
1845
1846 memcpy(dest, src, size);
1847 _str_sanitize(dest, size, U_SPECIAL);
1848 dest[size] = 0;
1849 return dest;
1850}
1851
1852/** Split string by delimiters.
1853 *
1854 * @param s String to be tokenized. May not be NULL.
1855 * @param delim String with the delimiters.
1856 * @param next Variable which will receive the pointer to the
1857 * continuation of the string following the first
1858 * occurrence of any of the delimiter characters.
1859 * May be NULL.
1860 * @return Pointer to the prefix of @a s before the first
1861 * delimiter character. NULL if no such prefix
1862 * exists.
1863 */
1864char *str_tok(char *s, const char *delim, char **next)
1865{
1866 char *start, *end;
1867
1868 if (!s)
1869 return NULL;
1870
1871 size_t len = str_size(s);
1872 size_t cur;
1873 size_t tmp;
1874 char32_t ch;
1875
1876 /* Skip over leading delimiters. */
1877 tmp = 0;
1878 cur = 0;
1879 while ((ch = str_decode(s, &tmp, len)) && str_chr(delim, ch))
1880 cur = tmp;
1881 start = &s[cur];
1882
1883 /* Skip over token characters. */
1884 tmp = cur;
1885 while ((ch = str_decode(s, &tmp, len)) && !str_chr(delim, ch))
1886 cur = tmp;
1887 end = &s[cur];
1888 if (next)
1889 *next = (ch ? &s[tmp] : &s[cur]);
1890
1891 if (start == end)
1892 return NULL; /* No more tokens. */
1893
1894 /* Overwrite delimiter with NULL terminator. */
1895 *end = '\0';
1896 return start;
1897}
1898
1899void order_suffix(const uint64_t val, uint64_t *rv, char *suffix)
1900{
1901 if (val > UINT64_C(10000000000000000000)) {
1902 *rv = val / UINT64_C(1000000000000000000);
1903 *suffix = 'Z';
1904 } else if (val > UINT64_C(1000000000000000000)) {
1905 *rv = val / UINT64_C(1000000000000000);
1906 *suffix = 'E';
1907 } else if (val > UINT64_C(1000000000000000)) {
1908 *rv = val / UINT64_C(1000000000000);
1909 *suffix = 'T';
1910 } else if (val > UINT64_C(1000000000000)) {
1911 *rv = val / UINT64_C(1000000000);
1912 *suffix = 'G';
1913 } else if (val > UINT64_C(1000000000)) {
1914 *rv = val / UINT64_C(1000000);
1915 *suffix = 'M';
1916 } else if (val > UINT64_C(1000000)) {
1917 *rv = val / UINT64_C(1000);
1918 *suffix = 'k';
1919 } else {
1920 *rv = val;
1921 *suffix = ' ';
1922 }
1923}
1924
1925void bin_order_suffix(const uint64_t val, uint64_t *rv, const char **suffix,
1926 bool fixed)
1927{
1928 if (val > UINT64_C(1152921504606846976)) {
1929 *rv = val / UINT64_C(1125899906842624);
1930 *suffix = "EiB";
1931 } else if (val > UINT64_C(1125899906842624)) {
1932 *rv = val / UINT64_C(1099511627776);
1933 *suffix = "TiB";
1934 } else if (val > UINT64_C(1099511627776)) {
1935 *rv = val / UINT64_C(1073741824);
1936 *suffix = "GiB";
1937 } else if (val > UINT64_C(1073741824)) {
1938 *rv = val / UINT64_C(1048576);
1939 *suffix = "MiB";
1940 } else if (val > UINT64_C(1048576)) {
1941 *rv = val / UINT64_C(1024);
1942 *suffix = "KiB";
1943 } else {
1944 *rv = val;
1945 if (fixed)
1946 *suffix = "B ";
1947 else
1948 *suffix = "B";
1949 }
1950}
1951
1952/** @}
1953 */
Note: See TracBrowser for help on using the repository browser.