source: mainline/common/str.c@ ae787807

Last change on this file since ae787807 was b31323f, checked in by Jiří Zárevúcky <zarevucky.jiri@…>, 2 months ago

Test, fix and extend string sanitization

  • Property mode set to 100644
File size: 45.6 KB
Line 
1/*
2 * Copyright (c) 2001-2004 Jakub Jermar
3 * Copyright (c) 2005 Martin Decky
4 * Copyright (c) 2008 Jiri Svoboda
5 * Copyright (c) 2011 Martin Sucha
6 * Copyright (c) 2011 Oleg Romanenko
7 * Copyright (c) 2025 Jiří Zárevúcky
8 * All rights reserved.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * - Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * - Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 * - The name of the author may not be used to endorse or promote products
20 * derived from this software without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
23 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
24 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
25 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
27 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
31 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33
34/** @addtogroup libc
35 * @{
36 */
37
38/**
39 * @file
40 * @brief String functions.
41 *
42 * Strings and characters use the Universal Character Set (UCS). The standard
43 * strings, called just strings are encoded in UTF-8. Wide strings (encoded
44 * in UTF-32) are supported to a limited degree. A single character is
45 * represented as char32_t.@n
46 *
47 * Overview of the terminology:@n
48 *
49 * Term Meaning
50 * -------------------- ----------------------------------------------------
51 * byte 8 bits stored in uint8_t (unsigned 8 bit integer)
52 *
53 * character UTF-32 encoded Unicode character, stored in char32_t
54 * (unsigned 32 bit integer), code points 0 .. 1114111
55 * are valid
56 *
57 * Note that Unicode characters do not match
58 * one-to-one with displayed characters or glyphs on
59 * screen. For that level of precision, look up
60 * Grapheme Clusters.
61 *
62 * ASCII character 7 bit encoded ASCII character, stored in char
63 * (usually signed 8 bit integer), code points 0 .. 127
64 * are valid
65 *
66 * string UTF-8 encoded NULL-terminated Unicode string, char *
67 *
68 * wide string UTF-32 encoded NULL-terminated Unicode string,
69 * char32_t *
70 *
71 * [wide] string size number of BYTES in a [wide] string (excluding
72 * the NULL-terminator), size_t
73 *
74 * [wide] string length number of CHARACTERS in a [wide] string (excluding
75 * the NULL-terminator), size_t
76 *
77 * [wide] string width number of display cells on a monospace display taken
78 * by a [wide] string, size_t
79 *
80 * This is virtually impossible to determine exactly for
81 * all strings without knowing specifics of the display
82 * device, due to various factors affecting text output.
83 * If you have the option to query the terminal for
84 * position change caused by outputting the string,
85 * it is preferrable to determine width that way.
86 *
87 *
88 * Overview of string metrics:@n
89 *
90 * Metric Abbrev. Type Meaning
91 * ------ ------ ------ -------------------------------------------------
92 * size n size_t number of BYTES in a string (excluding the
93 * NULL-terminator)
94 *
95 * length l size_t number of CHARACTERS in a string (excluding the
96 * null terminator)
97 *
98 * width w size_t number of display cells on a monospace display
99 * taken by a string
100 *
101 *
102 * Function naming prefixes:@n
103 *
104 * chr_ operate on characters
105 * ascii_ operate on ASCII characters
106 * str_ operate on strings
107 * wstr_ operate on wide strings
108 *
109 * [w]str_[n|l|w] operate on a prefix limited by size, length
110 * or width
111 *
112 *
113 * A specific character inside a [wide] string can be referred to by:@n
114 *
115 * pointer (char *, char32_t *)
116 * byte offset (size_t)
117 * character index (size_t)
118 *
119 */
120
121#include <str.h>
122
123#include <align.h>
124#include <assert.h>
125#include <ctype.h>
126#include <errno.h>
127#include <limits.h>
128#include <macros.h>
129#include <mem.h>
130#include <stdbool.h>
131#include <stddef.h>
132#include <stdint.h>
133#include <stdlib.h>
134#include <uchar.h>
135
136#if __STDC_HOSTED__
137#include <fibril.h>
138#endif
139
140static void _set_ilseq()
141{
142#ifdef errno
143 errno = EILSEQ;
144#endif
145}
146
147/** Byte mask consisting of lowest @n bits (out of 8) */
148#define LO_MASK_8(n) ((uint8_t) ((1 << (n)) - 1))
149
150/** Byte mask consisting of lowest @n bits (out of 32) */
151#define LO_MASK_32(n) ((uint32_t) ((1 << (n)) - 1))
152
153/** Byte mask consisting of highest @n bits (out of 8) */
154#define HI_MASK_8(n) (~LO_MASK_8(8 - (n)))
155
156/** Number of data bits in a UTF-8 continuation byte */
157#define CONT_BITS 6
158
159#define UTF8_MASK_INITIAL2 0b00011111
160#define UTF8_MASK_INITIAL3 0b00001111
161#define UTF8_MASK_INITIAL4 0b00000111
162#define UTF8_MASK_CONT 0b00111111
163
164#define CHAR_INVALID ((char32_t) UINT_MAX)
165
166static inline bool _is_ascii(uint8_t b)
167{
168 return b < 0x80;
169}
170
171static inline bool _is_continuation(uint8_t b)
172{
173 return (b & 0xC0) == 0x80;
174}
175
176static inline bool _is_2_byte(uint8_t c)
177{
178 return (c & 0xE0) == 0xC0;
179}
180
181static inline bool _is_3_byte(uint8_t c)
182{
183 return (c & 0xF0) == 0xE0;
184}
185
186static inline bool _is_4_byte(uint8_t c)
187{
188 return (c & 0xF8) == 0xF0;
189}
190
191static inline int _char_continuation_bytes(char32_t c)
192{
193 if ((c & ~LO_MASK_32(7)) == 0)
194 return 0;
195
196 if ((c & ~LO_MASK_32(11)) == 0)
197 return 1;
198
199 if ((c & ~LO_MASK_32(16)) == 0)
200 return 2;
201
202 if ((c & ~LO_MASK_32(21)) == 0)
203 return 3;
204
205 /* Codes longer than 21 bits are not supported */
206 return -1;
207}
208
209static inline int _continuation_bytes(uint8_t b)
210{
211 /* 0xxxxxxx */
212 if (_is_ascii(b))
213 return 0;
214
215 /* 110xxxxx 10xxxxxx */
216 if (_is_2_byte(b))
217 return 1;
218
219 /* 1110xxxx 10xxxxxx 10xxxxxx */
220 if (_is_3_byte(b))
221 return 2;
222
223 /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
224 if (_is_4_byte(b))
225 return 3;
226
227 return -1;
228}
229
230static bool _is_non_shortest(const mbstate_t *mb, uint8_t b)
231{
232 return (mb->state == 0b1111110000000000 && !(b & 0b00100000)) ||
233 (mb->state == 0b1111111111110000 && !(b & 0b00110000));
234}
235
236static bool _is_surrogate(const mbstate_t *mb, uint8_t b)
237{
238 return (mb->state == 0b1111110000001101 && b >= 0xa0);
239}
240
241#define _likely(expr) __builtin_expect((expr), true)
242#define _unlikely(expr) __builtin_expect((expr), false)
243
244#define FAST_PATHS 1
245
246static char32_t _str_decode(const char *s, size_t *offset, size_t size, mbstate_t *mb)
247{
248 assert(s);
249 assert(offset);
250 assert(*offset <= size);
251 assert(size == STR_NO_LIMIT || s + size >= s);
252 assert(mb);
253
254 if (*offset == size)
255 return 0;
256
257 if (_likely(!mb->state)) {
258 /* Clean slate, read initial byte. */
259 uint8_t b = s[(*offset)++];
260
261 /* Fast exit for the most common case. */
262 if (_likely(_is_ascii(b)))
263 return b;
264
265 /* unexpected continuation byte */
266 if (_unlikely(_is_continuation(b)))
267 return CHAR_INVALID;
268
269 /*
270 * The value stored into `continuation` is designed to have
271 * just enough leading ones that after shifting in one less than
272 * the expected number of continuation bytes, the most significant
273 * bit becomes zero. (The field is 16b wide.)
274 */
275
276 if (_is_2_byte(b)) {
277 /* Reject non-shortest form. */
278 if (_unlikely(!(b & 0b00011110)))
279 return CHAR_INVALID;
280
281#if FAST_PATHS
282 /* We can usually take this exit. */
283 if (_likely(*offset < size && _is_continuation(s[*offset])))
284 return (b & UTF8_MASK_INITIAL2) << 6 |
285 (s[(*offset)++] & UTF8_MASK_CONT);
286#endif
287
288 /* 2 byte continuation 110xxxxx */
289 mb->state = b ^ 0b0000000011000000;
290
291 } else if (_is_3_byte(b)) {
292#if FAST_PATHS
293 /* We can usually take this exit. */
294 if (_likely(*offset + 1 < size && _is_continuation(s[*offset]) && _is_continuation(s[*offset + 1]))) {
295
296 char32_t ch = (b & UTF8_MASK_INITIAL3) << 12 |
297 (s[(*offset)] & UTF8_MASK_CONT) << 6 |
298 (s[(*offset) + 1] & UTF8_MASK_CONT);
299
300 *offset += 2;
301
302 /* Reject non-shortest form. */
303 if (_unlikely(!(ch & 0xFFFFF800)))
304 return CHAR_INVALID;
305
306 /* Reject surrogates */
307 if (_unlikely(ch >= 0xD800 && ch < 0xE000))
308 return CHAR_INVALID;
309
310 return ch;
311 }
312#endif
313
314 /* 3 byte continuation 1110xxxx */
315 mb->state = b ^ 0b1111110011100000;
316
317 } else if (_is_4_byte(b)) {
318#if FAST_PATHS
319 /* We can usually take this exit. */
320 if (_likely(*offset + 2 < size && _is_continuation(s[*offset]) &&
321 _is_continuation(s[*offset + 1]) && _is_continuation(s[*offset + 2]))) {
322
323 char32_t ch = (b & UTF8_MASK_INITIAL4) << 18 |
324 (s[(*offset)] & UTF8_MASK_CONT) << 12 |
325 (s[(*offset) + 1] & UTF8_MASK_CONT) << 6 |
326 (s[(*offset) + 2] & UTF8_MASK_CONT);
327
328 *offset += 3;
329
330 /* Reject non-shortest form. */
331 if (_unlikely(!(ch & 0xFFFF0000)))
332 return CHAR_INVALID;
333
334 /* Reject out-of-range characters. */
335 if (_unlikely(ch >= 0x110000))
336 return CHAR_INVALID;
337
338 return ch;
339 }
340#endif
341
342 /* 4 byte continuation 11110xxx */
343 mb->state = b ^ 0b1111111100000000;
344 } else {
345 return CHAR_INVALID;
346 }
347 }
348
349 /* Deal with the remaining edge and invalid cases. */
350 for (; *offset < size; (*offset)++) {
351 /* Read continuation bytes. */
352 uint8_t b = s[*offset];
353
354 if (!_is_continuation(b) || _is_non_shortest(mb, b) || _is_surrogate(mb, b)) {
355 mb->state = 0;
356 return CHAR_INVALID;
357 }
358
359 /* Top bit becomes zero when shifting in the second to last byte. */
360 if (!(mb->state & 0x8000)) {
361 char32_t c = ((char32_t) mb->state) << 6 | (b & UTF8_MASK_CONT);
362 mb->state = 0;
363 (*offset)++;
364 return c;
365 }
366
367 mb->state = mb->state << 6 | (b & UTF8_MASK_CONT);
368 }
369
370 /* Incomplete character. */
371 assert(mb->state);
372 return 0;
373}
374
375/** Standard <uchar.h> function since C11. */
376size_t mbrtoc32(char32_t *c, const char *s, size_t n, mbstate_t *mb)
377{
378#if __STDC_HOSTED__
379 static fibril_local mbstate_t global_state = { };
380
381 if (!mb)
382 mb = &global_state;
383#endif
384
385 if (!s) {
386 /* Equivalent to mbrtoc32(NULL, "", 1, mb); */
387 c = NULL;
388 s = "";
389 n = 1;
390 }
391
392 size_t offset = 0;
393 char32_t ret = _str_decode(s, &offset, n, mb);
394 if (ret == CHAR_INVALID) {
395 assert(!mb->state);
396 _set_ilseq();
397 return UCHAR_ILSEQ;
398 }
399 if (mb->state) {
400 assert(ret == 0);
401 return UCHAR_INCOMPLETE;
402 }
403
404 if (c)
405 *c = ret;
406 return ret ? offset : 0;
407}
408
409/** Decode a single character from a string.
410 *
411 * Decode a single character from a string of size @a size. Decoding starts
412 * at @a offset and this offset is moved to the beginning of the next
413 * character. In case of decoding error, offset generally advances at least
414 * by one. However, offset is never moved beyond size.
415 *
416 * @param str String (not necessarily NULL-terminated).
417 * @param offset Byte offset in string where to start decoding.
418 * @param size Size of the string (in bytes).
419 *
420 * @return Value of decoded character, U_SPECIAL on decoding error or
421 * NULL if attempt to decode beyond @a size.
422 *
423 */
424char32_t str_decode(const char *str, size_t *offset, size_t size)
425{
426 mbstate_t mb = { };
427 char32_t ch = _str_decode(str, offset, size, &mb);
428
429 if (ch == CHAR_INVALID)
430 return U_SPECIAL;
431
432 if (mb.state)
433 return U_SPECIAL;
434
435 return ch;
436}
437
438/** Decode a single character from a string to the left.
439 *
440 * Decode a single character from a string of size @a size. Decoding starts
441 * at @a offset and this offset is moved to the beginning of the previous
442 * character. In case of decoding error, offset generally decreases at least
443 * by one. However, offset is never moved before 0.
444 *
445 * @param str String (not necessarily NULL-terminated).
446 * @param offset Byte offset in string where to start decoding.
447 * @param size Size of the string (in bytes).
448 *
449 * @return Value of decoded character, U_SPECIAL on decoding error or
450 * NULL if attempt to decode beyond @a start of str.
451 *
452 */
453char32_t str_decode_reverse(const char *str, size_t *offset, size_t size)
454{
455 if (*offset == 0)
456 return 0;
457
458 int cbytes = 0;
459 /* Continue while continuation bytes found */
460 while (*offset > 0 && cbytes < 4) {
461 uint8_t b = (uint8_t) str[--(*offset)];
462
463 if (_is_continuation(b)) {
464 cbytes++;
465 continue;
466 }
467
468 /* Reject non-shortest form encoding. */
469 if (cbytes != _continuation_bytes(b))
470 return U_SPECIAL;
471
472 /* Start byte */
473 size_t start_offset = *offset;
474 return str_decode(str, &start_offset, size);
475 }
476
477 /* Too many continuation bytes */
478 return U_SPECIAL;
479}
480
481/** Encode a single character to string representation.
482 *
483 * Encode a single character to string representation (i.e. UTF-8) and store
484 * it into a buffer at @a offset. Encoding starts at @a offset and this offset
485 * is moved to the position where the next character can be written to.
486 *
487 * @param ch Input character.
488 * @param str Output buffer.
489 * @param offset Byte offset where to start writing.
490 * @param size Size of the output buffer (in bytes).
491 *
492 * @return EOK if the character was encoded successfully, EOVERFLOW if there
493 * was not enough space in the output buffer or EINVAL if the character
494 * code was invalid.
495 */
496errno_t chr_encode(char32_t ch, char *str, size_t *offset, size_t size)
497{
498 // TODO: merge with c32rtomb()
499
500 if (*offset >= size)
501 return EOVERFLOW;
502
503 /* Fast exit for the most common case. */
504 if (ch < 0x80) {
505 str[(*offset)++] = (char) ch;
506 return EOK;
507 }
508
509 /* Codes longer than 21 bits are not supported */
510 if (!chr_check(ch))
511 return EINVAL;
512
513 /* Determine how many continuation bytes are needed */
514
515 unsigned int cbytes = _char_continuation_bytes(ch);
516 unsigned int b0_bits = 6 - cbytes; /* Data bits in first byte */
517
518 /* Check for available space in buffer */
519 if (*offset + cbytes >= size)
520 return EOVERFLOW;
521
522 /* Encode continuation bytes */
523 unsigned int i;
524 for (i = cbytes; i > 0; i--) {
525 str[*offset + i] = 0x80 | (ch & LO_MASK_32(CONT_BITS));
526 ch >>= CONT_BITS;
527 }
528
529 /* Encode first byte */
530 str[*offset] = (ch & LO_MASK_32(b0_bits)) | HI_MASK_8(8 - b0_bits - 1);
531
532 /* Advance offset */
533 *offset += cbytes + 1;
534
535 return EOK;
536}
537
538/* Convert in place any bytes that don't form a valid character into replacement. */
539static size_t _str_sanitize(char *str, size_t n, uint8_t replacement)
540{
541 uint8_t *b = (uint8_t *) str;
542 size_t count = 0;
543
544 for (; n > 0 && b[0]; b++, n--) {
545 int cont = _continuation_bytes(b[0]);
546 if (__builtin_expect(cont, 0) == 0)
547 continue;
548
549 if (cont < 0 || n <= (size_t) cont) {
550 b[0] = replacement;
551 count++;
552 continue;
553 }
554
555 /* Check continuation bytes. */
556 bool valid = true;
557 for (int i = 1; i <= cont; i++) {
558 if (!_is_continuation(b[i])) {
559 valid = false;
560 break;
561 }
562 }
563
564 if (!valid) {
565 b[0] = replacement;
566 count++;
567 continue;
568 }
569
570 /*
571 * Check for non-shortest form encoding.
572 * See https://www.unicode.org/versions/corrigendum1.html
573 */
574
575 /* 0b110!!!!x 0b10xxxxxx */
576 if (cont == 1 && !(b[0] & 0b00011110)) {
577 b[0] = replacement;
578 count++;
579 continue;
580 }
581
582 /* 0b1110!!!! 0b10!xxxxx 0b10xxxxxx */
583 if (cont == 2 && !(b[0] & 0b00001111) && !(b[1] & 0b00100000)) {
584 b[0] = replacement;
585 count++;
586 continue;
587 }
588
589 /* 0b11110!!! 0b10!!xxxx 0b10xxxxxx 0b10xxxxxx */
590 if (cont == 3 && !(b[0] & 0b00000111) && !(b[1] & 0b00110000)) {
591 b[0] = replacement;
592 count++;
593 continue;
594 }
595
596 /* Check for surrogate character encoding. */
597 if (cont == 2 && b[0] == 0xED && b[1] >= 0xA0) {
598 b[0] = replacement;
599 count++;
600 continue;
601 }
602
603 /* Check for out-of-range code points. */
604 if (cont == 3 && (b[0] > 0xF4 || (b[0] == 0xF4 && b[1] >= 0x90))) {
605 b[0] = replacement;
606 count++;
607 continue;
608 }
609
610 b += cont;
611 n -= cont;
612 }
613
614 return count;
615}
616
617size_t str_sanitize(char *str, size_t n, uint8_t replacement)
618{
619 return _str_sanitize(str, n, replacement);
620}
621
622static size_t _str_size(const char *str)
623{
624 size_t size = 0;
625
626 while (*str++ != 0)
627 size++;
628
629 return size;
630}
631
632/** Get size of string.
633 *
634 * Get the number of bytes which are used by the string @a str (excluding the
635 * NULL-terminator).
636 *
637 * @param str String to consider.
638 *
639 * @return Number of bytes used by the string
640 *
641 */
642size_t str_size(const char *str)
643{
644 return _str_size(str);
645}
646
647/** Get size of wide string.
648 *
649 * Get the number of bytes which are used by the wide string @a str (excluding the
650 * NULL-terminator).
651 *
652 * @param str Wide string to consider.
653 *
654 * @return Number of bytes used by the wide string
655 *
656 */
657size_t wstr_size(const char32_t *str)
658{
659 return (wstr_length(str) * sizeof(char32_t));
660}
661
662/** Get size of string with length limit.
663 *
664 * Get the number of bytes which are used by up to @a max_len first
665 * characters in the string @a str. If @a max_len is greater than
666 * the length of @a str, the entire string is measured (excluding the
667 * NULL-terminator).
668 *
669 * @param str String to consider.
670 * @param max_len Maximum number of characters to measure.
671 *
672 * @return Number of bytes used by the characters.
673 *
674 */
675size_t str_lsize(const char *str, size_t max_len)
676{
677 size_t len = 0;
678 size_t offset = 0;
679
680 while (len < max_len) {
681 if (str_decode(str, &offset, STR_NO_LIMIT) == 0)
682 break;
683
684 len++;
685 }
686
687 return offset;
688}
689
690static size_t _str_nsize(const char *str, size_t max_size)
691{
692 size_t size = 0;
693
694 while ((*str++ != 0) && (size < max_size))
695 size++;
696
697 return size;
698}
699
700/** Get size of string with size limit.
701 *
702 * Get the number of bytes which are used by the string @a str
703 * (excluding the NULL-terminator), but no more than @max_size bytes.
704 *
705 * @param str String to consider.
706 * @param max_size Maximum number of bytes to measure.
707 *
708 * @return Number of bytes used by the string
709 *
710 */
711size_t str_nsize(const char *str, size_t max_size)
712{
713 return _str_nsize(str, max_size);
714}
715
716/** Get size of wide string with size limit.
717 *
718 * Get the number of bytes which are used by the wide string @a str
719 * (excluding the NULL-terminator), but no more than @max_size bytes.
720 *
721 * @param str Wide string to consider.
722 * @param max_size Maximum number of bytes to measure.
723 *
724 * @return Number of bytes used by the wide string
725 *
726 */
727size_t wstr_nsize(const char32_t *str, size_t max_size)
728{
729 return (wstr_nlength(str, max_size) * sizeof(char32_t));
730}
731
732/** Get size of wide string with length limit.
733 *
734 * Get the number of bytes which are used by up to @a max_len first
735 * wide characters in the wide string @a str. If @a max_len is greater than
736 * the length of @a str, the entire wide string is measured (excluding the
737 * NULL-terminator).
738 *
739 * @param str Wide string to consider.
740 * @param max_len Maximum number of wide characters to measure.
741 *
742 * @return Number of bytes used by the wide characters.
743 *
744 */
745size_t wstr_lsize(const char32_t *str, size_t max_len)
746{
747 return (wstr_nlength(str, max_len * sizeof(char32_t)) * sizeof(char32_t));
748}
749
750/** Get number of characters in a string.
751 *
752 * @param str NULL-terminated string.
753 *
754 * @return Number of characters in string.
755 *
756 */
757size_t str_length(const char *str)
758{
759 size_t len = 0;
760 size_t offset = 0;
761
762 while (str_decode(str, &offset, STR_NO_LIMIT) != 0)
763 len++;
764
765 return len;
766}
767
768/** Get number of characters in a wide string.
769 *
770 * @param str NULL-terminated wide string.
771 *
772 * @return Number of characters in @a str.
773 *
774 */
775size_t wstr_length(const char32_t *wstr)
776{
777 size_t len = 0;
778
779 while (*wstr++ != 0)
780 len++;
781
782 return len;
783}
784
785/** Get number of characters in a string with size limit.
786 *
787 * @param str NULL-terminated string.
788 * @param size Maximum number of bytes to consider.
789 *
790 * @return Number of characters in string.
791 *
792 */
793size_t str_nlength(const char *str, size_t size)
794{
795 size_t len = 0;
796 size_t offset = 0;
797
798 while (str_decode(str, &offset, size) != 0)
799 len++;
800
801 return len;
802}
803
804/** Get number of characters in a string with size limit.
805 *
806 * @param str NULL-terminated string.
807 * @param size Maximum number of bytes to consider.
808 *
809 * @return Number of characters in string.
810 *
811 */
812size_t wstr_nlength(const char32_t *str, size_t size)
813{
814 size_t len = 0;
815 size_t limit = ALIGN_DOWN(size, sizeof(char32_t));
816 size_t offset = 0;
817
818 while ((offset < limit) && (*str++ != 0)) {
819 len++;
820 offset += sizeof(char32_t);
821 }
822
823 return len;
824}
825
826/** Get character display width on a character cell display.
827 *
828 * @param ch Character
829 * @return Width of character in cells.
830 */
831size_t chr_width(char32_t ch)
832{
833 return 1;
834}
835
836/** Get string display width on a character cell display.
837 *
838 * @param str String
839 * @return Width of string in cells.
840 */
841size_t str_width(const char *str)
842{
843 size_t width = 0;
844 size_t offset = 0;
845 char32_t ch;
846
847 while ((ch = str_decode(str, &offset, STR_NO_LIMIT)) != 0)
848 width += chr_width(ch);
849
850 return width;
851}
852
853/** Check whether character is plain ASCII.
854 *
855 * @return True if character is plain ASCII.
856 *
857 */
858bool ascii_check(char32_t ch)
859{
860 if (ch <= 127)
861 return true;
862
863 return false;
864}
865
866/** Check whether character is valid
867 *
868 * @return True if character is a valid Unicode code point.
869 *
870 */
871bool chr_check(char32_t ch)
872{
873 if (ch <= 1114111)
874 return true;
875
876 return false;
877}
878
879/** Compare two NULL terminated strings.
880 *
881 * Do a char-by-char comparison of two NULL-terminated strings.
882 * The strings are considered equal iff their length is equal
883 * and both strings consist of the same sequence of characters.
884 *
885 * A string S1 is less than another string S2 if it has a character with
886 * lower value at the first character position where the strings differ.
887 * If the strings differ in length, the shorter one is treated as if
888 * padded by characters with a value of zero.
889 *
890 * @param s1 First string to compare.
891 * @param s2 Second string to compare.
892 *
893 * @return 0 if the strings are equal, -1 if the first is less than the second,
894 * 1 if the second is less than the first.
895 *
896 */
897int str_cmp(const char *s1, const char *s2)
898{
899 /*
900 * UTF-8 has the nice property that lexicographic ordering on bytes is
901 * the same as the lexicographic ordering of the character sequences.
902 */
903 while (*s1 == *s2 && *s1 != 0) {
904 s1++;
905 s2++;
906 }
907
908 if (*s1 == *s2)
909 return 0;
910
911 return (*s1 < *s2) ? -1 : 1;
912}
913
914/** Compare two NULL terminated strings with length limit.
915 *
916 * Do a char-by-char comparison of two NULL-terminated strings.
917 * The strings are considered equal iff
918 * min(str_length(s1), max_len) == min(str_length(s2), max_len)
919 * and both strings consist of the same sequence of characters,
920 * up to max_len characters.
921 *
922 * A string S1 is less than another string S2 if it has a character with
923 * lower value at the first character position where the strings differ.
924 * If the strings differ in length, the shorter one is treated as if
925 * padded by characters with a value of zero. Only the first max_len
926 * characters are considered.
927 *
928 * @param s1 First string to compare.
929 * @param s2 Second string to compare.
930 * @param max_len Maximum number of characters to consider.
931 *
932 * @return 0 if the strings are equal, -1 if the first is less than the second,
933 * 1 if the second is less than the first.
934 *
935 */
936int str_lcmp(const char *s1, const char *s2, size_t max_len)
937{
938 char32_t c1 = 0;
939 char32_t c2 = 0;
940
941 size_t off1 = 0;
942 size_t off2 = 0;
943
944 size_t len = 0;
945
946 while (true) {
947 if (len >= max_len)
948 break;
949
950 c1 = str_decode(s1, &off1, STR_NO_LIMIT);
951 c2 = str_decode(s2, &off2, STR_NO_LIMIT);
952
953 if (c1 < c2)
954 return -1;
955
956 if (c1 > c2)
957 return 1;
958
959 if (c1 == 0 || c2 == 0)
960 break;
961
962 ++len;
963 }
964
965 return 0;
966
967}
968
969/** Compare two NULL terminated strings in case-insensitive manner.
970 *
971 * Do a char-by-char comparison of two NULL-terminated strings.
972 * The strings are considered equal iff their length is equal
973 * and both strings consist of the same sequence of characters
974 * when converted to lower case.
975 *
976 * A string S1 is less than another string S2 if it has a character with
977 * lower value at the first character position where the strings differ.
978 * If the strings differ in length, the shorter one is treated as if
979 * padded by characters with a value of zero.
980 *
981 * @param s1 First string to compare.
982 * @param s2 Second string to compare.
983 *
984 * @return 0 if the strings are equal, -1 if the first is less than the second,
985 * 1 if the second is less than the first.
986 *
987 */
988int str_casecmp(const char *s1, const char *s2)
989{
990 // FIXME: doesn't work for non-ASCII caseful characters
991
992 char32_t c1 = 0;
993 char32_t c2 = 0;
994
995 size_t off1 = 0;
996 size_t off2 = 0;
997
998 while (true) {
999 c1 = tolower(str_decode(s1, &off1, STR_NO_LIMIT));
1000 c2 = tolower(str_decode(s2, &off2, STR_NO_LIMIT));
1001
1002 if (c1 < c2)
1003 return -1;
1004
1005 if (c1 > c2)
1006 return 1;
1007
1008 if (c1 == 0 || c2 == 0)
1009 break;
1010 }
1011
1012 return 0;
1013}
1014
1015/** Compare two NULL terminated strings with length limit in case-insensitive
1016 * manner.
1017 *
1018 * Do a char-by-char comparison of two NULL-terminated strings.
1019 * The strings are considered equal iff
1020 * min(str_length(s1), max_len) == min(str_length(s2), max_len)
1021 * and both strings consist of the same sequence of characters,
1022 * up to max_len characters.
1023 *
1024 * A string S1 is less than another string S2 if it has a character with
1025 * lower value at the first character position where the strings differ.
1026 * If the strings differ in length, the shorter one is treated as if
1027 * padded by characters with a value of zero. Only the first max_len
1028 * characters are considered.
1029 *
1030 * @param s1 First string to compare.
1031 * @param s2 Second string to compare.
1032 * @param max_len Maximum number of characters to consider.
1033 *
1034 * @return 0 if the strings are equal, -1 if the first is less than the second,
1035 * 1 if the second is less than the first.
1036 *
1037 */
1038int str_lcasecmp(const char *s1, const char *s2, size_t max_len)
1039{
1040 // FIXME: doesn't work for non-ASCII caseful characters
1041
1042 char32_t c1 = 0;
1043 char32_t c2 = 0;
1044
1045 size_t off1 = 0;
1046 size_t off2 = 0;
1047
1048 size_t len = 0;
1049
1050 while (true) {
1051 if (len >= max_len)
1052 break;
1053
1054 c1 = tolower(str_decode(s1, &off1, STR_NO_LIMIT));
1055 c2 = tolower(str_decode(s2, &off2, STR_NO_LIMIT));
1056
1057 if (c1 < c2)
1058 return -1;
1059
1060 if (c1 > c2)
1061 return 1;
1062
1063 if (c1 == 0 || c2 == 0)
1064 break;
1065
1066 ++len;
1067 }
1068
1069 return 0;
1070
1071}
1072
1073static bool _test_prefix(const char *s, const char *p)
1074{
1075 while (*s == *p && *s != 0) {
1076 s++;
1077 p++;
1078 }
1079
1080 return *p == 0;
1081}
1082
1083/** Test whether p is a prefix of s.
1084 *
1085 * Do a char-by-char comparison of two NULL-terminated strings
1086 * and determine if p is a prefix of s.
1087 *
1088 * @param s The string in which to look
1089 * @param p The string to check if it is a prefix of s
1090 *
1091 * @return true iff p is prefix of s else false
1092 *
1093 */
1094bool str_test_prefix(const char *s, const char *p)
1095{
1096 return _test_prefix(s, p);
1097}
1098
1099/** Get a string suffix.
1100 *
1101 * Return a string suffix defined by the prefix length.
1102 *
1103 * @param s The string to get the suffix from.
1104 * @param prefix_length Number of prefix characters to ignore.
1105 *
1106 * @return String suffix.
1107 *
1108 */
1109const char *str_suffix(const char *s, size_t prefix_length)
1110{
1111 size_t off = 0;
1112 size_t i = 0;
1113
1114 while (true) {
1115 str_decode(s, &off, STR_NO_LIMIT);
1116 i++;
1117
1118 if (i >= prefix_length)
1119 break;
1120 }
1121
1122 return s + off;
1123}
1124
1125/** Copy string as a sequence of bytes. */
1126static void _str_cpy(char *dest, const char *src)
1127{
1128 while (*src)
1129 *(dest++) = *(src++);
1130
1131 *dest = 0;
1132}
1133
1134/** Copy string as a sequence of bytes. */
1135static void _str_cpyn(char *dest, size_t size, const char *src)
1136{
1137 assert(dest && src && size);
1138
1139 if (!dest || !src || !size)
1140 return;
1141
1142 if (size == STR_NO_LIMIT)
1143 return _str_cpy(dest, src);
1144
1145 char *dest_top = dest + size - 1;
1146 assert(size == 1 || dest < dest_top);
1147
1148 while (*src && dest < dest_top)
1149 *(dest++) = *(src++);
1150
1151 *dest = 0;
1152}
1153
1154/** Copy string.
1155 *
1156 * Copy source string @a src to destination buffer @a dest.
1157 * No more than @a size bytes are written. If the size of the output buffer
1158 * is at least one byte, the output string will always be well-formed, i.e.
1159 * null-terminated and containing only complete characters.
1160 *
1161 * @param dest Destination buffer.
1162 * @param count Size of the destination buffer (must be > 0).
1163 * @param src Source string.
1164 *
1165 */
1166void str_cpy(char *dest, size_t size, const char *src)
1167{
1168 /* There must be space for a null terminator in the buffer. */
1169 assert(size > 0);
1170 assert(src != NULL);
1171 assert(dest != NULL);
1172 assert(size == STR_NO_LIMIT || dest + size > dest);
1173
1174 /* Copy data. */
1175 _str_cpyn(dest, size, src);
1176
1177 /* In-place translate invalid bytes to U_SPECIAL. */
1178 _str_sanitize(dest, size, U_SPECIAL);
1179}
1180
1181/** Copy size-limited substring.
1182 *
1183 * Copy prefix of string @a src of max. size @a size to destination buffer
1184 * @a dest. No more than @a size bytes are written. The output string will
1185 * always be well-formed, i.e. null-terminated and containing only complete
1186 * characters.
1187 *
1188 * No more than @a n bytes are read from the input string, so it does not
1189 * have to be null-terminated.
1190 *
1191 * @param dest Destination buffer.
1192 * @param count Size of the destination buffer (must be > 0).
1193 * @param src Source string.
1194 * @param n Maximum number of bytes to read from @a src.
1195 *
1196 */
1197void str_ncpy(char *dest, size_t size, const char *src, size_t n)
1198{
1199 /* There must be space for a null terminator in the buffer. */
1200 assert(size > 0);
1201 assert(src != NULL);
1202
1203 /* Copy data. */
1204 _str_cpyn(dest, min(size, n + 1), src);
1205
1206 /* In-place translate invalid bytes to U_SPECIAL. */
1207 _str_sanitize(dest, size, U_SPECIAL);
1208}
1209
1210/** Append one string to another.
1211 *
1212 * Append source string @a src to string in destination buffer @a dest.
1213 * Size of the destination buffer is @a dest. If the size of the output buffer
1214 * is at least one byte, the output string will always be well-formed, i.e.
1215 * null-terminated and containing only complete characters.
1216 *
1217 * @param dest Destination buffer.
1218 * @param count Size of the destination buffer.
1219 * @param src Source string.
1220 */
1221void str_append(char *dest, size_t size, const char *src)
1222{
1223 assert(src != NULL);
1224 assert(dest != NULL);
1225 assert(size > 0);
1226 assert(size == STR_NO_LIMIT || dest + size > dest);
1227
1228 size_t dstr_size = _str_nsize(dest, size);
1229 if (dstr_size < size) {
1230 _str_cpyn(dest + dstr_size, size - dstr_size, src);
1231 _str_sanitize(dest + dstr_size, size - dstr_size, U_SPECIAL);
1232 }
1233}
1234
1235/** Convert space-padded ASCII to string.
1236 *
1237 * Common legacy text encoding in hardware is 7-bit ASCII fitted into
1238 * a fixed-width byte buffer (bit 7 always zero), right-padded with spaces
1239 * (ASCII 0x20). Convert space-padded ascii to string representation.
1240 *
1241 * If the text does not fit into the destination buffer, the function converts
1242 * as many characters as possible and returns EOVERFLOW.
1243 *
1244 * If the text contains non-ASCII bytes (with bit 7 set), the whole string is
1245 * converted anyway and invalid characters are replaced with question marks
1246 * (U_SPECIAL) and the function returns EIO.
1247 *
1248 * Regardless of return value upon return @a dest will always be well-formed.
1249 *
1250 * @param dest Destination buffer
1251 * @param size Size of destination buffer
1252 * @param src Space-padded ASCII.
1253 * @param n Size of the source buffer in bytes.
1254 *
1255 * @return EOK on success, EOVERFLOW if the text does not fit
1256 * destination buffer, EIO if the text contains
1257 * non-ASCII bytes.
1258 */
1259errno_t spascii_to_str(char *dest, size_t size, const uint8_t *src, size_t n)
1260{
1261 size_t len = 0;
1262
1263 /* Determine the length of the source string. */
1264 for (size_t i = 0; i < n; i++) {
1265 if (src[i] == 0)
1266 break;
1267
1268 if (src[i] != ' ')
1269 len = i + 1;
1270 }
1271
1272 errno_t result = EOK;
1273 size_t out_len = min(len, size - 1);
1274
1275 /* Copy characters */
1276 for (size_t i = 0; i < out_len; i++) {
1277 dest[i] = src[i];
1278
1279 if (dest[i] < 0) {
1280 dest[i] = U_SPECIAL;
1281 result = EIO;
1282 }
1283 }
1284
1285 dest[out_len] = 0;
1286
1287 if (out_len < len)
1288 return EOVERFLOW;
1289
1290 return result;
1291}
1292
1293/** Convert wide string to string.
1294 *
1295 * Convert wide string @a src to string. The output is written to the buffer
1296 * specified by @a dest and @a size. @a size must be non-zero and the string
1297 * written will always be well-formed.
1298 *
1299 * @param dest Destination buffer.
1300 * @param size Size of the destination buffer.
1301 * @param src Source wide string.
1302 */
1303void wstr_to_str(char *dest, size_t size, const char32_t *src)
1304{
1305 char32_t ch;
1306 size_t src_idx;
1307 size_t dest_off;
1308
1309 /* There must be space for a null terminator in the buffer. */
1310 assert(size > 0);
1311
1312 src_idx = 0;
1313 dest_off = 0;
1314
1315 while ((ch = src[src_idx++]) != 0) {
1316 if (chr_encode(ch, dest, &dest_off, size - 1) != EOK)
1317 break;
1318 }
1319
1320 dest[dest_off] = '\0';
1321}
1322
1323/** Convert UTF16 string to string.
1324 *
1325 * Convert utf16 string @a src to string. The output is written to the buffer
1326 * specified by @a dest and @a size. @a size must be non-zero and the string
1327 * written will always be well-formed. Surrogate pairs also supported.
1328 *
1329 * @param dest Destination buffer.
1330 * @param size Size of the destination buffer.
1331 * @param src Source utf16 string.
1332 *
1333 * @return EOK, if success, an error code otherwise.
1334 */
1335errno_t utf16_to_str(char *dest, size_t size, const uint16_t *src)
1336{
1337 size_t idx = 0, dest_off = 0;
1338 char32_t ch;
1339 errno_t rc = EOK;
1340
1341 /* There must be space for a null terminator in the buffer. */
1342 assert(size > 0);
1343
1344 while (src[idx]) {
1345 if ((src[idx] & 0xfc00) == 0xd800) {
1346 if (src[idx + 1] && (src[idx + 1] & 0xfc00) == 0xdc00) {
1347 ch = 0x10000;
1348 ch += (src[idx] & 0x03FF) << 10;
1349 ch += (src[idx + 1] & 0x03FF);
1350 idx += 2;
1351 } else
1352 break;
1353 } else {
1354 ch = src[idx];
1355 idx++;
1356 }
1357 rc = chr_encode(ch, dest, &dest_off, size - 1);
1358 if (rc != EOK)
1359 break;
1360 }
1361 dest[dest_off] = '\0';
1362 return rc;
1363}
1364
1365/** Convert string to UTF16 string.
1366 *
1367 * Convert string @a src to utf16 string. The output is written to the buffer
1368 * specified by @a dest and @a dlen. @a dlen must be non-zero and the string
1369 * written will always be well-formed. Surrogate pairs also supported.
1370 *
1371 * @param dest Destination buffer.
1372 * @param dlen Number of utf16 characters that fit in the destination buffer.
1373 * @param src Source string.
1374 *
1375 * @return EOK, if success, an error code otherwise.
1376 */
1377errno_t str_to_utf16(uint16_t *dest, size_t dlen, const char *src)
1378{
1379 errno_t rc = EOK;
1380 size_t offset = 0;
1381 size_t idx = 0;
1382 char32_t c;
1383
1384 assert(dlen > 0);
1385
1386 while ((c = str_decode(src, &offset, STR_NO_LIMIT)) != 0) {
1387 if (c > 0x10000) {
1388 if (idx + 2 >= dlen - 1) {
1389 rc = EOVERFLOW;
1390 break;
1391 }
1392 c = (c - 0x10000);
1393 dest[idx] = 0xD800 | (c >> 10);
1394 dest[idx + 1] = 0xDC00 | (c & 0x3FF);
1395 idx++;
1396 } else {
1397 dest[idx] = c;
1398 }
1399
1400 idx++;
1401 if (idx >= dlen - 1) {
1402 rc = EOVERFLOW;
1403 break;
1404 }
1405 }
1406
1407 dest[idx] = '\0';
1408 return rc;
1409}
1410
1411/** Get size of UTF-16 string.
1412 *
1413 * Get the number of words which are used by the UTF-16 string @a ustr
1414 * (excluding the NULL-terminator).
1415 *
1416 * @param ustr UTF-16 string to consider.
1417 *
1418 * @return Number of words used by the UTF-16 string
1419 *
1420 */
1421size_t utf16_wsize(const uint16_t *ustr)
1422{
1423 size_t wsize = 0;
1424
1425 while (*ustr++ != 0)
1426 wsize++;
1427
1428 return wsize;
1429}
1430
1431/** Convert wide string to new string.
1432 *
1433 * Convert wide string @a src to string. Space for the new string is allocated
1434 * on the heap.
1435 *
1436 * @param src Source wide string.
1437 * @return New string.
1438 */
1439char *wstr_to_astr(const char32_t *src)
1440{
1441 char dbuf[STR_BOUNDS(1)];
1442 char *str;
1443 char32_t ch;
1444
1445 size_t src_idx;
1446 size_t dest_off;
1447 size_t dest_size;
1448
1449 /* Compute size of encoded string. */
1450
1451 src_idx = 0;
1452 dest_size = 0;
1453
1454 while ((ch = src[src_idx++]) != 0) {
1455 dest_off = 0;
1456 if (chr_encode(ch, dbuf, &dest_off, STR_BOUNDS(1)) != EOK)
1457 break;
1458 dest_size += dest_off;
1459 }
1460
1461 str = malloc(dest_size + 1);
1462 if (str == NULL)
1463 return NULL;
1464
1465 /* Encode string. */
1466
1467 src_idx = 0;
1468 dest_off = 0;
1469
1470 while ((ch = src[src_idx++]) != 0) {
1471 if (chr_encode(ch, str, &dest_off, dest_size) != EOK)
1472 break;
1473 }
1474
1475 str[dest_size] = '\0';
1476 return str;
1477}
1478
1479/** Convert string to wide string.
1480 *
1481 * Convert string @a src to wide string. The output is written to the
1482 * buffer specified by @a dest and @a dlen. @a dlen must be non-zero
1483 * and the wide string written will always be null-terminated.
1484 *
1485 * @param dest Destination buffer.
1486 * @param dlen Length of destination buffer (number of wchars).
1487 * @param src Source string.
1488 */
1489void str_to_wstr(char32_t *dest, size_t dlen, const char *src)
1490{
1491 size_t offset;
1492 size_t di;
1493 char32_t c;
1494
1495 assert(dlen > 0);
1496
1497 offset = 0;
1498 di = 0;
1499
1500 do {
1501 if (di >= dlen - 1)
1502 break;
1503
1504 c = str_decode(src, &offset, STR_NO_LIMIT);
1505 dest[di++] = c;
1506 } while (c != '\0');
1507
1508 dest[dlen - 1] = '\0';
1509}
1510
1511/** Convert string to wide string.
1512 *
1513 * Convert string @a src to wide string. A new wide NULL-terminated
1514 * string will be allocated on the heap.
1515 *
1516 * @param src Source string.
1517 */
1518char32_t *str_to_awstr(const char *str)
1519{
1520 size_t len = str_length(str);
1521
1522 char32_t *wstr = calloc(len + 1, sizeof(char32_t));
1523 if (wstr == NULL)
1524 return NULL;
1525
1526 str_to_wstr(wstr, len + 1, str);
1527 return wstr;
1528}
1529
1530static char *_strchr(const char *str, char c)
1531{
1532 while (*str != 0 && *str != c)
1533 str++;
1534
1535 return (*str == c) ? (char *) str : NULL;
1536}
1537
1538/** Find first occurence of character in string.
1539 *
1540 * @param str String to search.
1541 * @param ch Character to look for.
1542 *
1543 * @return Pointer to character in @a str or NULL if not found.
1544 */
1545char *str_chr(const char *str, char32_t ch)
1546{
1547 /* Fast path for an ASCII character. */
1548 if (ascii_check(ch))
1549 return _strchr(str, ch);
1550
1551 /* Convert character to UTF-8. */
1552 char utf8[STR_BOUNDS(1) + 1];
1553 size_t offset = 0;
1554
1555 if (chr_encode(ch, utf8, &offset, sizeof(utf8)) != EOK || offset == 0)
1556 return NULL;
1557
1558 utf8[offset] = '\0';
1559
1560 /* Find the first byte, then check if all of them are correct. */
1561 while (*str != 0) {
1562 str = _strchr(str, utf8[0]);
1563 if (!str)
1564 return NULL;
1565
1566 if (_test_prefix(str, utf8))
1567 return (char *) str;
1568
1569 str++;
1570 }
1571
1572 return NULL;
1573}
1574
1575/** Find first occurence of substring in string.
1576 *
1577 * @param hs Haystack (string)
1578 * @param n Needle (substring to look for)
1579 *
1580 * @return Pointer to character in @a hs or @c NULL if not found.
1581 */
1582char *str_str(const char *hs, const char *n)
1583{
1584 size_t hsize = _str_size(hs);
1585 size_t nsize = _str_size(n);
1586
1587 while (hsize >= nsize) {
1588 if (_test_prefix(hs, n))
1589 return (char *) hs;
1590
1591 hs++;
1592 hsize--;
1593 }
1594
1595 return NULL;
1596}
1597
1598static void _str_rtrim(char *str, char c)
1599{
1600 char *last = str;
1601
1602 while (*str) {
1603 if (*str != c)
1604 last = str;
1605
1606 str++;
1607 }
1608
1609 /* Truncate string. */
1610 last[1] = 0;
1611}
1612
1613/** Removes specified trailing characters from a string.
1614 *
1615 * @param str String to remove from.
1616 * @param ch Character to remove.
1617 */
1618void str_rtrim(char *str, char32_t ch)
1619{
1620 /* Fast path for the ASCII case. */
1621 if (ascii_check(ch)) {
1622 _str_rtrim(str, ch);
1623 return;
1624 }
1625
1626 size_t off = 0;
1627 size_t pos = 0;
1628 char32_t c;
1629 bool update_last_chunk = true;
1630 char *last_chunk = NULL;
1631
1632 while ((c = str_decode(str, &off, STR_NO_LIMIT))) {
1633 if (c != ch) {
1634 update_last_chunk = true;
1635 last_chunk = NULL;
1636 } else if (update_last_chunk) {
1637 update_last_chunk = false;
1638 last_chunk = (str + pos);
1639 }
1640 pos = off;
1641 }
1642
1643 if (last_chunk)
1644 *last_chunk = '\0';
1645}
1646
1647static void _str_ltrim(char *str, char c)
1648{
1649 char *p = str;
1650
1651 while (*p == c)
1652 p++;
1653
1654 if (str != p)
1655 _str_cpy(str, p);
1656}
1657
1658/** Removes specified leading characters from a string.
1659 *
1660 * @param str String to remove from.
1661 * @param ch Character to remove.
1662 */
1663void str_ltrim(char *str, char32_t ch)
1664{
1665 /* Fast path for the ASCII case. */
1666 if (ascii_check(ch)) {
1667 _str_ltrim(str, ch);
1668 return;
1669 }
1670
1671 char32_t acc;
1672 size_t off = 0;
1673 size_t pos = 0;
1674 size_t str_sz = str_size(str);
1675
1676 while ((acc = str_decode(str, &off, STR_NO_LIMIT)) != 0) {
1677 if (acc != ch)
1678 break;
1679 else
1680 pos = off;
1681 }
1682
1683 if (pos > 0) {
1684 memmove(str, &str[pos], str_sz - pos);
1685 pos = str_sz - pos;
1686 str[pos] = '\0';
1687 }
1688}
1689
1690static char *_str_rchr(const char *str, char c)
1691{
1692 const char *last = NULL;
1693
1694 while (*str) {
1695 if (*str == c)
1696 last = str;
1697
1698 str++;
1699 }
1700
1701 return (char *) last;
1702}
1703
1704/** Find last occurence of character in string.
1705 *
1706 * @param str String to search.
1707 * @param ch Character to look for.
1708 *
1709 * @return Pointer to character in @a str or NULL if not found.
1710 */
1711char *str_rchr(const char *str, char32_t ch)
1712{
1713 if (ascii_check(ch))
1714 return _str_rchr(str, ch);
1715
1716 char32_t acc;
1717 size_t off = 0;
1718 size_t last = 0;
1719 const char *res = NULL;
1720
1721 while ((acc = str_decode(str, &off, STR_NO_LIMIT)) != 0) {
1722 if (acc == ch)
1723 res = (str + last);
1724 last = off;
1725 }
1726
1727 return (char *) res;
1728}
1729
1730/** Insert a wide character into a wide string.
1731 *
1732 * Insert a wide character into a wide string at position
1733 * @a pos. The characters after the position are shifted.
1734 *
1735 * @param str String to insert to.
1736 * @param ch Character to insert to.
1737 * @param pos Character index where to insert.
1738 * @param max_pos Characters in the buffer.
1739 *
1740 * @return True if the insertion was sucessful, false if the position
1741 * is out of bounds.
1742 *
1743 */
1744bool wstr_linsert(char32_t *str, char32_t ch, size_t pos, size_t max_pos)
1745{
1746 size_t len = wstr_length(str);
1747
1748 if ((pos > len) || (pos + 1 > max_pos))
1749 return false;
1750
1751 size_t i;
1752 for (i = len; i + 1 > pos; i--)
1753 str[i + 1] = str[i];
1754
1755 str[pos] = ch;
1756
1757 return true;
1758}
1759
1760/** Remove a wide character from a wide string.
1761 *
1762 * Remove a wide character from a wide string at position
1763 * @a pos. The characters after the position are shifted.
1764 *
1765 * @param str String to remove from.
1766 * @param pos Character index to remove.
1767 *
1768 * @return True if the removal was sucessful, false if the position
1769 * is out of bounds.
1770 *
1771 */
1772bool wstr_remove(char32_t *str, size_t pos)
1773{
1774 size_t len = wstr_length(str);
1775
1776 if (pos >= len)
1777 return false;
1778
1779 size_t i;
1780 for (i = pos + 1; i <= len; i++)
1781 str[i - 1] = str[i];
1782
1783 return true;
1784}
1785
1786/** Duplicate string.
1787 *
1788 * Allocate a new string and copy characters from the source
1789 * string into it. The duplicate string is allocated via sleeping
1790 * malloc(), thus this function can sleep in no memory conditions.
1791 *
1792 * The allocation cannot fail and the return value is always
1793 * a valid pointer. The duplicate string is always a well-formed
1794 * null-terminated UTF-8 string, but it can differ from the source
1795 * string on the byte level.
1796 *
1797 * @param src Source string.
1798 *
1799 * @return Duplicate string.
1800 *
1801 */
1802char *str_dup(const char *src)
1803{
1804 size_t size = _str_size(src) + 1;
1805 char *dest = malloc(size);
1806 if (!dest)
1807 return NULL;
1808
1809 memcpy(dest, src, size);
1810 _str_sanitize(dest, size, U_SPECIAL);
1811 return dest;
1812}
1813
1814/** Duplicate string with size limit.
1815 *
1816 * Allocate a new string and copy up to @max_size bytes from the source
1817 * string into it. The duplicate string is allocated via sleeping
1818 * malloc(), thus this function can sleep in no memory conditions.
1819 * No more than @max_size + 1 bytes is allocated, but if the size
1820 * occupied by the source string is smaller than @max_size + 1,
1821 * less is allocated.
1822 *
1823 * The allocation cannot fail and the return value is always
1824 * a valid pointer. The duplicate string is always a well-formed
1825 * null-terminated UTF-8 string, but it can differ from the source
1826 * string on the byte level.
1827 *
1828 * @param src Source string.
1829 * @param n Maximum number of bytes to duplicate.
1830 *
1831 * @return Duplicate string.
1832 *
1833 */
1834char *str_ndup(const char *src, size_t n)
1835{
1836 size_t size = _str_nsize(src, n);
1837
1838 char *dest = malloc(size + 1);
1839 if (!dest)
1840 return NULL;
1841
1842 memcpy(dest, src, size);
1843 _str_sanitize(dest, size, U_SPECIAL);
1844 dest[size] = 0;
1845 return dest;
1846}
1847
1848/** Split string by delimiters.
1849 *
1850 * @param s String to be tokenized. May not be NULL.
1851 * @param delim String with the delimiters.
1852 * @param next Variable which will receive the pointer to the
1853 * continuation of the string following the first
1854 * occurrence of any of the delimiter characters.
1855 * May be NULL.
1856 * @return Pointer to the prefix of @a s before the first
1857 * delimiter character. NULL if no such prefix
1858 * exists.
1859 */
1860char *str_tok(char *s, const char *delim, char **next)
1861{
1862 char *start, *end;
1863
1864 if (!s)
1865 return NULL;
1866
1867 size_t len = str_size(s);
1868 size_t cur;
1869 size_t tmp;
1870 char32_t ch;
1871
1872 /* Skip over leading delimiters. */
1873 tmp = 0;
1874 cur = 0;
1875 while ((ch = str_decode(s, &tmp, len)) && str_chr(delim, ch))
1876 cur = tmp;
1877 start = &s[cur];
1878
1879 /* Skip over token characters. */
1880 tmp = cur;
1881 while ((ch = str_decode(s, &tmp, len)) && !str_chr(delim, ch))
1882 cur = tmp;
1883 end = &s[cur];
1884 if (next)
1885 *next = (ch ? &s[tmp] : &s[cur]);
1886
1887 if (start == end)
1888 return NULL; /* No more tokens. */
1889
1890 /* Overwrite delimiter with NULL terminator. */
1891 *end = '\0';
1892 return start;
1893}
1894
1895void order_suffix(const uint64_t val, uint64_t *rv, char *suffix)
1896{
1897 if (val > UINT64_C(10000000000000000000)) {
1898 *rv = val / UINT64_C(1000000000000000000);
1899 *suffix = 'Z';
1900 } else if (val > UINT64_C(1000000000000000000)) {
1901 *rv = val / UINT64_C(1000000000000000);
1902 *suffix = 'E';
1903 } else if (val > UINT64_C(1000000000000000)) {
1904 *rv = val / UINT64_C(1000000000000);
1905 *suffix = 'T';
1906 } else if (val > UINT64_C(1000000000000)) {
1907 *rv = val / UINT64_C(1000000000);
1908 *suffix = 'G';
1909 } else if (val > UINT64_C(1000000000)) {
1910 *rv = val / UINT64_C(1000000);
1911 *suffix = 'M';
1912 } else if (val > UINT64_C(1000000)) {
1913 *rv = val / UINT64_C(1000);
1914 *suffix = 'k';
1915 } else {
1916 *rv = val;
1917 *suffix = ' ';
1918 }
1919}
1920
1921void bin_order_suffix(const uint64_t val, uint64_t *rv, const char **suffix,
1922 bool fixed)
1923{
1924 if (val > UINT64_C(1152921504606846976)) {
1925 *rv = val / UINT64_C(1125899906842624);
1926 *suffix = "EiB";
1927 } else if (val > UINT64_C(1125899906842624)) {
1928 *rv = val / UINT64_C(1099511627776);
1929 *suffix = "TiB";
1930 } else if (val > UINT64_C(1099511627776)) {
1931 *rv = val / UINT64_C(1073741824);
1932 *suffix = "GiB";
1933 } else if (val > UINT64_C(1073741824)) {
1934 *rv = val / UINT64_C(1048576);
1935 *suffix = "MiB";
1936 } else if (val > UINT64_C(1048576)) {
1937 *rv = val / UINT64_C(1024);
1938 *suffix = "KiB";
1939 } else {
1940 *rv = val;
1941 if (fixed)
1942 *suffix = "B ";
1943 else
1944 *suffix = "B";
1945 }
1946}
1947
1948/** @}
1949 */
Note: See TracBrowser for help on using the repository browser.