source: mainline/common/str.c@ 65bf084

Last change on this file since 65bf084 was 65bf084, checked in by Jiří Zárevúcky <zarevucky.jiri@…>, 2 months ago

Implement both str_decode() and mbrtoc32() using one function

  • Property mode set to 100644
File size: 44.6 KB
Line 
1/*
2 * Copyright (c) 2001-2004 Jakub Jermar
3 * Copyright (c) 2005 Martin Decky
4 * Copyright (c) 2008 Jiri Svoboda
5 * Copyright (c) 2011 Martin Sucha
6 * Copyright (c) 2011 Oleg Romanenko
7 * Copyright (c) 2025 Jiří Zárevúcky
8 * All rights reserved.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * - Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * - Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 * - The name of the author may not be used to endorse or promote products
20 * derived from this software without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
23 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
24 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
25 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
27 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
31 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33
34/** @addtogroup libc
35 * @{
36 */
37
38/**
39 * @file
40 * @brief String functions.
41 *
42 * Strings and characters use the Universal Character Set (UCS). The standard
43 * strings, called just strings are encoded in UTF-8. Wide strings (encoded
44 * in UTF-32) are supported to a limited degree. A single character is
45 * represented as char32_t.@n
46 *
47 * Overview of the terminology:@n
48 *
49 * Term Meaning
50 * -------------------- ----------------------------------------------------
51 * byte 8 bits stored in uint8_t (unsigned 8 bit integer)
52 *
53 * character UTF-32 encoded Unicode character, stored in char32_t
54 * (unsigned 32 bit integer), code points 0 .. 1114111
55 * are valid
56 *
57 * Note that Unicode characters do not match
58 * one-to-one with displayed characters or glyphs on
59 * screen. For that level of precision, look up
60 * Grapheme Clusters.
61 *
62 * ASCII character 7 bit encoded ASCII character, stored in char
63 * (usually signed 8 bit integer), code points 0 .. 127
64 * are valid
65 *
66 * string UTF-8 encoded NULL-terminated Unicode string, char *
67 *
68 * wide string UTF-32 encoded NULL-terminated Unicode string,
69 * char32_t *
70 *
71 * [wide] string size number of BYTES in a [wide] string (excluding
72 * the NULL-terminator), size_t
73 *
74 * [wide] string length number of CHARACTERS in a [wide] string (excluding
75 * the NULL-terminator), size_t
76 *
77 * [wide] string width number of display cells on a monospace display taken
78 * by a [wide] string, size_t
79 *
80 * This is virtually impossible to determine exactly for
81 * all strings without knowing specifics of the display
82 * device, due to various factors affecting text output.
83 * If you have the option to query the terminal for
84 * position change caused by outputting the string,
85 * it is preferrable to determine width that way.
86 *
87 *
88 * Overview of string metrics:@n
89 *
90 * Metric Abbrev. Type Meaning
91 * ------ ------ ------ -------------------------------------------------
92 * size n size_t number of BYTES in a string (excluding the
93 * NULL-terminator)
94 *
95 * length l size_t number of CHARACTERS in a string (excluding the
96 * null terminator)
97 *
98 * width w size_t number of display cells on a monospace display
99 * taken by a string
100 *
101 *
102 * Function naming prefixes:@n
103 *
104 * chr_ operate on characters
105 * ascii_ operate on ASCII characters
106 * str_ operate on strings
107 * wstr_ operate on wide strings
108 *
109 * [w]str_[n|l|w] operate on a prefix limited by size, length
110 * or width
111 *
112 *
113 * A specific character inside a [wide] string can be referred to by:@n
114 *
115 * pointer (char *, char32_t *)
116 * byte offset (size_t)
117 * character index (size_t)
118 *
119 */
120
121#include <str.h>
122
123#include <align.h>
124#include <assert.h>
125#include <ctype.h>
126#include <errno.h>
127#include <limits.h>
128#include <macros.h>
129#include <mem.h>
130#include <stdbool.h>
131#include <stddef.h>
132#include <stdint.h>
133#include <stdlib.h>
134#include <uchar.h>
135
136#if __STDC_HOSTED__
137#include <fibril.h>
138#endif
139
140static void _set_ilseq()
141{
142#ifdef errno
143 errno = EILSEQ;
144#endif
145}
146
147/** Byte mask consisting of lowest @n bits (out of 8) */
148#define LO_MASK_8(n) ((uint8_t) ((1 << (n)) - 1))
149
150/** Byte mask consisting of lowest @n bits (out of 32) */
151#define LO_MASK_32(n) ((uint32_t) ((1 << (n)) - 1))
152
153/** Byte mask consisting of highest @n bits (out of 8) */
154#define HI_MASK_8(n) (~LO_MASK_8(8 - (n)))
155
156/** Number of data bits in a UTF-8 continuation byte */
157#define CONT_BITS 6
158
159#define UTF8_MASK_INITIAL2 0b00011111
160#define UTF8_MASK_INITIAL3 0b00001111
161#define UTF8_MASK_INITIAL4 0b00000111
162#define UTF8_MASK_CONT 0b00111111
163
164#define CHAR_INVALID ((char32_t) UINT_MAX)
165
166static inline bool _is_ascii(uint8_t b)
167{
168 return b < 0x80;
169}
170
171static inline bool _is_continuation(uint8_t b)
172{
173 return (b & 0xC0) == 0x80;
174}
175
176static inline bool _is_2_byte(uint8_t c)
177{
178 return (c & 0xE0) == 0xC0;
179}
180
181static inline bool _is_3_byte(uint8_t c)
182{
183 return (c & 0xF0) == 0xE0;
184}
185
186static inline bool _is_4_byte(uint8_t c)
187{
188 return (c & 0xF8) == 0xF0;
189}
190
191static inline int _char_continuation_bytes(char32_t c)
192{
193 if ((c & ~LO_MASK_32(7)) == 0)
194 return 0;
195
196 if ((c & ~LO_MASK_32(11)) == 0)
197 return 1;
198
199 if ((c & ~LO_MASK_32(16)) == 0)
200 return 2;
201
202 if ((c & ~LO_MASK_32(21)) == 0)
203 return 3;
204
205 /* Codes longer than 21 bits are not supported */
206 return -1;
207}
208
209static inline int _continuation_bytes(uint8_t b)
210{
211 /* 0xxxxxxx */
212 if (_is_ascii(b))
213 return 0;
214
215 /* 110xxxxx 10xxxxxx */
216 if (_is_2_byte(b))
217 return 1;
218
219 /* 1110xxxx 10xxxxxx 10xxxxxx */
220 if (_is_3_byte(b))
221 return 2;
222
223 /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
224 if (_is_4_byte(b))
225 return 3;
226
227 return -1;
228}
229
230static bool _is_non_shortest(const mbstate_t *mb, uint8_t b)
231{
232 return (mb->state == 0b1111110000000000 && !(b & 0b00100000)) ||
233 (mb->state == 0b1111111111110000 && !(b & 0b00110000));
234}
235
236#define _likely(expr) __builtin_expect((expr), true)
237#define _unlikely(expr) __builtin_expect((expr), false)
238
239#define FAST_PATHS 1
240
241static char32_t _str_decode(const char *s, size_t *offset, size_t size, mbstate_t *mb)
242{
243 assert(s);
244 assert(offset);
245 assert(*offset <= size);
246 assert(size == STR_NO_LIMIT || s + size >= s);
247 assert(mb);
248
249 if (*offset == size)
250 return 0;
251
252 if (_likely(!mb->state)) {
253 /* Clean slate, read initial byte. */
254 uint8_t b = s[(*offset)++];
255
256 /* Fast exit for the most common case. */
257 if (_likely(_is_ascii(b)))
258 return b;
259
260 /* unexpected continuation byte */
261 if (_unlikely(_is_continuation(b)))
262 return CHAR_INVALID;
263
264 /*
265 * The value stored into `continuation` is designed to have
266 * just enough leading ones that after shifting in one less than
267 * the expected number of continuation bytes, the most significant
268 * bit becomes zero. (The field is 16b wide.)
269 */
270
271 if (_is_2_byte(b)) {
272 /* Reject non-shortest form. */
273 if (_unlikely(!(b & 0b00011110)))
274 return CHAR_INVALID;
275
276#if FAST_PATHS
277 /* We can usually take this exit. */
278 if (_likely(*offset < size && _is_continuation(s[*offset])))
279 return (b & UTF8_MASK_INITIAL2) << 6 |
280 (s[(*offset)++] & UTF8_MASK_CONT);
281#endif
282
283 /* 2 byte continuation 110xxxxx */
284 mb->state = b ^ 0b0000000011000000;
285
286 } else if (_is_3_byte(b)) {
287#if FAST_PATHS
288 /* We can usually take this exit. */
289 if (_likely(*offset + 1 < size && _is_continuation(s[*offset]) && _is_continuation(s[*offset + 1]))) {
290
291 char32_t ch = (b & UTF8_MASK_INITIAL3) << 12 |
292 (s[(*offset)] & UTF8_MASK_CONT) << 6 |
293 (s[(*offset) + 1] & UTF8_MASK_CONT);
294
295 *offset += 2;
296
297 /* Reject non-shortest form. */
298 if (_unlikely(!(ch & 0xFFFFF800)))
299 return CHAR_INVALID;
300
301 return ch;
302 }
303#endif
304
305 /* 3 byte continuation 1110xxxx */
306 mb->state = b ^ 0b1111110011100000;
307
308 } else if (_is_4_byte(b)) {
309#if FAST_PATHS
310 /* We can usually take this exit. */
311 if (_likely(*offset + 2 < size && _is_continuation(s[*offset]) &&
312 _is_continuation(s[*offset + 1]) && _is_continuation(s[*offset + 2]))) {
313
314 char32_t ch = (b & UTF8_MASK_INITIAL4) << 18 |
315 (s[(*offset)] & UTF8_MASK_CONT) << 12 |
316 (s[(*offset) + 1] & UTF8_MASK_CONT) << 6 |
317 (s[(*offset) + 2] & UTF8_MASK_CONT);
318
319 *offset += 3;
320
321 /* Reject non-shortest form. */
322 if (_unlikely(!(ch & 0xFFFF0000)))
323 return CHAR_INVALID;
324
325 return ch;
326 }
327#endif
328
329 /* 4 byte continuation 11110xxx */
330 mb->state = b ^ 0b1111111100000000;
331 } else {
332 return CHAR_INVALID;
333 }
334 }
335
336 /* Deal with the remaining edge and invalid cases. */
337 for (; *offset < size; (*offset)++) {
338 /* Read continuation bytes. */
339 uint8_t b = s[*offset];
340
341 if (!_is_continuation(b) || _is_non_shortest(mb, b)) {
342 mb->state = 0;
343 return CHAR_INVALID;
344 }
345
346 /* Top bit becomes zero when shifting in the second to last byte. */
347 if (!(mb->state & 0x8000)) {
348 char32_t c = ((char32_t) mb->state) << 6 | (b & UTF8_MASK_CONT);
349 mb->state = 0;
350 (*offset)++;
351 return c;
352 }
353
354 mb->state = mb->state << 6 | (b & UTF8_MASK_CONT);
355 }
356
357 /* Incomplete character. */
358 assert(mb->state);
359 return 0;
360}
361
362/** Standard <uchar.h> function since C11. */
363size_t mbrtoc32(char32_t *c, const char *s, size_t n, mbstate_t *mb)
364{
365#if __STDC_HOSTED__
366 static fibril_local mbstate_t global_state = { };
367
368 if (!mb)
369 mb = &global_state;
370#endif
371
372 if (!s) {
373 /* Equivalent to mbrtoc32(NULL, "", 1, mb); */
374 c = NULL;
375 s = "";
376 n = 1;
377 }
378
379 size_t offset = 0;
380 char32_t ret = _str_decode(s, &offset, n, mb);
381 if (ret == CHAR_INVALID) {
382 assert(!mb->state);
383 _set_ilseq();
384 return UCHAR_ILSEQ;
385 }
386 if (mb->state) {
387 assert(ret == 0);
388 return UCHAR_INCOMPLETE;
389 }
390
391 if (c)
392 *c = ret;
393 return ret ? offset : 0;
394}
395
396/** Decode a single character from a string.
397 *
398 * Decode a single character from a string of size @a size. Decoding starts
399 * at @a offset and this offset is moved to the beginning of the next
400 * character. In case of decoding error, offset generally advances at least
401 * by one. However, offset is never moved beyond size.
402 *
403 * @param str String (not necessarily NULL-terminated).
404 * @param offset Byte offset in string where to start decoding.
405 * @param size Size of the string (in bytes).
406 *
407 * @return Value of decoded character, U_SPECIAL on decoding error or
408 * NULL if attempt to decode beyond @a size.
409 *
410 */
411char32_t str_decode(const char *str, size_t *offset, size_t size)
412{
413 mbstate_t mb = { };
414 char32_t ch = _str_decode(str, offset, size, &mb);
415
416 if (ch == CHAR_INVALID)
417 return U_SPECIAL;
418
419 if (mb.state)
420 return U_SPECIAL;
421
422 return ch;
423}
424
425/** Decode a single character from a string to the left.
426 *
427 * Decode a single character from a string of size @a size. Decoding starts
428 * at @a offset and this offset is moved to the beginning of the previous
429 * character. In case of decoding error, offset generally decreases at least
430 * by one. However, offset is never moved before 0.
431 *
432 * @param str String (not necessarily NULL-terminated).
433 * @param offset Byte offset in string where to start decoding.
434 * @param size Size of the string (in bytes).
435 *
436 * @return Value of decoded character, U_SPECIAL on decoding error or
437 * NULL if attempt to decode beyond @a start of str.
438 *
439 */
440char32_t str_decode_reverse(const char *str, size_t *offset, size_t size)
441{
442 if (*offset == 0)
443 return 0;
444
445 int cbytes = 0;
446 /* Continue while continuation bytes found */
447 while (*offset > 0 && cbytes < 4) {
448 uint8_t b = (uint8_t) str[--(*offset)];
449
450 if (_is_continuation(b)) {
451 cbytes++;
452 continue;
453 }
454
455 /* Reject non-shortest form encoding. */
456 if (cbytes != _continuation_bytes(b))
457 return U_SPECIAL;
458
459 /* Start byte */
460 size_t start_offset = *offset;
461 return str_decode(str, &start_offset, size);
462 }
463
464 /* Too many continuation bytes */
465 return U_SPECIAL;
466}
467
468/** Encode a single character to string representation.
469 *
470 * Encode a single character to string representation (i.e. UTF-8) and store
471 * it into a buffer at @a offset. Encoding starts at @a offset and this offset
472 * is moved to the position where the next character can be written to.
473 *
474 * @param ch Input character.
475 * @param str Output buffer.
476 * @param offset Byte offset where to start writing.
477 * @param size Size of the output buffer (in bytes).
478 *
479 * @return EOK if the character was encoded successfully, EOVERFLOW if there
480 * was not enough space in the output buffer or EINVAL if the character
481 * code was invalid.
482 */
483errno_t chr_encode(char32_t ch, char *str, size_t *offset, size_t size)
484{
485 // TODO: merge with c32rtomb()
486
487 if (*offset >= size)
488 return EOVERFLOW;
489
490 /* Fast exit for the most common case. */
491 if (ch < 0x80) {
492 str[(*offset)++] = (char) ch;
493 return EOK;
494 }
495
496 /* Codes longer than 21 bits are not supported */
497 if (!chr_check(ch))
498 return EINVAL;
499
500 /* Determine how many continuation bytes are needed */
501
502 unsigned int cbytes = _char_continuation_bytes(ch);
503 unsigned int b0_bits = 6 - cbytes; /* Data bits in first byte */
504
505 /* Check for available space in buffer */
506 if (*offset + cbytes >= size)
507 return EOVERFLOW;
508
509 /* Encode continuation bytes */
510 unsigned int i;
511 for (i = cbytes; i > 0; i--) {
512 str[*offset + i] = 0x80 | (ch & LO_MASK_32(CONT_BITS));
513 ch >>= CONT_BITS;
514 }
515
516 /* Encode first byte */
517 str[*offset] = (ch & LO_MASK_32(b0_bits)) | HI_MASK_8(8 - b0_bits - 1);
518
519 /* Advance offset */
520 *offset += cbytes + 1;
521
522 return EOK;
523}
524
525/* Convert in place any bytes that don't form a valid character into U_SPECIAL. */
526static void _sanitize_string(char *str, size_t n)
527{
528 uint8_t *b = (uint8_t *) str;
529
530 for (; *b && n > 0; b++, n--) {
531 int cont = _continuation_bytes(b[0]);
532 if (__builtin_expect(cont, 0) == 0)
533 continue;
534
535 if (cont < 0 || n <= (size_t) cont) {
536 b[0] = U_SPECIAL;
537 continue;
538 }
539
540 /* Check continuation bytes. */
541 for (int i = 1; i <= cont; i++) {
542 if (!_is_continuation(b[i])) {
543 b[0] = U_SPECIAL;
544 continue;
545 }
546 }
547
548 /*
549 * Check for non-shortest form encoding.
550 * See https://www.unicode.org/versions/corrigendum1.html
551 */
552
553 switch (cont) {
554 case 1:
555 /* 0b110!!!!x 0b10xxxxxx */
556 if (!(b[0] & 0b00011110))
557 b[0] = U_SPECIAL;
558
559 continue;
560 case 2:
561 /* 0b1110!!!! 0b10!xxxxx 0b10xxxxxx */
562 if (!(b[0] & 0b00001111) && !(b[1] & 0b00100000))
563 b[0] = U_SPECIAL;
564
565 continue;
566 case 3:
567 /* 0b11110!!! 0b10!!xxxx 0b10xxxxxx 0b10xxxxxx */
568 if (!(b[0] & 0b00000111) && !(b[1] & 0b00110000))
569 b[0] = U_SPECIAL;
570
571 continue;
572 }
573 }
574}
575
576static size_t _str_size(const char *str)
577{
578 size_t size = 0;
579
580 while (*str++ != 0)
581 size++;
582
583 return size;
584}
585
586/** Get size of string.
587 *
588 * Get the number of bytes which are used by the string @a str (excluding the
589 * NULL-terminator).
590 *
591 * @param str String to consider.
592 *
593 * @return Number of bytes used by the string
594 *
595 */
596size_t str_size(const char *str)
597{
598 return _str_size(str);
599}
600
601/** Get size of wide string.
602 *
603 * Get the number of bytes which are used by the wide string @a str (excluding the
604 * NULL-terminator).
605 *
606 * @param str Wide string to consider.
607 *
608 * @return Number of bytes used by the wide string
609 *
610 */
611size_t wstr_size(const char32_t *str)
612{
613 return (wstr_length(str) * sizeof(char32_t));
614}
615
616/** Get size of string with length limit.
617 *
618 * Get the number of bytes which are used by up to @a max_len first
619 * characters in the string @a str. If @a max_len is greater than
620 * the length of @a str, the entire string is measured (excluding the
621 * NULL-terminator).
622 *
623 * @param str String to consider.
624 * @param max_len Maximum number of characters to measure.
625 *
626 * @return Number of bytes used by the characters.
627 *
628 */
629size_t str_lsize(const char *str, size_t max_len)
630{
631 size_t len = 0;
632 size_t offset = 0;
633
634 while (len < max_len) {
635 if (str_decode(str, &offset, STR_NO_LIMIT) == 0)
636 break;
637
638 len++;
639 }
640
641 return offset;
642}
643
644static size_t _str_nsize(const char *str, size_t max_size)
645{
646 size_t size = 0;
647
648 while ((*str++ != 0) && (size < max_size))
649 size++;
650
651 return size;
652}
653
654/** Get size of string with size limit.
655 *
656 * Get the number of bytes which are used by the string @a str
657 * (excluding the NULL-terminator), but no more than @max_size bytes.
658 *
659 * @param str String to consider.
660 * @param max_size Maximum number of bytes to measure.
661 *
662 * @return Number of bytes used by the string
663 *
664 */
665size_t str_nsize(const char *str, size_t max_size)
666{
667 return _str_nsize(str, max_size);
668}
669
670/** Get size of wide string with size limit.
671 *
672 * Get the number of bytes which are used by the wide string @a str
673 * (excluding the NULL-terminator), but no more than @max_size bytes.
674 *
675 * @param str Wide string to consider.
676 * @param max_size Maximum number of bytes to measure.
677 *
678 * @return Number of bytes used by the wide string
679 *
680 */
681size_t wstr_nsize(const char32_t *str, size_t max_size)
682{
683 return (wstr_nlength(str, max_size) * sizeof(char32_t));
684}
685
686/** Get size of wide string with length limit.
687 *
688 * Get the number of bytes which are used by up to @a max_len first
689 * wide characters in the wide string @a str. If @a max_len is greater than
690 * the length of @a str, the entire wide string is measured (excluding the
691 * NULL-terminator).
692 *
693 * @param str Wide string to consider.
694 * @param max_len Maximum number of wide characters to measure.
695 *
696 * @return Number of bytes used by the wide characters.
697 *
698 */
699size_t wstr_lsize(const char32_t *str, size_t max_len)
700{
701 return (wstr_nlength(str, max_len * sizeof(char32_t)) * sizeof(char32_t));
702}
703
704/** Get number of characters in a string.
705 *
706 * @param str NULL-terminated string.
707 *
708 * @return Number of characters in string.
709 *
710 */
711size_t str_length(const char *str)
712{
713 size_t len = 0;
714 size_t offset = 0;
715
716 while (str_decode(str, &offset, STR_NO_LIMIT) != 0)
717 len++;
718
719 return len;
720}
721
722/** Get number of characters in a wide string.
723 *
724 * @param str NULL-terminated wide string.
725 *
726 * @return Number of characters in @a str.
727 *
728 */
729size_t wstr_length(const char32_t *wstr)
730{
731 size_t len = 0;
732
733 while (*wstr++ != 0)
734 len++;
735
736 return len;
737}
738
739/** Get number of characters in a string with size limit.
740 *
741 * @param str NULL-terminated string.
742 * @param size Maximum number of bytes to consider.
743 *
744 * @return Number of characters in string.
745 *
746 */
747size_t str_nlength(const char *str, size_t size)
748{
749 size_t len = 0;
750 size_t offset = 0;
751
752 while (str_decode(str, &offset, size) != 0)
753 len++;
754
755 return len;
756}
757
758/** Get number of characters in a string with size limit.
759 *
760 * @param str NULL-terminated string.
761 * @param size Maximum number of bytes to consider.
762 *
763 * @return Number of characters in string.
764 *
765 */
766size_t wstr_nlength(const char32_t *str, size_t size)
767{
768 size_t len = 0;
769 size_t limit = ALIGN_DOWN(size, sizeof(char32_t));
770 size_t offset = 0;
771
772 while ((offset < limit) && (*str++ != 0)) {
773 len++;
774 offset += sizeof(char32_t);
775 }
776
777 return len;
778}
779
780/** Get character display width on a character cell display.
781 *
782 * @param ch Character
783 * @return Width of character in cells.
784 */
785size_t chr_width(char32_t ch)
786{
787 return 1;
788}
789
790/** Get string display width on a character cell display.
791 *
792 * @param str String
793 * @return Width of string in cells.
794 */
795size_t str_width(const char *str)
796{
797 size_t width = 0;
798 size_t offset = 0;
799 char32_t ch;
800
801 while ((ch = str_decode(str, &offset, STR_NO_LIMIT)) != 0)
802 width += chr_width(ch);
803
804 return width;
805}
806
807/** Check whether character is plain ASCII.
808 *
809 * @return True if character is plain ASCII.
810 *
811 */
812bool ascii_check(char32_t ch)
813{
814 if (ch <= 127)
815 return true;
816
817 return false;
818}
819
820/** Check whether character is valid
821 *
822 * @return True if character is a valid Unicode code point.
823 *
824 */
825bool chr_check(char32_t ch)
826{
827 if (ch <= 1114111)
828 return true;
829
830 return false;
831}
832
833/** Compare two NULL terminated strings.
834 *
835 * Do a char-by-char comparison of two NULL-terminated strings.
836 * The strings are considered equal iff their length is equal
837 * and both strings consist of the same sequence of characters.
838 *
839 * A string S1 is less than another string S2 if it has a character with
840 * lower value at the first character position where the strings differ.
841 * If the strings differ in length, the shorter one is treated as if
842 * padded by characters with a value of zero.
843 *
844 * @param s1 First string to compare.
845 * @param s2 Second string to compare.
846 *
847 * @return 0 if the strings are equal, -1 if the first is less than the second,
848 * 1 if the second is less than the first.
849 *
850 */
851int str_cmp(const char *s1, const char *s2)
852{
853 /*
854 * UTF-8 has the nice property that lexicographic ordering on bytes is
855 * the same as the lexicographic ordering of the character sequences.
856 */
857 while (*s1 == *s2 && *s1 != 0) {
858 s1++;
859 s2++;
860 }
861
862 if (*s1 == *s2)
863 return 0;
864
865 return (*s1 < *s2) ? -1 : 1;
866}
867
868/** Compare two NULL terminated strings with length limit.
869 *
870 * Do a char-by-char comparison of two NULL-terminated strings.
871 * The strings are considered equal iff
872 * min(str_length(s1), max_len) == min(str_length(s2), max_len)
873 * and both strings consist of the same sequence of characters,
874 * up to max_len characters.
875 *
876 * A string S1 is less than another string S2 if it has a character with
877 * lower value at the first character position where the strings differ.
878 * If the strings differ in length, the shorter one is treated as if
879 * padded by characters with a value of zero. Only the first max_len
880 * characters are considered.
881 *
882 * @param s1 First string to compare.
883 * @param s2 Second string to compare.
884 * @param max_len Maximum number of characters to consider.
885 *
886 * @return 0 if the strings are equal, -1 if the first is less than the second,
887 * 1 if the second is less than the first.
888 *
889 */
890int str_lcmp(const char *s1, const char *s2, size_t max_len)
891{
892 char32_t c1 = 0;
893 char32_t c2 = 0;
894
895 size_t off1 = 0;
896 size_t off2 = 0;
897
898 size_t len = 0;
899
900 while (true) {
901 if (len >= max_len)
902 break;
903
904 c1 = str_decode(s1, &off1, STR_NO_LIMIT);
905 c2 = str_decode(s2, &off2, STR_NO_LIMIT);
906
907 if (c1 < c2)
908 return -1;
909
910 if (c1 > c2)
911 return 1;
912
913 if (c1 == 0 || c2 == 0)
914 break;
915
916 ++len;
917 }
918
919 return 0;
920
921}
922
923/** Compare two NULL terminated strings in case-insensitive manner.
924 *
925 * Do a char-by-char comparison of two NULL-terminated strings.
926 * The strings are considered equal iff their length is equal
927 * and both strings consist of the same sequence of characters
928 * when converted to lower case.
929 *
930 * A string S1 is less than another string S2 if it has a character with
931 * lower value at the first character position where the strings differ.
932 * If the strings differ in length, the shorter one is treated as if
933 * padded by characters with a value of zero.
934 *
935 * @param s1 First string to compare.
936 * @param s2 Second string to compare.
937 *
938 * @return 0 if the strings are equal, -1 if the first is less than the second,
939 * 1 if the second is less than the first.
940 *
941 */
942int str_casecmp(const char *s1, const char *s2)
943{
944 // FIXME: doesn't work for non-ASCII caseful characters
945
946 char32_t c1 = 0;
947 char32_t c2 = 0;
948
949 size_t off1 = 0;
950 size_t off2 = 0;
951
952 while (true) {
953 c1 = tolower(str_decode(s1, &off1, STR_NO_LIMIT));
954 c2 = tolower(str_decode(s2, &off2, STR_NO_LIMIT));
955
956 if (c1 < c2)
957 return -1;
958
959 if (c1 > c2)
960 return 1;
961
962 if (c1 == 0 || c2 == 0)
963 break;
964 }
965
966 return 0;
967}
968
969/** Compare two NULL terminated strings with length limit in case-insensitive
970 * manner.
971 *
972 * Do a char-by-char comparison of two NULL-terminated strings.
973 * The strings are considered equal iff
974 * min(str_length(s1), max_len) == min(str_length(s2), max_len)
975 * and both strings consist of the same sequence of characters,
976 * up to max_len characters.
977 *
978 * A string S1 is less than another string S2 if it has a character with
979 * lower value at the first character position where the strings differ.
980 * If the strings differ in length, the shorter one is treated as if
981 * padded by characters with a value of zero. Only the first max_len
982 * characters are considered.
983 *
984 * @param s1 First string to compare.
985 * @param s2 Second string to compare.
986 * @param max_len Maximum number of characters to consider.
987 *
988 * @return 0 if the strings are equal, -1 if the first is less than the second,
989 * 1 if the second is less than the first.
990 *
991 */
992int str_lcasecmp(const char *s1, const char *s2, size_t max_len)
993{
994 // FIXME: doesn't work for non-ASCII caseful characters
995
996 char32_t c1 = 0;
997 char32_t c2 = 0;
998
999 size_t off1 = 0;
1000 size_t off2 = 0;
1001
1002 size_t len = 0;
1003
1004 while (true) {
1005 if (len >= max_len)
1006 break;
1007
1008 c1 = tolower(str_decode(s1, &off1, STR_NO_LIMIT));
1009 c2 = tolower(str_decode(s2, &off2, STR_NO_LIMIT));
1010
1011 if (c1 < c2)
1012 return -1;
1013
1014 if (c1 > c2)
1015 return 1;
1016
1017 if (c1 == 0 || c2 == 0)
1018 break;
1019
1020 ++len;
1021 }
1022
1023 return 0;
1024
1025}
1026
1027static bool _test_prefix(const char *s, const char *p)
1028{
1029 while (*s == *p && *s != 0) {
1030 s++;
1031 p++;
1032 }
1033
1034 return *p == 0;
1035}
1036
1037/** Test whether p is a prefix of s.
1038 *
1039 * Do a char-by-char comparison of two NULL-terminated strings
1040 * and determine if p is a prefix of s.
1041 *
1042 * @param s The string in which to look
1043 * @param p The string to check if it is a prefix of s
1044 *
1045 * @return true iff p is prefix of s else false
1046 *
1047 */
1048bool str_test_prefix(const char *s, const char *p)
1049{
1050 return _test_prefix(s, p);
1051}
1052
1053/** Get a string suffix.
1054 *
1055 * Return a string suffix defined by the prefix length.
1056 *
1057 * @param s The string to get the suffix from.
1058 * @param prefix_length Number of prefix characters to ignore.
1059 *
1060 * @return String suffix.
1061 *
1062 */
1063const char *str_suffix(const char *s, size_t prefix_length)
1064{
1065 size_t off = 0;
1066 size_t i = 0;
1067
1068 while (true) {
1069 str_decode(s, &off, STR_NO_LIMIT);
1070 i++;
1071
1072 if (i >= prefix_length)
1073 break;
1074 }
1075
1076 return s + off;
1077}
1078
1079/** Copy string as a sequence of bytes. */
1080static void _str_cpy(char *dest, const char *src)
1081{
1082 while (*src)
1083 *(dest++) = *(src++);
1084
1085 *dest = 0;
1086}
1087
1088/** Copy string as a sequence of bytes. */
1089static void _str_cpyn(char *dest, size_t size, const char *src)
1090{
1091 assert(dest && src && size);
1092
1093 if (!dest || !src || !size)
1094 return;
1095
1096 if (size == STR_NO_LIMIT)
1097 return _str_cpy(dest, src);
1098
1099 char *dest_top = dest + size - 1;
1100 assert(size == 1 || dest < dest_top);
1101
1102 while (*src && dest < dest_top)
1103 *(dest++) = *(src++);
1104
1105 *dest = 0;
1106}
1107
1108/** Copy string.
1109 *
1110 * Copy source string @a src to destination buffer @a dest.
1111 * No more than @a size bytes are written. If the size of the output buffer
1112 * is at least one byte, the output string will always be well-formed, i.e.
1113 * null-terminated and containing only complete characters.
1114 *
1115 * @param dest Destination buffer.
1116 * @param count Size of the destination buffer (must be > 0).
1117 * @param src Source string.
1118 *
1119 */
1120void str_cpy(char *dest, size_t size, const char *src)
1121{
1122 /* There must be space for a null terminator in the buffer. */
1123 assert(size > 0);
1124 assert(src != NULL);
1125 assert(dest != NULL);
1126 assert(size == STR_NO_LIMIT || dest + size > dest);
1127
1128 /* Copy data. */
1129 _str_cpyn(dest, size, src);
1130
1131 /* In-place translate invalid bytes to U_SPECIAL. */
1132 _sanitize_string(dest, size);
1133}
1134
1135/** Copy size-limited substring.
1136 *
1137 * Copy prefix of string @a src of max. size @a size to destination buffer
1138 * @a dest. No more than @a size bytes are written. The output string will
1139 * always be well-formed, i.e. null-terminated and containing only complete
1140 * characters.
1141 *
1142 * No more than @a n bytes are read from the input string, so it does not
1143 * have to be null-terminated.
1144 *
1145 * @param dest Destination buffer.
1146 * @param count Size of the destination buffer (must be > 0).
1147 * @param src Source string.
1148 * @param n Maximum number of bytes to read from @a src.
1149 *
1150 */
1151void str_ncpy(char *dest, size_t size, const char *src, size_t n)
1152{
1153 /* There must be space for a null terminator in the buffer. */
1154 assert(size > 0);
1155 assert(src != NULL);
1156
1157 /* Copy data. */
1158 _str_cpyn(dest, min(size, n + 1), src);
1159
1160 /* In-place translate invalid bytes to U_SPECIAL. */
1161 _sanitize_string(dest, size);
1162}
1163
1164/** Append one string to another.
1165 *
1166 * Append source string @a src to string in destination buffer @a dest.
1167 * Size of the destination buffer is @a dest. If the size of the output buffer
1168 * is at least one byte, the output string will always be well-formed, i.e.
1169 * null-terminated and containing only complete characters.
1170 *
1171 * @param dest Destination buffer.
1172 * @param count Size of the destination buffer.
1173 * @param src Source string.
1174 */
1175void str_append(char *dest, size_t size, const char *src)
1176{
1177 assert(src != NULL);
1178 assert(dest != NULL);
1179 assert(size > 0);
1180 assert(size == STR_NO_LIMIT || dest + size > dest);
1181
1182 size_t dstr_size = _str_nsize(dest, size);
1183 if (dstr_size < size) {
1184 _str_cpyn(dest + dstr_size, size - dstr_size, src);
1185 _sanitize_string(dest + dstr_size, size - dstr_size);
1186 }
1187}
1188
1189/** Convert space-padded ASCII to string.
1190 *
1191 * Common legacy text encoding in hardware is 7-bit ASCII fitted into
1192 * a fixed-width byte buffer (bit 7 always zero), right-padded with spaces
1193 * (ASCII 0x20). Convert space-padded ascii to string representation.
1194 *
1195 * If the text does not fit into the destination buffer, the function converts
1196 * as many characters as possible and returns EOVERFLOW.
1197 *
1198 * If the text contains non-ASCII bytes (with bit 7 set), the whole string is
1199 * converted anyway and invalid characters are replaced with question marks
1200 * (U_SPECIAL) and the function returns EIO.
1201 *
1202 * Regardless of return value upon return @a dest will always be well-formed.
1203 *
1204 * @param dest Destination buffer
1205 * @param size Size of destination buffer
1206 * @param src Space-padded ASCII.
1207 * @param n Size of the source buffer in bytes.
1208 *
1209 * @return EOK on success, EOVERFLOW if the text does not fit
1210 * destination buffer, EIO if the text contains
1211 * non-ASCII bytes.
1212 */
1213errno_t spascii_to_str(char *dest, size_t size, const uint8_t *src, size_t n)
1214{
1215 size_t len = 0;
1216
1217 /* Determine the length of the source string. */
1218 for (size_t i = 0; i < n; i++) {
1219 if (src[i] == 0)
1220 break;
1221
1222 if (src[i] != ' ')
1223 len = i + 1;
1224 }
1225
1226 errno_t result = EOK;
1227 size_t out_len = min(len, size - 1);
1228
1229 /* Copy characters */
1230 for (size_t i = 0; i < out_len; i++) {
1231 dest[i] = src[i];
1232
1233 if (dest[i] < 0) {
1234 dest[i] = U_SPECIAL;
1235 result = EIO;
1236 }
1237 }
1238
1239 dest[out_len] = 0;
1240
1241 if (out_len < len)
1242 return EOVERFLOW;
1243
1244 return result;
1245}
1246
1247/** Convert wide string to string.
1248 *
1249 * Convert wide string @a src to string. The output is written to the buffer
1250 * specified by @a dest and @a size. @a size must be non-zero and the string
1251 * written will always be well-formed.
1252 *
1253 * @param dest Destination buffer.
1254 * @param size Size of the destination buffer.
1255 * @param src Source wide string.
1256 */
1257void wstr_to_str(char *dest, size_t size, const char32_t *src)
1258{
1259 char32_t ch;
1260 size_t src_idx;
1261 size_t dest_off;
1262
1263 /* There must be space for a null terminator in the buffer. */
1264 assert(size > 0);
1265
1266 src_idx = 0;
1267 dest_off = 0;
1268
1269 while ((ch = src[src_idx++]) != 0) {
1270 if (chr_encode(ch, dest, &dest_off, size - 1) != EOK)
1271 break;
1272 }
1273
1274 dest[dest_off] = '\0';
1275}
1276
1277/** Convert UTF16 string to string.
1278 *
1279 * Convert utf16 string @a src to string. The output is written to the buffer
1280 * specified by @a dest and @a size. @a size must be non-zero and the string
1281 * written will always be well-formed. Surrogate pairs also supported.
1282 *
1283 * @param dest Destination buffer.
1284 * @param size Size of the destination buffer.
1285 * @param src Source utf16 string.
1286 *
1287 * @return EOK, if success, an error code otherwise.
1288 */
1289errno_t utf16_to_str(char *dest, size_t size, const uint16_t *src)
1290{
1291 size_t idx = 0, dest_off = 0;
1292 char32_t ch;
1293 errno_t rc = EOK;
1294
1295 /* There must be space for a null terminator in the buffer. */
1296 assert(size > 0);
1297
1298 while (src[idx]) {
1299 if ((src[idx] & 0xfc00) == 0xd800) {
1300 if (src[idx + 1] && (src[idx + 1] & 0xfc00) == 0xdc00) {
1301 ch = 0x10000;
1302 ch += (src[idx] & 0x03FF) << 10;
1303 ch += (src[idx + 1] & 0x03FF);
1304 idx += 2;
1305 } else
1306 break;
1307 } else {
1308 ch = src[idx];
1309 idx++;
1310 }
1311 rc = chr_encode(ch, dest, &dest_off, size - 1);
1312 if (rc != EOK)
1313 break;
1314 }
1315 dest[dest_off] = '\0';
1316 return rc;
1317}
1318
1319/** Convert string to UTF16 string.
1320 *
1321 * Convert string @a src to utf16 string. The output is written to the buffer
1322 * specified by @a dest and @a dlen. @a dlen must be non-zero and the string
1323 * written will always be well-formed. Surrogate pairs also supported.
1324 *
1325 * @param dest Destination buffer.
1326 * @param dlen Number of utf16 characters that fit in the destination buffer.
1327 * @param src Source string.
1328 *
1329 * @return EOK, if success, an error code otherwise.
1330 */
1331errno_t str_to_utf16(uint16_t *dest, size_t dlen, const char *src)
1332{
1333 errno_t rc = EOK;
1334 size_t offset = 0;
1335 size_t idx = 0;
1336 char32_t c;
1337
1338 assert(dlen > 0);
1339
1340 while ((c = str_decode(src, &offset, STR_NO_LIMIT)) != 0) {
1341 if (c > 0x10000) {
1342 if (idx + 2 >= dlen - 1) {
1343 rc = EOVERFLOW;
1344 break;
1345 }
1346 c = (c - 0x10000);
1347 dest[idx] = 0xD800 | (c >> 10);
1348 dest[idx + 1] = 0xDC00 | (c & 0x3FF);
1349 idx++;
1350 } else {
1351 dest[idx] = c;
1352 }
1353
1354 idx++;
1355 if (idx >= dlen - 1) {
1356 rc = EOVERFLOW;
1357 break;
1358 }
1359 }
1360
1361 dest[idx] = '\0';
1362 return rc;
1363}
1364
1365/** Get size of UTF-16 string.
1366 *
1367 * Get the number of words which are used by the UTF-16 string @a ustr
1368 * (excluding the NULL-terminator).
1369 *
1370 * @param ustr UTF-16 string to consider.
1371 *
1372 * @return Number of words used by the UTF-16 string
1373 *
1374 */
1375size_t utf16_wsize(const uint16_t *ustr)
1376{
1377 size_t wsize = 0;
1378
1379 while (*ustr++ != 0)
1380 wsize++;
1381
1382 return wsize;
1383}
1384
1385/** Convert wide string to new string.
1386 *
1387 * Convert wide string @a src to string. Space for the new string is allocated
1388 * on the heap.
1389 *
1390 * @param src Source wide string.
1391 * @return New string.
1392 */
1393char *wstr_to_astr(const char32_t *src)
1394{
1395 char dbuf[STR_BOUNDS(1)];
1396 char *str;
1397 char32_t ch;
1398
1399 size_t src_idx;
1400 size_t dest_off;
1401 size_t dest_size;
1402
1403 /* Compute size of encoded string. */
1404
1405 src_idx = 0;
1406 dest_size = 0;
1407
1408 while ((ch = src[src_idx++]) != 0) {
1409 dest_off = 0;
1410 if (chr_encode(ch, dbuf, &dest_off, STR_BOUNDS(1)) != EOK)
1411 break;
1412 dest_size += dest_off;
1413 }
1414
1415 str = malloc(dest_size + 1);
1416 if (str == NULL)
1417 return NULL;
1418
1419 /* Encode string. */
1420
1421 src_idx = 0;
1422 dest_off = 0;
1423
1424 while ((ch = src[src_idx++]) != 0) {
1425 if (chr_encode(ch, str, &dest_off, dest_size) != EOK)
1426 break;
1427 }
1428
1429 str[dest_size] = '\0';
1430 return str;
1431}
1432
1433/** Convert string to wide string.
1434 *
1435 * Convert string @a src to wide string. The output is written to the
1436 * buffer specified by @a dest and @a dlen. @a dlen must be non-zero
1437 * and the wide string written will always be null-terminated.
1438 *
1439 * @param dest Destination buffer.
1440 * @param dlen Length of destination buffer (number of wchars).
1441 * @param src Source string.
1442 */
1443void str_to_wstr(char32_t *dest, size_t dlen, const char *src)
1444{
1445 size_t offset;
1446 size_t di;
1447 char32_t c;
1448
1449 assert(dlen > 0);
1450
1451 offset = 0;
1452 di = 0;
1453
1454 do {
1455 if (di >= dlen - 1)
1456 break;
1457
1458 c = str_decode(src, &offset, STR_NO_LIMIT);
1459 dest[di++] = c;
1460 } while (c != '\0');
1461
1462 dest[dlen - 1] = '\0';
1463}
1464
1465/** Convert string to wide string.
1466 *
1467 * Convert string @a src to wide string. A new wide NULL-terminated
1468 * string will be allocated on the heap.
1469 *
1470 * @param src Source string.
1471 */
1472char32_t *str_to_awstr(const char *str)
1473{
1474 size_t len = str_length(str);
1475
1476 char32_t *wstr = calloc(len + 1, sizeof(char32_t));
1477 if (wstr == NULL)
1478 return NULL;
1479
1480 str_to_wstr(wstr, len + 1, str);
1481 return wstr;
1482}
1483
1484static char *_strchr(const char *str, char c)
1485{
1486 while (*str != 0 && *str != c)
1487 str++;
1488
1489 return (*str == c) ? (char *) str : NULL;
1490}
1491
1492/** Find first occurence of character in string.
1493 *
1494 * @param str String to search.
1495 * @param ch Character to look for.
1496 *
1497 * @return Pointer to character in @a str or NULL if not found.
1498 */
1499char *str_chr(const char *str, char32_t ch)
1500{
1501 /* Fast path for an ASCII character. */
1502 if (ascii_check(ch))
1503 return _strchr(str, ch);
1504
1505 /* Convert character to UTF-8. */
1506 char utf8[STR_BOUNDS(1) + 1];
1507 size_t offset = 0;
1508
1509 if (chr_encode(ch, utf8, &offset, sizeof(utf8)) != EOK || offset == 0)
1510 return NULL;
1511
1512 utf8[offset] = '\0';
1513
1514 /* Find the first byte, then check if all of them are correct. */
1515 while (*str != 0) {
1516 str = _strchr(str, utf8[0]);
1517 if (!str)
1518 return NULL;
1519
1520 if (_test_prefix(str, utf8))
1521 return (char *) str;
1522
1523 str++;
1524 }
1525
1526 return NULL;
1527}
1528
1529/** Find first occurence of substring in string.
1530 *
1531 * @param hs Haystack (string)
1532 * @param n Needle (substring to look for)
1533 *
1534 * @return Pointer to character in @a hs or @c NULL if not found.
1535 */
1536char *str_str(const char *hs, const char *n)
1537{
1538 size_t hsize = _str_size(hs);
1539 size_t nsize = _str_size(n);
1540
1541 while (hsize >= nsize) {
1542 if (_test_prefix(hs, n))
1543 return (char *) hs;
1544
1545 hs++;
1546 hsize--;
1547 }
1548
1549 return NULL;
1550}
1551
1552static void _str_rtrim(char *str, char c)
1553{
1554 char *last = str;
1555
1556 while (*str) {
1557 if (*str != c)
1558 last = str;
1559
1560 str++;
1561 }
1562
1563 /* Truncate string. */
1564 last[1] = 0;
1565}
1566
1567/** Removes specified trailing characters from a string.
1568 *
1569 * @param str String to remove from.
1570 * @param ch Character to remove.
1571 */
1572void str_rtrim(char *str, char32_t ch)
1573{
1574 /* Fast path for the ASCII case. */
1575 if (ascii_check(ch)) {
1576 _str_rtrim(str, ch);
1577 return;
1578 }
1579
1580 size_t off = 0;
1581 size_t pos = 0;
1582 char32_t c;
1583 bool update_last_chunk = true;
1584 char *last_chunk = NULL;
1585
1586 while ((c = str_decode(str, &off, STR_NO_LIMIT))) {
1587 if (c != ch) {
1588 update_last_chunk = true;
1589 last_chunk = NULL;
1590 } else if (update_last_chunk) {
1591 update_last_chunk = false;
1592 last_chunk = (str + pos);
1593 }
1594 pos = off;
1595 }
1596
1597 if (last_chunk)
1598 *last_chunk = '\0';
1599}
1600
1601static void _str_ltrim(char *str, char c)
1602{
1603 char *p = str;
1604
1605 while (*p == c)
1606 p++;
1607
1608 if (str != p)
1609 _str_cpy(str, p);
1610}
1611
1612/** Removes specified leading characters from a string.
1613 *
1614 * @param str String to remove from.
1615 * @param ch Character to remove.
1616 */
1617void str_ltrim(char *str, char32_t ch)
1618{
1619 /* Fast path for the ASCII case. */
1620 if (ascii_check(ch)) {
1621 _str_ltrim(str, ch);
1622 return;
1623 }
1624
1625 char32_t acc;
1626 size_t off = 0;
1627 size_t pos = 0;
1628 size_t str_sz = str_size(str);
1629
1630 while ((acc = str_decode(str, &off, STR_NO_LIMIT)) != 0) {
1631 if (acc != ch)
1632 break;
1633 else
1634 pos = off;
1635 }
1636
1637 if (pos > 0) {
1638 memmove(str, &str[pos], str_sz - pos);
1639 pos = str_sz - pos;
1640 str[pos] = '\0';
1641 }
1642}
1643
1644static char *_str_rchr(const char *str, char c)
1645{
1646 const char *last = NULL;
1647
1648 while (*str) {
1649 if (*str == c)
1650 last = str;
1651
1652 str++;
1653 }
1654
1655 return (char *) last;
1656}
1657
1658/** Find last occurence of character in string.
1659 *
1660 * @param str String to search.
1661 * @param ch Character to look for.
1662 *
1663 * @return Pointer to character in @a str or NULL if not found.
1664 */
1665char *str_rchr(const char *str, char32_t ch)
1666{
1667 if (ascii_check(ch))
1668 return _str_rchr(str, ch);
1669
1670 char32_t acc;
1671 size_t off = 0;
1672 size_t last = 0;
1673 const char *res = NULL;
1674
1675 while ((acc = str_decode(str, &off, STR_NO_LIMIT)) != 0) {
1676 if (acc == ch)
1677 res = (str + last);
1678 last = off;
1679 }
1680
1681 return (char *) res;
1682}
1683
1684/** Insert a wide character into a wide string.
1685 *
1686 * Insert a wide character into a wide string at position
1687 * @a pos. The characters after the position are shifted.
1688 *
1689 * @param str String to insert to.
1690 * @param ch Character to insert to.
1691 * @param pos Character index where to insert.
1692 * @param max_pos Characters in the buffer.
1693 *
1694 * @return True if the insertion was sucessful, false if the position
1695 * is out of bounds.
1696 *
1697 */
1698bool wstr_linsert(char32_t *str, char32_t ch, size_t pos, size_t max_pos)
1699{
1700 size_t len = wstr_length(str);
1701
1702 if ((pos > len) || (pos + 1 > max_pos))
1703 return false;
1704
1705 size_t i;
1706 for (i = len; i + 1 > pos; i--)
1707 str[i + 1] = str[i];
1708
1709 str[pos] = ch;
1710
1711 return true;
1712}
1713
1714/** Remove a wide character from a wide string.
1715 *
1716 * Remove a wide character from a wide string at position
1717 * @a pos. The characters after the position are shifted.
1718 *
1719 * @param str String to remove from.
1720 * @param pos Character index to remove.
1721 *
1722 * @return True if the removal was sucessful, false if the position
1723 * is out of bounds.
1724 *
1725 */
1726bool wstr_remove(char32_t *str, size_t pos)
1727{
1728 size_t len = wstr_length(str);
1729
1730 if (pos >= len)
1731 return false;
1732
1733 size_t i;
1734 for (i = pos + 1; i <= len; i++)
1735 str[i - 1] = str[i];
1736
1737 return true;
1738}
1739
1740/** Duplicate string.
1741 *
1742 * Allocate a new string and copy characters from the source
1743 * string into it. The duplicate string is allocated via sleeping
1744 * malloc(), thus this function can sleep in no memory conditions.
1745 *
1746 * The allocation cannot fail and the return value is always
1747 * a valid pointer. The duplicate string is always a well-formed
1748 * null-terminated UTF-8 string, but it can differ from the source
1749 * string on the byte level.
1750 *
1751 * @param src Source string.
1752 *
1753 * @return Duplicate string.
1754 *
1755 */
1756char *str_dup(const char *src)
1757{
1758 size_t size = _str_size(src) + 1;
1759 char *dest = malloc(size);
1760 if (!dest)
1761 return NULL;
1762
1763 memcpy(dest, src, size);
1764 _sanitize_string(dest, size);
1765 return dest;
1766}
1767
1768/** Duplicate string with size limit.
1769 *
1770 * Allocate a new string and copy up to @max_size bytes from the source
1771 * string into it. The duplicate string is allocated via sleeping
1772 * malloc(), thus this function can sleep in no memory conditions.
1773 * No more than @max_size + 1 bytes is allocated, but if the size
1774 * occupied by the source string is smaller than @max_size + 1,
1775 * less is allocated.
1776 *
1777 * The allocation cannot fail and the return value is always
1778 * a valid pointer. The duplicate string is always a well-formed
1779 * null-terminated UTF-8 string, but it can differ from the source
1780 * string on the byte level.
1781 *
1782 * @param src Source string.
1783 * @param n Maximum number of bytes to duplicate.
1784 *
1785 * @return Duplicate string.
1786 *
1787 */
1788char *str_ndup(const char *src, size_t n)
1789{
1790 size_t size = _str_nsize(src, n);
1791
1792 char *dest = malloc(size + 1);
1793 if (!dest)
1794 return NULL;
1795
1796 memcpy(dest, src, size);
1797 _sanitize_string(dest, size);
1798 dest[size] = 0;
1799 return dest;
1800}
1801
1802/** Split string by delimiters.
1803 *
1804 * @param s String to be tokenized. May not be NULL.
1805 * @param delim String with the delimiters.
1806 * @param next Variable which will receive the pointer to the
1807 * continuation of the string following the first
1808 * occurrence of any of the delimiter characters.
1809 * May be NULL.
1810 * @return Pointer to the prefix of @a s before the first
1811 * delimiter character. NULL if no such prefix
1812 * exists.
1813 */
1814char *str_tok(char *s, const char *delim, char **next)
1815{
1816 char *start, *end;
1817
1818 if (!s)
1819 return NULL;
1820
1821 size_t len = str_size(s);
1822 size_t cur;
1823 size_t tmp;
1824 char32_t ch;
1825
1826 /* Skip over leading delimiters. */
1827 tmp = 0;
1828 cur = 0;
1829 while ((ch = str_decode(s, &tmp, len)) && str_chr(delim, ch))
1830 cur = tmp;
1831 start = &s[cur];
1832
1833 /* Skip over token characters. */
1834 tmp = cur;
1835 while ((ch = str_decode(s, &tmp, len)) && !str_chr(delim, ch))
1836 cur = tmp;
1837 end = &s[cur];
1838 if (next)
1839 *next = (ch ? &s[tmp] : &s[cur]);
1840
1841 if (start == end)
1842 return NULL; /* No more tokens. */
1843
1844 /* Overwrite delimiter with NULL terminator. */
1845 *end = '\0';
1846 return start;
1847}
1848
1849void order_suffix(const uint64_t val, uint64_t *rv, char *suffix)
1850{
1851 if (val > UINT64_C(10000000000000000000)) {
1852 *rv = val / UINT64_C(1000000000000000000);
1853 *suffix = 'Z';
1854 } else if (val > UINT64_C(1000000000000000000)) {
1855 *rv = val / UINT64_C(1000000000000000);
1856 *suffix = 'E';
1857 } else if (val > UINT64_C(1000000000000000)) {
1858 *rv = val / UINT64_C(1000000000000);
1859 *suffix = 'T';
1860 } else if (val > UINT64_C(1000000000000)) {
1861 *rv = val / UINT64_C(1000000000);
1862 *suffix = 'G';
1863 } else if (val > UINT64_C(1000000000)) {
1864 *rv = val / UINT64_C(1000000);
1865 *suffix = 'M';
1866 } else if (val > UINT64_C(1000000)) {
1867 *rv = val / UINT64_C(1000);
1868 *suffix = 'k';
1869 } else {
1870 *rv = val;
1871 *suffix = ' ';
1872 }
1873}
1874
1875void bin_order_suffix(const uint64_t val, uint64_t *rv, const char **suffix,
1876 bool fixed)
1877{
1878 if (val > UINT64_C(1152921504606846976)) {
1879 *rv = val / UINT64_C(1125899906842624);
1880 *suffix = "EiB";
1881 } else if (val > UINT64_C(1125899906842624)) {
1882 *rv = val / UINT64_C(1099511627776);
1883 *suffix = "TiB";
1884 } else if (val > UINT64_C(1099511627776)) {
1885 *rv = val / UINT64_C(1073741824);
1886 *suffix = "GiB";
1887 } else if (val > UINT64_C(1073741824)) {
1888 *rv = val / UINT64_C(1048576);
1889 *suffix = "MiB";
1890 } else if (val > UINT64_C(1048576)) {
1891 *rv = val / UINT64_C(1024);
1892 *suffix = "KiB";
1893 } else {
1894 *rv = val;
1895 if (fixed)
1896 *suffix = "B ";
1897 else
1898 *suffix = "B";
1899 }
1900}
1901
1902/** @}
1903 */
Note: See TracBrowser for help on using the repository browser.