source: mainline/common/str.c@ e2b417f

Last change on this file since e2b417f was fdfb24e, checked in by Jiří Zárevúcky <zarevucky.jiri@…>, 20 months ago

Deduplicate string related functions

  • Property mode set to 100644
File size: 37.2 KB
Line 
1/*
2 * Copyright (c) 2001-2004 Jakub Jermar
3 * Copyright (c) 2005 Martin Decky
4 * Copyright (c) 2008 Jiri Svoboda
5 * Copyright (c) 2011 Martin Sucha
6 * Copyright (c) 2011 Oleg Romanenko
7 * All rights reserved.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 *
13 * - Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * - Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * - The name of the author may not be used to endorse or promote products
19 * derived from this software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
22 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
23 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
24 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
25 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
26 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
30 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 */
32
33/** @addtogroup libc
34 * @{
35 */
36
37/**
38 * @file
39 * @brief String functions.
40 *
41 * Strings and characters use the Universal Character Set (UCS). The standard
42 * strings, called just strings are encoded in UTF-8. Wide strings (encoded
43 * in UTF-32) are supported to a limited degree. A single character is
44 * represented as char32_t.@n
45 *
46 * Overview of the terminology:@n
47 *
48 * Term Meaning
49 * -------------------- ----------------------------------------------------
50 * byte 8 bits stored in uint8_t (unsigned 8 bit integer)
51 *
52 * character UTF-32 encoded Unicode character, stored in char32_t
53 * (unsigned 32 bit integer), code points 0 .. 1114111
54 * are valid
55 *
56 * ASCII character 7 bit encoded ASCII character, stored in char
57 * (usually signed 8 bit integer), code points 0 .. 127
58 * are valid
59 *
60 * string UTF-8 encoded NULL-terminated Unicode string, char *
61 *
62 * wide string UTF-32 encoded NULL-terminated Unicode string,
63 * char32_t *
64 *
65 * [wide] string size number of BYTES in a [wide] string (excluding
66 * the NULL-terminator), size_t
67 *
68 * [wide] string length number of CHARACTERS in a [wide] string (excluding
69 * the NULL-terminator), size_t
70 *
71 * [wide] string width number of display cells on a monospace display taken
72 * by a [wide] string, size_t
73 *
74 *
75 * Overview of string metrics:@n
76 *
77 * Metric Abbrev. Type Meaning
78 * ------ ------ ------ -------------------------------------------------
79 * size n size_t number of BYTES in a string (excluding the
80 * NULL-terminator)
81 *
82 * length l size_t number of CHARACTERS in a string (excluding the
83 * null terminator)
84 *
85 * width w size_t number of display cells on a monospace display
86 * taken by a string
87 *
88 *
89 * Function naming prefixes:@n
90 *
91 * chr_ operate on characters
92 * ascii_ operate on ASCII characters
93 * str_ operate on strings
94 * wstr_ operate on wide strings
95 *
96 * [w]str_[n|l|w] operate on a prefix limited by size, length
97 * or width
98 *
99 *
100 * A specific character inside a [wide] string can be referred to by:@n
101 *
102 * pointer (char *, char32_t *)
103 * byte offset (size_t)
104 * character index (size_t)
105 *
106 */
107
108#include <str.h>
109
110#include <assert.h>
111#include <ctype.h>
112#include <errno.h>
113#include <stdbool.h>
114#include <stddef.h>
115#include <stdint.h>
116#include <stdlib.h>
117
118#include <align.h>
119#include <mem.h>
120
121/** Byte mask consisting of lowest @n bits (out of 8) */
122#define LO_MASK_8(n) ((uint8_t) ((1 << (n)) - 1))
123
124/** Byte mask consisting of lowest @n bits (out of 32) */
125#define LO_MASK_32(n) ((uint32_t) ((1 << (n)) - 1))
126
127/** Byte mask consisting of highest @n bits (out of 8) */
128#define HI_MASK_8(n) (~LO_MASK_8(8 - (n)))
129
130/** Number of data bits in a UTF-8 continuation byte */
131#define CONT_BITS 6
132
133/** Decode a single character from a string.
134 *
135 * Decode a single character from a string of size @a size. Decoding starts
136 * at @a offset and this offset is moved to the beginning of the next
137 * character. In case of decoding error, offset generally advances at least
138 * by one. However, offset is never moved beyond size.
139 *
140 * @param str String (not necessarily NULL-terminated).
141 * @param offset Byte offset in string where to start decoding.
142 * @param size Size of the string (in bytes).
143 *
144 * @return Value of decoded character, U_SPECIAL on decoding error or
145 * NULL if attempt to decode beyond @a size.
146 *
147 */
148char32_t str_decode(const char *str, size_t *offset, size_t size)
149{
150 if (*offset + 1 > size)
151 return 0;
152
153 /* First byte read from string */
154 uint8_t b0 = (uint8_t) str[(*offset)++];
155
156 /* Determine code length */
157
158 unsigned int b0_bits; /* Data bits in first byte */
159 unsigned int cbytes; /* Number of continuation bytes */
160
161 if ((b0 & 0x80) == 0) {
162 /* 0xxxxxxx (Plain ASCII) */
163 b0_bits = 7;
164 cbytes = 0;
165 } else if ((b0 & 0xe0) == 0xc0) {
166 /* 110xxxxx 10xxxxxx */
167 b0_bits = 5;
168 cbytes = 1;
169 } else if ((b0 & 0xf0) == 0xe0) {
170 /* 1110xxxx 10xxxxxx 10xxxxxx */
171 b0_bits = 4;
172 cbytes = 2;
173 } else if ((b0 & 0xf8) == 0xf0) {
174 /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
175 b0_bits = 3;
176 cbytes = 3;
177 } else {
178 /* 10xxxxxx -- unexpected continuation byte */
179 return U_SPECIAL;
180 }
181
182 if (*offset + cbytes > size)
183 return U_SPECIAL;
184
185 char32_t ch = b0 & LO_MASK_8(b0_bits);
186
187 /* Decode continuation bytes */
188 while (cbytes > 0) {
189 uint8_t b = (uint8_t) str[(*offset)++];
190
191 /* Must be 10xxxxxx */
192 if ((b & 0xc0) != 0x80)
193 return U_SPECIAL;
194
195 /* Shift data bits to ch */
196 ch = (ch << CONT_BITS) | (char32_t) (b & LO_MASK_8(CONT_BITS));
197 cbytes--;
198 }
199
200 return ch;
201}
202
203/** Decode a single character from a string to the left.
204 *
205 * Decode a single character from a string of size @a size. Decoding starts
206 * at @a offset and this offset is moved to the beginning of the previous
207 * character. In case of decoding error, offset generally decreases at least
208 * by one. However, offset is never moved before 0.
209 *
210 * @param str String (not necessarily NULL-terminated).
211 * @param offset Byte offset in string where to start decoding.
212 * @param size Size of the string (in bytes).
213 *
214 * @return Value of decoded character, U_SPECIAL on decoding error or
215 * NULL if attempt to decode beyond @a start of str.
216 *
217 */
218char32_t str_decode_reverse(const char *str, size_t *offset, size_t size)
219{
220 if (*offset == 0)
221 return 0;
222
223 size_t processed = 0;
224 /* Continue while continuation bytes found */
225 while (*offset > 0 && processed < 4) {
226 uint8_t b = (uint8_t) str[--(*offset)];
227
228 if (processed == 0 && (b & 0x80) == 0) {
229 /* 0xxxxxxx (Plain ASCII) */
230 return b & 0x7f;
231 } else if ((b & 0xe0) == 0xc0 || (b & 0xf0) == 0xe0 ||
232 (b & 0xf8) == 0xf0) {
233 /* Start byte */
234 size_t start_offset = *offset;
235 return str_decode(str, &start_offset, size);
236 } else if ((b & 0xc0) != 0x80) {
237 /* Not a continuation byte */
238 return U_SPECIAL;
239 }
240 processed++;
241 }
242 /* Too many continuation bytes */
243 return U_SPECIAL;
244}
245
246/** Encode a single character to string representation.
247 *
248 * Encode a single character to string representation (i.e. UTF-8) and store
249 * it into a buffer at @a offset. Encoding starts at @a offset and this offset
250 * is moved to the position where the next character can be written to.
251 *
252 * @param ch Input character.
253 * @param str Output buffer.
254 * @param offset Byte offset where to start writing.
255 * @param size Size of the output buffer (in bytes).
256 *
257 * @return EOK if the character was encoded successfully, EOVERFLOW if there
258 * was not enough space in the output buffer or EINVAL if the character
259 * code was invalid.
260 */
261errno_t chr_encode(const char32_t ch, char *str, size_t *offset, size_t size)
262{
263 if (*offset >= size)
264 return EOVERFLOW;
265
266 if (!chr_check(ch))
267 return EINVAL;
268
269 /*
270 * Unsigned version of ch (bit operations should only be done
271 * on unsigned types).
272 */
273 uint32_t cc = (uint32_t) ch;
274
275 /* Determine how many continuation bytes are needed */
276
277 unsigned int b0_bits; /* Data bits in first byte */
278 unsigned int cbytes; /* Number of continuation bytes */
279
280 if ((cc & ~LO_MASK_32(7)) == 0) {
281 b0_bits = 7;
282 cbytes = 0;
283 } else if ((cc & ~LO_MASK_32(11)) == 0) {
284 b0_bits = 5;
285 cbytes = 1;
286 } else if ((cc & ~LO_MASK_32(16)) == 0) {
287 b0_bits = 4;
288 cbytes = 2;
289 } else if ((cc & ~LO_MASK_32(21)) == 0) {
290 b0_bits = 3;
291 cbytes = 3;
292 } else {
293 /* Codes longer than 21 bits are not supported */
294 return EINVAL;
295 }
296
297 /* Check for available space in buffer */
298 if (*offset + cbytes >= size)
299 return EOVERFLOW;
300
301 /* Encode continuation bytes */
302 unsigned int i;
303 for (i = cbytes; i > 0; i--) {
304 str[*offset + i] = 0x80 | (cc & LO_MASK_32(CONT_BITS));
305 cc = cc >> CONT_BITS;
306 }
307
308 /* Encode first byte */
309 str[*offset] = (cc & LO_MASK_32(b0_bits)) | HI_MASK_8(8 - b0_bits - 1);
310
311 /* Advance offset */
312 *offset += cbytes + 1;
313
314 return EOK;
315}
316
317/** Get size of string.
318 *
319 * Get the number of bytes which are used by the string @a str (excluding the
320 * NULL-terminator).
321 *
322 * @param str String to consider.
323 *
324 * @return Number of bytes used by the string
325 *
326 */
327size_t str_size(const char *str)
328{
329 size_t size = 0;
330
331 while (*str++ != 0)
332 size++;
333
334 return size;
335}
336
337/** Get size of wide string.
338 *
339 * Get the number of bytes which are used by the wide string @a str (excluding the
340 * NULL-terminator).
341 *
342 * @param str Wide string to consider.
343 *
344 * @return Number of bytes used by the wide string
345 *
346 */
347size_t wstr_size(const char32_t *str)
348{
349 return (wstr_length(str) * sizeof(char32_t));
350}
351
352/** Get size of string with length limit.
353 *
354 * Get the number of bytes which are used by up to @a max_len first
355 * characters in the string @a str. If @a max_len is greater than
356 * the length of @a str, the entire string is measured (excluding the
357 * NULL-terminator).
358 *
359 * @param str String to consider.
360 * @param max_len Maximum number of characters to measure.
361 *
362 * @return Number of bytes used by the characters.
363 *
364 */
365size_t str_lsize(const char *str, size_t max_len)
366{
367 size_t len = 0;
368 size_t offset = 0;
369
370 while (len < max_len) {
371 if (str_decode(str, &offset, STR_NO_LIMIT) == 0)
372 break;
373
374 len++;
375 }
376
377 return offset;
378}
379
380/** Get size of string with size limit.
381 *
382 * Get the number of bytes which are used by the string @a str
383 * (excluding the NULL-terminator), but no more than @max_size bytes.
384 *
385 * @param str String to consider.
386 * @param max_size Maximum number of bytes to measure.
387 *
388 * @return Number of bytes used by the string
389 *
390 */
391size_t str_nsize(const char *str, size_t max_size)
392{
393 size_t size = 0;
394
395 while ((*str++ != 0) && (size < max_size))
396 size++;
397
398 return size;
399}
400
401/** Get size of wide string with size limit.
402 *
403 * Get the number of bytes which are used by the wide string @a str
404 * (excluding the NULL-terminator), but no more than @max_size bytes.
405 *
406 * @param str Wide string to consider.
407 * @param max_size Maximum number of bytes to measure.
408 *
409 * @return Number of bytes used by the wide string
410 *
411 */
412size_t wstr_nsize(const char32_t *str, size_t max_size)
413{
414 return (wstr_nlength(str, max_size) * sizeof(char32_t));
415}
416
417/** Get size of wide string with length limit.
418 *
419 * Get the number of bytes which are used by up to @a max_len first
420 * wide characters in the wide string @a str. If @a max_len is greater than
421 * the length of @a str, the entire wide string is measured (excluding the
422 * NULL-terminator).
423 *
424 * @param str Wide string to consider.
425 * @param max_len Maximum number of wide characters to measure.
426 *
427 * @return Number of bytes used by the wide characters.
428 *
429 */
430size_t wstr_lsize(const char32_t *str, size_t max_len)
431{
432 return (wstr_nlength(str, max_len * sizeof(char32_t)) * sizeof(char32_t));
433}
434
435/** Get number of characters in a string.
436 *
437 * @param str NULL-terminated string.
438 *
439 * @return Number of characters in string.
440 *
441 */
442size_t str_length(const char *str)
443{
444 size_t len = 0;
445 size_t offset = 0;
446
447 while (str_decode(str, &offset, STR_NO_LIMIT) != 0)
448 len++;
449
450 return len;
451}
452
453/** Get number of characters in a wide string.
454 *
455 * @param str NULL-terminated wide string.
456 *
457 * @return Number of characters in @a str.
458 *
459 */
460size_t wstr_length(const char32_t *wstr)
461{
462 size_t len = 0;
463
464 while (*wstr++ != 0)
465 len++;
466
467 return len;
468}
469
470/** Get number of characters in a string with size limit.
471 *
472 * @param str NULL-terminated string.
473 * @param size Maximum number of bytes to consider.
474 *
475 * @return Number of characters in string.
476 *
477 */
478size_t str_nlength(const char *str, size_t size)
479{
480 size_t len = 0;
481 size_t offset = 0;
482
483 while (str_decode(str, &offset, size) != 0)
484 len++;
485
486 return len;
487}
488
489/** Get number of characters in a string with size limit.
490 *
491 * @param str NULL-terminated string.
492 * @param size Maximum number of bytes to consider.
493 *
494 * @return Number of characters in string.
495 *
496 */
497size_t wstr_nlength(const char32_t *str, size_t size)
498{
499 size_t len = 0;
500 size_t limit = ALIGN_DOWN(size, sizeof(char32_t));
501 size_t offset = 0;
502
503 while ((offset < limit) && (*str++ != 0)) {
504 len++;
505 offset += sizeof(char32_t);
506 }
507
508 return len;
509}
510
511/** Get character display width on a character cell display.
512 *
513 * @param ch Character
514 * @return Width of character in cells.
515 */
516size_t chr_width(char32_t ch)
517{
518 return 1;
519}
520
521/** Get string display width on a character cell display.
522 *
523 * @param str String
524 * @return Width of string in cells.
525 */
526size_t str_width(const char *str)
527{
528 size_t width = 0;
529 size_t offset = 0;
530 char32_t ch;
531
532 while ((ch = str_decode(str, &offset, STR_NO_LIMIT)) != 0)
533 width += chr_width(ch);
534
535 return width;
536}
537
538/** Check whether character is plain ASCII.
539 *
540 * @return True if character is plain ASCII.
541 *
542 */
543bool ascii_check(char32_t ch)
544{
545 if (ch <= 127)
546 return true;
547
548 return false;
549}
550
551/** Check whether character is valid
552 *
553 * @return True if character is a valid Unicode code point.
554 *
555 */
556bool chr_check(char32_t ch)
557{
558 if (ch <= 1114111)
559 return true;
560
561 return false;
562}
563
564/** Compare two NULL terminated strings.
565 *
566 * Do a char-by-char comparison of two NULL-terminated strings.
567 * The strings are considered equal iff their length is equal
568 * and both strings consist of the same sequence of characters.
569 *
570 * A string S1 is less than another string S2 if it has a character with
571 * lower value at the first character position where the strings differ.
572 * If the strings differ in length, the shorter one is treated as if
573 * padded by characters with a value of zero.
574 *
575 * @param s1 First string to compare.
576 * @param s2 Second string to compare.
577 *
578 * @return 0 if the strings are equal, -1 if the first is less than the second,
579 * 1 if the second is less than the first.
580 *
581 */
582int str_cmp(const char *s1, const char *s2)
583{
584 char32_t c1 = 0;
585 char32_t c2 = 0;
586
587 size_t off1 = 0;
588 size_t off2 = 0;
589
590 while (true) {
591 c1 = str_decode(s1, &off1, STR_NO_LIMIT);
592 c2 = str_decode(s2, &off2, STR_NO_LIMIT);
593
594 if (c1 < c2)
595 return -1;
596
597 if (c1 > c2)
598 return 1;
599
600 if (c1 == 0 || c2 == 0)
601 break;
602 }
603
604 return 0;
605}
606
607/** Compare two NULL terminated strings with length limit.
608 *
609 * Do a char-by-char comparison of two NULL-terminated strings.
610 * The strings are considered equal iff
611 * min(str_length(s1), max_len) == min(str_length(s2), max_len)
612 * and both strings consist of the same sequence of characters,
613 * up to max_len characters.
614 *
615 * A string S1 is less than another string S2 if it has a character with
616 * lower value at the first character position where the strings differ.
617 * If the strings differ in length, the shorter one is treated as if
618 * padded by characters with a value of zero. Only the first max_len
619 * characters are considered.
620 *
621 * @param s1 First string to compare.
622 * @param s2 Second string to compare.
623 * @param max_len Maximum number of characters to consider.
624 *
625 * @return 0 if the strings are equal, -1 if the first is less than the second,
626 * 1 if the second is less than the first.
627 *
628 */
629int str_lcmp(const char *s1, const char *s2, size_t max_len)
630{
631 char32_t c1 = 0;
632 char32_t c2 = 0;
633
634 size_t off1 = 0;
635 size_t off2 = 0;
636
637 size_t len = 0;
638
639 while (true) {
640 if (len >= max_len)
641 break;
642
643 c1 = str_decode(s1, &off1, STR_NO_LIMIT);
644 c2 = str_decode(s2, &off2, STR_NO_LIMIT);
645
646 if (c1 < c2)
647 return -1;
648
649 if (c1 > c2)
650 return 1;
651
652 if (c1 == 0 || c2 == 0)
653 break;
654
655 ++len;
656 }
657
658 return 0;
659
660}
661
662/** Compare two NULL terminated strings in case-insensitive manner.
663 *
664 * Do a char-by-char comparison of two NULL-terminated strings.
665 * The strings are considered equal iff their length is equal
666 * and both strings consist of the same sequence of characters
667 * when converted to lower case.
668 *
669 * A string S1 is less than another string S2 if it has a character with
670 * lower value at the first character position where the strings differ.
671 * If the strings differ in length, the shorter one is treated as if
672 * padded by characters with a value of zero.
673 *
674 * @param s1 First string to compare.
675 * @param s2 Second string to compare.
676 *
677 * @return 0 if the strings are equal, -1 if the first is less than the second,
678 * 1 if the second is less than the first.
679 *
680 */
681int str_casecmp(const char *s1, const char *s2)
682{
683 char32_t c1 = 0;
684 char32_t c2 = 0;
685
686 size_t off1 = 0;
687 size_t off2 = 0;
688
689 while (true) {
690 c1 = tolower(str_decode(s1, &off1, STR_NO_LIMIT));
691 c2 = tolower(str_decode(s2, &off2, STR_NO_LIMIT));
692
693 if (c1 < c2)
694 return -1;
695
696 if (c1 > c2)
697 return 1;
698
699 if (c1 == 0 || c2 == 0)
700 break;
701 }
702
703 return 0;
704}
705
706/** Compare two NULL terminated strings with length limit in case-insensitive
707 * manner.
708 *
709 * Do a char-by-char comparison of two NULL-terminated strings.
710 * The strings are considered equal iff
711 * min(str_length(s1), max_len) == min(str_length(s2), max_len)
712 * and both strings consist of the same sequence of characters,
713 * up to max_len characters.
714 *
715 * A string S1 is less than another string S2 if it has a character with
716 * lower value at the first character position where the strings differ.
717 * If the strings differ in length, the shorter one is treated as if
718 * padded by characters with a value of zero. Only the first max_len
719 * characters are considered.
720 *
721 * @param s1 First string to compare.
722 * @param s2 Second string to compare.
723 * @param max_len Maximum number of characters to consider.
724 *
725 * @return 0 if the strings are equal, -1 if the first is less than the second,
726 * 1 if the second is less than the first.
727 *
728 */
729int str_lcasecmp(const char *s1, const char *s2, size_t max_len)
730{
731 char32_t c1 = 0;
732 char32_t c2 = 0;
733
734 size_t off1 = 0;
735 size_t off2 = 0;
736
737 size_t len = 0;
738
739 while (true) {
740 if (len >= max_len)
741 break;
742
743 c1 = tolower(str_decode(s1, &off1, STR_NO_LIMIT));
744 c2 = tolower(str_decode(s2, &off2, STR_NO_LIMIT));
745
746 if (c1 < c2)
747 return -1;
748
749 if (c1 > c2)
750 return 1;
751
752 if (c1 == 0 || c2 == 0)
753 break;
754
755 ++len;
756 }
757
758 return 0;
759
760}
761
762/** Test whether p is a prefix of s.
763 *
764 * Do a char-by-char comparison of two NULL-terminated strings
765 * and determine if p is a prefix of s.
766 *
767 * @param s The string in which to look
768 * @param p The string to check if it is a prefix of s
769 *
770 * @return true iff p is prefix of s else false
771 *
772 */
773bool str_test_prefix(const char *s, const char *p)
774{
775 char32_t c1 = 0;
776 char32_t c2 = 0;
777
778 size_t off1 = 0;
779 size_t off2 = 0;
780
781 while (true) {
782 c1 = str_decode(s, &off1, STR_NO_LIMIT);
783 c2 = str_decode(p, &off2, STR_NO_LIMIT);
784
785 if (c2 == 0)
786 return true;
787
788 if (c1 != c2)
789 return false;
790
791 if (c1 == 0)
792 break;
793 }
794
795 return false;
796}
797
798/** Get a string suffix.
799 *
800 * Return a string suffix defined by the prefix length.
801 *
802 * @param s The string to get the suffix from.
803 * @param prefix_length Number of prefix characters to ignore.
804 *
805 * @return String suffix.
806 *
807 */
808const char *str_suffix(const char *s, size_t prefix_length)
809{
810 size_t off = 0;
811 size_t i = 0;
812
813 while (true) {
814 str_decode(s, &off, STR_NO_LIMIT);
815 i++;
816
817 if (i >= prefix_length)
818 break;
819 }
820
821 return s + off;
822}
823
824/** Copy string.
825 *
826 * Copy source string @a src to destination buffer @a dest.
827 * No more than @a size bytes are written. If the size of the output buffer
828 * is at least one byte, the output string will always be well-formed, i.e.
829 * null-terminated and containing only complete characters.
830 *
831 * @param dest Destination buffer.
832 * @param count Size of the destination buffer (must be > 0).
833 * @param src Source string.
834 *
835 */
836void str_cpy(char *dest, size_t size, const char *src)
837{
838 /* There must be space for a null terminator in the buffer. */
839 assert(size > 0);
840 assert(src != NULL);
841
842 size_t src_off = 0;
843 size_t dest_off = 0;
844
845 char32_t ch;
846 while ((ch = str_decode(src, &src_off, STR_NO_LIMIT)) != 0) {
847 if (chr_encode(ch, dest, &dest_off, size - 1) != EOK)
848 break;
849 }
850
851 dest[dest_off] = '\0';
852}
853
854/** Copy size-limited substring.
855 *
856 * Copy prefix of string @a src of max. size @a size to destination buffer
857 * @a dest. No more than @a size bytes are written. The output string will
858 * always be well-formed, i.e. null-terminated and containing only complete
859 * characters.
860 *
861 * No more than @a n bytes are read from the input string, so it does not
862 * have to be null-terminated.
863 *
864 * @param dest Destination buffer.
865 * @param count Size of the destination buffer (must be > 0).
866 * @param src Source string.
867 * @param n Maximum number of bytes to read from @a src.
868 *
869 */
870void str_ncpy(char *dest, size_t size, const char *src, size_t n)
871{
872 /* There must be space for a null terminator in the buffer. */
873 assert(size > 0);
874
875 size_t src_off = 0;
876 size_t dest_off = 0;
877
878 char32_t ch;
879 while ((ch = str_decode(src, &src_off, n)) != 0) {
880 if (chr_encode(ch, dest, &dest_off, size - 1) != EOK)
881 break;
882 }
883
884 dest[dest_off] = '\0';
885}
886
887/** Append one string to another.
888 *
889 * Append source string @a src to string in destination buffer @a dest.
890 * Size of the destination buffer is @a dest. If the size of the output buffer
891 * is at least one byte, the output string will always be well-formed, i.e.
892 * null-terminated and containing only complete characters.
893 *
894 * @param dest Destination buffer.
895 * @param count Size of the destination buffer.
896 * @param src Source string.
897 */
898void str_append(char *dest, size_t size, const char *src)
899{
900 size_t dstr_size;
901
902 dstr_size = str_size(dest);
903 if (dstr_size >= size)
904 return;
905
906 str_cpy(dest + dstr_size, size - dstr_size, src);
907}
908
909/** Convert space-padded ASCII to string.
910 *
911 * Common legacy text encoding in hardware is 7-bit ASCII fitted into
912 * a fixed-width byte buffer (bit 7 always zero), right-padded with spaces
913 * (ASCII 0x20). Convert space-padded ascii to string representation.
914 *
915 * If the text does not fit into the destination buffer, the function converts
916 * as many characters as possible and returns EOVERFLOW.
917 *
918 * If the text contains non-ASCII bytes (with bit 7 set), the whole string is
919 * converted anyway and invalid characters are replaced with question marks
920 * (U_SPECIAL) and the function returns EIO.
921 *
922 * Regardless of return value upon return @a dest will always be well-formed.
923 *
924 * @param dest Destination buffer
925 * @param size Size of destination buffer
926 * @param src Space-padded ASCII.
927 * @param n Size of the source buffer in bytes.
928 *
929 * @return EOK on success, EOVERFLOW if the text does not fit
930 * destination buffer, EIO if the text contains
931 * non-ASCII bytes.
932 */
933errno_t spascii_to_str(char *dest, size_t size, const uint8_t *src, size_t n)
934{
935 size_t sidx;
936 size_t didx;
937 size_t dlast;
938 uint8_t byte;
939 errno_t rc;
940 errno_t result;
941
942 /* There must be space for a null terminator in the buffer. */
943 assert(size > 0);
944 result = EOK;
945
946 didx = 0;
947 dlast = 0;
948 for (sidx = 0; sidx < n; ++sidx) {
949 byte = src[sidx];
950 if (!ascii_check(byte)) {
951 byte = U_SPECIAL;
952 result = EIO;
953 }
954
955 rc = chr_encode(byte, dest, &didx, size - 1);
956 if (rc != EOK) {
957 assert(rc == EOVERFLOW);
958 dest[didx] = '\0';
959 return rc;
960 }
961
962 /* Remember dest index after last non-empty character */
963 if (byte != 0x20)
964 dlast = didx;
965 }
966
967 /* Terminate string after last non-empty character */
968 dest[dlast] = '\0';
969 return result;
970}
971
972/** Convert wide string to string.
973 *
974 * Convert wide string @a src to string. The output is written to the buffer
975 * specified by @a dest and @a size. @a size must be non-zero and the string
976 * written will always be well-formed.
977 *
978 * @param dest Destination buffer.
979 * @param size Size of the destination buffer.
980 * @param src Source wide string.
981 */
982void wstr_to_str(char *dest, size_t size, const char32_t *src)
983{
984 char32_t ch;
985 size_t src_idx;
986 size_t dest_off;
987
988 /* There must be space for a null terminator in the buffer. */
989 assert(size > 0);
990
991 src_idx = 0;
992 dest_off = 0;
993
994 while ((ch = src[src_idx++]) != 0) {
995 if (chr_encode(ch, dest, &dest_off, size - 1) != EOK)
996 break;
997 }
998
999 dest[dest_off] = '\0';
1000}
1001
1002/** Convert UTF16 string to string.
1003 *
1004 * Convert utf16 string @a src to string. The output is written to the buffer
1005 * specified by @a dest and @a size. @a size must be non-zero and the string
1006 * written will always be well-formed. Surrogate pairs also supported.
1007 *
1008 * @param dest Destination buffer.
1009 * @param size Size of the destination buffer.
1010 * @param src Source utf16 string.
1011 *
1012 * @return EOK, if success, an error code otherwise.
1013 */
1014errno_t utf16_to_str(char *dest, size_t size, const uint16_t *src)
1015{
1016 size_t idx = 0, dest_off = 0;
1017 char32_t ch;
1018 errno_t rc = EOK;
1019
1020 /* There must be space for a null terminator in the buffer. */
1021 assert(size > 0);
1022
1023 while (src[idx]) {
1024 if ((src[idx] & 0xfc00) == 0xd800) {
1025 if (src[idx + 1] && (src[idx + 1] & 0xfc00) == 0xdc00) {
1026 ch = 0x10000;
1027 ch += (src[idx] & 0x03FF) << 10;
1028 ch += (src[idx + 1] & 0x03FF);
1029 idx += 2;
1030 } else
1031 break;
1032 } else {
1033 ch = src[idx];
1034 idx++;
1035 }
1036 rc = chr_encode(ch, dest, &dest_off, size - 1);
1037 if (rc != EOK)
1038 break;
1039 }
1040 dest[dest_off] = '\0';
1041 return rc;
1042}
1043
1044/** Convert string to UTF16 string.
1045 *
1046 * Convert string @a src to utf16 string. The output is written to the buffer
1047 * specified by @a dest and @a dlen. @a dlen must be non-zero and the string
1048 * written will always be well-formed. Surrogate pairs also supported.
1049 *
1050 * @param dest Destination buffer.
1051 * @param dlen Number of utf16 characters that fit in the destination buffer.
1052 * @param src Source string.
1053 *
1054 * @return EOK, if success, an error code otherwise.
1055 */
1056errno_t str_to_utf16(uint16_t *dest, size_t dlen, const char *src)
1057{
1058 errno_t rc = EOK;
1059 size_t offset = 0;
1060 size_t idx = 0;
1061 char32_t c;
1062
1063 assert(dlen > 0);
1064
1065 while ((c = str_decode(src, &offset, STR_NO_LIMIT)) != 0) {
1066 if (c > 0x10000) {
1067 if (idx + 2 >= dlen - 1) {
1068 rc = EOVERFLOW;
1069 break;
1070 }
1071 c = (c - 0x10000);
1072 dest[idx] = 0xD800 | (c >> 10);
1073 dest[idx + 1] = 0xDC00 | (c & 0x3FF);
1074 idx++;
1075 } else {
1076 dest[idx] = c;
1077 }
1078
1079 idx++;
1080 if (idx >= dlen - 1) {
1081 rc = EOVERFLOW;
1082 break;
1083 }
1084 }
1085
1086 dest[idx] = '\0';
1087 return rc;
1088}
1089
1090/** Get size of UTF-16 string.
1091 *
1092 * Get the number of words which are used by the UTF-16 string @a ustr
1093 * (excluding the NULL-terminator).
1094 *
1095 * @param ustr UTF-16 string to consider.
1096 *
1097 * @return Number of words used by the UTF-16 string
1098 *
1099 */
1100size_t utf16_wsize(const uint16_t *ustr)
1101{
1102 size_t wsize = 0;
1103
1104 while (*ustr++ != 0)
1105 wsize++;
1106
1107 return wsize;
1108}
1109
1110/** Convert wide string to new string.
1111 *
1112 * Convert wide string @a src to string. Space for the new string is allocated
1113 * on the heap.
1114 *
1115 * @param src Source wide string.
1116 * @return New string.
1117 */
1118char *wstr_to_astr(const char32_t *src)
1119{
1120 char dbuf[STR_BOUNDS(1)];
1121 char *str;
1122 char32_t ch;
1123
1124 size_t src_idx;
1125 size_t dest_off;
1126 size_t dest_size;
1127
1128 /* Compute size of encoded string. */
1129
1130 src_idx = 0;
1131 dest_size = 0;
1132
1133 while ((ch = src[src_idx++]) != 0) {
1134 dest_off = 0;
1135 if (chr_encode(ch, dbuf, &dest_off, STR_BOUNDS(1)) != EOK)
1136 break;
1137 dest_size += dest_off;
1138 }
1139
1140 str = malloc(dest_size + 1);
1141 if (str == NULL)
1142 return NULL;
1143
1144 /* Encode string. */
1145
1146 src_idx = 0;
1147 dest_off = 0;
1148
1149 while ((ch = src[src_idx++]) != 0) {
1150 if (chr_encode(ch, str, &dest_off, dest_size) != EOK)
1151 break;
1152 }
1153
1154 str[dest_size] = '\0';
1155 return str;
1156}
1157
1158/** Convert string to wide string.
1159 *
1160 * Convert string @a src to wide string. The output is written to the
1161 * buffer specified by @a dest and @a dlen. @a dlen must be non-zero
1162 * and the wide string written will always be null-terminated.
1163 *
1164 * @param dest Destination buffer.
1165 * @param dlen Length of destination buffer (number of wchars).
1166 * @param src Source string.
1167 */
1168void str_to_wstr(char32_t *dest, size_t dlen, const char *src)
1169{
1170 size_t offset;
1171 size_t di;
1172 char32_t c;
1173
1174 assert(dlen > 0);
1175
1176 offset = 0;
1177 di = 0;
1178
1179 do {
1180 if (di >= dlen - 1)
1181 break;
1182
1183 c = str_decode(src, &offset, STR_NO_LIMIT);
1184 dest[di++] = c;
1185 } while (c != '\0');
1186
1187 dest[dlen - 1] = '\0';
1188}
1189
1190/** Convert string to wide string.
1191 *
1192 * Convert string @a src to wide string. A new wide NULL-terminated
1193 * string will be allocated on the heap.
1194 *
1195 * @param src Source string.
1196 */
1197char32_t *str_to_awstr(const char *str)
1198{
1199 size_t len = str_length(str);
1200
1201 char32_t *wstr = calloc(len + 1, sizeof(char32_t));
1202 if (wstr == NULL)
1203 return NULL;
1204
1205 str_to_wstr(wstr, len + 1, str);
1206 return wstr;
1207}
1208
1209/** Find first occurence of character in string.
1210 *
1211 * @param str String to search.
1212 * @param ch Character to look for.
1213 *
1214 * @return Pointer to character in @a str or NULL if not found.
1215 */
1216char *str_chr(const char *str, char32_t ch)
1217{
1218 char32_t acc;
1219 size_t off = 0;
1220 size_t last = 0;
1221
1222 while ((acc = str_decode(str, &off, STR_NO_LIMIT)) != 0) {
1223 if (acc == ch)
1224 return (char *) (str + last);
1225 last = off;
1226 }
1227
1228 return NULL;
1229}
1230
1231/** Find first occurence of substring in string.
1232 *
1233 * @param hs Haystack (string)
1234 * @param n Needle (substring to look for)
1235 *
1236 * @return Pointer to character in @a hs or @c NULL if not found.
1237 */
1238char *str_str(const char *hs, const char *n)
1239{
1240 size_t off = 0;
1241
1242 if (str_lcmp(hs, n, str_length(n)) == 0)
1243 return (char *)hs;
1244
1245 while (str_decode(hs, &off, STR_NO_LIMIT) != 0) {
1246 if (str_lcmp(hs + off, n, str_length(n)) == 0)
1247 return (char *)(hs + off);
1248 }
1249
1250 return NULL;
1251}
1252
1253/** Removes specified trailing characters from a string.
1254 *
1255 * @param str String to remove from.
1256 * @param ch Character to remove.
1257 */
1258void str_rtrim(char *str, char32_t ch)
1259{
1260 size_t off = 0;
1261 size_t pos = 0;
1262 char32_t c;
1263 bool update_last_chunk = true;
1264 char *last_chunk = NULL;
1265
1266 while ((c = str_decode(str, &off, STR_NO_LIMIT))) {
1267 if (c != ch) {
1268 update_last_chunk = true;
1269 last_chunk = NULL;
1270 } else if (update_last_chunk) {
1271 update_last_chunk = false;
1272 last_chunk = (str + pos);
1273 }
1274 pos = off;
1275 }
1276
1277 if (last_chunk)
1278 *last_chunk = '\0';
1279}
1280
1281/** Removes specified leading characters from a string.
1282 *
1283 * @param str String to remove from.
1284 * @param ch Character to remove.
1285 */
1286void str_ltrim(char *str, char32_t ch)
1287{
1288 char32_t acc;
1289 size_t off = 0;
1290 size_t pos = 0;
1291 size_t str_sz = str_size(str);
1292
1293 while ((acc = str_decode(str, &off, STR_NO_LIMIT)) != 0) {
1294 if (acc != ch)
1295 break;
1296 else
1297 pos = off;
1298 }
1299
1300 if (pos > 0) {
1301 memmove(str, &str[pos], str_sz - pos);
1302 pos = str_sz - pos;
1303 str[pos] = '\0';
1304 }
1305}
1306
1307/** Find last occurence of character in string.
1308 *
1309 * @param str String to search.
1310 * @param ch Character to look for.
1311 *
1312 * @return Pointer to character in @a str or NULL if not found.
1313 */
1314char *str_rchr(const char *str, char32_t ch)
1315{
1316 char32_t acc;
1317 size_t off = 0;
1318 size_t last = 0;
1319 const char *res = NULL;
1320
1321 while ((acc = str_decode(str, &off, STR_NO_LIMIT)) != 0) {
1322 if (acc == ch)
1323 res = (str + last);
1324 last = off;
1325 }
1326
1327 return (char *) res;
1328}
1329
1330/** Insert a wide character into a wide string.
1331 *
1332 * Insert a wide character into a wide string at position
1333 * @a pos. The characters after the position are shifted.
1334 *
1335 * @param str String to insert to.
1336 * @param ch Character to insert to.
1337 * @param pos Character index where to insert.
1338 * @param max_pos Characters in the buffer.
1339 *
1340 * @return True if the insertion was sucessful, false if the position
1341 * is out of bounds.
1342 *
1343 */
1344bool wstr_linsert(char32_t *str, char32_t ch, size_t pos, size_t max_pos)
1345{
1346 size_t len = wstr_length(str);
1347
1348 if ((pos > len) || (pos + 1 > max_pos))
1349 return false;
1350
1351 size_t i;
1352 for (i = len; i + 1 > pos; i--)
1353 str[i + 1] = str[i];
1354
1355 str[pos] = ch;
1356
1357 return true;
1358}
1359
1360/** Remove a wide character from a wide string.
1361 *
1362 * Remove a wide character from a wide string at position
1363 * @a pos. The characters after the position are shifted.
1364 *
1365 * @param str String to remove from.
1366 * @param pos Character index to remove.
1367 *
1368 * @return True if the removal was sucessful, false if the position
1369 * is out of bounds.
1370 *
1371 */
1372bool wstr_remove(char32_t *str, size_t pos)
1373{
1374 size_t len = wstr_length(str);
1375
1376 if (pos >= len)
1377 return false;
1378
1379 size_t i;
1380 for (i = pos + 1; i <= len; i++)
1381 str[i - 1] = str[i];
1382
1383 return true;
1384}
1385
1386/** Duplicate string.
1387 *
1388 * Allocate a new string and copy characters from the source
1389 * string into it. The duplicate string is allocated via sleeping
1390 * malloc(), thus this function can sleep in no memory conditions.
1391 *
1392 * The allocation cannot fail and the return value is always
1393 * a valid pointer. The duplicate string is always a well-formed
1394 * null-terminated UTF-8 string, but it can differ from the source
1395 * string on the byte level.
1396 *
1397 * @param src Source string.
1398 *
1399 * @return Duplicate string.
1400 *
1401 */
1402char *str_dup(const char *src)
1403{
1404 size_t size = str_size(src) + 1;
1405 char *dest = malloc(size);
1406 if (!dest)
1407 return NULL;
1408
1409 str_cpy(dest, size, src);
1410 return dest;
1411}
1412
1413/** Duplicate string with size limit.
1414 *
1415 * Allocate a new string and copy up to @max_size bytes from the source
1416 * string into it. The duplicate string is allocated via sleeping
1417 * malloc(), thus this function can sleep in no memory conditions.
1418 * No more than @max_size + 1 bytes is allocated, but if the size
1419 * occupied by the source string is smaller than @max_size + 1,
1420 * less is allocated.
1421 *
1422 * The allocation cannot fail and the return value is always
1423 * a valid pointer. The duplicate string is always a well-formed
1424 * null-terminated UTF-8 string, but it can differ from the source
1425 * string on the byte level.
1426 *
1427 * @param src Source string.
1428 * @param n Maximum number of bytes to duplicate.
1429 *
1430 * @return Duplicate string.
1431 *
1432 */
1433char *str_ndup(const char *src, size_t n)
1434{
1435 size_t size = str_size(src);
1436 if (size > n)
1437 size = n;
1438
1439 char *dest = malloc(size + 1);
1440 if (!dest)
1441 return NULL;
1442
1443 str_ncpy(dest, size + 1, src, size);
1444 return dest;
1445}
1446
1447/** Split string by delimiters.
1448 *
1449 * @param s String to be tokenized. May not be NULL.
1450 * @param delim String with the delimiters.
1451 * @param next Variable which will receive the pointer to the
1452 * continuation of the string following the first
1453 * occurrence of any of the delimiter characters.
1454 * May be NULL.
1455 * @return Pointer to the prefix of @a s before the first
1456 * delimiter character. NULL if no such prefix
1457 * exists.
1458 */
1459char *str_tok(char *s, const char *delim, char **next)
1460{
1461 char *start, *end;
1462
1463 if (!s)
1464 return NULL;
1465
1466 size_t len = str_size(s);
1467 size_t cur;
1468 size_t tmp;
1469 char32_t ch;
1470
1471 /* Skip over leading delimiters. */
1472 tmp = 0;
1473 cur = 0;
1474 while ((ch = str_decode(s, &tmp, len)) && str_chr(delim, ch))
1475 cur = tmp;
1476 start = &s[cur];
1477
1478 /* Skip over token characters. */
1479 tmp = cur;
1480 while ((ch = str_decode(s, &tmp, len)) && !str_chr(delim, ch))
1481 cur = tmp;
1482 end = &s[cur];
1483 if (next)
1484 *next = (ch ? &s[tmp] : &s[cur]);
1485
1486 if (start == end)
1487 return NULL; /* No more tokens. */
1488
1489 /* Overwrite delimiter with NULL terminator. */
1490 *end = '\0';
1491 return start;
1492}
1493
1494void order_suffix(const uint64_t val, uint64_t *rv, char *suffix)
1495{
1496 if (val > UINT64_C(10000000000000000000)) {
1497 *rv = val / UINT64_C(1000000000000000000);
1498 *suffix = 'Z';
1499 } else if (val > UINT64_C(1000000000000000000)) {
1500 *rv = val / UINT64_C(1000000000000000);
1501 *suffix = 'E';
1502 } else if (val > UINT64_C(1000000000000000)) {
1503 *rv = val / UINT64_C(1000000000000);
1504 *suffix = 'T';
1505 } else if (val > UINT64_C(1000000000000)) {
1506 *rv = val / UINT64_C(1000000000);
1507 *suffix = 'G';
1508 } else if (val > UINT64_C(1000000000)) {
1509 *rv = val / UINT64_C(1000000);
1510 *suffix = 'M';
1511 } else if (val > UINT64_C(1000000)) {
1512 *rv = val / UINT64_C(1000);
1513 *suffix = 'k';
1514 } else {
1515 *rv = val;
1516 *suffix = ' ';
1517 }
1518}
1519
1520void bin_order_suffix(const uint64_t val, uint64_t *rv, const char **suffix,
1521 bool fixed)
1522{
1523 if (val > UINT64_C(1152921504606846976)) {
1524 *rv = val / UINT64_C(1125899906842624);
1525 *suffix = "EiB";
1526 } else if (val > UINT64_C(1125899906842624)) {
1527 *rv = val / UINT64_C(1099511627776);
1528 *suffix = "TiB";
1529 } else if (val > UINT64_C(1099511627776)) {
1530 *rv = val / UINT64_C(1073741824);
1531 *suffix = "GiB";
1532 } else if (val > UINT64_C(1073741824)) {
1533 *rv = val / UINT64_C(1048576);
1534 *suffix = "MiB";
1535 } else if (val > UINT64_C(1048576)) {
1536 *rv = val / UINT64_C(1024);
1537 *suffix = "KiB";
1538 } else {
1539 *rv = val;
1540 if (fixed)
1541 *suffix = "B ";
1542 else
1543 *suffix = "B";
1544 }
1545}
1546
1547/** @}
1548 */
Note: See TracBrowser for help on using the repository browser.