Context Navigation

Reverse Diff

Changes in common/str.c [1db4e2ae:fdfb24e] in mainline

File:

: 1 edited

common/str.c (modified) (34 diffs)

Legend:

: Unmodified
: Added
: Removed

common/str.c

-              r1db4e2ae
+              rfdfb24e
  * Copyright (c) 2011 Martin Sucha
  * Copyright (c) 2011 Oleg Romanenko
- * Copyright (c) 2025 Jiří Zárevúcky
  * All rights reserved.
+ *
 …
  *                        are valid
+ *
- *                        Note that Unicode characters do not match
- *                        one-to-one with displayed characters or glyphs on
- *                        screen. For that level of precision, look up
- *                        Grapheme Clusters.
+ *
  *  ASCII character       7 bit encoded ASCII character, stored in char
  *                        (usually signed 8 bit integer), code points 0 .. 127
 …
  *  [wide] string width   number of display cells on a monospace display taken
  *                        by a [wide] string, size_t
+ *
- *                        This is virtually impossible to determine exactly for
- *                        all strings without knowing specifics of the display
- *                        device, due to various factors affecting text output.
- *                        If you have the option to query the terminal for
- *                        position change caused by outputting the string,
- *                        it is preferrable to determine width that way.
+ *
+ *
 …
 #include <str.h>
-#include <align.h>
 #include <assert.h>
 #include <ctype.h>
 #include <errno.h>
-#include <limits.h>
-#include <macros.h>
-#include <mem.h>
 #include <stdbool.h>
 #include <stddef.h>
 #include <stdint.h>
 #include <stdlib.h>
+#include <uchar.h>
+#if __STDC_HOSTED__
+#include <fibril.h>
+#endif
+static void _set_ilseq()
+{
+#ifdef errno
+        errno = EILSEQ;
+#endif
+}
+#include <align.h>
+#include <mem.h>
 /** Byte mask consisting of lowest @n bits (out of 8) */
 …
 /** Number of data bits in a UTF-8 continuation byte */
 #define CONT_BITS  6
-#define UTF8_MASK_INITIAL2  0b00011111
-#define UTF8_MASK_INITIAL3  0b00001111
-#define UTF8_MASK_INITIAL4  0b00000111
-#define UTF8_MASK_CONT      0b00111111
-#define CHAR_INVALID ((char32_t) UINT_MAX)
-static inline bool _is_ascii(uint8_t b)
+{
-        return b < 0x80;
+}
-static inline bool _is_continuation(uint8_t b)
+{
-        return (b & 0xC0) == 0x80;
+}
-static inline bool _is_2_byte(uint8_t c)
+{
-        return (c & 0xE0) == 0xC0;
+}
-static inline bool _is_3_byte(uint8_t c)
+{
-        return (c & 0xF0) == 0xE0;
+}
-static inline bool _is_4_byte(uint8_t c)
+{
-        return (c & 0xF8) == 0xF0;
+}
-static inline int _char_continuation_bytes(char32_t c)
+{
-        if ((c & ~LO_MASK_32(7)) == 0)
-                return 0;
-        if ((c & ~LO_MASK_32(11)) == 0)
-                return 1;
-        if ((c & ~LO_MASK_32(16)) == 0)
-                return 2;
-        if ((c & ~LO_MASK_32(21)) == 0)
-                return 3;
-        /* Codes longer than 21 bits are not supported */
-        return -1;
+}
-static inline int _continuation_bytes(uint8_t b)
+{
-        /* 0xxxxxxx */
-        if (_is_ascii(b))
-                return 0;
-        /* 110xxxxx 10xxxxxx */
-        if (_is_2_byte(b))
-                return 1;
-        /* 1110xxxx 10xxxxxx 10xxxxxx */
-        if (_is_3_byte(b))
-                return 2;
-        /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
-        if (_is_4_byte(b))
-                return 3;
-        return -1;
+}
-static bool _is_non_shortest(const mbstate_t *mb, uint8_t b)
+{
-        return (mb->state == 0b1111110000000000 && !(b & 0b00100000)) ||
-            (mb->state == 0b1111111111110000 && !(b & 0b00110000));
+}
-static bool _is_surrogate(const mbstate_t *mb, uint8_t b)
+{
-        return (mb->state == 0b1111110000001101 && b >= 0xa0);
+}
-#define _likely(expr) __builtin_expect((expr), true)
-#define _unlikely(expr) __builtin_expect((expr), false)
-#define FAST_PATHS 1
-static char32_t _str_decode(const char *s, size_t *offset, size_t size, mbstate_t *mb)
+{
-        assert(s);
-        assert(offset);
-        assert(*offset <= size);
-        assert(size == STR_NO_LIMIT || s + size >= s);
-        assert(mb);
-        if (*offset == size)
-                return 0;
-        if (_likely(!mb->state)) {
-                /* Clean slate, read initial byte. */
-                uint8_t b = s[(*offset)++];
-                /* Fast exit for the most common case. */
-                if (_likely(_is_ascii(b)))
-                        return b;
-                /* unexpected continuation byte */
-                if (_unlikely(_is_continuation(b)))
-                        return CHAR_INVALID;
-                /*
-                 * The value stored into `continuation` is designed to have
-                 * just enough leading ones that after shifting in one less than
-                 * the expected number of continuation bytes, the most significant
-                 * bit becomes zero. (The field is 16b wide.)
-                 */
-                if (_is_2_byte(b)) {
-                        /* Reject non-shortest form. */
-                        if (_unlikely(!(b & 0b00011110)))
-                                return CHAR_INVALID;
-#if FAST_PATHS
-                        /* We can usually take this exit. */
-                        if (_likely(*offset < size && _is_continuation(s[*offset])))
-                                return (b & UTF8_MASK_INITIAL2) << 6 |
-                                    (s[(*offset)++] & UTF8_MASK_CONT);
-#endif
-                        /* 2 byte continuation    110xxxxx */
-                        mb->state = b ^ 0b0000000011000000;
-                } else if (_is_3_byte(b)) {
-#if FAST_PATHS
-                        /* We can usually take this exit. */
-                        if (_likely(*offset + 1 < size && _is_continuation(s[*offset]) && _is_continuation(s[*offset + 1]))) {
-                                char32_t ch = (b & UTF8_MASK_INITIAL3) << 12 |
-                                    (s[(*offset)] & UTF8_MASK_CONT) << 6 |
-                                    (s[(*offset) + 1] & UTF8_MASK_CONT);
-                                *offset += 2;
-                                /* Reject non-shortest form. */
-                                if (_unlikely(!(ch & 0xFFFFF800)))
-                                        return CHAR_INVALID;
-                                /* Reject surrogates */
-                                if (_unlikely(ch >= 0xD800 && ch < 0xE000))
-                                        return CHAR_INVALID;
-                                return ch;
+                        }
-#endif
-                        /* 3 byte continuation    1110xxxx */
-                        mb->state = b ^ 0b1111110011100000;
-                } else if (_is_4_byte(b)) {
-#if FAST_PATHS
-                        /* We can usually take this exit. */
-                        if (_likely(*offset + 2 < size && _is_continuation(s[*offset]) &&
-                            _is_continuation(s[*offset + 1]) && _is_continuation(s[*offset + 2]))) {
-                                char32_t ch = (b & UTF8_MASK_INITIAL4) << 18 |
-                                    (s[(*offset)] & UTF8_MASK_CONT) << 12 |
-                                    (s[(*offset) + 1] & UTF8_MASK_CONT) << 6 |
-                                    (s[(*offset) + 2] & UTF8_MASK_CONT);
-                                *offset += 3;
-                                /* Reject non-shortest form. */
-                                if (_unlikely(!(ch & 0xFFFF0000)))
-                                        return CHAR_INVALID;
-                                /* Reject out-of-range characters. */
-                                if (_unlikely(ch >= 0x110000))
-                                        return CHAR_INVALID;
-                                return ch;
+                        }
-#endif
-                        /* 4 byte continuation    11110xxx */
-                        mb->state = b ^ 0b1111111100000000;
-                } else {
-                        return CHAR_INVALID;
+                }
+        }
-        /* Deal with the remaining edge and invalid cases. */
-        for (; *offset < size; (*offset)++) {
-                /* Read continuation bytes. */
-                uint8_t b = s[*offset];
-                if (!_is_continuation(b) || _is_non_shortest(mb, b) || _is_surrogate(mb, b)) {
-                        mb->state = 0;
-                        return CHAR_INVALID;
+                }
-                /* Top bit becomes zero when shifting in the second to last byte. */
-                if (!(mb->state & 0x8000)) {
-                        char32_t c = ((char32_t) mb->state) << 6 | (b & UTF8_MASK_CONT);
-                        mb->state = 0;
-                        (*offset)++;
-                        return c;
+                }
-                mb->state = mb->state << 6 | (b & UTF8_MASK_CONT);
+        }
-        /* Incomplete character. */
-        assert(mb->state);
-        return 0;
+}
-/** Standard <uchar.h> function since C11. */
-size_t mbrtoc32(char32_t *c, const char *s, size_t n, mbstate_t *mb)
+{
-#if __STDC_HOSTED__
-        static fibril_local mbstate_t global_state = { };
-        if (!mb)
-                mb = &global_state;
-#endif
-        if (!s) {
-                /* Equivalent to mbrtoc32(NULL, "", 1, mb); */
-                c = NULL;
-                s = "";
-                n = 1;
+        }
-        size_t offset = 0;
-        char32_t ret = _str_decode(s, &offset, n, mb);
-        if (ret == CHAR_INVALID) {
-                assert(!mb->state);
-                _set_ilseq();
-                return UCHAR_ILSEQ;
+        }
-        if (mb->state) {
-                assert(ret == 0);
-                return UCHAR_INCOMPLETE;
+        }
-        if (c)
-                *c = ret;
-        return ret ? offset : 0;
+}
 /** Decode a single character from a string.
 …
 char32_t str_decode(const char *str, size_t *offset, size_t size)
+{
+        mbstate_t mb = { };
+        char32_t ch = _str_decode(str, offset, size, &mb);
+        if (ch == CHAR_INVALID || mb.state)
+        if (*offset + 1 > size)
+                return 0;
+        /* First byte read from string */
+        uint8_t b0 = (uint8_t) str[(*offset)++];
+        /* Determine code length */
+        unsigned int b0_bits;  /* Data bits in first byte */
+        unsigned int cbytes;   /* Number of continuation bytes */
+        if ((b0 & 0x80) == 0) {
+                /* 0xxxxxxx (Plain ASCII) */
+                b0_bits = 7;
+                cbytes = 0;
+        } else if ((b0 & 0xe0) == 0xc0) {
+                /* 110xxxxx 10xxxxxx */
+                b0_bits = 5;
+                cbytes = 1;
+        } else if ((b0 & 0xf0) == 0xe0) {
+                /* 1110xxxx 10xxxxxx 10xxxxxx */
+                b0_bits = 4;
+                cbytes = 2;
+        } else if ((b0 & 0xf8) == 0xf0) {
+                /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
+                b0_bits = 3;
+                cbytes = 3;
+        } else {
+                /* 10xxxxxx -- unexpected continuation byte */
                 return U_SPECIAL;
+        }
+        if (*offset + cbytes > size)
+                return U_SPECIAL;
+        char32_t ch = b0 & LO_MASK_8(b0_bits);
+        /* Decode continuation bytes */
+        while (cbytes > 0) {
+                uint8_t b = (uint8_t) str[(*offset)++];
+                /* Must be 10xxxxxx */
+                if ((b & 0xc0) != 0x80)
+                        return U_SPECIAL;
+                /* Shift data bits to ch */
+                ch = (ch << CONT_BITS) | (char32_t) (b & LO_MASK_8(CONT_BITS));
+                cbytes--;
+        }
         return ch;
+}
-char32_t str_decode_r(const char *str, size_t *offset, size_t size,
-        char32_t replacement, mbstate_t *mb)
+{
-        char32_t ch = _str_decode(str, offset, size, mb);
-        return (ch == CHAR_INVALID) ? replacement : ch;
+}
 …
                 return 0;
         int cbytes = 0;
+        size_t processed = 0;
         /* Continue while continuation bytes found */
         while (*offset > 0 && cbytes < 4) {
+        while (*offset > 0 && processed < 4) {
                 uint8_t b = (uint8_t) str[--(*offset)];
+                if (_is_continuation(b)) {
+                        cbytes++;
+                        continue;
+                if (processed == 0 && (b & 0x80) == 0) {
+                        /* 0xxxxxxx (Plain ASCII) */
+                        return b & 0x7f;
+                } else if ((b & 0xe0) == 0xc0 || (b & 0xf0) == 0xe0 ||
+                    (b & 0xf8) == 0xf0) {
+                        /* Start byte */
+                        size_t start_offset = *offset;
+                        return str_decode(str, &start_offset, size);
+                } else if ((b & 0xc0) != 0x80) {
+                        /* Not a continuation byte */
+                        return U_SPECIAL;
+                }
+                /* Reject non-shortest form encoding. */
+                if (cbytes != _continuation_bytes(b))
+                        return U_SPECIAL;
+                /* Start byte */
+                size_t start_offset = *offset;
+                return str_decode(str, &start_offset, size);
+        }
+                processed++;
+        }
         /* Too many continuation bytes */
         return U_SPECIAL;
 …
  *         code was invalid.
  */
+errno_t chr_encode(char32_t ch, char *str, size_t *offset, size_t size)
+{
+        // TODO: merge with c32rtomb()
+errno_t chr_encode(const char32_t ch, char *str, size_t *offset, size_t size)
+{
         if (*offset >= size)
                 return EOVERFLOW;
-        /* Fast exit for the most common case. */
-        if (ch < 0x80) {
-                str[(*offset)++] = (char) ch;
-                return EOK;
+        }
-        /* Codes longer than 21 bits are not supported */
         if (!chr_check(ch))
                 return EINVAL;
+        /*
+         * Unsigned version of ch (bit operations should only be done
+         * on unsigned types).
+         */
+        uint32_t cc = (uint32_t) ch;
         /* Determine how many continuation bytes are needed */
+        unsigned int cbytes = _char_continuation_bytes(ch);
+        unsigned int b0_bits = 6 - cbytes;  /* Data bits in first byte */
+        unsigned int b0_bits;  /* Data bits in first byte */
+        unsigned int cbytes;   /* Number of continuation bytes */
+        if ((cc & ~LO_MASK_32(7)) == 0) {
+                b0_bits = 7;
+                cbytes = 0;
+        } else if ((cc & ~LO_MASK_32(11)) == 0) {
+                b0_bits = 5;
+                cbytes = 1;
+        } else if ((cc & ~LO_MASK_32(16)) == 0) {
+                b0_bits = 4;
+                cbytes = 2;
+        } else if ((cc & ~LO_MASK_32(21)) == 0) {
+                b0_bits = 3;
+                cbytes = 3;
+        } else {
+                /* Codes longer than 21 bits are not supported */
+                return EINVAL;
+        }
         /* Check for available space in buffer */
 …
         unsigned int i;
         for (i = cbytes; i > 0; i--) {
                 str[*offset + i] = 0x80 | (ch & LO_MASK_32(CONT_BITS));
                 ch >>= CONT_BITS;
+                str[*offset + i] = 0x80 | (cc & LO_MASK_32(CONT_BITS));
+                cc = cc >> CONT_BITS;
+        }
         /* Encode first byte */
         str[*offset] = (ch & LO_MASK_32(b0_bits)) | HI_MASK_8(8 - b0_bits - 1);
+        str[*offset] = (cc & LO_MASK_32(b0_bits)) | HI_MASK_8(8 - b0_bits - 1);
         /* Advance offset */
 …
+}
+/* Convert in place any bytes that don't form a valid character into replacement. */
+static size_t _str_sanitize(char *str, size_t n, uint8_t replacement)
+{
+        uint8_t *b = (uint8_t *) str;
+        size_t count = 0;
+        for (; n > 0 && b[0]; b++, n--) {
+                if (b[0] < ' ') {
+                        /* C0 control codes */
+                        b[0] = replacement;
+                        count++;
+                        continue;
+                }
+                int cont = _continuation_bytes(b[0]);
+                if (__builtin_expect(cont, 0) == 0)
+                        continue;
+                if (cont < 0 || n <= (size_t) cont) {
+                        b[0] = replacement;
+                        count++;
+                        continue;
+                }
+                /* Check continuation bytes. */
+                bool valid = true;
+                for (int i = 1; i <= cont; i++) {
+                        if (!_is_continuation(b[i])) {
+                                valid = false;
+                                break;
+                        }
+                }
+                if (!valid) {
+                        b[0] = replacement;
+                        count++;
+                        continue;
+                }
+                /*
+                 * Check for non-shortest form encoding.
+                 * See https://www.unicode.org/versions/corrigendum1.html
+                 */
+                /* 0b110!!!!x 0b10xxxxxx */
+                if (cont == 1 && !(b[0] & 0b00011110)) {
+                        b[0] = replacement;
+                        count++;
+                        continue;
+                }
+                bool c1_control = (b[0] == 0b11000010 && b[1] < 0b10100000);
+                if (cont == 1 && c1_control) {
+                        b[0] = replacement;
+                        count++;
+                        continue;
+                }
+                /* 0b1110!!!! 0b10!xxxxx 0b10xxxxxx */
+                if (cont == 2 && !(b[0] & 0b00001111) && !(b[1] & 0b00100000)) {
+                        b[0] = replacement;
+                        count++;
+                        continue;
+                }
+                /* 0b11110!!! 0b10!!xxxx 0b10xxxxxx 0b10xxxxxx */
+                if (cont == 3 && !(b[0] & 0b00000111) && !(b[1] & 0b00110000)) {
+                        b[0] = replacement;
+                        count++;
+                        continue;
+                }
+                /* Check for surrogate character encoding. */
+                if (cont == 2 && b[0] == 0xED && b[1] >= 0xA0) {
+                        b[0] = replacement;
+                        count++;
+                        continue;
+                }
+                /* Check for out-of-range code points. */
+                if (cont == 3 && (b[0] > 0xF4 || (b[0] == 0xF4 && b[1] >= 0x90))) {
+                        b[0] = replacement;
+                        count++;
+                        continue;
+                }
+                b += cont;
+                n -= cont;
+        }
+        return count;
+}
+/** Replaces any byte that's not part of a complete valid UTF-8 character
+ * encoding with a replacement byte.
+ * Also replaces C0 and C1 control codes.
+ */
+size_t str_sanitize(char *str, size_t n, uint8_t replacement)
+{
+        return _str_sanitize(str, n, replacement);
+}
+static size_t _str_size(const char *str)
+/** Get size of string.
+ *
+ * Get the number of bytes which are used by the string @a str (excluding the
+ * NULL-terminator).
+ *
+ * @param str String to consider.
+ *
+ * @return Number of bytes used by the string
+ *
+ */
+size_t str_size(const char *str)
+{
         size_t size = 0;
 …
         return size;
+}
-/** Get size of string.
+ *
- * Get the number of bytes which are used by the string @a str (excluding the
- * NULL-terminator).
+ *
- * @param str String to consider.
+ *
- * @return Number of bytes used by the string
+ *
- */
-size_t str_size(const char *str)
+{
-        return _str_size(str);
+}
 …
+}
+static size_t _str_nsize(const char *str, size_t max_size)
+/** Get size of string with size limit.
+ *
+ * Get the number of bytes which are used by the string @a str
+ * (excluding the NULL-terminator), but no more than @max_size bytes.
+ *
+ * @param str      String to consider.
+ * @param max_size Maximum number of bytes to measure.
+ *
+ * @return Number of bytes used by the string
+ *
+ */
+size_t str_nsize(const char *str, size_t max_size)
+{
         size_t size = 0;
 …
         return size;
+}
-/** Get size of string with size limit.
+ *
- * Get the number of bytes which are used by the string @a str
- * (excluding the NULL-terminator), but no more than @max_size bytes.
+ *
- * @param str      String to consider.
- * @param max_size Maximum number of bytes to measure.
+ *
- * @return Number of bytes used by the string
+ *
- */
-size_t str_nsize(const char *str, size_t max_size)
+{
-        return _str_nsize(str, max_size);
+}
 …
 int str_cmp(const char *s1, const char *s2)
+{
+        /*
+         * UTF-8 has the nice property that lexicographic ordering on bytes is
+         * the same as the lexicographic ordering of the character sequences.
+         */
+        while (*s1 == *s2 && *s1 != 0) {
+                s1++;
+                s2++;
+        }
+        if (*s1 == *s2)
+                return 0;
+        return (*s1 < *s2) ? -1 : 1;
+        char32_t c1 = 0;
+        char32_t c2 = 0;
+        size_t off1 = 0;
+        size_t off2 = 0;
+        while (true) {
+                c1 = str_decode(s1, &off1, STR_NO_LIMIT);
+                c2 = str_decode(s2, &off2, STR_NO_LIMIT);
+                if (c1 < c2)
+                        return -1;
+                if (c1 > c2)
+                        return 1;
+                if (c1 == 0 || c2 == 0)
+                        break;
+        }
+        return 0;
+}
 …
 int str_casecmp(const char *s1, const char *s2)
+{
-        // FIXME: doesn't work for non-ASCII caseful characters
         char32_t c1 = 0;
         char32_t c2 = 0;
 …
 int str_lcasecmp(const char *s1, const char *s2, size_t max_len)
+{
-        // FIXME: doesn't work for non-ASCII caseful characters
         char32_t c1 = 0;
         char32_t c2 = 0;
 …
+}
-static bool _test_prefix(const char *s, const char *p)
+{
-        while (*s == *p && *s != 0) {
-                s++;
-                p++;
+        }
-        return *p == 0;
+}
 /** Test whether p is a prefix of s.
+ *
 …
 bool str_test_prefix(const char *s, const char *p)
+{
+        return _test_prefix(s, p);
+        char32_t c1 = 0;
+        char32_t c2 = 0;
+        size_t off1 = 0;
+        size_t off2 = 0;
+        while (true) {
+                c1 = str_decode(s, &off1, STR_NO_LIMIT);
+                c2 = str_decode(p, &off2, STR_NO_LIMIT);
+                if (c2 == 0)
+                        return true;
+                if (c1 != c2)
+                        return false;
+                if (c1 == 0)
+                        break;
+        }
+        return false;
+}
 …
         return s + off;
+}
-/** Copy string as a sequence of bytes. */
-static void _str_cpy(char *dest, const char *src)
+{
-        while (*src)
-                *(dest++) = *(src++);
-        *dest = 0;
+}
-/** Copy string as a sequence of bytes. */
-static void _str_cpyn(char *dest, size_t size, const char *src)
+{
-        assert(dest && src && size);
-        if (!dest || !src || !size)
-                return;
-        if (size == STR_NO_LIMIT)
-                return _str_cpy(dest, src);
-        char *dest_top = dest + size - 1;
-        assert(size == 1 || dest < dest_top);
-        while (*src && dest < dest_top)
-                *(dest++) = *(src++);
-        *dest = 0;
+}
 …
         assert(size > 0);
         assert(src != NULL);
+        assert(dest != NULL);
+        assert(size == STR_NO_LIMIT || dest + size > dest);
+        /* Copy data. */
+        _str_cpyn(dest, size, src);
+        /* In-place translate invalid bytes to U_SPECIAL. */
+        _str_sanitize(dest, size, U_SPECIAL);
+        size_t src_off = 0;
+        size_t dest_off = 0;
+        char32_t ch;
+        while ((ch = str_decode(src, &src_off, STR_NO_LIMIT)) != 0) {
+                if (chr_encode(ch, dest, &dest_off, size - 1) != EOK)
+                        break;
+        }
+        dest[dest_off] = '\0';
+}
 …
         /* There must be space for a null terminator in the buffer. */
         assert(size > 0);
+        assert(src != NULL);
+        /* Copy data. */
+        _str_cpyn(dest, min(size, n + 1), src);
+        /* In-place translate invalid bytes to U_SPECIAL. */
+        _str_sanitize(dest, size, U_SPECIAL);
+        size_t src_off = 0;
+        size_t dest_off = 0;
+        char32_t ch;
+        while ((ch = str_decode(src, &src_off, n)) != 0) {
+                if (chr_encode(ch, dest, &dest_off, size - 1) != EOK)
+                        break;
+        }
+        dest[dest_off] = '\0';
+}
 …
 void str_append(char *dest, size_t size, const char *src)
+{
+        assert(src != NULL);
+        assert(dest != NULL);
+        assert(size > 0);
+        assert(size == STR_NO_LIMIT || dest + size > dest);
+        size_t dstr_size = _str_nsize(dest, size);
+        if (dstr_size < size) {
+                _str_cpyn(dest + dstr_size, size - dstr_size, src);
+                _str_sanitize(dest + dstr_size, size - dstr_size, U_SPECIAL);
+        }
+        size_t dstr_size;
+        dstr_size = str_size(dest);
+        if (dstr_size >= size)
+                return;
+        str_cpy(dest + dstr_size, size - dstr_size, src);
+}
 …
 errno_t spascii_to_str(char *dest, size_t size, const uint8_t *src, size_t n)
+{
+        size_t len = 0;
+        /* Determine the length of the source string. */
+        for (size_t i = 0; i < n; i++) {
+                if (src[i] == 0)
+                        break;
+                if (src[i] != ' ')
+                        len = i + 1;
+        }
+        errno_t result = EOK;
+        size_t out_len = min(len, size - 1);
+        /* Copy characters */
+        for (size_t i = 0; i < out_len; i++) {
+                dest[i] = src[i];
+                if (dest[i] < 0) {
+                        dest[i] = U_SPECIAL;
+        size_t sidx;
+        size_t didx;
+        size_t dlast;
+        uint8_t byte;
+        errno_t rc;
+        errno_t result;
+        /* There must be space for a null terminator in the buffer. */
+        assert(size > 0);
+        result = EOK;
+        didx = 0;
+        dlast = 0;
+        for (sidx = 0; sidx < n; ++sidx) {
+                byte = src[sidx];
+                if (!ascii_check(byte)) {
+                        byte = U_SPECIAL;
                         result = EIO;
+                }
+        }
+        dest[out_len] = 0;
+        if (out_len < len)
+                return EOVERFLOW;
+                rc = chr_encode(byte, dest, &didx, size - 1);
+                if (rc != EOK) {
+                        assert(rc == EOVERFLOW);
+                        dest[didx] = '\0';
+                        return rc;
+                }
+                /* Remember dest index after last non-empty character */
+                if (byte != 0x20)
+                        dlast = didx;
+        }
+        /* Terminate string after last non-empty character */
+        dest[dlast] = '\0';
         return result;
+}
 …
+}
-static char *_strchr(const char *str, char c)
+{
-        while (*str != 0 && *str != c)
-                str++;
-        return (*str == c) ? (char *) str : NULL;
+}
 /** Find first occurence of character in string.
+ *
 …
 char *str_chr(const char *str, char32_t ch)
+{
+        /* Fast path for an ASCII character. */
+        if (ascii_check(ch))
+                return _strchr(str, ch);
+        /* Convert character to UTF-8. */
+        char utf8[STR_BOUNDS(1) + 1];
+        size_t offset = 0;
+        if (chr_encode(ch, utf8, &offset, sizeof(utf8)) != EOK || offset == 0)
+                return NULL;
+        utf8[offset] = '\0';
+        /* Find the first byte, then check if all of them are correct. */
+        while (*str != 0) {
+                str = _strchr(str, utf8[0]);
+                if (!str)
+                        return NULL;
+                if (_test_prefix(str, utf8))
+                        return (char *) str;
+                str++;
+        char32_t acc;
+        size_t off = 0;
+        size_t last = 0;
+        while ((acc = str_decode(str, &off, STR_NO_LIMIT)) != 0) {
+                if (acc == ch)
+                        return (char *) (str + last);
+                last = off;
+        }
 …
 char *str_str(const char *hs, const char *n)
+{
+        size_t hsize = _str_size(hs);
+        size_t nsize = _str_size(n);
+        while (hsize >= nsize) {
+                if (_test_prefix(hs, n))
+                        return (char *) hs;
+                hs++;
+                hsize--;
+        size_t off = 0;
+        if (str_lcmp(hs, n, str_length(n)) == 0)
+                return (char *)hs;
+        while (str_decode(hs, &off, STR_NO_LIMIT) != 0) {
+                if (str_lcmp(hs + off, n, str_length(n)) == 0)
+                        return (char *)(hs + off);
+        }
         return NULL;
+}
-static void _str_rtrim(char *str, char c)
+{
-        char *last = str;
-        while (*str) {
-                if (*str != c)
-                        last = str;
-                str++;
+        }
-        /* Truncate string. */
-        last[1] = 0;
+}
 …
 void str_rtrim(char *str, char32_t ch)
+{
-        /* Fast path for the ASCII case. */
-        if (ascii_check(ch)) {
-                _str_rtrim(str, ch);
-                return;
+        }
         size_t off = 0;
         size_t pos = 0;
 …
+}
-static void _str_ltrim(char *str, char c)
+{
-        char *p = str;
-        while (*p == c)
-                p++;
-        if (str != p)
-                _str_cpy(str, p);
+}
 /** Removes specified leading characters from a string.
+ *
 …
 void str_ltrim(char *str, char32_t ch)
+{
-        /* Fast path for the ASCII case. */
-        if (ascii_check(ch)) {
-                _str_ltrim(str, ch);
-                return;
+        }
         char32_t acc;
         size_t off = 0;
 …
+}
-static char *_str_rchr(const char *str, char c)
+{
-        const char *last = NULL;
-        while (*str) {
-                if (*str == c)
-                        last = str;
-                str++;
+        }
-        return (char *) last;
+}
 /** Find last occurence of character in string.
+ *
 …
 char *str_rchr(const char *str, char32_t ch)
+{
-        if (ascii_check(ch))
-                return _str_rchr(str, ch);
         char32_t acc;
         size_t off = 0;
 …
 char *str_dup(const char *src)
+{
         size_t size = _str_size(src) + 1;
+        size_t size = str_size(src) + 1;
         char *dest = malloc(size);
         if (!dest)
                 return NULL;
+        memcpy(dest, src, size);
+        _str_sanitize(dest, size, U_SPECIAL);
+        str_cpy(dest, size, src);
         return dest;
+}
 …
 char *str_ndup(const char *src, size_t n)
+{
+        size_t size = _str_nsize(src, n);
+        size_t size = str_size(src);
+        if (size > n)
+                size = n;
         char *dest = malloc(size + 1);
 …
                 return NULL;
+        memcpy(dest, src, size);
+        _str_sanitize(dest, size, U_SPECIAL);
+        dest[size] = 0;
+        str_ncpy(dest, size + 1, src, size);
         return dest;
+}

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changes in common/str.c [1db4e2ae:fdfb24e] in mainline

Legend:

common/str.c

Download in other formats: