Context Navigation

Reverse Diff

Changes in common/str.c [fdfb24e:1db4e2ae] in mainline

File:

: 1 edited

common/str.c (modified) (34 diffs)

Legend:

: Unmodified
: Added
: Removed

common/str.c

-              rfdfb24e
+              r1db4e2ae
  * Copyright (c) 2011 Martin Sucha
  * Copyright (c) 2011 Oleg Romanenko
+ * Copyright (c) 2025 Jiří Zárevúcky
  * All rights reserved.
+ *
 …
  *                        are valid
+ *
+ *                        Note that Unicode characters do not match
+ *                        one-to-one with displayed characters or glyphs on
+ *                        screen. For that level of precision, look up
+ *                        Grapheme Clusters.
+ *
  *  ASCII character       7 bit encoded ASCII character, stored in char
  *                        (usually signed 8 bit integer), code points 0 .. 127
 …
  *  [wide] string width   number of display cells on a monospace display taken
  *                        by a [wide] string, size_t
+ *
+ *                        This is virtually impossible to determine exactly for
+ *                        all strings without knowing specifics of the display
+ *                        device, due to various factors affecting text output.
+ *                        If you have the option to query the terminal for
+ *                        position change caused by outputting the string,
+ *                        it is preferrable to determine width that way.
+ *
+ *
 …
 #include <str.h>
+#include <align.h>
 #include <assert.h>
 #include <ctype.h>
 #include <errno.h>
+#include <limits.h>
+#include <macros.h>
+#include <mem.h>
 #include <stdbool.h>
 #include <stddef.h>
 #include <stdint.h>
 #include <stdlib.h>
+#include <align.h>
+#include <mem.h>
+#include <uchar.h>
+#if __STDC_HOSTED__
+#include <fibril.h>
+#endif
+static void _set_ilseq()
+{
+#ifdef errno
+        errno = EILSEQ;
+#endif
+}
 /** Byte mask consisting of lowest @n bits (out of 8) */
 …
 /** Number of data bits in a UTF-8 continuation byte */
 #define CONT_BITS  6
+#define UTF8_MASK_INITIAL2  0b00011111
+#define UTF8_MASK_INITIAL3  0b00001111
+#define UTF8_MASK_INITIAL4  0b00000111
+#define UTF8_MASK_CONT      0b00111111
+#define CHAR_INVALID ((char32_t) UINT_MAX)
+static inline bool _is_ascii(uint8_t b)
+{
+        return b < 0x80;
+}
+static inline bool _is_continuation(uint8_t b)
+{
+        return (b & 0xC0) == 0x80;
+}
+static inline bool _is_2_byte(uint8_t c)
+{
+        return (c & 0xE0) == 0xC0;
+}
+static inline bool _is_3_byte(uint8_t c)
+{
+        return (c & 0xF0) == 0xE0;
+}
+static inline bool _is_4_byte(uint8_t c)
+{
+        return (c & 0xF8) == 0xF0;
+}
+static inline int _char_continuation_bytes(char32_t c)
+{
+        if ((c & ~LO_MASK_32(7)) == 0)
+                return 0;
+        if ((c & ~LO_MASK_32(11)) == 0)
+                return 1;
+        if ((c & ~LO_MASK_32(16)) == 0)
+                return 2;
+        if ((c & ~LO_MASK_32(21)) == 0)
+                return 3;
+        /* Codes longer than 21 bits are not supported */
+        return -1;
+}
+static inline int _continuation_bytes(uint8_t b)
+{
+        /* 0xxxxxxx */
+        if (_is_ascii(b))
+                return 0;
+        /* 110xxxxx 10xxxxxx */
+        if (_is_2_byte(b))
+                return 1;
+        /* 1110xxxx 10xxxxxx 10xxxxxx */
+        if (_is_3_byte(b))
+                return 2;
+        /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
+        if (_is_4_byte(b))
+                return 3;
+        return -1;
+}
+static bool _is_non_shortest(const mbstate_t *mb, uint8_t b)
+{
+        return (mb->state == 0b1111110000000000 && !(b & 0b00100000)) ||
+            (mb->state == 0b1111111111110000 && !(b & 0b00110000));
+}
+static bool _is_surrogate(const mbstate_t *mb, uint8_t b)
+{
+        return (mb->state == 0b1111110000001101 && b >= 0xa0);
+}
+#define _likely(expr) __builtin_expect((expr), true)
+#define _unlikely(expr) __builtin_expect((expr), false)
+#define FAST_PATHS 1
+static char32_t _str_decode(const char *s, size_t *offset, size_t size, mbstate_t *mb)
+{
+        assert(s);
+        assert(offset);
+        assert(*offset <= size);
+        assert(size == STR_NO_LIMIT || s + size >= s);
+        assert(mb);
+        if (*offset == size)
+                return 0;
+        if (_likely(!mb->state)) {
+                /* Clean slate, read initial byte. */
+                uint8_t b = s[(*offset)++];
+                /* Fast exit for the most common case. */
+                if (_likely(_is_ascii(b)))
+                        return b;
+                /* unexpected continuation byte */
+                if (_unlikely(_is_continuation(b)))
+                        return CHAR_INVALID;
+                /*
+                 * The value stored into `continuation` is designed to have
+                 * just enough leading ones that after shifting in one less than
+                 * the expected number of continuation bytes, the most significant
+                 * bit becomes zero. (The field is 16b wide.)
+                 */
+                if (_is_2_byte(b)) {
+                        /* Reject non-shortest form. */
+                        if (_unlikely(!(b & 0b00011110)))
+                                return CHAR_INVALID;
+#if FAST_PATHS
+                        /* We can usually take this exit. */
+                        if (_likely(*offset < size && _is_continuation(s[*offset])))
+                                return (b & UTF8_MASK_INITIAL2) << 6 |
+                                    (s[(*offset)++] & UTF8_MASK_CONT);
+#endif
+                        /* 2 byte continuation    110xxxxx */
+                        mb->state = b ^ 0b0000000011000000;
+                } else if (_is_3_byte(b)) {
+#if FAST_PATHS
+                        /* We can usually take this exit. */
+                        if (_likely(*offset + 1 < size && _is_continuation(s[*offset]) && _is_continuation(s[*offset + 1]))) {
+                                char32_t ch = (b & UTF8_MASK_INITIAL3) << 12 |
+                                    (s[(*offset)] & UTF8_MASK_CONT) << 6 |
+                                    (s[(*offset) + 1] & UTF8_MASK_CONT);
+                                *offset += 2;
+                                /* Reject non-shortest form. */
+                                if (_unlikely(!(ch & 0xFFFFF800)))
+                                        return CHAR_INVALID;
+                                /* Reject surrogates */
+                                if (_unlikely(ch >= 0xD800 && ch < 0xE000))
+                                        return CHAR_INVALID;
+                                return ch;
+                        }
+#endif
+                        /* 3 byte continuation    1110xxxx */
+                        mb->state = b ^ 0b1111110011100000;
+                } else if (_is_4_byte(b)) {
+#if FAST_PATHS
+                        /* We can usually take this exit. */
+                        if (_likely(*offset + 2 < size && _is_continuation(s[*offset]) &&
+                            _is_continuation(s[*offset + 1]) && _is_continuation(s[*offset + 2]))) {
+                                char32_t ch = (b & UTF8_MASK_INITIAL4) << 18 |
+                                    (s[(*offset)] & UTF8_MASK_CONT) << 12 |
+                                    (s[(*offset) + 1] & UTF8_MASK_CONT) << 6 |
+                                    (s[(*offset) + 2] & UTF8_MASK_CONT);
+                                *offset += 3;
+                                /* Reject non-shortest form. */
+                                if (_unlikely(!(ch & 0xFFFF0000)))
+                                        return CHAR_INVALID;
+                                /* Reject out-of-range characters. */
+                                if (_unlikely(ch >= 0x110000))
+                                        return CHAR_INVALID;
+                                return ch;
+                        }
+#endif
+                        /* 4 byte continuation    11110xxx */
+                        mb->state = b ^ 0b1111111100000000;
+                } else {
+                        return CHAR_INVALID;
+                }
+        }
+        /* Deal with the remaining edge and invalid cases. */
+        for (; *offset < size; (*offset)++) {
+                /* Read continuation bytes. */
+                uint8_t b = s[*offset];
+                if (!_is_continuation(b) || _is_non_shortest(mb, b) || _is_surrogate(mb, b)) {
+                        mb->state = 0;
+                        return CHAR_INVALID;
+                }
+                /* Top bit becomes zero when shifting in the second to last byte. */
+                if (!(mb->state & 0x8000)) {
+                        char32_t c = ((char32_t) mb->state) << 6 | (b & UTF8_MASK_CONT);
+                        mb->state = 0;
+                        (*offset)++;
+                        return c;
+                }
+                mb->state = mb->state << 6 | (b & UTF8_MASK_CONT);
+        }
+        /* Incomplete character. */
+        assert(mb->state);
+        return 0;
+}
+/** Standard <uchar.h> function since C11. */
+size_t mbrtoc32(char32_t *c, const char *s, size_t n, mbstate_t *mb)
+{
+#if __STDC_HOSTED__
+        static fibril_local mbstate_t global_state = { };
+        if (!mb)
+                mb = &global_state;
+#endif
+        if (!s) {
+                /* Equivalent to mbrtoc32(NULL, "", 1, mb); */
+                c = NULL;
+                s = "";
+                n = 1;
+        }
+        size_t offset = 0;
+        char32_t ret = _str_decode(s, &offset, n, mb);
+        if (ret == CHAR_INVALID) {
+                assert(!mb->state);
+                _set_ilseq();
+                return UCHAR_ILSEQ;
+        }
+        if (mb->state) {
+                assert(ret == 0);
+                return UCHAR_INCOMPLETE;
+        }
+        if (c)
+                *c = ret;
+        return ret ? offset : 0;
+}
 /** Decode a single character from a string.
 …
 char32_t str_decode(const char *str, size_t *offset, size_t size)
+{
+        if (*offset + 1 > size)
+                return 0;
+        /* First byte read from string */
+        uint8_t b0 = (uint8_t) str[(*offset)++];
+        /* Determine code length */
+        unsigned int b0_bits;  /* Data bits in first byte */
+        unsigned int cbytes;   /* Number of continuation bytes */
+        if ((b0 & 0x80) == 0) {
+                /* 0xxxxxxx (Plain ASCII) */
+                b0_bits = 7;
+                cbytes = 0;
+        } else if ((b0 & 0xe0) == 0xc0) {
+                /* 110xxxxx 10xxxxxx */
+                b0_bits = 5;
+                cbytes = 1;
+        } else if ((b0 & 0xf0) == 0xe0) {
+                /* 1110xxxx 10xxxxxx 10xxxxxx */
+                b0_bits = 4;
+                cbytes = 2;
+        } else if ((b0 & 0xf8) == 0xf0) {
+                /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
+                b0_bits = 3;
+                cbytes = 3;
+        } else {
+                /* 10xxxxxx -- unexpected continuation byte */
+        mbstate_t mb = { };
+        char32_t ch = _str_decode(str, offset, size, &mb);
+        if (ch == CHAR_INVALID || mb.state)
                 return U_SPECIAL;
+        }
-        if (*offset + cbytes > size)
-                return U_SPECIAL;
-        char32_t ch = b0 & LO_MASK_8(b0_bits);
-        /* Decode continuation bytes */
-        while (cbytes > 0) {
-                uint8_t b = (uint8_t) str[(*offset)++];
-                /* Must be 10xxxxxx */
-                if ((b & 0xc0) != 0x80)
-                        return U_SPECIAL;
-                /* Shift data bits to ch */
-                ch = (ch << CONT_BITS) | (char32_t) (b & LO_MASK_8(CONT_BITS));
-                cbytes--;
+        }
         return ch;
+}
+char32_t str_decode_r(const char *str, size_t *offset, size_t size,
+        char32_t replacement, mbstate_t *mb)
+{
+        char32_t ch = _str_decode(str, offset, size, mb);
+        return (ch == CHAR_INVALID) ? replacement : ch;
+}
 …
                 return 0;
         size_t processed = 0;
+        int cbytes = 0;
         /* Continue while continuation bytes found */
         while (*offset > 0 && processed < 4) {
+        while (*offset > 0 && cbytes < 4) {
                 uint8_t b = (uint8_t) str[--(*offset)];
+                if (processed == 0 && (b & 0x80) == 0) {
+                        /* 0xxxxxxx (Plain ASCII) */
+                        return b & 0x7f;
+                } else if ((b & 0xe0) == 0xc0 || (b & 0xf0) == 0xe0 ||
+                    (b & 0xf8) == 0xf0) {
+                        /* Start byte */
+                        size_t start_offset = *offset;
+                        return str_decode(str, &start_offset, size);
+                } else if ((b & 0xc0) != 0x80) {
+                        /* Not a continuation byte */
+                if (_is_continuation(b)) {
+                        cbytes++;
+                        continue;
+                }
+                /* Reject non-shortest form encoding. */
+                if (cbytes != _continuation_bytes(b))
                         return U_SPECIAL;
+                }
+                processed++;
+        }
+                /* Start byte */
+                size_t start_offset = *offset;
+                return str_decode(str, &start_offset, size);
+        }
         /* Too many continuation bytes */
         return U_SPECIAL;
 …
  *         code was invalid.
  */
+errno_t chr_encode(const char32_t ch, char *str, size_t *offset, size_t size)
+{
+errno_t chr_encode(char32_t ch, char *str, size_t *offset, size_t size)
+{
+        // TODO: merge with c32rtomb()
         if (*offset >= size)
                 return EOVERFLOW;
+        /* Fast exit for the most common case. */
+        if (ch < 0x80) {
+                str[(*offset)++] = (char) ch;
+                return EOK;
+        }
+        /* Codes longer than 21 bits are not supported */
         if (!chr_check(ch))
                 return EINVAL;
-        /*
-         * Unsigned version of ch (bit operations should only be done
-         * on unsigned types).
-         */
-        uint32_t cc = (uint32_t) ch;
         /* Determine how many continuation bytes are needed */
+        unsigned int b0_bits;  /* Data bits in first byte */
+        unsigned int cbytes;   /* Number of continuation bytes */
+        if ((cc & ~LO_MASK_32(7)) == 0) {
+                b0_bits = 7;
+                cbytes = 0;
+        } else if ((cc & ~LO_MASK_32(11)) == 0) {
+                b0_bits = 5;
+                cbytes = 1;
+        } else if ((cc & ~LO_MASK_32(16)) == 0) {
+                b0_bits = 4;
+                cbytes = 2;
+        } else if ((cc & ~LO_MASK_32(21)) == 0) {
+                b0_bits = 3;
+                cbytes = 3;
+        } else {
+                /* Codes longer than 21 bits are not supported */
+                return EINVAL;
+        }
+        unsigned int cbytes = _char_continuation_bytes(ch);
+        unsigned int b0_bits = 6 - cbytes;  /* Data bits in first byte */
         /* Check for available space in buffer */
 …
         unsigned int i;
         for (i = cbytes; i > 0; i--) {
                 str[*offset + i] = 0x80 | (cc & LO_MASK_32(CONT_BITS));
                 cc = cc >> CONT_BITS;
+                str[*offset + i] = 0x80 | (ch & LO_MASK_32(CONT_BITS));
+                ch >>= CONT_BITS;
+        }
         /* Encode first byte */
         str[*offset] = (cc & LO_MASK_32(b0_bits)) | HI_MASK_8(8 - b0_bits - 1);
+        str[*offset] = (ch & LO_MASK_32(b0_bits)) | HI_MASK_8(8 - b0_bits - 1);
         /* Advance offset */
 …
+}
+/* Convert in place any bytes that don't form a valid character into replacement. */
+static size_t _str_sanitize(char *str, size_t n, uint8_t replacement)
+{
+        uint8_t *b = (uint8_t *) str;
+        size_t count = 0;
+        for (; n > 0 && b[0]; b++, n--) {
+                if (b[0] < ' ') {
+                        /* C0 control codes */
+                        b[0] = replacement;
+                        count++;
+                        continue;
+                }
+                int cont = _continuation_bytes(b[0]);
+                if (__builtin_expect(cont, 0) == 0)
+                        continue;
+                if (cont < 0 || n <= (size_t) cont) {
+                        b[0] = replacement;
+                        count++;
+                        continue;
+                }
+                /* Check continuation bytes. */
+                bool valid = true;
+                for (int i = 1; i <= cont; i++) {
+                        if (!_is_continuation(b[i])) {
+                                valid = false;
+                                break;
+                        }
+                }
+                if (!valid) {
+                        b[0] = replacement;
+                        count++;
+                        continue;
+                }
+                /*
+                 * Check for non-shortest form encoding.
+                 * See https://www.unicode.org/versions/corrigendum1.html
+                 */
+                /* 0b110!!!!x 0b10xxxxxx */
+                if (cont == 1 && !(b[0] & 0b00011110)) {
+                        b[0] = replacement;
+                        count++;
+                        continue;
+                }
+                bool c1_control = (b[0] == 0b11000010 && b[1] < 0b10100000);
+                if (cont == 1 && c1_control) {
+                        b[0] = replacement;
+                        count++;
+                        continue;
+                }
+                /* 0b1110!!!! 0b10!xxxxx 0b10xxxxxx */
+                if (cont == 2 && !(b[0] & 0b00001111) && !(b[1] & 0b00100000)) {
+                        b[0] = replacement;
+                        count++;
+                        continue;
+                }
+                /* 0b11110!!! 0b10!!xxxx 0b10xxxxxx 0b10xxxxxx */
+                if (cont == 3 && !(b[0] & 0b00000111) && !(b[1] & 0b00110000)) {
+                        b[0] = replacement;
+                        count++;
+                        continue;
+                }
+                /* Check for surrogate character encoding. */
+                if (cont == 2 && b[0] == 0xED && b[1] >= 0xA0) {
+                        b[0] = replacement;
+                        count++;
+                        continue;
+                }
+                /* Check for out-of-range code points. */
+                if (cont == 3 && (b[0] > 0xF4 || (b[0] == 0xF4 && b[1] >= 0x90))) {
+                        b[0] = replacement;
+                        count++;
+                        continue;
+                }
+                b += cont;
+                n -= cont;
+        }
+        return count;
+}
+/** Replaces any byte that's not part of a complete valid UTF-8 character
+ * encoding with a replacement byte.
+ * Also replaces C0 and C1 control codes.
+ */
+size_t str_sanitize(char *str, size_t n, uint8_t replacement)
+{
+        return _str_sanitize(str, n, replacement);
+}
+static size_t _str_size(const char *str)
+{
+        size_t size = 0;
+        while (*str++ != 0)
+                size++;
+        return size;
+}
 /** Get size of string.
+ *
 …
 size_t str_size(const char *str)
+{
+        size_t size = 0;
+        while (*str++ != 0)
+                size++;
+        return size;
+        return _str_size(str);
+}
 …
+}
+static size_t _str_nsize(const char *str, size_t max_size)
+{
+        size_t size = 0;
+        while ((*str++ != 0) && (size < max_size))
+                size++;
+        return size;
+}
 /** Get size of string with size limit.
+ *
 …
 size_t str_nsize(const char *str, size_t max_size)
+{
+        size_t size = 0;
+        while ((*str++ != 0) && (size < max_size))
+                size++;
+        return size;
+        return _str_nsize(str, max_size);
+}
 …
 int str_cmp(const char *s1, const char *s2)
+{
+        char32_t c1 = 0;
+        char32_t c2 = 0;
+        size_t off1 = 0;
+        size_t off2 = 0;
+        while (true) {
+                c1 = str_decode(s1, &off1, STR_NO_LIMIT);
+                c2 = str_decode(s2, &off2, STR_NO_LIMIT);
+                if (c1 < c2)
+                        return -1;
+                if (c1 > c2)
+                        return 1;
+                if (c1 == 0 || c2 == 0)
+                        break;
+        }
+        return 0;
+        /*
+         * UTF-8 has the nice property that lexicographic ordering on bytes is
+         * the same as the lexicographic ordering of the character sequences.
+         */
+        while (*s1 == *s2 && *s1 != 0) {
+                s1++;
+                s2++;
+        }
+        if (*s1 == *s2)
+                return 0;
+        return (*s1 < *s2) ? -1 : 1;
+}
 …
 int str_casecmp(const char *s1, const char *s2)
+{
+        // FIXME: doesn't work for non-ASCII caseful characters
         char32_t c1 = 0;
         char32_t c2 = 0;
 …
 int str_lcasecmp(const char *s1, const char *s2, size_t max_len)
+{
+        // FIXME: doesn't work for non-ASCII caseful characters
         char32_t c1 = 0;
         char32_t c2 = 0;
 …
+}
+static bool _test_prefix(const char *s, const char *p)
+{
+        while (*s == *p && *s != 0) {
+                s++;
+                p++;
+        }
+        return *p == 0;
+}
 /** Test whether p is a prefix of s.
+ *
 …
 bool str_test_prefix(const char *s, const char *p)
+{
+        char32_t c1 = 0;
+        char32_t c2 = 0;
+        size_t off1 = 0;
+        size_t off2 = 0;
+        while (true) {
+                c1 = str_decode(s, &off1, STR_NO_LIMIT);
+                c2 = str_decode(p, &off2, STR_NO_LIMIT);
+                if (c2 == 0)
+                        return true;
+                if (c1 != c2)
+                        return false;
+                if (c1 == 0)
+                        break;
+        }
+        return false;
+        return _test_prefix(s, p);
+}
 …
         return s + off;
+}
+/** Copy string as a sequence of bytes. */
+static void _str_cpy(char *dest, const char *src)
+{
+        while (*src)
+                *(dest++) = *(src++);
+        *dest = 0;
+}
+/** Copy string as a sequence of bytes. */
+static void _str_cpyn(char *dest, size_t size, const char *src)
+{
+        assert(dest && src && size);
+        if (!dest || !src || !size)
+                return;
+        if (size == STR_NO_LIMIT)
+                return _str_cpy(dest, src);
+        char *dest_top = dest + size - 1;
+        assert(size == 1 || dest < dest_top);
+        while (*src && dest < dest_top)
+                *(dest++) = *(src++);
+        *dest = 0;
+}
 …
         assert(size > 0);
         assert(src != NULL);
+        size_t src_off = 0;
+        size_t dest_off = 0;
+        char32_t ch;
+        while ((ch = str_decode(src, &src_off, STR_NO_LIMIT)) != 0) {
+                if (chr_encode(ch, dest, &dest_off, size - 1) != EOK)
+                        break;
+        }
+        dest[dest_off] = '\0';
+        assert(dest != NULL);
+        assert(size == STR_NO_LIMIT || dest + size > dest);
+        /* Copy data. */
+        _str_cpyn(dest, size, src);
+        /* In-place translate invalid bytes to U_SPECIAL. */
+        _str_sanitize(dest, size, U_SPECIAL);
+}
 …
         /* There must be space for a null terminator in the buffer. */
         assert(size > 0);
+        size_t src_off = 0;
+        size_t dest_off = 0;
+        char32_t ch;
+        while ((ch = str_decode(src, &src_off, n)) != 0) {
+                if (chr_encode(ch, dest, &dest_off, size - 1) != EOK)
+                        break;
+        }
+        dest[dest_off] = '\0';
+        assert(src != NULL);
+        /* Copy data. */
+        _str_cpyn(dest, min(size, n + 1), src);
+        /* In-place translate invalid bytes to U_SPECIAL. */
+        _str_sanitize(dest, size, U_SPECIAL);
+}
 …
 void str_append(char *dest, size_t size, const char *src)
+{
+        size_t dstr_size;
+        dstr_size = str_size(dest);
+        if (dstr_size >= size)
+                return;
+        str_cpy(dest + dstr_size, size - dstr_size, src);
+        assert(src != NULL);
+        assert(dest != NULL);
+        assert(size > 0);
+        assert(size == STR_NO_LIMIT || dest + size > dest);
+        size_t dstr_size = _str_nsize(dest, size);
+        if (dstr_size < size) {
+                _str_cpyn(dest + dstr_size, size - dstr_size, src);
+                _str_sanitize(dest + dstr_size, size - dstr_size, U_SPECIAL);
+        }
+}
 …
 errno_t spascii_to_str(char *dest, size_t size, const uint8_t *src, size_t n)
+{
+        size_t sidx;
+        size_t didx;
+        size_t dlast;
+        uint8_t byte;
+        errno_t rc;
+        errno_t result;
+        /* There must be space for a null terminator in the buffer. */
+        assert(size > 0);
+        result = EOK;
+        didx = 0;
+        dlast = 0;
+        for (sidx = 0; sidx < n; ++sidx) {
+                byte = src[sidx];
+                if (!ascii_check(byte)) {
+                        byte = U_SPECIAL;
+        size_t len = 0;
+        /* Determine the length of the source string. */
+        for (size_t i = 0; i < n; i++) {
+                if (src[i] == 0)
+                        break;
+                if (src[i] != ' ')
+                        len = i + 1;
+        }
+        errno_t result = EOK;
+        size_t out_len = min(len, size - 1);
+        /* Copy characters */
+        for (size_t i = 0; i < out_len; i++) {
+                dest[i] = src[i];
+                if (dest[i] < 0) {
+                        dest[i] = U_SPECIAL;
                         result = EIO;
+                }
+                rc = chr_encode(byte, dest, &didx, size - 1);
+                if (rc != EOK) {
+                        assert(rc == EOVERFLOW);
+                        dest[didx] = '\0';
+                        return rc;
+                }
+                /* Remember dest index after last non-empty character */
+                if (byte != 0x20)
+                        dlast = didx;
+        }
+        /* Terminate string after last non-empty character */
+        dest[dlast] = '\0';
+        }
+        dest[out_len] = 0;
+        if (out_len < len)
+                return EOVERFLOW;
         return result;
+}
 …
+}
+static char *_strchr(const char *str, char c)
+{
+        while (*str != 0 && *str != c)
+                str++;
+        return (*str == c) ? (char *) str : NULL;
+}
 /** Find first occurence of character in string.
+ *
 …
 char *str_chr(const char *str, char32_t ch)
+{
+        char32_t acc;
+        size_t off = 0;
+        size_t last = 0;
+        while ((acc = str_decode(str, &off, STR_NO_LIMIT)) != 0) {
+                if (acc == ch)
+                        return (char *) (str + last);
+                last = off;
+        /* Fast path for an ASCII character. */
+        if (ascii_check(ch))
+                return _strchr(str, ch);
+        /* Convert character to UTF-8. */
+        char utf8[STR_BOUNDS(1) + 1];
+        size_t offset = 0;
+        if (chr_encode(ch, utf8, &offset, sizeof(utf8)) != EOK || offset == 0)
+                return NULL;
+        utf8[offset] = '\0';
+        /* Find the first byte, then check if all of them are correct. */
+        while (*str != 0) {
+                str = _strchr(str, utf8[0]);
+                if (!str)
+                        return NULL;
+                if (_test_prefix(str, utf8))
+                        return (char *) str;
+                str++;
+        }
 …
 char *str_str(const char *hs, const char *n)
+{
+        size_t off = 0;
+        if (str_lcmp(hs, n, str_length(n)) == 0)
+                return (char *)hs;
+        while (str_decode(hs, &off, STR_NO_LIMIT) != 0) {
+                if (str_lcmp(hs + off, n, str_length(n)) == 0)
+                        return (char *)(hs + off);
+        size_t hsize = _str_size(hs);
+        size_t nsize = _str_size(n);
+        while (hsize >= nsize) {
+                if (_test_prefix(hs, n))
+                        return (char *) hs;
+                hs++;
+                hsize--;
+        }
         return NULL;
+}
+static void _str_rtrim(char *str, char c)
+{
+        char *last = str;
+        while (*str) {
+                if (*str != c)
+                        last = str;
+                str++;
+        }
+        /* Truncate string. */
+        last[1] = 0;
+}
 …
 void str_rtrim(char *str, char32_t ch)
+{
+        /* Fast path for the ASCII case. */
+        if (ascii_check(ch)) {
+                _str_rtrim(str, ch);
+                return;
+        }
         size_t off = 0;
         size_t pos = 0;
 …
+}
+static void _str_ltrim(char *str, char c)
+{
+        char *p = str;
+        while (*p == c)
+                p++;
+        if (str != p)
+                _str_cpy(str, p);
+}
 /** Removes specified leading characters from a string.
+ *
 …
 void str_ltrim(char *str, char32_t ch)
+{
+        /* Fast path for the ASCII case. */
+        if (ascii_check(ch)) {
+                _str_ltrim(str, ch);
+                return;
+        }
         char32_t acc;
         size_t off = 0;
 …
+}
+static char *_str_rchr(const char *str, char c)
+{
+        const char *last = NULL;
+        while (*str) {
+                if (*str == c)
+                        last = str;
+                str++;
+        }
+        return (char *) last;
+}
 /** Find last occurence of character in string.
+ *
 …
 char *str_rchr(const char *str, char32_t ch)
+{
+        if (ascii_check(ch))
+                return _str_rchr(str, ch);
         char32_t acc;
         size_t off = 0;
 …
 char *str_dup(const char *src)
+{
         size_t size = str_size(src) + 1;
+        size_t size = _str_size(src) + 1;
         char *dest = malloc(size);
         if (!dest)
                 return NULL;
+        str_cpy(dest, size, src);
+        memcpy(dest, src, size);
+        _str_sanitize(dest, size, U_SPECIAL);
         return dest;
+}
 …
 char *str_ndup(const char *src, size_t n)
+{
+        size_t size = str_size(src);
+        if (size > n)
+                size = n;
+        size_t size = _str_nsize(src, n);
         char *dest = malloc(size + 1);
 …
                 return NULL;
+        str_ncpy(dest, size + 1, src, size);
+        memcpy(dest, src, size);
+        _str_sanitize(dest, size, U_SPECIAL);
+        dest[size] = 0;
         return dest;
+}

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changes in common/str.c [fdfb24e:1db4e2ae] in mainline

Legend:

common/str.c

Download in other formats: