Changeset 28a5ebd in mainline for boot/generic/src/str.c


Ignore:
Timestamp:
2020-06-18T15:39:50Z (4 years ago)
Author:
Martin Decky <martin@…>
Branches:
lfn, master, serial, ticket/834-toolchain-update, topic/msim-upgrade, topic/simplify-dev-export
Children:
ce52c333
Parents:
4f663f3e
Message:

Use char32_t instead of wchat_t to represent UTF-32 strings

The intention of the native HelenOS string API has been always to
support Unicode in the UTF-8 and UTF-32 encodings as the sole character
representations and ignore the obsolete mess of older single-byte and
multibyte character encodings. Before C11, the wchar_t type has been
slightly misused for the purpose of the UTF-32 strings. The newer
char32_t type is obviously a much more suitable option. The standard
defines char32_t as uint_least32_t, thus we can take the liberty to fix
it to uint32_t.

To maintain compatilibity with the C Standard, the putwchar(wchar_t)
functions has been replaced by our custom putuchar(char32_t) functions
where appropriate.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • boot/generic/src/str.c

    r4f663f3e r28a5ebd  
    3838 * strings, called just strings are encoded in UTF-8. Wide strings (encoded
    3939 * in UTF-32) are supported to a limited degree. A single character is
    40  * represented as wchar_t.@n
     40 * represented as char32_t.@n
    4141 *
    4242 * Overview of the terminology:@n
     
    4646 *  byte                  8 bits stored in uint8_t (unsigned 8 bit integer)
    4747 *
    48  *  character             UTF-32 encoded Unicode character, stored in wchar_t
    49  *                        (signed 32 bit integer), code points 0 .. 1114111
     48 *  character             UTF-32 encoded Unicode character, stored in char32_t
     49 *                        (unsigned 32 bit integer), code points 0 .. 1114111
    5050 *                        are valid
    5151 *
     
    5757 *
    5858 *  wide string           UTF-32 encoded NULL-terminated Unicode string,
    59  *                        wchar_t *
     59 *                        char32_t *
    6060 *
    6161 *  [wide] string size    number of BYTES in a [wide] string (excluding
     
    9696 * A specific character inside a [wide] string can be referred to by:@n
    9797 *
    98  *  pointer (char *, wchar_t *)
     98 *  pointer (char *, char32_t *)
    9999 *  byte offset (size_t)
    100100 *  character index (size_t)
     
    109109#include <stdint.h>
    110110
    111 /** Check the condition if wchar_t is signed */
    112 #ifdef __WCHAR_UNSIGNED__
    113 #define WCHAR_SIGNED_CHECK(cond)  (true)
    114 #else
    115 #define WCHAR_SIGNED_CHECK(cond)  (cond)
    116 #endif
    117 
    118111/** Byte mask consisting of lowest @n bits (out of 8) */
    119112#define LO_MASK_8(n)  ((uint8_t) ((1 << (n)) - 1))
     
    143136 *
    144137 */
    145 wchar_t str_decode(const char *str, size_t *offset, size_t size)
     138char32_t str_decode(const char *str, size_t *offset, size_t size)
    146139{
    147140        if (*offset + 1 > size)
     
    180173                return U_SPECIAL;
    181174
    182         wchar_t ch = b0 & LO_MASK_8(b0_bits);
     175        char32_t ch = b0 & LO_MASK_8(b0_bits);
    183176
    184177        /* Decode continuation bytes */
     
    191184
    192185                /* Shift data bits to ch */
    193                 ch = (ch << CONT_BITS) | (wchar_t) (b & LO_MASK_8(CONT_BITS));
     186                ch = (ch << CONT_BITS) | (char32_t) (b & LO_MASK_8(CONT_BITS));
    194187                cbytes--;
    195188        }
     
    213206 *         code was invalid.
    214207 */
    215 errno_t chr_encode(const wchar_t ch, char *str, size_t *offset, size_t size)
     208errno_t chr_encode(const char32_t ch, char *str, size_t *offset, size_t size)
    216209{
    217210        if (*offset >= size)
     
    340333 *
    341334 */
    342 bool ascii_check(wchar_t ch)
    343 {
    344         if (WCHAR_SIGNED_CHECK(ch >= 0) && (ch <= 127))
     335bool ascii_check(char32_t ch)
     336{
     337        if (ch <= 127)
    345338                return true;
    346339
     
    353346 *
    354347 */
    355 bool chr_check(wchar_t ch)
    356 {
    357         if (WCHAR_SIGNED_CHECK(ch >= 0) && (ch <= 1114111))
     348bool chr_check(char32_t ch)
     349{
     350        if (ch <= 1114111)
    358351                return true;
    359352
     
    381374int str_cmp(const char *s1, const char *s2)
    382375{
    383         wchar_t c1 = 0;
    384         wchar_t c2 = 0;
     376        char32_t c1 = 0;
     377        char32_t c2 = 0;
    385378
    386379        size_t off1 = 0;
     
    421414        size_t dest_off = 0;
    422415
    423         wchar_t ch;
     416        char32_t ch;
    424417        while ((ch = str_decode(src, &src_off, STR_NO_LIMIT)) != 0) {
    425418                if (chr_encode(ch, dest, &dest_off, size - 1) != EOK)
Note: See TracChangeset for help on using the changeset viewer.