Changeset 28a5ebd in mainline for kernel/generic/src/lib/str.c


Ignore:
Timestamp:
2020-06-18T15:39:50Z (4 years ago)
Author:
Martin Decky <martin@…>
Branches:
lfn, master, serial, ticket/834-toolchain-update, topic/msim-upgrade, topic/simplify-dev-export
Children:
ce52c333
Parents:
4f663f3e
Message:

Use char32_t instead of wchat_t to represent UTF-32 strings

The intention of the native HelenOS string API has been always to
support Unicode in the UTF-8 and UTF-32 encodings as the sole character
representations and ignore the obsolete mess of older single-byte and
multibyte character encodings. Before C11, the wchar_t type has been
slightly misused for the purpose of the UTF-32 strings. The newer
char32_t type is obviously a much more suitable option. The standard
defines char32_t as uint_least32_t, thus we can take the liberty to fix
it to uint32_t.

To maintain compatilibity with the C Standard, the putwchar(wchar_t)
functions has been replaced by our custom putuchar(char32_t) functions
where appropriate.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • kernel/generic/src/lib/str.c

    r4f663f3e r28a5ebd  
    4242 * strings, called just strings are encoded in UTF-8. Wide strings (encoded
    4343 * in UTF-32) are supported to a limited degree. A single character is
    44  * represented as wchar_t.@n
     44 * represented as char32_t.@n
    4545 *
    4646 * Overview of the terminology:@n
     
    5050 *  byte                  8 bits stored in uint8_t (unsigned 8 bit integer)
    5151 *
    52  *  character             UTF-32 encoded Unicode character, stored in wchar_t
    53  *                        (signed 32 bit integer), code points 0 .. 1114111
     52 *  character             UTF-32 encoded Unicode character, stored in char32_t
     53 *                        (unsigned 32 bit integer), code points 0 .. 1114111
    5454 *                        are valid
    5555 *
     
    6161 *
    6262 *  wide string           UTF-32 encoded NULL-terminated Unicode string,
    63  *                        wchar_t *
     63 *                        char32_t *
    6464 *
    6565 *  [wide] string size    number of BYTES in a [wide] string (excluding
     
    100100 * A specific character inside a [wide] string can be referred to by:@n
    101101 *
    102  *  pointer (char *, wchar_t *)
     102 *  pointer (char *, char32_t *)
    103103 *  byte offset (size_t)
    104104 *  character index (size_t)
     
    118118#include <macros.h>
    119119
    120 /** Check the condition if wchar_t is signed */
    121 #ifdef __WCHAR_UNSIGNED__
    122 #define WCHAR_SIGNED_CHECK(cond)  (true)
    123 #else
    124 #define WCHAR_SIGNED_CHECK(cond)  (cond)
    125 #endif
    126 
    127120/** Byte mask consisting of lowest @n bits (out of 8) */
    128121#define LO_MASK_8(n)  ((uint8_t) ((1 << (n)) - 1))
     
    152145 *
    153146 */
    154 wchar_t str_decode(const char *str, size_t *offset, size_t size)
     147char32_t str_decode(const char *str, size_t *offset, size_t size)
    155148{
    156149        if (*offset + 1 > size)
     
    189182                return U_SPECIAL;
    190183
    191         wchar_t ch = b0 & LO_MASK_8(b0_bits);
     184        char32_t ch = b0 & LO_MASK_8(b0_bits);
    192185
    193186        /* Decode continuation bytes */
     
    200193
    201194                /* Shift data bits to ch */
    202                 ch = (ch << CONT_BITS) | (wchar_t) (b & LO_MASK_8(CONT_BITS));
     195                ch = (ch << CONT_BITS) | (char32_t) (b & LO_MASK_8(CONT_BITS));
    203196                cbytes--;
    204197        }
     
    222215 *         code was invalid.
    223216 */
    224 errno_t chr_encode(const wchar_t ch, char *str, size_t *offset, size_t size)
     217errno_t chr_encode(const char32_t ch, char *str, size_t *offset, size_t size)
    225218{
    226219        if (*offset >= size)
     
    308301 *
    309302 */
    310 size_t wstr_size(const wchar_t *str)
    311 {
    312         return (wstr_length(str) * sizeof(wchar_t));
     303size_t wstr_size(const char32_t *str)
     304{
     305        return (wstr_length(str) * sizeof(char32_t));
    313306}
    314307
     
    354347 *
    355348 */
    356 size_t wstr_lsize(const wchar_t *str, size_t max_len)
    357 {
    358         return (wstr_nlength(str, max_len * sizeof(wchar_t)) * sizeof(wchar_t));
     349size_t wstr_lsize(const char32_t *str, size_t max_len)
     350{
     351        return (wstr_nlength(str, max_len * sizeof(char32_t)) * sizeof(char32_t));
    359352}
    360353
     
    384377 *
    385378 */
    386 size_t wstr_length(const wchar_t *wstr)
     379size_t wstr_length(const char32_t *wstr)
    387380{
    388381        size_t len = 0;
     
    421414 *
    422415 */
    423 size_t wstr_nlength(const wchar_t *str, size_t size)
     416size_t wstr_nlength(const char32_t *str, size_t size)
    424417{
    425418        size_t len = 0;
    426         size_t limit = ALIGN_DOWN(size, sizeof(wchar_t));
     419        size_t limit = ALIGN_DOWN(size, sizeof(char32_t));
    427420        size_t offset = 0;
    428421
    429422        while ((offset < limit) && (*str++ != 0)) {
    430423                len++;
    431                 offset += sizeof(wchar_t);
     424                offset += sizeof(char32_t);
    432425        }
    433426
     
    440433 *
    441434 */
    442 bool ascii_check(wchar_t ch)
    443 {
    444         if (WCHAR_SIGNED_CHECK(ch >= 0) && (ch <= 127))
     435bool ascii_check(char32_t ch)
     436{
     437        if (ch <= 127)
    445438                return true;
    446439
     
    453446 *
    454447 */
    455 bool chr_check(wchar_t ch)
    456 {
    457         if (WCHAR_SIGNED_CHECK(ch >= 0) && (ch <= 1114111))
     448bool chr_check(char32_t ch)
     449{
     450        if (ch <= 1114111)
    458451                return true;
    459452
     
    481474int str_cmp(const char *s1, const char *s2)
    482475{
    483         wchar_t c1 = 0;
    484         wchar_t c2 = 0;
     476        char32_t c1 = 0;
     477        char32_t c2 = 0;
    485478
    486479        size_t off1 = 0;
     
    528521int str_lcmp(const char *s1, const char *s2, size_t max_len)
    529522{
    530         wchar_t c1 = 0;
    531         wchar_t c2 = 0;
     523        char32_t c1 = 0;
     524        char32_t c2 = 0;
    532525
    533526        size_t off1 = 0;
     
    580573        size_t dest_off = 0;
    581574
    582         wchar_t ch;
     575        char32_t ch;
    583576        while ((ch = str_decode(src, &src_off, STR_NO_LIMIT)) != 0) {
    584577                if (chr_encode(ch, dest, &dest_off, size - 1) != EOK)
     
    613606        size_t dest_off = 0;
    614607
    615         wchar_t ch;
     608        char32_t ch;
    616609        while ((ch = str_decode(src, &src_off, n)) != 0) {
    617610                if (chr_encode(ch, dest, &dest_off, size - 1) != EOK)
     
    628621 * written will always be well-formed.
    629622 *
    630  * @param dest  Destination buffer.
    631  * @param size  Size of the destination buffer.
    632  * @param src   Source wide string.
    633  */
    634 void wstr_to_str(char *dest, size_t size, const wchar_t *src)
    635 {
    636         wchar_t ch;
     623 * @param dest Destination buffer.
     624 * @param size Size of the destination buffer.
     625 * @param src  Source wide string.
     626 */
     627void wstr_to_str(char *dest, size_t size, const char32_t *src)
     628{
     629        char32_t ch;
    637630        size_t src_idx;
    638631        size_t dest_off;
     
    659652 * @return Pointer to character in @a str or NULL if not found.
    660653 */
    661 char *str_chr(const char *str, wchar_t ch)
    662 {
    663         wchar_t acc;
     654char *str_chr(const char *str, char32_t ch)
     655{
     656        char32_t acc;
    664657        size_t off = 0;
    665658        size_t last = 0;
     
    688681 *
    689682 */
    690 bool wstr_linsert(wchar_t *str, wchar_t ch, size_t pos, size_t max_pos)
     683bool wstr_linsert(char32_t *str, char32_t ch, size_t pos, size_t max_pos)
    691684{
    692685        size_t len = wstr_length(str);
     
    716709 *
    717710 */
    718 bool wstr_remove(wchar_t *str, size_t pos)
     711bool wstr_remove(char32_t *str, size_t pos)
    719712{
    720713        size_t len = wstr_length(str);
Note: See TracChangeset for help on using the changeset viewer.