Changeset 28a5ebd in mainline for uspace/lib/c/generic/str.c


Ignore:
Timestamp:
2020-06-18T15:39:50Z (4 years ago)
Author:
Martin Decky <martin@…>
Branches:
lfn, master, serial, ticket/834-toolchain-update, topic/msim-upgrade, topic/simplify-dev-export
Children:
ce52c333
Parents:
4f663f3e
Message:

Use char32_t instead of wchat_t to represent UTF-32 strings

The intention of the native HelenOS string API has been always to
support Unicode in the UTF-8 and UTF-32 encodings as the sole character
representations and ignore the obsolete mess of older single-byte and
multibyte character encodings. Before C11, the wchar_t type has been
slightly misused for the purpose of the UTF-32 strings. The newer
char32_t type is obviously a much more suitable option. The standard
defines char32_t as uint_least32_t, thus we can take the liberty to fix
it to uint32_t.

To maintain compatilibity with the C Standard, the putwchar(wchar_t)
functions has been replaced by our custom putuchar(char32_t) functions
where appropriate.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • uspace/lib/c/generic/str.c

    r4f663f3e r28a5ebd  
    4242 * strings, called just strings are encoded in UTF-8. Wide strings (encoded
    4343 * in UTF-32) are supported to a limited degree. A single character is
    44  * represented as wchar_t.@n
     44 * represented as char32_t.@n
    4545 *
    4646 * Overview of the terminology:@n
     
    5050 *  byte                  8 bits stored in uint8_t (unsigned 8 bit integer)
    5151 *
    52  *  character             UTF-32 encoded Unicode character, stored in wchar_t
    53  *                        (signed 32 bit integer), code points 0 .. 1114111
     52 *  character             UTF-32 encoded Unicode character, stored in char32_t
     53 *                        (unsigned 32 bit integer), code points 0 .. 1114111
    5454 *                        are valid
    5555 *
     
    6161 *
    6262 *  wide string           UTF-32 encoded NULL-terminated Unicode string,
    63  *                        wchar_t *
     63 *                        char32_t *
    6464 *
    6565 *  [wide] string size    number of BYTES in a [wide] string (excluding
     
    100100 * A specific character inside a [wide] string can be referred to by:@n
    101101 *
    102  *  pointer (char *, wchar_t *)
     102 *  pointer (char *, char32_t *)
    103103 *  byte offset (size_t)
    104104 *  character index (size_t)
     
    119119#include <mem.h>
    120120
    121 /** Check the condition if wchar_t is signed */
    122 #ifdef __WCHAR_UNSIGNED__
    123 #define WCHAR_SIGNED_CHECK(cond)  (true)
    124 #else
    125 #define WCHAR_SIGNED_CHECK(cond)  (cond)
    126 #endif
    127 
    128121/** Byte mask consisting of lowest @n bits (out of 8) */
    129122#define LO_MASK_8(n)  ((uint8_t) ((1 << (n)) - 1))
     
    153146 *
    154147 */
    155 wchar_t str_decode(const char *str, size_t *offset, size_t size)
     148char32_t str_decode(const char *str, size_t *offset, size_t size)
    156149{
    157150        if (*offset + 1 > size)
     
    190183                return U_SPECIAL;
    191184
    192         wchar_t ch = b0 & LO_MASK_8(b0_bits);
     185        char32_t ch = b0 & LO_MASK_8(b0_bits);
    193186
    194187        /* Decode continuation bytes */
     
    201194
    202195                /* Shift data bits to ch */
    203                 ch = (ch << CONT_BITS) | (wchar_t) (b & LO_MASK_8(CONT_BITS));
     196                ch = (ch << CONT_BITS) | (char32_t) (b & LO_MASK_8(CONT_BITS));
    204197                cbytes--;
    205198        }
     
    223216 *
    224217 */
    225 wchar_t str_decode_reverse(const char *str, size_t *offset, size_t size)
     218char32_t str_decode_reverse(const char *str, size_t *offset, size_t size)
    226219{
    227220        if (*offset == 0)
     
    266259 *         code was invalid.
    267260 */
    268 errno_t chr_encode(const wchar_t ch, char *str, size_t *offset, size_t size)
     261errno_t chr_encode(const char32_t ch, char *str, size_t *offset, size_t size)
    269262{
    270263        if (*offset >= size)
     
    352345 *
    353346 */
    354 size_t wstr_size(const wchar_t *str)
    355 {
    356         return (wstr_length(str) * sizeof(wchar_t));
     347size_t wstr_size(const char32_t *str)
     348{
     349        return (wstr_length(str) * sizeof(char32_t));
    357350}
    358351
     
    417410 *
    418411 */
    419 size_t wstr_nsize(const wchar_t *str, size_t max_size)
    420 {
    421         return (wstr_nlength(str, max_size) * sizeof(wchar_t));
     412size_t wstr_nsize(const char32_t *str, size_t max_size)
     413{
     414        return (wstr_nlength(str, max_size) * sizeof(char32_t));
    422415}
    423416
     
    435428 *
    436429 */
    437 size_t wstr_lsize(const wchar_t *str, size_t max_len)
    438 {
    439         return (wstr_nlength(str, max_len * sizeof(wchar_t)) * sizeof(wchar_t));
     430size_t wstr_lsize(const char32_t *str, size_t max_len)
     431{
     432        return (wstr_nlength(str, max_len * sizeof(char32_t)) * sizeof(char32_t));
    440433}
    441434
     
    465458 *
    466459 */
    467 size_t wstr_length(const wchar_t *wstr)
     460size_t wstr_length(const char32_t *wstr)
    468461{
    469462        size_t len = 0;
     
    502495 *
    503496 */
    504 size_t wstr_nlength(const wchar_t *str, size_t size)
     497size_t wstr_nlength(const char32_t *str, size_t size)
    505498{
    506499        size_t len = 0;
    507         size_t limit = ALIGN_DOWN(size, sizeof(wchar_t));
     500        size_t limit = ALIGN_DOWN(size, sizeof(char32_t));
    508501        size_t offset = 0;
    509502
    510503        while ((offset < limit) && (*str++ != 0)) {
    511504                len++;
    512                 offset += sizeof(wchar_t);
     505                offset += sizeof(char32_t);
    513506        }
    514507
     
    521514 * @return      Width of character in cells.
    522515 */
    523 size_t chr_width(wchar_t ch)
     516size_t chr_width(char32_t ch)
    524517{
    525518        return 1;
     
    535528        size_t width = 0;
    536529        size_t offset = 0;
    537         wchar_t ch;
     530        char32_t ch;
    538531
    539532        while ((ch = str_decode(str, &offset, STR_NO_LIMIT)) != 0)
     
    548541 *
    549542 */
    550 bool ascii_check(wchar_t ch)
    551 {
    552         if (WCHAR_SIGNED_CHECK(ch >= 0) && (ch <= 127))
     543bool ascii_check(char32_t ch)
     544{
     545        if (ch <= 127)
    553546                return true;
    554547
     
    561554 *
    562555 */
    563 bool chr_check(wchar_t ch)
    564 {
    565         if (WCHAR_SIGNED_CHECK(ch >= 0) && (ch <= 1114111))
     556bool chr_check(char32_t ch)
     557{
     558        if (ch <= 1114111)
    566559                return true;
    567560
     
    589582int str_cmp(const char *s1, const char *s2)
    590583{
    591         wchar_t c1 = 0;
    592         wchar_t c2 = 0;
     584        char32_t c1 = 0;
     585        char32_t c2 = 0;
    593586
    594587        size_t off1 = 0;
     
    636629int str_lcmp(const char *s1, const char *s2, size_t max_len)
    637630{
    638         wchar_t c1 = 0;
    639         wchar_t c2 = 0;
     631        char32_t c1 = 0;
     632        char32_t c2 = 0;
    640633
    641634        size_t off1 = 0;
     
    688681int str_casecmp(const char *s1, const char *s2)
    689682{
    690         wchar_t c1 = 0;
    691         wchar_t c2 = 0;
     683        char32_t c1 = 0;
     684        char32_t c2 = 0;
    692685
    693686        size_t off1 = 0;
     
    736729int str_lcasecmp(const char *s1, const char *s2, size_t max_len)
    737730{
    738         wchar_t c1 = 0;
    739         wchar_t c2 = 0;
     731        char32_t c1 = 0;
     732        char32_t c2 = 0;
    740733
    741734        size_t off1 = 0;
     
    780773bool str_test_prefix(const char *s, const char *p)
    781774{
    782         wchar_t c1 = 0;
    783         wchar_t c2 = 0;
     775        char32_t c1 = 0;
     776        char32_t c2 = 0;
    784777
    785778        size_t off1 = 0;
     
    850843        size_t dest_off = 0;
    851844
    852         wchar_t ch;
     845        char32_t ch;
    853846        while ((ch = str_decode(src, &src_off, STR_NO_LIMIT)) != 0) {
    854847                if (chr_encode(ch, dest, &dest_off, size - 1) != EOK)
     
    883876        size_t dest_off = 0;
    884877
    885         wchar_t ch;
     878        char32_t ch;
    886879        while ((ch = str_decode(src, &src_off, n)) != 0) {
    887880                if (chr_encode(ch, dest, &dest_off, size - 1) != EOK)
     
    987980 * @param src   Source wide string.
    988981 */
    989 void wstr_to_str(char *dest, size_t size, const wchar_t *src)
    990 {
    991         wchar_t ch;
     982void wstr_to_str(char *dest, size_t size, const char32_t *src)
     983{
     984        char32_t ch;
    992985        size_t src_idx;
    993986        size_t dest_off;
     
    10221015{
    10231016        size_t idx = 0, dest_off = 0;
    1024         wchar_t ch;
     1017        char32_t ch;
    10251018        errno_t rc = EOK;
    10261019
     
    10661059        size_t offset = 0;
    10671060        size_t idx = 0;
    1068         wchar_t c;
     1061        char32_t c;
    10691062
    10701063        assert(dlen > 0);
     
    11231116 * @return      New string.
    11241117 */
    1125 char *wstr_to_astr(const wchar_t *src)
     1118char *wstr_to_astr(const char32_t *src)
    11261119{
    11271120        char dbuf[STR_BOUNDS(1)];
    11281121        char *str;
    1129         wchar_t ch;
     1122        char32_t ch;
    11301123
    11311124        size_t src_idx;
     
    11731166 * @param src   Source string.
    11741167 */
    1175 void str_to_wstr(wchar_t *dest, size_t dlen, const char *src)
     1168void str_to_wstr(char32_t *dest, size_t dlen, const char *src)
    11761169{
    11771170        size_t offset;
    11781171        size_t di;
    1179         wchar_t c;
     1172        char32_t c;
    11801173
    11811174        assert(dlen > 0);
     
    12021195 * @param src   Source string.
    12031196 */
    1204 wchar_t *str_to_awstr(const char *str)
     1197char32_t *str_to_awstr(const char *str)
    12051198{
    12061199        size_t len = str_length(str);
    12071200
    1208         wchar_t *wstr = calloc(len + 1, sizeof(wchar_t));
     1201        char32_t *wstr = calloc(len + 1, sizeof(char32_t));
    12091202        if (wstr == NULL)
    12101203                return NULL;
     
    12211214 * @return Pointer to character in @a str or NULL if not found.
    12221215 */
    1223 char *str_chr(const char *str, wchar_t ch)
    1224 {
    1225         wchar_t acc;
     1216char *str_chr(const char *str, char32_t ch)
     1217{
     1218        char32_t acc;
    12261219        size_t off = 0;
    12271220        size_t last = 0;
     
    12631256 * @param ch  Character to remove.
    12641257 */
    1265 void str_rtrim(char *str, wchar_t ch)
     1258void str_rtrim(char *str, char32_t ch)
    12661259{
    12671260        size_t off = 0;
    12681261        size_t pos = 0;
    1269         wchar_t c;
     1262        char32_t c;
    12701263        bool update_last_chunk = true;
    12711264        char *last_chunk = NULL;
     
    12911284 * @param ch  Character to remove.
    12921285 */
    1293 void str_ltrim(char *str, wchar_t ch)
    1294 {
    1295         wchar_t acc;
     1286void str_ltrim(char *str, char32_t ch)
     1287{
     1288        char32_t acc;
    12961289        size_t off = 0;
    12971290        size_t pos = 0;
     
    13191312 * @return Pointer to character in @a str or NULL if not found.
    13201313 */
    1321 char *str_rchr(const char *str, wchar_t ch)
    1322 {
    1323         wchar_t acc;
     1314char *str_rchr(const char *str, char32_t ch)
     1315{
     1316        char32_t acc;
    13241317        size_t off = 0;
    13251318        size_t last = 0;
     
    13491342 *
    13501343 */
    1351 bool wstr_linsert(wchar_t *str, wchar_t ch, size_t pos, size_t max_pos)
     1344bool wstr_linsert(char32_t *str, char32_t ch, size_t pos, size_t max_pos)
    13521345{
    13531346        size_t len = wstr_length(str);
     
    13771370 *
    13781371 */
    1379 bool wstr_remove(wchar_t *str, size_t pos)
     1372bool wstr_remove(char32_t *str, size_t pos)
    13801373{
    13811374        size_t len = wstr_length(str);
     
    14741467        size_t cur;
    14751468        size_t tmp;
    1476         wchar_t ch;
     1469        char32_t ch;
    14771470
    14781471        /* Skip over leading delimiters. */
Note: See TracChangeset for help on using the changeset viewer.