Changeset 171f9a1 in mainline for uspace/lib/libc/generic/string.c


Ignore:
Timestamp:
2009-04-03T20:39:33Z (15 years ago)
Author:
Jiri Svoboda <jirik.svoboda@…>
Branches:
lfn, master, serial, ticket/834-toolchain-update, topic/msim-upgrade
Children:
cb01e1e
Parents:
7a2c479
Message:

Character encoding/decoding un uspace. Partially fix klog application.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • uspace/lib/libc/generic/string.c

    r7a2c479 r171f9a1  
    3939#include <ctype.h>
    4040#include <malloc.h>
     41#include <errno.h>
     42#include <string.h>
     43
     44/** Byte mask consisting of lowest @n bits (out of 8) */
     45#define LO_MASK_8(n)  ((uint8_t) ((1 << (n)) - 1))
     46
     47/** Byte mask consisting of lowest @n bits (out of 32) */
     48#define LO_MASK_32(n)  ((uint32_t) ((1 << (n)) - 1))
     49
     50/** Byte mask consisting of highest @n bits (out of 8) */
     51#define HI_MASK_8(n)  (~LO_MASK_8(8 - (n)))
     52
     53/** Number of data bits in a UTF-8 continuation byte */
     54#define CONT_BITS  6
     55
     56/** Decode a single character from a string.
     57 *
     58 * Decode a single character from a string of size @a size. Decoding starts
     59 * at @a offset and this offset is moved to the beginning of the next
     60 * character. In case of decoding error, offset generally advances at least
     61 * by one. However, offset is never moved beyond size.
     62 *
     63 * @param str    String (not necessarily NULL-terminated).
     64 * @param offset Byte offset in string where to start decoding.
     65 * @param size   Size of the string (in bytes).
     66 *
     67 * @return Value of decoded character, U_SPECIAL on decoding error or
     68 *         NULL if attempt to decode beyond @a size.
     69 *
     70 */
     71wchar_t str_decode(const char *str, size_t *offset, size_t size)
     72{
     73        if (*offset + 1 > size)
     74                return 0;
     75       
     76        /* First byte read from string */
     77        uint8_t b0 = (uint8_t) str[(*offset)++];
     78       
     79        /* Determine code length */
     80       
     81        unsigned int b0_bits;  /* Data bits in first byte */
     82        unsigned int cbytes;   /* Number of continuation bytes */
     83       
     84        if ((b0 & 0x80) == 0) {
     85                /* 0xxxxxxx (Plain ASCII) */
     86                b0_bits = 7;
     87                cbytes = 0;
     88        } else if ((b0 & 0xe0) == 0xc0) {
     89                /* 110xxxxx 10xxxxxx */
     90                b0_bits = 5;
     91                cbytes = 1;
     92        } else if ((b0 & 0xf0) == 0xe0) {
     93                /* 1110xxxx 10xxxxxx 10xxxxxx */
     94                b0_bits = 4;
     95                cbytes = 2;
     96        } else if ((b0 & 0xf8) == 0xf0) {
     97                /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
     98                b0_bits = 3;
     99                cbytes = 3;
     100        } else {
     101                /* 10xxxxxx -- unexpected continuation byte */
     102                return U_SPECIAL;
     103        }
     104       
     105        if (*offset + cbytes > size)
     106                return U_SPECIAL;
     107       
     108        wchar_t ch = b0 & LO_MASK_8(b0_bits);
     109       
     110        /* Decode continuation bytes */
     111        while (cbytes > 0) {
     112                uint8_t b = (uint8_t) str[(*offset)++];
     113               
     114                /* Must be 10xxxxxx */
     115                if ((b & 0xc0) != 0x80)
     116                        return U_SPECIAL;
     117               
     118                /* Shift data bits to ch */
     119                ch = (ch << CONT_BITS) | (wchar_t) (b & LO_MASK_8(CONT_BITS));
     120                cbytes--;
     121        }
     122       
     123        return ch;
     124}
     125
     126/** Encode a single character to string representation.
     127 *
     128 * Encode a single character to string representation (i.e. UTF-8) and store
     129 * it into a buffer at @a offset. Encoding starts at @a offset and this offset
     130 * is moved to the position where the next character can be written to.
     131 *
     132 * @param ch     Input character.
     133 * @param str    Output buffer.
     134 * @param offset Byte offset where to start writing.
     135 * @param size   Size of the output buffer (in bytes).
     136 *
     137 * @return EOK if the character was encoded successfully, EOVERFLOW if there
     138 *         was not enough space in the output buffer or EINVAL if the character
     139 *         code was invalid.
     140 */
     141int chr_encode(const wchar_t ch, char *str, size_t *offset, size_t size)
     142{
     143        if (*offset >= size)
     144                return EOVERFLOW;
     145       
     146        if (!chr_check(ch))
     147                return EINVAL;
     148       
     149        /* Unsigned version of ch (bit operations should only be done
     150           on unsigned types). */
     151        uint32_t cc = (uint32_t) ch;
     152       
     153        /* Determine how many continuation bytes are needed */
     154       
     155        unsigned int b0_bits;  /* Data bits in first byte */
     156        unsigned int cbytes;   /* Number of continuation bytes */
     157       
     158        if ((cc & ~LO_MASK_32(7)) == 0) {
     159                b0_bits = 7;
     160                cbytes = 0;
     161        } else if ((cc & ~LO_MASK_32(11)) == 0) {
     162                b0_bits = 5;
     163                cbytes = 1;
     164        } else if ((cc & ~LO_MASK_32(16)) == 0) {
     165                b0_bits = 4;
     166                cbytes = 2;
     167        } else if ((cc & ~LO_MASK_32(21)) == 0) {
     168                b0_bits = 3;
     169                cbytes = 3;
     170        } else {
     171                /* Codes longer than 21 bits are not supported */
     172                return EINVAL;
     173        }
     174       
     175        /* Check for available space in buffer */
     176        if (*offset + cbytes >= size)
     177                return EOVERFLOW;
     178       
     179        /* Encode continuation bytes */
     180        unsigned int i;
     181        for (i = cbytes; i > 0; i--) {
     182                str[*offset + i] = 0x80 | (cc & LO_MASK_32(CONT_BITS));
     183                cc = cc >> CONT_BITS;
     184        }
     185       
     186        /* Encode first byte */
     187        str[*offset] = (cc & LO_MASK_32(b0_bits)) | HI_MASK_8(8 - b0_bits - 1);
     188       
     189        /* Advance offset */
     190        *offset += cbytes + 1;
     191       
     192        return EOK;
     193}
     194
     195/** Check whether character is valid
     196 *
     197 * @return True if character is a valid Unicode code point.
     198 *
     199 */
     200bool chr_check(const wchar_t ch)
     201{
     202        if ((ch >= 0) && (ch <= 1114111))
     203                return true;
     204       
     205        return false;
     206}
    41207
    42208/** Count the number of characters in the string, not including terminating 0.
Note: See TracChangeset for help on using the changeset viewer.