/* * Copyright (c) 2025 Jiří Zárevúcky * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * - Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * - The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include #include #if __STDC_HOSTED__ #include #endif static void _set_ilseq() { #ifdef errno errno = EILSEQ; #endif } static bool _is_low_surrogate(char16_t c) { return c >= 0xDC00 && c < 0xE000; } static bool _is_high_surrogate(char16_t c) { return c >= 0xD800 && c < 0xDC00; } static bool _is_surrogate(char16_t c) { return c >= 0xD800 && c < 0xE000; } #define UTF8_CONT(c, shift) (0x80 | (((c) >> (shift)) & 0x3F)) size_t c32rtomb(char *s, char32_t c, mbstate_t *mb) { if (!s) { // Equivalent to c32rtomb(buf, L’\0’, mb). return 1; } /* 1 byte encoding */ if (c < 0x80) { s[0] = c; return 1; } /* 2 byte encoding */ if (c < 0x800) { s[0] = 0b11000000 | (c >> 6); s[1] = UTF8_CONT(c, 0); return 2; } /* 3 byte encoding */ if (c < 0x10000) { if (_is_surrogate(c)) { /* illegal range for an unicode code point */ _set_ilseq(); return UCHAR_ILSEQ; } s[0] = 0b11100000 | (c >> 12); s[1] = UTF8_CONT(c, 6); s[2] = UTF8_CONT(c, 0); return 3; } /* 4 byte encoding */ if (c < 0x110000) { s[0] = 0b11110000 | (c >> 18); s[1] = UTF8_CONT(c, 12); s[2] = UTF8_CONT(c, 6); s[3] = UTF8_CONT(c, 0); return 4; } _set_ilseq(); return UCHAR_ILSEQ; } size_t mbrtoc16(char16_t *c, const char *s, size_t n, mbstate_t *mb) { #if __STDC_HOSTED__ static fibril_local mbstate_t global_state = { }; if (!mb) mb = &global_state; #else assert(mb); #endif char16_t dummy; if (!c) c = &dummy; if (!s) { /* Equivalent to mbrtoc16(NULL, "", 1, mb). */ if (mb->state) { _set_ilseq(); return UCHAR_ILSEQ; } else { return 0; } } if ((mb->state & 0xD000) == 0xD000) { /* mbstate_t contains the second surrogate character. */ /* mbrtoc32() will never set it to such value. */ *c = mb->state; mb->state = 0; return UCHAR_CONTINUED; } char32_t c32 = 0; size_t ret = mbrtoc32(&c32, s, n, mb); if (ret < INT_MAX) { if (c32 < 0x10000) { *c = c32; } else { /* Encode UTF-16 surrogates. */ mb->state = (c32 & 0x3FF) + 0xDC00; *c = (c32 >> 10) + 0xD7C0; } return ret; } return ret; } size_t c16rtomb(char *s, char16_t c, mbstate_t *mb) { #if __STDC_HOSTED__ static fibril_local mbstate_t global_state = { }; if (!mb) mb = &global_state; #else assert(mb); #endif if (!s) { // Equivalent to c16rtomb(buf, L’\0’, mb). if (mb->state) { _set_ilseq(); return UCHAR_ILSEQ; } else { return 1; } } if (!_is_surrogate(c)) { if (mb->state) { _set_ilseq(); return UCHAR_ILSEQ; } return c32rtomb(s, c, mb); } if (!mb->state) { mb->state = c; return 0; } char32_t c32; /* Decode UTF-16 surrogates. */ if (_is_low_surrogate(mb->state) && _is_high_surrogate(c)) { c32 = ((c - 0xD7C0) << 10) | (mb->state - 0xDC00); } else if (_is_high_surrogate(mb->state) && _is_low_surrogate(c)) { c32 = ((mb->state - 0xD7C0) << 10) | (c - 0xDC00); } else { _set_ilseq(); return UCHAR_ILSEQ; } mb->state = 0; return c32rtomb(s, c32, mb); }