source: mainline/common/stdc/uchar.c

Last change on this file was 65bf084, checked in by Jiří Zárevúcky <zarevucky.jiri@…>, 3 months ago

Implement both str_decode() and mbrtoc32() using one function

  • Property mode set to 100644
File size: 4.5 KB
Line 
1/*
2 * Copyright (c) 2025 Jiří Zárevúcky
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * - Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * - Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * - The name of the author may not be used to endorse or promote products
15 * derived from this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29#include <assert.h>
30#include <errno.h>
31#include <limits.h>
32#include <stdbool.h>
33#include <uchar.h>
34
35#if __STDC_HOSTED__
36#include <fibril.h>
37#endif
38
39static void _set_ilseq()
40{
41#ifdef errno
42 errno = EILSEQ;
43#endif
44}
45
46static bool _is_low_surrogate(char16_t c)
47{
48 return c >= 0xDC00 && c < 0xE000;
49}
50
51static bool _is_high_surrogate(char16_t c)
52{
53 return c >= 0xD800 && c < 0xDC00;
54}
55
56static bool _is_surrogate(char16_t c)
57{
58 return c >= 0xD800 && c < 0xE000;
59}
60
61#define UTF8_CONT(c, shift) (0x80 | (((c) >> (shift)) & 0x3F))
62
63size_t c32rtomb(char *s, char32_t c, mbstate_t *mb)
64{
65 if (!s) {
66 // Equivalent to c32rtomb(buf, L’\0’, mb).
67 return 1;
68 }
69
70 /* 1 byte encoding */
71 if (c < 0x80) {
72 s[0] = c;
73 return 1;
74 }
75
76 /* 2 byte encoding */
77 if (c < 0x800) {
78 s[0] = 0b11000000 | (c >> 6);
79 s[1] = UTF8_CONT(c, 0);
80 return 2;
81 }
82
83 /* 3 byte encoding */
84 if (c < 0x10000) {
85 if (_is_surrogate(c)) {
86 /* illegal range for an unicode code point */
87 _set_ilseq();
88 return UCHAR_ILSEQ;
89 }
90
91 s[0] = 0b11100000 | (c >> 12);
92 s[1] = UTF8_CONT(c, 6);
93 s[2] = UTF8_CONT(c, 0);
94 return 3;
95 }
96
97 /* 4 byte encoding */
98 if (c < 0x110000) {
99 s[0] = 0b11110000 | (c >> 18);
100 s[1] = UTF8_CONT(c, 12);
101 s[2] = UTF8_CONT(c, 6);
102 s[3] = UTF8_CONT(c, 0);
103 return 4;
104 }
105
106 _set_ilseq();
107 return UCHAR_ILSEQ;
108}
109
110size_t mbrtoc16(char16_t *c, const char *s, size_t n, mbstate_t *mb)
111{
112#if __STDC_HOSTED__
113 static fibril_local mbstate_t global_state = { };
114
115 if (!mb)
116 mb = &global_state;
117#else
118 assert(mb);
119#endif
120
121 char16_t dummy;
122
123 if (!c)
124 c = &dummy;
125
126 if (!s) {
127 /* Equivalent to mbrtoc16(NULL, "", 1, mb). */
128 if (mb->state) {
129 _set_ilseq();
130 return UCHAR_ILSEQ;
131 } else {
132 return 0;
133 }
134 }
135
136 if ((mb->state & 0xD000) == 0xD000) {
137 /* mbstate_t contains the second surrogate character. */
138 /* mbrtoc32() will never set it to such value. */
139 *c = mb->state;
140 mb->state = 0;
141 return UCHAR_CONTINUED;
142 }
143
144 char32_t c32 = 0;
145 size_t ret = mbrtoc32(&c32, s, n, mb);
146 if (ret < INT_MAX) {
147 if (c32 < 0x10000) {
148 *c = c32;
149 } else {
150 /* Encode UTF-16 surrogates. */
151 mb->state = (c32 & 0x3FF) + 0xDC00;
152 *c = (c32 >> 10) + 0xD7C0;
153 }
154 return ret;
155 }
156
157 return ret;
158}
159
160size_t c16rtomb(char *s, char16_t c, mbstate_t *mb)
161{
162#if __STDC_HOSTED__
163 static fibril_local mbstate_t global_state = { };
164
165 if (!mb)
166 mb = &global_state;
167#else
168 assert(mb);
169#endif
170
171 if (!s) {
172 // Equivalent to c16rtomb(buf, L’\0’, mb).
173 if (mb->state) {
174 _set_ilseq();
175 return UCHAR_ILSEQ;
176 } else {
177 return 1;
178 }
179 }
180
181 if (!_is_surrogate(c)) {
182 if (mb->state) {
183 _set_ilseq();
184 return UCHAR_ILSEQ;
185 }
186
187 return c32rtomb(s, c, mb);
188 }
189
190 if (!mb->state) {
191 mb->state = c;
192 return 0;
193 }
194
195 char32_t c32;
196
197 /* Decode UTF-16 surrogates. */
198 if (_is_low_surrogate(mb->state) && _is_high_surrogate(c)) {
199 c32 = ((c - 0xD7C0) << 10) | (mb->state - 0xDC00);
200 } else if (_is_high_surrogate(mb->state) && _is_low_surrogate(c)) {
201 c32 = ((mb->state - 0xD7C0) << 10) | (c - 0xDC00);
202 } else {
203 _set_ilseq();
204 return UCHAR_ILSEQ;
205 }
206
207 mb->state = 0;
208 return c32rtomb(s, c32, mb);
209}
Note: See TracBrowser for help on using the repository browser.