source: mainline/boot/generic/src/str.c@ 08e103d4

Last change on this file since 08e103d4 was 08e103d4, checked in by Jiří Zárevúcky <zarevucky.jiri@…>, 6 years ago

Use clearer naming for string length functions

This and the following commit change the names of functions, as well as
their documentation, to use unambiguous terms "bytes" and "code points"
instead of ambiguous terms "size", "length", and "characters".

  • Property mode set to 100644
File size: 11.7 KB
RevLine 
[4872160]1/*
2 * Copyright (c) 2001-2004 Jakub Jermar
[d066259]3 * Copyright (c) 2005 Martin Decky
4 * Copyright (c) 2008 Jiri Svoboda
5 * Copyright (c) 2011 Martin Sucha
6 * Copyright (c) 2011 Oleg Romanenko
[4872160]7 * All rights reserved.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 *
13 * - Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * - Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * - The name of the author may not be used to endorse or promote products
19 * derived from this software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
22 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
23 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
24 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
25 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
26 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
30 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 */
32
33/**
34 * @file
35 * @brief String functions.
36 *
37 * Strings and characters use the Universal Character Set (UCS). The standard
38 * strings, called just strings are encoded in UTF-8. Wide strings (encoded
39 * in UTF-32) are supported to a limited degree. A single character is
40 * represented as wchar_t.@n
41 *
42 * Overview of the terminology:@n
43 *
44 * Term Meaning
45 * -------------------- ----------------------------------------------------
46 * byte 8 bits stored in uint8_t (unsigned 8 bit integer)
47 *
48 * character UTF-32 encoded Unicode character, stored in wchar_t
49 * (signed 32 bit integer), code points 0 .. 1114111
50 * are valid
51 *
52 * ASCII character 7 bit encoded ASCII character, stored in char
53 * (usually signed 8 bit integer), code points 0 .. 127
54 * are valid
55 *
56 * string UTF-8 encoded NULL-terminated Unicode string, char *
57 *
58 * wide string UTF-32 encoded NULL-terminated Unicode string,
59 * wchar_t *
60 *
61 * [wide] string size number of BYTES in a [wide] string (excluding
62 * the NULL-terminator), size_t
63 *
64 * [wide] string length number of CHARACTERS in a [wide] string (excluding
65 * the NULL-terminator), size_t
66 *
67 * [wide] string width number of display cells on a monospace display taken
68 * by a [wide] string, size_t
69 *
70 *
71 * Overview of string metrics:@n
72 *
73 * Metric Abbrev. Type Meaning
74 * ------ ------ ------ -------------------------------------------------
75 * size n size_t number of BYTES in a string (excluding the
76 * NULL-terminator)
77 *
78 * length l size_t number of CHARACTERS in a string (excluding the
79 * null terminator)
80 *
81 * width w size_t number of display cells on a monospace display
82 * taken by a string
83 *
84 *
85 * Function naming prefixes:@n
86 *
87 * chr_ operate on characters
88 * ascii_ operate on ASCII characters
89 * str_ operate on strings
90 * wstr_ operate on wide strings
91 *
92 * [w]str_[n|l|w] operate on a prefix limited by size, length
93 * or width
94 *
95 *
96 * A specific character inside a [wide] string can be referred to by:@n
97 *
98 * pointer (char *, wchar_t *)
99 * byte offset (size_t)
100 * character index (size_t)
101 *
102 */
103
[d066259]104#include <str.h>
105
[4872160]106#include <errno.h>
[d735e2e]107#include <stdbool.h>
108#include <stddef.h>
[10d65d70]109#include <stdint.h>
[4872160]110
[8e893ae]111/** Check the condition if wchar_t is signed */
[002fd5f]112#ifdef __WCHAR_UNSIGNED__
[1433ecda]113#define WCHAR_SIGNED_CHECK(cond) (true)
[8e893ae]114#else
[1433ecda]115#define WCHAR_SIGNED_CHECK(cond) (cond)
[8e893ae]116#endif
117
[4872160]118/** Byte mask consisting of lowest @n bits (out of 8) */
119#define LO_MASK_8(n) ((uint8_t) ((1 << (n)) - 1))
120
121/** Byte mask consisting of lowest @n bits (out of 32) */
122#define LO_MASK_32(n) ((uint32_t) ((1 << (n)) - 1))
123
124/** Byte mask consisting of highest @n bits (out of 8) */
125#define HI_MASK_8(n) (~LO_MASK_8(8 - (n)))
126
127/** Number of data bits in a UTF-8 continuation byte */
128#define CONT_BITS 6
129
130/** Decode a single character from a string.
131 *
132 * Decode a single character from a string of size @a size. Decoding starts
133 * at @a offset and this offset is moved to the beginning of the next
134 * character. In case of decoding error, offset generally advances at least
135 * by one. However, offset is never moved beyond size.
136 *
137 * @param str String (not necessarily NULL-terminated).
138 * @param offset Byte offset in string where to start decoding.
139 * @param size Size of the string (in bytes).
140 *
141 * @return Value of decoded character, U_SPECIAL on decoding error or
142 * NULL if attempt to decode beyond @a size.
143 *
144 */
145wchar_t str_decode(const char *str, size_t *offset, size_t size)
146{
147 if (*offset + 1 > size)
148 return 0;
[a35b458]149
[4872160]150 /* First byte read from string */
151 uint8_t b0 = (uint8_t) str[(*offset)++];
[a35b458]152
[4872160]153 /* Determine code length */
[a35b458]154
[4872160]155 unsigned int b0_bits; /* Data bits in first byte */
156 unsigned int cbytes; /* Number of continuation bytes */
[a35b458]157
[4872160]158 if ((b0 & 0x80) == 0) {
159 /* 0xxxxxxx (Plain ASCII) */
160 b0_bits = 7;
161 cbytes = 0;
162 } else if ((b0 & 0xe0) == 0xc0) {
163 /* 110xxxxx 10xxxxxx */
164 b0_bits = 5;
165 cbytes = 1;
166 } else if ((b0 & 0xf0) == 0xe0) {
167 /* 1110xxxx 10xxxxxx 10xxxxxx */
168 b0_bits = 4;
169 cbytes = 2;
170 } else if ((b0 & 0xf8) == 0xf0) {
171 /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
172 b0_bits = 3;
173 cbytes = 3;
174 } else {
175 /* 10xxxxxx -- unexpected continuation byte */
176 return U_SPECIAL;
177 }
[a35b458]178
[4872160]179 if (*offset + cbytes > size)
180 return U_SPECIAL;
[a35b458]181
[4872160]182 wchar_t ch = b0 & LO_MASK_8(b0_bits);
[a35b458]183
[4872160]184 /* Decode continuation bytes */
185 while (cbytes > 0) {
186 uint8_t b = (uint8_t) str[(*offset)++];
[a35b458]187
[4872160]188 /* Must be 10xxxxxx */
189 if ((b & 0xc0) != 0x80)
190 return U_SPECIAL;
[a35b458]191
[4872160]192 /* Shift data bits to ch */
193 ch = (ch << CONT_BITS) | (wchar_t) (b & LO_MASK_8(CONT_BITS));
194 cbytes--;
195 }
[a35b458]196
[4872160]197 return ch;
198}
199
200/** Encode a single character to string representation.
201 *
202 * Encode a single character to string representation (i.e. UTF-8) and store
203 * it into a buffer at @a offset. Encoding starts at @a offset and this offset
204 * is moved to the position where the next character can be written to.
205 *
206 * @param ch Input character.
207 * @param str Output buffer.
208 * @param offset Byte offset where to start writing.
209 * @param size Size of the output buffer (in bytes).
210 *
211 * @return EOK if the character was encoded successfully, EOVERFLOW if there
212 * was not enough space in the output buffer or EINVAL if the character
213 * code was invalid.
214 */
[d066259]215errno_t chr_encode(const wchar_t ch, char *str, size_t *offset, size_t size)
[4872160]216{
217 if (*offset >= size)
218 return EOVERFLOW;
[a35b458]219
[4872160]220 if (!chr_check(ch))
221 return EINVAL;
[a35b458]222
[7c3fb9b]223 /*
224 * Unsigned version of ch (bit operations should only be done
225 * on unsigned types).
226 */
[4872160]227 uint32_t cc = (uint32_t) ch;
[a35b458]228
[4872160]229 /* Determine how many continuation bytes are needed */
[a35b458]230
[4872160]231 unsigned int b0_bits; /* Data bits in first byte */
232 unsigned int cbytes; /* Number of continuation bytes */
[a35b458]233
[4872160]234 if ((cc & ~LO_MASK_32(7)) == 0) {
235 b0_bits = 7;
236 cbytes = 0;
237 } else if ((cc & ~LO_MASK_32(11)) == 0) {
238 b0_bits = 5;
239 cbytes = 1;
240 } else if ((cc & ~LO_MASK_32(16)) == 0) {
241 b0_bits = 4;
242 cbytes = 2;
243 } else if ((cc & ~LO_MASK_32(21)) == 0) {
244 b0_bits = 3;
245 cbytes = 3;
246 } else {
247 /* Codes longer than 21 bits are not supported */
248 return EINVAL;
249 }
[a35b458]250
[4872160]251 /* Check for available space in buffer */
252 if (*offset + cbytes >= size)
253 return EOVERFLOW;
[a35b458]254
[4872160]255 /* Encode continuation bytes */
256 unsigned int i;
257 for (i = cbytes; i > 0; i--) {
258 str[*offset + i] = 0x80 | (cc & LO_MASK_32(CONT_BITS));
259 cc = cc >> CONT_BITS;
260 }
[a35b458]261
[4872160]262 /* Encode first byte */
263 str[*offset] = (cc & LO_MASK_32(b0_bits)) | HI_MASK_8(8 - b0_bits - 1);
[a35b458]264
[4872160]265 /* Advance offset */
266 *offset += cbytes + 1;
[a35b458]267
[4872160]268 return EOK;
269}
270
271/** Get size of string.
272 *
273 * Get the number of bytes which are used by the string @a str (excluding the
274 * NULL-terminator).
275 *
276 * @param str String to consider.
277 *
278 * @return Number of bytes used by the string
279 *
280 */
[08e103d4]281size_t str_bytes(const char *str)
[4872160]282{
283 size_t size = 0;
[a35b458]284
[4872160]285 while (*str++ != 0)
286 size++;
[a35b458]287
[4872160]288 return size;
289}
290
291/** Get size of string with length limit.
292 *
293 * Get the number of bytes which are used by up to @a max_len first
294 * characters in the string @a str. If @a max_len is greater than
295 * the length of @a str, the entire string is measured (excluding the
296 * NULL-terminator).
297 *
298 * @param str String to consider.
299 * @param max_len Maximum number of characters to measure.
300 *
301 * @return Number of bytes used by the characters.
302 *
303 */
[08e103d4]304size_t str_lbytes(const char *str, size_t max_len)
[4872160]305{
306 size_t len = 0;
307 size_t offset = 0;
[a35b458]308
[4872160]309 while (len < max_len) {
310 if (str_decode(str, &offset, STR_NO_LIMIT) == 0)
311 break;
[a35b458]312
[4872160]313 len++;
314 }
[a35b458]315
[4872160]316 return offset;
317}
318
319/** Get number of characters in a string.
320 *
321 * @param str NULL-terminated string.
322 *
323 * @return Number of characters in string.
324 *
325 */
[08e103d4]326size_t str_code_points(const char *str)
[4872160]327{
328 size_t len = 0;
329 size_t offset = 0;
[a35b458]330
[4872160]331 while (str_decode(str, &offset, STR_NO_LIMIT) != 0)
332 len++;
[a35b458]333
[4872160]334 return len;
335}
336
337/** Check whether character is plain ASCII.
338 *
339 * @return True if character is plain ASCII.
340 *
341 */
342bool ascii_check(wchar_t ch)
343{
[8e893ae]344 if (WCHAR_SIGNED_CHECK(ch >= 0) && (ch <= 127))
[4872160]345 return true;
[a35b458]346
[4872160]347 return false;
348}
349
350/** Check whether character is valid
351 *
352 * @return True if character is a valid Unicode code point.
353 *
354 */
355bool chr_check(wchar_t ch)
356{
[8e893ae]357 if (WCHAR_SIGNED_CHECK(ch >= 0) && (ch <= 1114111))
[4872160]358 return true;
[a35b458]359
[4872160]360 return false;
361}
362
363/** Compare two NULL terminated strings.
364 *
365 * Do a char-by-char comparison of two NULL-terminated strings.
[4efeab5]366 * The strings are considered equal iff their length is equal
367 * and both strings consist of the same sequence of characters.
368 *
[1772e6d]369 * A string S1 is less than another string S2 if it has a character with
370 * lower value at the first character position where the strings differ.
371 * If the strings differ in length, the shorter one is treated as if
372 * padded by characters with a value of zero.
[4872160]373 *
374 * @param s1 First string to compare.
375 * @param s2 Second string to compare.
376 *
[1772e6d]377 * @return 0 if the strings are equal, -1 if the first is less than the second,
378 * 1 if the second is less than the first.
[4872160]379 *
380 */
381int str_cmp(const char *s1, const char *s2)
382{
383 wchar_t c1 = 0;
384 wchar_t c2 = 0;
[a35b458]385
[4872160]386 size_t off1 = 0;
387 size_t off2 = 0;
[a35b458]388
[4872160]389 while (true) {
390 c1 = str_decode(s1, &off1, STR_NO_LIMIT);
391 c2 = str_decode(s2, &off2, STR_NO_LIMIT);
[a35b458]392
[4872160]393 if (c1 < c2)
394 return -1;
[a35b458]395
[4872160]396 if (c1 > c2)
397 return 1;
[a35b458]398
[d066259]399 if (c1 == 0 || c2 == 0)
[4872160]400 break;
401 }
[a35b458]402
[4872160]403 return 0;
404}
405
406/** Copy string.
407 *
408 * Copy source string @a src to destination buffer @a dest.
409 * No more than @a size bytes are written. If the size of the output buffer
410 * is at least one byte, the output string will always be well-formed, i.e.
411 * null-terminated and containing only complete characters.
412 *
413 * @param dest Destination buffer.
414 * @param count Size of the destination buffer (must be > 0).
415 * @param src Source string.
416 *
417 */
418void str_cpy(char *dest, size_t size, const char *src)
419{
420 size_t src_off = 0;
421 size_t dest_off = 0;
[a35b458]422
[4872160]423 wchar_t ch;
424 while ((ch = str_decode(src, &src_off, STR_NO_LIMIT)) != 0) {
425 if (chr_encode(ch, dest, &dest_off, size - 1) != EOK)
426 break;
427 }
[a35b458]428
[4872160]429 dest[dest_off] = '\0';
430}
431
432/** @}
433 */
Note: See TracBrowser for help on using the repository browser.