source: mainline/boot/generic/src/str.c@ 39916d6

Last change on this file since 39916d6 was d7f7a4a, checked in by Jiří Zárevúcky <zarevucky.jiri@…>, 3 years ago

Replace some license headers with SPDX identifier

Headers are replaced using tools/transorm-copyright.sh only
when it can be matched verbatim with the license header used
throughout most of the codebase.

  • Property mode set to 100644
File size: 10.2 KB
Line 
1/*
2 * SPDX-FileCopyrightText: 2001-2004 Jakub Jermar
3 * SPDX-FileCopyrightText: 2005 Martin Decky
4 * SPDX-FileCopyrightText: 2008 Jiri Svoboda
5 * SPDX-FileCopyrightText: 2011 Martin Sucha
6 * SPDX-FileCopyrightText: 2011 Oleg Romanenko
7 *
8 * SPDX-License-Identifier: BSD-3-Clause
9 */
10
11/**
12 * @file
13 * @brief String functions.
14 *
15 * Strings and characters use the Universal Character Set (UCS). The standard
16 * strings, called just strings are encoded in UTF-8. Wide strings (encoded
17 * in UTF-32) are supported to a limited degree. A single character is
18 * represented as char32_t.@n
19 *
20 * Overview of the terminology:@n
21 *
22 * Term Meaning
23 * -------------------- ----------------------------------------------------
24 * byte 8 bits stored in uint8_t (unsigned 8 bit integer)
25 *
26 * character UTF-32 encoded Unicode character, stored in char32_t
27 * (unsigned 32 bit integer), code points 0 .. 1114111
28 * are valid
29 *
30 * ASCII character 7 bit encoded ASCII character, stored in char
31 * (usually signed 8 bit integer), code points 0 .. 127
32 * are valid
33 *
34 * string UTF-8 encoded NULL-terminated Unicode string, char *
35 *
36 * wide string UTF-32 encoded NULL-terminated Unicode string,
37 * char32_t *
38 *
39 * [wide] string size number of BYTES in a [wide] string (excluding
40 * the NULL-terminator), size_t
41 *
42 * [wide] string length number of CHARACTERS in a [wide] string (excluding
43 * the NULL-terminator), size_t
44 *
45 * [wide] string width number of display cells on a monospace display taken
46 * by a [wide] string, size_t
47 *
48 *
49 * Overview of string metrics:@n
50 *
51 * Metric Abbrev. Type Meaning
52 * ------ ------ ------ -------------------------------------------------
53 * size n size_t number of BYTES in a string (excluding the
54 * NULL-terminator)
55 *
56 * length l size_t number of CHARACTERS in a string (excluding the
57 * null terminator)
58 *
59 * width w size_t number of display cells on a monospace display
60 * taken by a string
61 *
62 *
63 * Function naming prefixes:@n
64 *
65 * chr_ operate on characters
66 * ascii_ operate on ASCII characters
67 * str_ operate on strings
68 * wstr_ operate on wide strings
69 *
70 * [w]str_[n|l|w] operate on a prefix limited by size, length
71 * or width
72 *
73 *
74 * A specific character inside a [wide] string can be referred to by:@n
75 *
76 * pointer (char *, char32_t *)
77 * byte offset (size_t)
78 * character index (size_t)
79 *
80 */
81
82#include <str.h>
83
84#include <errno.h>
85#include <stdbool.h>
86#include <stddef.h>
87#include <stdint.h>
88
89/** Byte mask consisting of lowest @n bits (out of 8) */
90#define LO_MASK_8(n) ((uint8_t) ((1 << (n)) - 1))
91
92/** Byte mask consisting of lowest @n bits (out of 32) */
93#define LO_MASK_32(n) ((uint32_t) ((1 << (n)) - 1))
94
95/** Byte mask consisting of highest @n bits (out of 8) */
96#define HI_MASK_8(n) (~LO_MASK_8(8 - (n)))
97
98/** Number of data bits in a UTF-8 continuation byte */
99#define CONT_BITS 6
100
101/** Decode a single character from a string.
102 *
103 * Decode a single character from a string of size @a size. Decoding starts
104 * at @a offset and this offset is moved to the beginning of the next
105 * character. In case of decoding error, offset generally advances at least
106 * by one. However, offset is never moved beyond size.
107 *
108 * @param str String (not necessarily NULL-terminated).
109 * @param offset Byte offset in string where to start decoding.
110 * @param size Size of the string (in bytes).
111 *
112 * @return Value of decoded character, U_SPECIAL on decoding error or
113 * NULL if attempt to decode beyond @a size.
114 *
115 */
116char32_t str_decode(const char *str, size_t *offset, size_t size)
117{
118 if (*offset + 1 > size)
119 return 0;
120
121 /* First byte read from string */
122 uint8_t b0 = (uint8_t) str[(*offset)++];
123
124 /* Determine code length */
125
126 unsigned int b0_bits; /* Data bits in first byte */
127 unsigned int cbytes; /* Number of continuation bytes */
128
129 if ((b0 & 0x80) == 0) {
130 /* 0xxxxxxx (Plain ASCII) */
131 b0_bits = 7;
132 cbytes = 0;
133 } else if ((b0 & 0xe0) == 0xc0) {
134 /* 110xxxxx 10xxxxxx */
135 b0_bits = 5;
136 cbytes = 1;
137 } else if ((b0 & 0xf0) == 0xe0) {
138 /* 1110xxxx 10xxxxxx 10xxxxxx */
139 b0_bits = 4;
140 cbytes = 2;
141 } else if ((b0 & 0xf8) == 0xf0) {
142 /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
143 b0_bits = 3;
144 cbytes = 3;
145 } else {
146 /* 10xxxxxx -- unexpected continuation byte */
147 return U_SPECIAL;
148 }
149
150 if (*offset + cbytes > size)
151 return U_SPECIAL;
152
153 char32_t ch = b0 & LO_MASK_8(b0_bits);
154
155 /* Decode continuation bytes */
156 while (cbytes > 0) {
157 uint8_t b = (uint8_t) str[(*offset)++];
158
159 /* Must be 10xxxxxx */
160 if ((b & 0xc0) != 0x80)
161 return U_SPECIAL;
162
163 /* Shift data bits to ch */
164 ch = (ch << CONT_BITS) | (char32_t) (b & LO_MASK_8(CONT_BITS));
165 cbytes--;
166 }
167
168 return ch;
169}
170
171/** Encode a single character to string representation.
172 *
173 * Encode a single character to string representation (i.e. UTF-8) and store
174 * it into a buffer at @a offset. Encoding starts at @a offset and this offset
175 * is moved to the position where the next character can be written to.
176 *
177 * @param ch Input character.
178 * @param str Output buffer.
179 * @param offset Byte offset where to start writing.
180 * @param size Size of the output buffer (in bytes).
181 *
182 * @return EOK if the character was encoded successfully, EOVERFLOW if there
183 * was not enough space in the output buffer or EINVAL if the character
184 * code was invalid.
185 */
186errno_t chr_encode(const char32_t ch, char *str, size_t *offset, size_t size)
187{
188 if (*offset >= size)
189 return EOVERFLOW;
190
191 if (!chr_check(ch))
192 return EINVAL;
193
194 /*
195 * Unsigned version of ch (bit operations should only be done
196 * on unsigned types).
197 */
198 uint32_t cc = (uint32_t) ch;
199
200 /* Determine how many continuation bytes are needed */
201
202 unsigned int b0_bits; /* Data bits in first byte */
203 unsigned int cbytes; /* Number of continuation bytes */
204
205 if ((cc & ~LO_MASK_32(7)) == 0) {
206 b0_bits = 7;
207 cbytes = 0;
208 } else if ((cc & ~LO_MASK_32(11)) == 0) {
209 b0_bits = 5;
210 cbytes = 1;
211 } else if ((cc & ~LO_MASK_32(16)) == 0) {
212 b0_bits = 4;
213 cbytes = 2;
214 } else if ((cc & ~LO_MASK_32(21)) == 0) {
215 b0_bits = 3;
216 cbytes = 3;
217 } else {
218 /* Codes longer than 21 bits are not supported */
219 return EINVAL;
220 }
221
222 /* Check for available space in buffer */
223 if (*offset + cbytes >= size)
224 return EOVERFLOW;
225
226 /* Encode continuation bytes */
227 unsigned int i;
228 for (i = cbytes; i > 0; i--) {
229 str[*offset + i] = 0x80 | (cc & LO_MASK_32(CONT_BITS));
230 cc = cc >> CONT_BITS;
231 }
232
233 /* Encode first byte */
234 str[*offset] = (cc & LO_MASK_32(b0_bits)) | HI_MASK_8(8 - b0_bits - 1);
235
236 /* Advance offset */
237 *offset += cbytes + 1;
238
239 return EOK;
240}
241
242/** Get size of string.
243 *
244 * Get the number of bytes which are used by the string @a str (excluding the
245 * NULL-terminator).
246 *
247 * @param str String to consider.
248 *
249 * @return Number of bytes used by the string
250 *
251 */
252size_t str_size(const char *str)
253{
254 size_t size = 0;
255
256 while (*str++ != 0)
257 size++;
258
259 return size;
260}
261
262/** Get size of string with length limit.
263 *
264 * Get the number of bytes which are used by up to @a max_len first
265 * characters in the string @a str. If @a max_len is greater than
266 * the length of @a str, the entire string is measured (excluding the
267 * NULL-terminator).
268 *
269 * @param str String to consider.
270 * @param max_len Maximum number of characters to measure.
271 *
272 * @return Number of bytes used by the characters.
273 *
274 */
275size_t str_lsize(const char *str, size_t max_len)
276{
277 size_t len = 0;
278 size_t offset = 0;
279
280 while (len < max_len) {
281 if (str_decode(str, &offset, STR_NO_LIMIT) == 0)
282 break;
283
284 len++;
285 }
286
287 return offset;
288}
289
290/** Get number of characters in a string.
291 *
292 * @param str NULL-terminated string.
293 *
294 * @return Number of characters in string.
295 *
296 */
297size_t str_length(const char *str)
298{
299 size_t len = 0;
300 size_t offset = 0;
301
302 while (str_decode(str, &offset, STR_NO_LIMIT) != 0)
303 len++;
304
305 return len;
306}
307
308/** Check whether character is plain ASCII.
309 *
310 * @return True if character is plain ASCII.
311 *
312 */
313bool ascii_check(char32_t ch)
314{
315 if (ch <= 127)
316 return true;
317
318 return false;
319}
320
321/** Check whether character is valid
322 *
323 * @return True if character is a valid Unicode code point.
324 *
325 */
326bool chr_check(char32_t ch)
327{
328 if (ch <= 1114111)
329 return true;
330
331 return false;
332}
333
334/** Compare two NULL terminated strings.
335 *
336 * Do a char-by-char comparison of two NULL-terminated strings.
337 * The strings are considered equal iff their length is equal
338 * and both strings consist of the same sequence of characters.
339 *
340 * A string S1 is less than another string S2 if it has a character with
341 * lower value at the first character position where the strings differ.
342 * If the strings differ in length, the shorter one is treated as if
343 * padded by characters with a value of zero.
344 *
345 * @param s1 First string to compare.
346 * @param s2 Second string to compare.
347 *
348 * @return 0 if the strings are equal, -1 if the first is less than the second,
349 * 1 if the second is less than the first.
350 *
351 */
352int str_cmp(const char *s1, const char *s2)
353{
354 char32_t c1 = 0;
355 char32_t c2 = 0;
356
357 size_t off1 = 0;
358 size_t off2 = 0;
359
360 while (true) {
361 c1 = str_decode(s1, &off1, STR_NO_LIMIT);
362 c2 = str_decode(s2, &off2, STR_NO_LIMIT);
363
364 if (c1 < c2)
365 return -1;
366
367 if (c1 > c2)
368 return 1;
369
370 if (c1 == 0 || c2 == 0)
371 break;
372 }
373
374 return 0;
375}
376
377/** Copy string.
378 *
379 * Copy source string @a src to destination buffer @a dest.
380 * No more than @a size bytes are written. If the size of the output buffer
381 * is at least one byte, the output string will always be well-formed, i.e.
382 * null-terminated and containing only complete characters.
383 *
384 * @param dest Destination buffer.
385 * @param count Size of the destination buffer (must be > 0).
386 * @param src Source string.
387 *
388 */
389void str_cpy(char *dest, size_t size, const char *src)
390{
391 size_t src_off = 0;
392 size_t dest_off = 0;
393
394 char32_t ch;
395 while ((ch = str_decode(src, &src_off, STR_NO_LIMIT)) != 0) {
396 if (chr_encode(ch, dest, &dest_off, size - 1) != EOK)
397 break;
398 }
399
400 dest[dest_off] = '\0';
401}
402
403/** @}
404 */
Note: See TracBrowser for help on using the repository browser.