source: mainline/boot/generic/src/str.c@ a35b458

lfn serial ticket/834-toolchain-update topic/msim-upgrade topic/simplify-dev-export
Last change on this file since a35b458 was a35b458, checked in by Jiří Zárevúcky <zarevucky.jiri@…>, 8 years ago

style: Remove trailing whitespace on _all_ lines, including empty ones, for particular file types.

Command used: tools/srepl '\s\+$' '' -- *.c *.h *.py *.sh *.s *.S *.ag

Currently, whitespace on empty lines is very inconsistent.
There are two basic choices: Either remove the whitespace, or keep empty lines
indented to the level of surrounding code. The former is AFAICT more common,
and also much easier to do automatically.

Alternatively, we could write script for automatic indentation, and use that
instead. However, if such a script exists, it's possible to use the indented
style locally, by having the editor apply relevant conversions on load/save,
without affecting remote repository. IMO, it makes more sense to adopt
the simpler rule.

  • Property mode set to 100644
File size: 11.5 KB
Line 
1/*
2 * Copyright (c) 2001-2004 Jakub Jermar
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * - Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * - Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * - The name of the author may not be used to endorse or promote products
15 * derived from this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/**
30 * @file
31 * @brief String functions.
32 *
33 * Strings and characters use the Universal Character Set (UCS). The standard
34 * strings, called just strings are encoded in UTF-8. Wide strings (encoded
35 * in UTF-32) are supported to a limited degree. A single character is
36 * represented as wchar_t.@n
37 *
38 * Overview of the terminology:@n
39 *
40 * Term Meaning
41 * -------------------- ----------------------------------------------------
42 * byte 8 bits stored in uint8_t (unsigned 8 bit integer)
43 *
44 * character UTF-32 encoded Unicode character, stored in wchar_t
45 * (signed 32 bit integer), code points 0 .. 1114111
46 * are valid
47 *
48 * ASCII character 7 bit encoded ASCII character, stored in char
49 * (usually signed 8 bit integer), code points 0 .. 127
50 * are valid
51 *
52 * string UTF-8 encoded NULL-terminated Unicode string, char *
53 *
54 * wide string UTF-32 encoded NULL-terminated Unicode string,
55 * wchar_t *
56 *
57 * [wide] string size number of BYTES in a [wide] string (excluding
58 * the NULL-terminator), size_t
59 *
60 * [wide] string length number of CHARACTERS in a [wide] string (excluding
61 * the NULL-terminator), size_t
62 *
63 * [wide] string width number of display cells on a monospace display taken
64 * by a [wide] string, size_t
65 *
66 *
67 * Overview of string metrics:@n
68 *
69 * Metric Abbrev. Type Meaning
70 * ------ ------ ------ -------------------------------------------------
71 * size n size_t number of BYTES in a string (excluding the
72 * NULL-terminator)
73 *
74 * length l size_t number of CHARACTERS in a string (excluding the
75 * null terminator)
76 *
77 * width w size_t number of display cells on a monospace display
78 * taken by a string
79 *
80 *
81 * Function naming prefixes:@n
82 *
83 * chr_ operate on characters
84 * ascii_ operate on ASCII characters
85 * str_ operate on strings
86 * wstr_ operate on wide strings
87 *
88 * [w]str_[n|l|w] operate on a prefix limited by size, length
89 * or width
90 *
91 *
92 * A specific character inside a [wide] string can be referred to by:@n
93 *
94 * pointer (char *, wchar_t *)
95 * byte offset (size_t)
96 * character index (size_t)
97 *
98 */
99
100#include <errno.h>
101#include <stdbool.h>
102#include <stddef.h>
103#include <str.h>
104
105/** Check the condition if wchar_t is signed */
106#ifdef __WCHAR_UNSIGNED__
107 #define WCHAR_SIGNED_CHECK(cond) (true)
108#else
109 #define WCHAR_SIGNED_CHECK(cond) (cond)
110#endif
111
112/** Byte mask consisting of lowest @n bits (out of 8) */
113#define LO_MASK_8(n) ((uint8_t) ((1 << (n)) - 1))
114
115/** Byte mask consisting of lowest @n bits (out of 32) */
116#define LO_MASK_32(n) ((uint32_t) ((1 << (n)) - 1))
117
118/** Byte mask consisting of highest @n bits (out of 8) */
119#define HI_MASK_8(n) (~LO_MASK_8(8 - (n)))
120
121/** Number of data bits in a UTF-8 continuation byte */
122#define CONT_BITS 6
123
124/** Decode a single character from a string.
125 *
126 * Decode a single character from a string of size @a size. Decoding starts
127 * at @a offset and this offset is moved to the beginning of the next
128 * character. In case of decoding error, offset generally advances at least
129 * by one. However, offset is never moved beyond size.
130 *
131 * @param str String (not necessarily NULL-terminated).
132 * @param offset Byte offset in string where to start decoding.
133 * @param size Size of the string (in bytes).
134 *
135 * @return Value of decoded character, U_SPECIAL on decoding error or
136 * NULL if attempt to decode beyond @a size.
137 *
138 */
139wchar_t str_decode(const char *str, size_t *offset, size_t size)
140{
141 if (*offset + 1 > size)
142 return 0;
143
144 /* First byte read from string */
145 uint8_t b0 = (uint8_t) str[(*offset)++];
146
147 /* Determine code length */
148
149 unsigned int b0_bits; /* Data bits in first byte */
150 unsigned int cbytes; /* Number of continuation bytes */
151
152 if ((b0 & 0x80) == 0) {
153 /* 0xxxxxxx (Plain ASCII) */
154 b0_bits = 7;
155 cbytes = 0;
156 } else if ((b0 & 0xe0) == 0xc0) {
157 /* 110xxxxx 10xxxxxx */
158 b0_bits = 5;
159 cbytes = 1;
160 } else if ((b0 & 0xf0) == 0xe0) {
161 /* 1110xxxx 10xxxxxx 10xxxxxx */
162 b0_bits = 4;
163 cbytes = 2;
164 } else if ((b0 & 0xf8) == 0xf0) {
165 /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
166 b0_bits = 3;
167 cbytes = 3;
168 } else {
169 /* 10xxxxxx -- unexpected continuation byte */
170 return U_SPECIAL;
171 }
172
173 if (*offset + cbytes > size)
174 return U_SPECIAL;
175
176 wchar_t ch = b0 & LO_MASK_8(b0_bits);
177
178 /* Decode continuation bytes */
179 while (cbytes > 0) {
180 uint8_t b = (uint8_t) str[(*offset)++];
181
182 /* Must be 10xxxxxx */
183 if ((b & 0xc0) != 0x80)
184 return U_SPECIAL;
185
186 /* Shift data bits to ch */
187 ch = (ch << CONT_BITS) | (wchar_t) (b & LO_MASK_8(CONT_BITS));
188 cbytes--;
189 }
190
191 return ch;
192}
193
194/** Encode a single character to string representation.
195 *
196 * Encode a single character to string representation (i.e. UTF-8) and store
197 * it into a buffer at @a offset. Encoding starts at @a offset and this offset
198 * is moved to the position where the next character can be written to.
199 *
200 * @param ch Input character.
201 * @param str Output buffer.
202 * @param offset Byte offset where to start writing.
203 * @param size Size of the output buffer (in bytes).
204 *
205 * @return EOK if the character was encoded successfully, EOVERFLOW if there
206 * was not enough space in the output buffer or EINVAL if the character
207 * code was invalid.
208 */
209int chr_encode(const wchar_t ch, char *str, size_t *offset, size_t size)
210{
211 if (*offset >= size)
212 return EOVERFLOW;
213
214 if (!chr_check(ch))
215 return EINVAL;
216
217 /* Unsigned version of ch (bit operations should only be done
218 on unsigned types). */
219 uint32_t cc = (uint32_t) ch;
220
221 /* Determine how many continuation bytes are needed */
222
223 unsigned int b0_bits; /* Data bits in first byte */
224 unsigned int cbytes; /* Number of continuation bytes */
225
226 if ((cc & ~LO_MASK_32(7)) == 0) {
227 b0_bits = 7;
228 cbytes = 0;
229 } else if ((cc & ~LO_MASK_32(11)) == 0) {
230 b0_bits = 5;
231 cbytes = 1;
232 } else if ((cc & ~LO_MASK_32(16)) == 0) {
233 b0_bits = 4;
234 cbytes = 2;
235 } else if ((cc & ~LO_MASK_32(21)) == 0) {
236 b0_bits = 3;
237 cbytes = 3;
238 } else {
239 /* Codes longer than 21 bits are not supported */
240 return EINVAL;
241 }
242
243 /* Check for available space in buffer */
244 if (*offset + cbytes >= size)
245 return EOVERFLOW;
246
247 /* Encode continuation bytes */
248 unsigned int i;
249 for (i = cbytes; i > 0; i--) {
250 str[*offset + i] = 0x80 | (cc & LO_MASK_32(CONT_BITS));
251 cc = cc >> CONT_BITS;
252 }
253
254 /* Encode first byte */
255 str[*offset] = (cc & LO_MASK_32(b0_bits)) | HI_MASK_8(8 - b0_bits - 1);
256
257 /* Advance offset */
258 *offset += cbytes + 1;
259
260 return EOK;
261}
262
263/** Get size of string.
264 *
265 * Get the number of bytes which are used by the string @a str (excluding the
266 * NULL-terminator).
267 *
268 * @param str String to consider.
269 *
270 * @return Number of bytes used by the string
271 *
272 */
273size_t str_size(const char *str)
274{
275 size_t size = 0;
276
277 while (*str++ != 0)
278 size++;
279
280 return size;
281}
282
283/** Get size of string with length limit.
284 *
285 * Get the number of bytes which are used by up to @a max_len first
286 * characters in the string @a str. If @a max_len is greater than
287 * the length of @a str, the entire string is measured (excluding the
288 * NULL-terminator).
289 *
290 * @param str String to consider.
291 * @param max_len Maximum number of characters to measure.
292 *
293 * @return Number of bytes used by the characters.
294 *
295 */
296size_t str_lsize(const char *str, size_t max_len)
297{
298 size_t len = 0;
299 size_t offset = 0;
300
301 while (len < max_len) {
302 if (str_decode(str, &offset, STR_NO_LIMIT) == 0)
303 break;
304
305 len++;
306 }
307
308 return offset;
309}
310
311/** Get number of characters in a string.
312 *
313 * @param str NULL-terminated string.
314 *
315 * @return Number of characters in string.
316 *
317 */
318size_t str_length(const char *str)
319{
320 size_t len = 0;
321 size_t offset = 0;
322
323 while (str_decode(str, &offset, STR_NO_LIMIT) != 0)
324 len++;
325
326 return len;
327}
328
329/** Check whether character is plain ASCII.
330 *
331 * @return True if character is plain ASCII.
332 *
333 */
334bool ascii_check(wchar_t ch)
335{
336 if (WCHAR_SIGNED_CHECK(ch >= 0) && (ch <= 127))
337 return true;
338
339 return false;
340}
341
342/** Check whether character is valid
343 *
344 * @return True if character is a valid Unicode code point.
345 *
346 */
347bool chr_check(wchar_t ch)
348{
349 if (WCHAR_SIGNED_CHECK(ch >= 0) && (ch <= 1114111))
350 return true;
351
352 return false;
353}
354
355/** Compare two NULL terminated strings.
356 *
357 * Do a char-by-char comparison of two NULL-terminated strings.
358 * The strings are considered equal iff their length is equal
359 * and both strings consist of the same sequence of characters.
360 *
361 * A string S1 is less than another string S2 if it has a character with
362 * lower value at the first character position where the strings differ.
363 * If the strings differ in length, the shorter one is treated as if
364 * padded by characters with a value of zero.
365 *
366 * @param s1 First string to compare.
367 * @param s2 Second string to compare.
368 *
369 * @return 0 if the strings are equal, -1 if the first is less than the second,
370 * 1 if the second is less than the first.
371 *
372 */
373int str_cmp(const char *s1, const char *s2)
374{
375 wchar_t c1 = 0;
376 wchar_t c2 = 0;
377
378 size_t off1 = 0;
379 size_t off2 = 0;
380
381 while (true) {
382 c1 = str_decode(s1, &off1, STR_NO_LIMIT);
383 c2 = str_decode(s2, &off2, STR_NO_LIMIT);
384
385 if (c1 < c2)
386 return -1;
387
388 if (c1 > c2)
389 return 1;
390
391 if ((c1 == 0) || (c2 == 0))
392 break;
393 }
394
395 return 0;
396}
397
398/** Copy string.
399 *
400 * Copy source string @a src to destination buffer @a dest.
401 * No more than @a size bytes are written. If the size of the output buffer
402 * is at least one byte, the output string will always be well-formed, i.e.
403 * null-terminated and containing only complete characters.
404 *
405 * @param dest Destination buffer.
406 * @param count Size of the destination buffer (must be > 0).
407 * @param src Source string.
408 *
409 */
410void str_cpy(char *dest, size_t size, const char *src)
411{
412 size_t src_off = 0;
413 size_t dest_off = 0;
414
415 wchar_t ch;
416 while ((ch = str_decode(src, &src_off, STR_NO_LIMIT)) != 0) {
417 if (chr_encode(ch, dest, &dest_off, size - 1) != EOK)
418 break;
419 }
420
421 dest[dest_off] = '\0';
422}
423
424/** @}
425 */
Note: See TracBrowser for help on using the repository browser.