source: mainline/kernel/generic/src/lib/string.c@ 32704cb

lfn serial ticket/834-toolchain-update topic/msim-upgrade topic/simplify-dev-export
Last change on this file since 32704cb was 32704cb, checked in by Jiri Svoboda <jirik.svoboda@…>, 16 years ago

Slightly decompile character encoder.

  • Property mode set to 100644
File size: 9.6 KB
Line 
1/*
2 * Copyright (c) 2001-2004 Jakub Jermar
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * - Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * - Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * - The name of the author may not be used to endorse or promote products
15 * derived from this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/** @addtogroup generic
30 * @{
31 */
32
33/**
34 * @file
35 * @brief Miscellaneous functions.
36 */
37
38#include <string.h>
39#include <print.h>
40#include <cpu.h>
41#include <arch/asm.h>
42#include <arch.h>
43#include <console/kconsole.h>
44
45char invalch = '?';
46
47/** Byte mask consisting of lowest @n bits (out of eight). */
48#define LO_MASK_8(n) ((uint8_t)((1 << (n)) - 1))
49
50/** Byte mask consisting of lowest @n bits (out of 32). */
51#define LO_MASK_32(n) ((uint32_t)((1 << (n)) - 1))
52
53/** Byte mask consisting of highest @n bits (out of eight). */
54#define HI_MASK_8(n) (~LO_MASK_8(8 - (n)))
55
56/** Number of data bits in a UTF-8 continuation byte. */
57#define CONT_BITS 6
58
59/** Decode a single UTF-8 character from a NULL-terminated string.
60 *
61 * Decode a single UTF-8 character from a plain char NULL-terminated
62 * string. Decoding starts at @index and this index is incremented
63 * if the current UTF-8 string is encoded in more than a single byte.
64 *
65 * @param str Plain character NULL-terminated string.
66 * @param index Index (counted in plain characters) where to start
67 * the decoding.
68 * @param limit Maximal allowed value of index.
69 *
70 * @return Decoded character in UTF-32 or '?' if the encoding is wrong.
71 *
72 */
73wchar_t utf8_decode(const char *str, index_t *index, index_t limit)
74{
75 uint8_t b0, b; /* Bytes read from str. */
76 wchar_t ch;
77
78 int b0_bits; /* Data bits in first byte. */
79 int cbytes; /* Number of continuation bytes. */
80
81 if (*index > limit)
82 return invalch;
83
84 b0 = (uint8_t) str[*index];
85
86 /* Determine code length. */
87
88 if ((b0 & 0x80) == 0) {
89 /* 0xxxxxxx (Plain ASCII) */
90 b0_bits = 7;
91 cbytes = 0;
92 } else if ((b0 & 0xe0) == 0xc0) {
93 /* 110xxxxx 10xxxxxx */
94 b0_bits = 5;
95 cbytes = 1;
96 } else if ((b0 & 0xf0) == 0xe0) {
97 /* 1110xxxx 10xxxxxx 10xxxxxx */
98 b0_bits = 4;
99 cbytes = 2;
100 } else if ((b0 & 0xf8) == 0xf0) {
101 /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
102 b0_bits = 3;
103 cbytes = 3;
104 } else {
105 /* 10xxxxxx -- unexpected continuation byte. */
106 return invalch;
107 }
108
109 if (*index + cbytes > limit) {
110 return invalch;
111 }
112
113 ch = b0 & LO_MASK_8(b0_bits);
114
115 /* Decode continuation bytes. */
116 while (cbytes > 0) {
117 b = (uint8_t) str[*index + 1];
118 ++(*index);
119
120 /* Must be 10xxxxxx. */
121 if ((b & 0xc0) != 0x80) {
122 return invalch;
123 }
124
125 /* Shift data bits to ch. */
126 ch = (ch << CONT_BITS) | (wchar_t) (b & LO_MASK_8(CONT_BITS));
127 --cbytes;
128 }
129
130 return ch;
131}
132
133/** Encode a single UTF-32 character as UTF-8
134 *
135 * Encode a single UTF-32 character as UTF-8 and store it into
136 * the given buffer at @index. Encoding starts at @index and
137 * this index is incremented if the UTF-8 character takes
138 * more than a single byte.
139 *
140 * @param ch Input UTF-32 character.
141 * @param str Output buffer.
142 * @param index Index (counted in plain characters) where to start
143 * the encoding
144 * @param limit Maximal allowed value of index.
145 *
146 * @return True if the character was encoded or false if there is not
147 * enought space in the output buffer or the character is invalid
148 * Unicode code point.
149 *
150 */
151bool utf8_encode(const wchar_t ch, char *str, index_t *index, index_t limit)
152{
153 uint32_t cc; /* Unsigned version of ch. */
154
155 int cbytes; /* Number of continuation bytes. */
156 int b0_bits; /* Number of data bits in first byte. */
157 int i;
158
159 if (*index > limit)
160 return false;
161
162 if (ch < 0)
163 return false;
164
165 /* Bit operations should only be done on unsigned numbers. */
166 cc = (uint32_t) ch;
167
168 /* Determine how many continuation bytes are needed. */
169 if ((cc & ~LO_MASK_32(7)) == 0) {
170 b0_bits = 7;
171 cbytes = 0;
172 } else if ((cc & ~LO_MASK_32(11)) == 0) {
173 b0_bits = 5;
174 cbytes = 1;
175 } else if ((cc & ~LO_MASK_32(16)) == 0) {
176 b0_bits = 4;
177 cbytes = 2;
178 } else if ((cc & ~LO_MASK_32(21)) == 0) {
179 b0_bits = 3;
180 cbytes = 3;
181 } else {
182 /* Codes longer than 21 bits are not supported. */
183 return false;
184 }
185
186 /* Check for available space in buffer. */
187 if (*index + cbytes > limit)
188 return false;
189
190 /* Encode continuation bytes. */
191 for (i = cbytes; i > 0; --i) {
192 str[*index + i] = 0x80 | (cc & LO_MASK_32(CONT_BITS));
193 cc = cc >> CONT_BITS;
194 }
195
196 /* Encode first byte. */
197 str[*index] = (cc & LO_MASK_32(b0_bits)) | HI_MASK_8(8 - b0_bits - 1);
198
199 /* Advance index. */
200 *index += cbytes;
201
202 return true;
203}
204
205/** Get bytes used by UTF-8 characters.
206 *
207 * Get the number of bytes (count of plain characters) which
208 * are used by a given count of UTF-8 characters in a string.
209 * As UTF-8 encoding is multibyte, there is no constant
210 * correspondence between number of characters and used bytes.
211 *
212 * @param str UTF-8 string to consider.
213 * @param count Number of UTF-8 characters to count.
214 *
215 * @return Number of bytes used by the characters.
216 *
217 */
218size_t utf8_count_bytes(const char *str, count_t count)
219{
220 size_t size = 0;
221 index_t index = 0;
222
223 while ((utf8_decode(str, &index, UTF8_NO_LIMIT) != 0) && (size < count)) {
224 size++;
225 index++;
226 }
227
228 return index;
229}
230
231/** Check whether character is plain ASCII.
232 *
233 * @return True if character is plain ASCII.
234 *
235 */
236bool ascii_check(const wchar_t ch)
237{
238 if ((ch >= 0) && (ch <= 127))
239 return true;
240
241 return false;
242}
243
244/** Check whether character is Unicode.
245 *
246 * @return True if character is valid Unicode code point.
247 *
248 */
249bool unicode_check(const wchar_t ch)
250{
251 if ((ch >= 0) && (ch <= 1114111))
252 return true;
253
254 return false;
255}
256
257/** Return number of plain characters in a string.
258 *
259 * @param str NULL-terminated string.
260 *
261 * @return Number of characters in str.
262 *
263 */
264size_t strlen(const char *str)
265{
266 size_t size;
267 for (size = 0; str[size]; size++);
268
269 return size;
270}
271
272/** Return number of UTF-8 characters in a string.
273 *
274 * @param str NULL-terminated UTF-8 string.
275 *
276 * @return Number of UTF-8 characters in str.
277 *
278 */
279size_t strlen_utf8(const char *str)
280{
281 size_t size = 0;
282 index_t index = 0;
283
284 while (utf8_decode(str, &index, UTF8_NO_LIMIT) != 0) {
285 size++;
286 index++;
287 }
288
289 return size;
290}
291
292/** Return number of UTF-32 characters in a string.
293 *
294 * @param str NULL-terminated UTF-32 string.
295 *
296 * @return Number of UTF-32 characters in str.
297 *
298 */
299size_t strlen_utf32(const wchar_t *str)
300{
301 size_t size;
302 for (size = 0; str[size]; size++);
303
304 return size;
305}
306
307/** Compare two NULL terminated strings
308 *
309 * Do a char-by-char comparison of two NULL terminated strings.
310 * The strings are considered equal iff they consist of the same
311 * characters on the minimum of their lengths.
312 *
313 * @param src First string to compare.
314 * @param dst Second string to compare.
315 *
316 * @return 0 if the strings are equal, -1 if first is smaller, 1 if second smaller.
317 *
318 */
319int strcmp(const char *src, const char *dst)
320{
321 for (; *src && *dst; src++, dst++) {
322 if (*src < *dst)
323 return -1;
324 if (*src > *dst)
325 return 1;
326 }
327 if (*src == *dst)
328 return 0;
329
330 if (!*src)
331 return -1;
332
333 return 1;
334}
335
336
337/** Compare two NULL terminated strings
338 *
339 * Do a char-by-char comparison of two NULL terminated strings.
340 * The strings are considered equal iff they consist of the same
341 * characters on the minimum of their lengths and specified maximal
342 * length.
343 *
344 * @param src First string to compare.
345 * @param dst Second string to compare.
346 * @param len Maximal length for comparison.
347 *
348 * @return 0 if the strings are equal, -1 if first is smaller, 1 if second smaller.
349 *
350 */
351int strncmp(const char *src, const char *dst, size_t len)
352{
353 unsigned int i;
354
355 for (i = 0; (*src) && (*dst) && (i < len); src++, dst++, i++) {
356 if (*src < *dst)
357 return -1;
358
359 if (*src > *dst)
360 return 1;
361 }
362
363 if (i == len || *src == *dst)
364 return 0;
365
366 if (!*src)
367 return -1;
368
369 return 1;
370}
371
372
373
374/** Copy NULL terminated string.
375 *
376 * Copy at most 'len' characters from string 'src' to 'dest'.
377 * If 'src' is shorter than 'len', '\0' is inserted behind the
378 * last copied character.
379 *
380 * @param src Source string.
381 * @param dest Destination buffer.
382 * @param len Size of destination buffer.
383 *
384 */
385void strncpy(char *dest, const char *src, size_t len)
386{
387 unsigned int i;
388
389 for (i = 0; i < len; i++) {
390 if (!(dest[i] = src[i]))
391 return;
392 }
393
394 dest[i - 1] = '\0';
395}
396
397/** Find first occurence of character in string.
398 *
399 * @param s String to search.
400 * @param i Character to look for.
401 *
402 * @return Pointer to character in @a s or NULL if not found.
403 */
404extern char *strchr(const char *s, int i)
405{
406 while (*s != '\0') {
407 if (*s == i)
408 return (char *) s;
409 ++s;
410 }
411
412 return NULL;
413}
414
415/** @}
416 */
Note: See TracBrowser for help on using the repository browser.