source: mainline/uspace/lib/libc/generic/string.c@ 171f9a1

lfn serial ticket/834-toolchain-update topic/msim-upgrade topic/simplify-dev-export
Last change on this file since 171f9a1 was 171f9a1, checked in by Jiri Svoboda <jirik.svoboda@…>, 16 years ago

Character encoding/decoding un uspace. Partially fix klog application.

  • Property mode set to 100644
File size: 12.1 KB
Line 
1/*
2 * Copyright (c) 2005 Martin Decky
3 * Copyright (c) 2008 Jiri Svoboda
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * - Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * - Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 * - The name of the author may not be used to endorse or promote products
16 * derived from this software without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
19 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
20 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
21 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
22 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
23 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
27 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 */
29
30/** @addtogroup libc
31 * @{
32 */
33/** @file
34 */
35
36#include <string.h>
37#include <stdlib.h>
38#include <limits.h>
39#include <ctype.h>
40#include <malloc.h>
41#include <errno.h>
42#include <string.h>
43
44/** Byte mask consisting of lowest @n bits (out of 8) */
45#define LO_MASK_8(n) ((uint8_t) ((1 << (n)) - 1))
46
47/** Byte mask consisting of lowest @n bits (out of 32) */
48#define LO_MASK_32(n) ((uint32_t) ((1 << (n)) - 1))
49
50/** Byte mask consisting of highest @n bits (out of 8) */
51#define HI_MASK_8(n) (~LO_MASK_8(8 - (n)))
52
53/** Number of data bits in a UTF-8 continuation byte */
54#define CONT_BITS 6
55
56/** Decode a single character from a string.
57 *
58 * Decode a single character from a string of size @a size. Decoding starts
59 * at @a offset and this offset is moved to the beginning of the next
60 * character. In case of decoding error, offset generally advances at least
61 * by one. However, offset is never moved beyond size.
62 *
63 * @param str String (not necessarily NULL-terminated).
64 * @param offset Byte offset in string where to start decoding.
65 * @param size Size of the string (in bytes).
66 *
67 * @return Value of decoded character, U_SPECIAL on decoding error or
68 * NULL if attempt to decode beyond @a size.
69 *
70 */
71wchar_t str_decode(const char *str, size_t *offset, size_t size)
72{
73 if (*offset + 1 > size)
74 return 0;
75
76 /* First byte read from string */
77 uint8_t b0 = (uint8_t) str[(*offset)++];
78
79 /* Determine code length */
80
81 unsigned int b0_bits; /* Data bits in first byte */
82 unsigned int cbytes; /* Number of continuation bytes */
83
84 if ((b0 & 0x80) == 0) {
85 /* 0xxxxxxx (Plain ASCII) */
86 b0_bits = 7;
87 cbytes = 0;
88 } else if ((b0 & 0xe0) == 0xc0) {
89 /* 110xxxxx 10xxxxxx */
90 b0_bits = 5;
91 cbytes = 1;
92 } else if ((b0 & 0xf0) == 0xe0) {
93 /* 1110xxxx 10xxxxxx 10xxxxxx */
94 b0_bits = 4;
95 cbytes = 2;
96 } else if ((b0 & 0xf8) == 0xf0) {
97 /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
98 b0_bits = 3;
99 cbytes = 3;
100 } else {
101 /* 10xxxxxx -- unexpected continuation byte */
102 return U_SPECIAL;
103 }
104
105 if (*offset + cbytes > size)
106 return U_SPECIAL;
107
108 wchar_t ch = b0 & LO_MASK_8(b0_bits);
109
110 /* Decode continuation bytes */
111 while (cbytes > 0) {
112 uint8_t b = (uint8_t) str[(*offset)++];
113
114 /* Must be 10xxxxxx */
115 if ((b & 0xc0) != 0x80)
116 return U_SPECIAL;
117
118 /* Shift data bits to ch */
119 ch = (ch << CONT_BITS) | (wchar_t) (b & LO_MASK_8(CONT_BITS));
120 cbytes--;
121 }
122
123 return ch;
124}
125
126/** Encode a single character to string representation.
127 *
128 * Encode a single character to string representation (i.e. UTF-8) and store
129 * it into a buffer at @a offset. Encoding starts at @a offset and this offset
130 * is moved to the position where the next character can be written to.
131 *
132 * @param ch Input character.
133 * @param str Output buffer.
134 * @param offset Byte offset where to start writing.
135 * @param size Size of the output buffer (in bytes).
136 *
137 * @return EOK if the character was encoded successfully, EOVERFLOW if there
138 * was not enough space in the output buffer or EINVAL if the character
139 * code was invalid.
140 */
141int chr_encode(const wchar_t ch, char *str, size_t *offset, size_t size)
142{
143 if (*offset >= size)
144 return EOVERFLOW;
145
146 if (!chr_check(ch))
147 return EINVAL;
148
149 /* Unsigned version of ch (bit operations should only be done
150 on unsigned types). */
151 uint32_t cc = (uint32_t) ch;
152
153 /* Determine how many continuation bytes are needed */
154
155 unsigned int b0_bits; /* Data bits in first byte */
156 unsigned int cbytes; /* Number of continuation bytes */
157
158 if ((cc & ~LO_MASK_32(7)) == 0) {
159 b0_bits = 7;
160 cbytes = 0;
161 } else if ((cc & ~LO_MASK_32(11)) == 0) {
162 b0_bits = 5;
163 cbytes = 1;
164 } else if ((cc & ~LO_MASK_32(16)) == 0) {
165 b0_bits = 4;
166 cbytes = 2;
167 } else if ((cc & ~LO_MASK_32(21)) == 0) {
168 b0_bits = 3;
169 cbytes = 3;
170 } else {
171 /* Codes longer than 21 bits are not supported */
172 return EINVAL;
173 }
174
175 /* Check for available space in buffer */
176 if (*offset + cbytes >= size)
177 return EOVERFLOW;
178
179 /* Encode continuation bytes */
180 unsigned int i;
181 for (i = cbytes; i > 0; i--) {
182 str[*offset + i] = 0x80 | (cc & LO_MASK_32(CONT_BITS));
183 cc = cc >> CONT_BITS;
184 }
185
186 /* Encode first byte */
187 str[*offset] = (cc & LO_MASK_32(b0_bits)) | HI_MASK_8(8 - b0_bits - 1);
188
189 /* Advance offset */
190 *offset += cbytes + 1;
191
192 return EOK;
193}
194
195/** Check whether character is valid
196 *
197 * @return True if character is a valid Unicode code point.
198 *
199 */
200bool chr_check(const wchar_t ch)
201{
202 if ((ch >= 0) && (ch <= 1114111))
203 return true;
204
205 return false;
206}
207
208/** Count the number of characters in the string, not including terminating 0.
209 *
210 * @param str String.
211 * @return Number of characters in string.
212 */
213size_t strlen(const char *str)
214{
215 size_t counter = 0;
216
217 while (str[counter] != 0)
218 counter++;
219
220 return counter;
221}
222
223int strcmp(const char *a, const char *b)
224{
225 int c = 0;
226
227 while (a[c] && b[c] && (!(a[c] - b[c])))
228 c++;
229
230 return (a[c] - b[c]);
231}
232
233int strncmp(const char *a, const char *b, size_t n)
234{
235 size_t c = 0;
236
237 while (c < n && a[c] && b[c] && (!(a[c] - b[c])))
238 c++;
239
240 return ( c < n ? a[c] - b[c] : 0);
241
242}
243
244int stricmp(const char *a, const char *b)
245{
246 int c = 0;
247
248 while (a[c] && b[c] && (!(tolower(a[c]) - tolower(b[c]))))
249 c++;
250
251 return (tolower(a[c]) - tolower(b[c]));
252}
253
254/** Return pointer to the first occurence of character c in string.
255 *
256 * @param str Scanned string.
257 * @param c Searched character (taken as one byte).
258 * @return Pointer to the matched character or NULL if it is not
259 * found in given string.
260 */
261char *strchr(const char *str, int c)
262{
263 while (*str != '\0') {
264 if (*str == (char) c)
265 return (char *) str;
266 str++;
267 }
268
269 return NULL;
270}
271
272/** Return pointer to the last occurence of character c in string.
273 *
274 * @param str Scanned string.
275 * @param c Searched character (taken as one byte).
276 * @return Pointer to the matched character or NULL if it is not
277 * found in given string.
278 */
279char *strrchr(const char *str, int c)
280{
281 char *retval = NULL;
282
283 while (*str != '\0') {
284 if (*str == (char) c)
285 retval = (char *) str;
286 str++;
287 }
288
289 return (char *) retval;
290}
291
292/** Convert string to a number.
293 * Core of strtol and strtoul functions.
294 *
295 * @param nptr Pointer to string.
296 * @param endptr If not NULL, function stores here pointer to the first
297 * invalid character.
298 * @param base Zero or number between 2 and 36 inclusive.
299 * @param sgn It's set to 1 if minus found.
300 * @return Result of conversion.
301 */
302static unsigned long
303_strtoul(const char *nptr, char **endptr, int base, char *sgn)
304{
305 unsigned char c;
306 unsigned long result = 0;
307 unsigned long a, b;
308 const char *str = nptr;
309 const char *tmpptr;
310
311 while (isspace(*str))
312 str++;
313
314 if (*str == '-') {
315 *sgn = 1;
316 ++str;
317 } else if (*str == '+')
318 ++str;
319
320 if (base) {
321 if ((base == 1) || (base > 36)) {
322 /* FIXME: set errno to EINVAL */
323 return 0;
324 }
325 if ((base == 16) && (*str == '0') && ((str[1] == 'x') ||
326 (str[1] == 'X'))) {
327 str += 2;
328 }
329 } else {
330 base = 10;
331
332 if (*str == '0') {
333 base = 8;
334 if ((str[1] == 'X') || (str[1] == 'x')) {
335 base = 16;
336 str += 2;
337 }
338 }
339 }
340
341 tmpptr = str;
342
343 while (*str) {
344 c = *str;
345 c = (c >= 'a' ? c - 'a' + 10 : (c >= 'A' ? c - 'A' + 10 :
346 (c <= '9' ? c - '0' : 0xff)));
347 if (c > base) {
348 break;
349 }
350
351 a = (result & 0xff) * base + c;
352 b = (result >> 8) * base + (a >> 8);
353
354 if (b > (ULONG_MAX >> 8)) {
355 /* overflow */
356 /* FIXME: errno = ERANGE*/
357 return ULONG_MAX;
358 }
359
360 result = (b << 8) + (a & 0xff);
361 ++str;
362 }
363
364 if (str == tmpptr) {
365 /*
366 * No number was found => first invalid character is the first
367 * character of the string.
368 */
369 /* FIXME: set errno to EINVAL */
370 str = nptr;
371 result = 0;
372 }
373
374 if (endptr)
375 *endptr = (char *) str;
376
377 if (nptr == str) {
378 /*FIXME: errno = EINVAL*/
379 return 0;
380 }
381
382 return result;
383}
384
385/** Convert initial part of string to long int according to given base.
386 * The number may begin with an arbitrary number of whitespaces followed by
387 * optional sign (`+' or `-'). If the base is 0 or 16, the prefix `0x' may be
388 * inserted and the number will be taken as hexadecimal one. If the base is 0
389 * and the number begin with a zero, number will be taken as octal one (as with
390 * base 8). Otherwise the base 0 is taken as decimal.
391 *
392 * @param nptr Pointer to string.
393 * @param endptr If not NULL, function stores here pointer to the first
394 * invalid character.
395 * @param base Zero or number between 2 and 36 inclusive.
396 * @return Result of conversion.
397 */
398long int strtol(const char *nptr, char **endptr, int base)
399{
400 char sgn = 0;
401 unsigned long number = 0;
402
403 number = _strtoul(nptr, endptr, base, &sgn);
404
405 if (number > LONG_MAX) {
406 if ((sgn) && (number == (unsigned long) (LONG_MAX) + 1)) {
407 /* FIXME: set 0 to errno */
408 return number;
409 }
410 /* FIXME: set ERANGE to errno */
411 return (sgn ? LONG_MIN : LONG_MAX);
412 }
413
414 return (sgn ? -number : number);
415}
416
417
418/** Convert initial part of string to unsigned long according to given base.
419 * The number may begin with an arbitrary number of whitespaces followed by
420 * optional sign (`+' or `-'). If the base is 0 or 16, the prefix `0x' may be
421 * inserted and the number will be taken as hexadecimal one. If the base is 0
422 * and the number begin with a zero, number will be taken as octal one (as with
423 * base 8). Otherwise the base 0 is taken as decimal.
424 *
425 * @param nptr Pointer to string.
426 * @param endptr If not NULL, function stores here pointer to the first
427 * invalid character
428 * @param base Zero or number between 2 and 36 inclusive.
429 * @return Result of conversion.
430 */
431unsigned long strtoul(const char *nptr, char **endptr, int base)
432{
433 char sgn = 0;
434 unsigned long number = 0;
435
436 number = _strtoul(nptr, endptr, base, &sgn);
437
438 return (sgn ? -number : number);
439}
440
441char *strcpy(char *dest, const char *src)
442{
443 char *orig = dest;
444
445 while ((*(dest++) = *(src++)))
446 ;
447 return orig;
448}
449
450char *strncpy(char *dest, const char *src, size_t n)
451{
452 char *orig = dest;
453
454 while ((*(dest++) = *(src++)) && --n)
455 ;
456 return orig;
457}
458
459char *strcat(char *dest, const char *src)
460{
461 char *orig = dest;
462 while (*dest++)
463 ;
464 --dest;
465 while ((*dest++ = *src++))
466 ;
467 return orig;
468}
469
470char * strdup(const char *s1)
471{
472 size_t len = strlen(s1) + 1;
473 void *ret = malloc(len);
474
475 if (ret == NULL)
476 return (char *) NULL;
477
478 return (char *) memcpy(ret, s1, len);
479}
480
481char *strtok(char *s, const char *delim)
482{
483 static char *next;
484
485 return strtok_r(s, delim, &next);
486}
487
488char *strtok_r(char *s, const char *delim, char **next)
489{
490 char *start, *end;
491
492 if (s == NULL)
493 s = *next;
494
495 /* Skip over leading delimiters. */
496 while (*s && (strchr(delim, *s) != NULL)) ++s;
497 start = s;
498
499 /* Skip over token characters. */
500 while (*s && (strchr(delim, *s) == NULL)) ++s;
501 end = s;
502 *next = (*s ? s + 1 : s);
503
504 if (start == end) {
505 return NULL; /* No more tokens. */
506 }
507
508 /* Overwrite delimiter with NULL terminator. */
509 *end = '\0';
510 return start;
511}
512
513/** @}
514 */
Note: See TracBrowser for help on using the repository browser.