Context Navigation

source: mainline/kernel/generic/src/lib/string.c@ fa5526d

Visit:

lfn serial ticket/834-toolchain-update topic/msim-upgrade topic/simplify-dev-export

Last change on this file since fa5526d was 98000fb, checked in by Martin Decky <martin@…>, 16 years ago
remove redundant index_t and count_t types (which were always quite ambiguous and not actually needed)
Property mode set to `100644`
File size: 17.5 KB

Line
1	/*
2	* Copyright (c) 2001-2004 Jakub Jermar
3	* All rights reserved.
4	*
5	* Redistribution and use in source and binary forms, with or without
6	* modification, are permitted provided that the following conditions
7	* are met:
8	*
9	* - Redistributions of source code must retain the above copyright
10	* notice, this list of conditions and the following disclaimer.
11	* - Redistributions in binary form must reproduce the above copyright
12	* notice, this list of conditions and the following disclaimer in the
13	* documentation and/or other materials provided with the distribution.
14	* - The name of the author may not be used to endorse or promote products
15	* derived from this software without specific prior written permission.
16	*
17	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27	*/
28
29	/** @addtogroup generic
30	* @{
31	*/
32
33	/**
34	* @file
35	* @brief String functions.
36	*
37	* Strings and characters use the Universal Character Set (UCS). The standard
38	* strings, called just strings are encoded in UTF-8. Wide strings (encoded
39	* in UTF-32) are supported to a limited degree. A single character is
40	* represented as wchar_t.@n
41	*
42	* Overview of the terminology:@n
43	*
44	* Term Meaning
45	* -------------------- ----------------------------------------------------
46	* byte 8 bits stored in uint8_t (unsigned 8 bit integer)
47	*
48	* character UTF-32 encoded Unicode character, stored in wchar_t
49	* (signed 32 bit integer), code points 0 .. 1114111
50	* are valid
51	*
52	* ASCII character 7 bit encoded ASCII character, stored in char
53	* (usually signed 8 bit integer), code points 0 .. 127
54	* are valid
55	*
56	* string UTF-8 encoded NULL-terminated Unicode string, char *
57	*
58	* wide string UTF-32 encoded NULL-terminated Unicode string,
59	* wchar_t *
60	*
61	* [wide] string size number of BYTES in a [wide] string (excluding
62	* the NULL-terminator), size_t
63	*
64	* [wide] string length number of CHARACTERS in a [wide] string (excluding
65	* the NULL-terminator), size_t
66	*
67	* [wide] string width number of display cells on a monospace display taken
68	* by a [wide] string, size_t
69	*
70	*
71	* Overview of string metrics:@n
72	*
73	* Metric Abbrev. Type Meaning
74	* ------ ------ ------ -------------------------------------------------
75	* size n size_t number of BYTES in a string (excluding the
76	* NULL-terminator)
77	*
78	* length l size_t number of CHARACTERS in a string (excluding the
79	* null terminator)
80	*
81	* width w size_t number of display cells on a monospace display
82	* taken by a string
83	*
84	*
85	* Function naming prefixes:@n
86	*
87	* chr_ operate on characters
88	* ascii_ operate on ASCII characters
89	* str_ operate on strings
90	* wstr_ operate on wide strings
91	*
92	* [w]str_[n\|l\|w] operate on a prefix limited by size, length
93	* or width
94	*
95	*
96	* A specific character inside a [wide] string can be referred to by:@n
97	*
98	* pointer (char , wchar_t )
99	* byte offset (size_t)
100	* character index (size_t)
101	*
102	*/
103
104	#include <string.h>
105	#include <print.h>
106	#include <cpu.h>
107	#include <arch/asm.h>
108	#include <arch.h>
109	#include <errno.h>
110	#include <align.h>
111	#include <debug.h>
112
113	/** Byte mask consisting of lowest @n bits (out of 8) */
114	#define LO_MASK_8(n) ((uint8_t) ((1 << (n)) - 1))
115
116	/** Byte mask consisting of lowest @n bits (out of 32) */
117	#define LO_MASK_32(n) ((uint32_t) ((1 << (n)) - 1))
118
119	/** Byte mask consisting of highest @n bits (out of 8) */
120	#define HI_MASK_8(n) (~LO_MASK_8(8 - (n)))
121
122	/** Number of data bits in a UTF-8 continuation byte */
123	#define CONT_BITS 6
124
125	/** Decode a single character from a string.
126	*
127	* Decode a single character from a string of size @a size. Decoding starts
128	* at @a offset and this offset is moved to the beginning of the next
129	* character. In case of decoding error, offset generally advances at least
130	* by one. However, offset is never moved beyond size.
131	*
132	* @param str String (not necessarily NULL-terminated).
133	* @param offset Byte offset in string where to start decoding.
134	* @param size Size of the string (in bytes).
135	*
136	* @return Value of decoded character, U_SPECIAL on decoding error or
137	* NULL if attempt to decode beyond @a size.
138	*
139	*/
140	wchar_t str_decode(const char str, size_t offset, size_t size)
141	{
142	if (*offset + 1 > size)
143	return 0;
144
145	/* First byte read from string */
146	uint8_t b0 = (uint8_t) str[(*offset)++];
147
148	/* Determine code length */
149
150	unsigned int b0_bits; /* Data bits in first byte */
151	unsigned int cbytes; /* Number of continuation bytes */
152
153	if ((b0 & 0x80) == 0) {
154	/* 0xxxxxxx (Plain ASCII) */
155	b0_bits = 7;
156	cbytes = 0;
157	} else if ((b0 & 0xe0) == 0xc0) {
158	/* 110xxxxx 10xxxxxx */
159	b0_bits = 5;
160	cbytes = 1;
161	} else if ((b0 & 0xf0) == 0xe0) {
162	/* 1110xxxx 10xxxxxx 10xxxxxx */
163	b0_bits = 4;
164	cbytes = 2;
165	} else if ((b0 & 0xf8) == 0xf0) {
166	/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
167	b0_bits = 3;
168	cbytes = 3;
169	} else {
170	/* 10xxxxxx -- unexpected continuation byte */
171	return U_SPECIAL;
172	}
173
174	if (*offset + cbytes > size)
175	return U_SPECIAL;
176
177	wchar_t ch = b0 & LO_MASK_8(b0_bits);
178
179	/* Decode continuation bytes */
180	while (cbytes > 0) {
181	uint8_t b = (uint8_t) str[(*offset)++];
182
183	/* Must be 10xxxxxx */
184	if ((b & 0xc0) != 0x80)
185	return U_SPECIAL;
186
187	/* Shift data bits to ch */
188	ch = (ch << CONT_BITS) \| (wchar_t) (b & LO_MASK_8(CONT_BITS));
189	cbytes--;
190	}
191
192	return ch;
193	}
194
195	/** Encode a single character to string representation.
196	*
197	* Encode a single character to string representation (i.e. UTF-8) and store
198	* it into a buffer at @a offset. Encoding starts at @a offset and this offset
199	* is moved to the position where the next character can be written to.
200	*
201	* @param ch Input character.
202	* @param str Output buffer.
203	* @param offset Byte offset where to start writing.
204	* @param size Size of the output buffer (in bytes).
205	*
206	* @return EOK if the character was encoded successfully, EOVERFLOW if there
207	* was not enough space in the output buffer or EINVAL if the character
208	* code was invalid.
209	*/
210	int chr_encode(wchar_t ch, char str, size_t offset, size_t size)
211	{
212	if (*offset >= size)
213	return EOVERFLOW;
214
215	if (!chr_check(ch))
216	return EINVAL;
217
218	/* Unsigned version of ch (bit operations should only be done
219	on unsigned types). */
220	uint32_t cc = (uint32_t) ch;
221
222	/* Determine how many continuation bytes are needed */
223
224	unsigned int b0_bits; /* Data bits in first byte */
225	unsigned int cbytes; /* Number of continuation bytes */
226
227	if ((cc & ~LO_MASK_32(7)) == 0) {
228	b0_bits = 7;
229	cbytes = 0;
230	} else if ((cc & ~LO_MASK_32(11)) == 0) {
231	b0_bits = 5;
232	cbytes = 1;
233	} else if ((cc & ~LO_MASK_32(16)) == 0) {
234	b0_bits = 4;
235	cbytes = 2;
236	} else if ((cc & ~LO_MASK_32(21)) == 0) {
237	b0_bits = 3;
238	cbytes = 3;
239	} else {
240	/* Codes longer than 21 bits are not supported */
241	return EINVAL;
242	}
243
244	/* Check for available space in buffer */
245	if (*offset + cbytes >= size)
246	return EOVERFLOW;
247
248	/* Encode continuation bytes */
249	unsigned int i;
250	for (i = cbytes; i > 0; i--) {
251	str[*offset + i] = 0x80 \| (cc & LO_MASK_32(CONT_BITS));
252	cc = cc >> CONT_BITS;
253	}
254
255	/* Encode first byte */
256	str[*offset] = (cc & LO_MASK_32(b0_bits)) \| HI_MASK_8(8 - b0_bits - 1);
257
258	/* Advance offset */
259	*offset += cbytes + 1;
260
261	return EOK;
262	}
263
264	/** Get size of string.
265	*
266	* Get the number of bytes which are used by the string @a str (excluding the
267	* NULL-terminator).
268	*
269	* @param str String to consider.
270	*
271	* @return Number of bytes used by the string
272	*
273	*/
274	size_t str_size(const char *str)
275	{
276	size_t size = 0;
277
278	while (*str++ != 0)
279	size++;
280
281	return size;
282	}
283
284	/** Get size of wide string.
285	*
286	* Get the number of bytes which are used by the wide string @a str (excluding the
287	* NULL-terminator).
288	*
289	* @param str Wide string to consider.
290	*
291	* @return Number of bytes used by the wide string
292	*
293	*/
294	size_t wstr_size(const wchar_t *str)
295	{
296	return (wstr_length(str) * sizeof(wchar_t));
297	}
298
299	/** Get size of string with length limit.
300	*
301	* Get the number of bytes which are used by up to @a max_len first
302	* characters in the string @a str. If @a max_len is greater than
303	* the length of @a str, the entire string is measured (excluding the
304	* NULL-terminator).
305	*
306	* @param str String to consider.
307	* @param max_len Maximum number of characters to measure.
308	*
309	* @return Number of bytes used by the characters.
310	*
311	*/
312	size_t str_lsize(const char *str, size_t max_len)
313	{
314	size_t len = 0;
315	size_t offset = 0;
316
317	while (len < max_len) {
318	if (str_decode(str, &offset, STR_NO_LIMIT) == 0)
319	break;
320
321	len++;
322	}
323
324	return offset;
325	}
326
327	/** Get size of wide string with length limit.
328	*
329	* Get the number of bytes which are used by up to @a max_len first
330	* wide characters in the wide string @a str. If @a max_len is greater than
331	* the length of @a str, the entire wide string is measured (excluding the
332	* NULL-terminator).
333	*
334	* @param str Wide string to consider.
335	* @param max_len Maximum number of wide characters to measure.
336	*
337	* @return Number of bytes used by the wide characters.
338	*
339	*/
340	size_t wstr_lsize(const wchar_t *str, size_t max_len)
341	{
342	return (wstr_nlength(str, max_len * sizeof(wchar_t)) * sizeof(wchar_t));
343	}
344
345	/** Get number of characters in a string.
346	*
347	* @param str NULL-terminated string.
348	*
349	* @return Number of characters in string.
350	*
351	*/
352	size_t str_length(const char *str)
353	{
354	size_t len = 0;
355	size_t offset = 0;
356
357	while (str_decode(str, &offset, STR_NO_LIMIT) != 0)
358	len++;
359
360	return len;
361	}
362
363	/** Get number of characters in a wide string.
364	*
365	* @param str NULL-terminated wide string.
366	*
367	* @return Number of characters in @a str.
368	*
369	*/
370	size_t wstr_length(const wchar_t *wstr)
371	{
372	size_t len = 0;
373
374	while (*wstr++ != 0)
375	len++;
376
377	return len;
378	}
379
380	/** Get number of characters in a string with size limit.
381	*
382	* @param str NULL-terminated string.
383	* @param size Maximum number of bytes to consider.
384	*
385	* @return Number of characters in string.
386	*
387	*/
388	size_t str_nlength(const char *str, size_t size)
389	{
390	size_t len = 0;
391	size_t offset = 0;
392
393	while (str_decode(str, &offset, size) != 0)
394	len++;
395
396	return len;
397	}
398
399	/** Get number of characters in a string with size limit.
400	*
401	* @param str NULL-terminated string.
402	* @param size Maximum number of bytes to consider.
403	*
404	* @return Number of characters in string.
405	*
406	*/
407	size_t wstr_nlength(const wchar_t *str, size_t size)
408	{
409	size_t len = 0;
410	size_t limit = ALIGN_DOWN(size, sizeof(wchar_t));
411	size_t offset = 0;
412
413	while ((offset < limit) && (*str++ != 0)) {
414	len++;
415	offset += sizeof(wchar_t);
416	}
417
418	return len;
419	}
420
421	/** Check whether character is plain ASCII.
422	*
423	* @return True if character is plain ASCII.
424	*
425	*/
426	bool ascii_check(wchar_t ch)
427	{
428	if ((ch >= 0) && (ch <= 127))
429	return true;
430
431	return false;
432	}
433
434	/** Check whether character is valid
435	*
436	* @return True if character is a valid Unicode code point.
437	*
438	*/
439	bool chr_check(wchar_t ch)
440	{
441	if ((ch >= 0) && (ch <= 1114111))
442	return true;
443
444	return false;
445	}
446
447	/** Compare two NULL terminated strings.
448	*
449	* Do a char-by-char comparison of two NULL-terminated strings.
450	* The strings are considered equal iff they consist of the same
451	* characters on the minimum of their lengths.
452	*
453	* @param s1 First string to compare.
454	* @param s2 Second string to compare.
455	*
456	* @return 0 if the strings are equal, -1 if first is smaller,
457	* 1 if second smaller.
458	*
459	*/
460	int str_cmp(const char s1, const char s2)
461	{
462	wchar_t c1 = 0;
463	wchar_t c2 = 0;
464
465	size_t off1 = 0;
466	size_t off2 = 0;
467
468	while (true) {
469	c1 = str_decode(s1, &off1, STR_NO_LIMIT);
470	c2 = str_decode(s2, &off2, STR_NO_LIMIT);
471
472	if (c1 < c2)
473	return -1;
474
475	if (c1 > c2)
476	return 1;
477
478	if (c1 == 0 \|\| c2 == 0)
479	break;
480	}
481
482	return 0;
483	}
484
485	/** Compare two NULL terminated strings with length limit.
486	*
487	* Do a char-by-char comparison of two NULL-terminated strings.
488	* The strings are considered equal iff they consist of the same
489	* characters on the minimum of their lengths and the length limit.
490	*
491	* @param s1 First string to compare.
492	* @param s2 Second string to compare.
493	* @param max_len Maximum number of characters to consider.
494	*
495	* @return 0 if the strings are equal, -1 if first is smaller,
496	* 1 if second smaller.
497	*
498	*/
499	int str_lcmp(const char s1, const char s2, size_t max_len)
500	{
501	wchar_t c1 = 0;
502	wchar_t c2 = 0;
503
504	size_t off1 = 0;
505	size_t off2 = 0;
506
507	size_t len = 0;
508
509	while (true) {
510	if (len >= max_len)
511	break;
512
513	c1 = str_decode(s1, &off1, STR_NO_LIMIT);
514	c2 = str_decode(s2, &off2, STR_NO_LIMIT);
515
516	if (c1 < c2)
517	return -1;
518
519	if (c1 > c2)
520	return 1;
521
522	if (c1 == 0 \|\| c2 == 0)
523	break;
524
525	++len;
526	}
527
528	return 0;
529
530	}
531
532	/** Copy string.
533	*
534	* Copy source string @a src to destination buffer @a dest.
535	* No more than @a size bytes are written. If the size of the output buffer
536	* is at least one byte, the output string will always be well-formed, i.e.
537	* null-terminated and containing only complete characters.
538	*
539	* @param dst Destination buffer.
540	* @param count Size of the destination buffer (must be > 0).
541	* @param src Source string.
542	*/
543	void str_cpy(char dest, size_t size, const char src)
544	{
545	wchar_t ch;
546	size_t src_off;
547	size_t dest_off;
548
549	/* There must be space for a null terminator in the buffer. */
550	ASSERT(size > 0);
551
552	src_off = 0;
553	dest_off = 0;
554
555	while ((ch = str_decode(src, &src_off, STR_NO_LIMIT)) != 0) {
556	if (chr_encode(ch, dest, &dest_off, size - 1) != EOK)
557	break;
558	}
559
560	dest[dest_off] = '\0';
561	}
562
563	/** Copy size-limited substring.
564	*
565	* Copy prefix of string @a src of max. size @a size to destination buffer
566	* @a dest. No more than @a size bytes are written. The output string will
567	* always be well-formed, i.e. null-terminated and containing only complete
568	* characters.
569	*
570	* No more than @a n bytes are read from the input string, so it does not
571	* have to be null-terminated.
572	*
573	* @param dst Destination buffer.
574	* @param count Size of the destination buffer (must be > 0).
575	* @param src Source string.
576	* @param n Maximum number of bytes to read from @a src.
577	*/
578	void str_ncpy(char dest, size_t size, const char src, size_t n)
579	{
580	wchar_t ch;
581	size_t src_off;
582	size_t dest_off;
583
584	/* There must be space for a null terminator in the buffer. */
585	ASSERT(size > 0);
586
587	src_off = 0;
588	dest_off = 0;
589
590	while ((ch = str_decode(src, &src_off, n)) != 0) {
591	if (chr_encode(ch, dest, &dest_off, size - 1) != EOK)
592	break;
593	}
594
595	dest[dest_off] = '\0';
596	}
597
598	/** Copy NULL-terminated wide string to string
599	*
600	* Copy source wide string @a src to destination buffer @a dst.
601	* No more than @a size bytes are written. NULL-terminator is always
602	* written after the last succesfully copied character (i.e. if the
603	* destination buffer is has at least 1 byte, it will be always
604	* NULL-terminated).
605	*
606	* @param src Source wide string.
607	* @param dst Destination buffer.
608	* @param count Size of the destination buffer.
609	*
610	*/
611	void wstr_nstr(char dst, const wchar_t src, size_t size)
612	{
613	/* No space for the NULL-terminator in the buffer */
614	if (size == 0)
615	return;
616
617	wchar_t ch;
618	size_t src_idx = 0;
619	size_t dst_off = 0;
620
621	while ((ch = src[src_idx++]) != 0) {
622	if (chr_encode(ch, dst, &dst_off, size) != EOK)
623	break;
624	}
625
626	if (dst_off >= size)
627	dst[size - 1] = 0;
628	else
629	dst[dst_off] = 0;
630	}
631
632	/** Find first occurence of character in string.
633	*
634	* @param str String to search.
635	* @param ch Character to look for.
636	*
637	* @return Pointer to character in @a str or NULL if not found.
638	*
639	*/
640	const char str_chr(const char str, wchar_t ch)
641	{
642	wchar_t acc;
643	size_t off = 0;
644	size_t last = 0;
645
646	while ((acc = str_decode(str, &off, STR_NO_LIMIT)) != 0) {
647	if (acc == ch)
648	return (str + last);
649	last = off;
650	}
651
652	return NULL;
653	}
654
655	/** Insert a wide character into a wide string.
656	*
657	* Insert a wide character into a wide string at position
658	* @a pos. The characters after the position are shifted.
659	*
660	* @param str String to insert to.
661	* @param ch Character to insert to.
662	* @param pos Character index where to insert.
663	@ @param max_pos Characters in the buffer.
664	*
665	* @return True if the insertion was sucessful, false if the position
666	* is out of bounds.
667	*
668	*/
669	bool wstr_linsert(wchar_t *str, wchar_t ch, size_t pos, size_t max_pos)
670	{
671	size_t len = wstr_length(str);
672
673	if ((pos > len) \|\| (pos + 1 > max_pos))
674	return false;
675
676	size_t i;
677	for (i = len; i + 1 > pos; i--)
678	str[i + 1] = str[i];
679
680	str[pos] = ch;
681
682	return true;
683	}
684
685	/** Remove a wide character from a wide string.
686	*
687	* Remove a wide character from a wide string at position
688	* @a pos. The characters after the position are shifted.
689	*
690	* @param str String to remove from.
691	* @param pos Character index to remove.
692	*
693	* @return True if the removal was sucessful, false if the position
694	* is out of bounds.
695	*
696	*/
697	bool wstr_remove(wchar_t *str, size_t pos)
698	{
699	size_t len = wstr_length(str);
700
701	if (pos >= len)
702	return false;
703
704	size_t i;
705	for (i = pos + 1; i <= len; i++)
706	str[i - 1] = str[i];
707
708	return true;
709	}
710
711	/** @}
712	*/

Note: See TracBrowser for help on using the repository browser.

Download in other formats: