Context Navigation

source: mainline/kernel/generic/src/lib/string.c@ 82bb9c1

Visit:

lfn serial ticket/834-toolchain-update topic/msim-upgrade topic/simplify-dev-export

Last change on this file since 82bb9c1 was 82bb9c1, checked in by Jiri Svoboda <jirik.svoboda@…>, 16 years ago
Consider character display width somewhat. Explain naming scheme.
Property mode set to `100644`
File size: 11.8 KB

Line
1	/*
2	* Copyright (c) 2001-2004 Jakub Jermar
3	* All rights reserved.
4	*
5	* Redistribution and use in source and binary forms, with or without
6	* modification, are permitted provided that the following conditions
7	* are met:
8	*
9	* - Redistributions of source code must retain the above copyright
10	* notice, this list of conditions and the following disclaimer.
11	* - Redistributions in binary form must reproduce the above copyright
12	* notice, this list of conditions and the following disclaimer in the
13	* documentation and/or other materials provided with the distribution.
14	* - The name of the author may not be used to endorse or promote products
15	* derived from this software without specific prior written permission.
16	*
17	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27	*/
28
29	/** @addtogroup generic
30	* @{
31	*/
32
33	/**
34	* @file
35	* @brief String functions.
36	*
37	* Strings and characters use the Universal Character Set (UCS). The standard
38	* strings, called just strings are encoded in UTF-8. Wide strings (encoded
39	* in UTF-32) are supported to a limited degree. A single character is
40	* represented as wchar_t.
41	*
42	* Strings have the following metrics:
43	*
44	* Metric Abbrev. Meaning
45	* ------ ------ -------
46	* size n Number of bytes the string is encoded into, excluding
47	* the null terminator.
48	* length l The number of characters in the string, excluding
49	* the null terminator.
50	* width w The number of character cells the string takes up on a
51	* monospace display.
52	*
53	* Naming scheme:
54	*
55	* chr_xxx operate on characters
56	* str_xxx operate on strings
57	* wstr_xxx operate on wide strings
58	*
59	* [w]str_[n\|l\|w]xxx operate on a prefix limited by size, length
60	* or width.
61	*/
62
63	#include <string.h>
64	#include <print.h>
65	#include <cpu.h>
66	#include <arch/asm.h>
67	#include <arch.h>
68	#include <errno.h>
69	#include <console/kconsole.h>
70
71	char invalch = '?';
72
73	/** Byte mask consisting of lowest @n bits (out of eight). */
74	#define LO_MASK_8(n) ((uint8_t)((1 << (n)) - 1))
75
76	/** Byte mask consisting of lowest @n bits (out of 32). */
77	#define LO_MASK_32(n) ((uint32_t)((1 << (n)) - 1))
78
79	/** Byte mask consisting of highest @n bits (out of eight). */
80	#define HI_MASK_8(n) (~LO_MASK_8(8 - (n)))
81
82	/** Number of data bits in a UTF-8 continuation byte. */
83	#define CONT_BITS 6
84
85	/** Decode a single character from a substring.
86	*
87	* Decode a single character from a substring of size @a sz. Decoding starts
88	* at @a offset and this offset is moved to the beginning of the next
89	* character. In case of decoding error, offset generally advances at least
90	* by one. However, offset is never moved beyond (str + sz).
91	*
92	* @param str String (not necessarily NULL-terminated).
93	* @param index Index (counted in plain characters) where to start
94	* the decoding.
95	* @param limit Size of the substring.
96	*
97	* @return Value of decoded character or '?' on decoding error.
98	*/
99	wchar_t chr_decode(const char str, size_t offset, size_t sz)
100	{
101	uint8_t b0, b; /* Bytes read from str. */
102	wchar_t ch;
103
104	int b0_bits; /* Data bits in first byte. */
105	int cbytes; /* Number of continuation bytes. */
106
107	if (*offset + 1 > sz)
108	return invalch;
109
110	b0 = (uint8_t) str[(*offset)++];
111
112	/* Determine code length. */
113
114	if ((b0 & 0x80) == 0) {
115	/* 0xxxxxxx (Plain ASCII) */
116	b0_bits = 7;
117	cbytes = 0;
118	} else if ((b0 & 0xe0) == 0xc0) {
119	/* 110xxxxx 10xxxxxx */
120	b0_bits = 5;
121	cbytes = 1;
122	} else if ((b0 & 0xf0) == 0xe0) {
123	/* 1110xxxx 10xxxxxx 10xxxxxx */
124	b0_bits = 4;
125	cbytes = 2;
126	} else if ((b0 & 0xf8) == 0xf0) {
127	/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
128	b0_bits = 3;
129	cbytes = 3;
130	} else {
131	/* 10xxxxxx -- unexpected continuation byte. */
132	return invalch;
133	}
134
135	if (*offset + cbytes > sz) {
136	return invalch;
137	}
138
139	ch = b0 & LO_MASK_8(b0_bits);
140
141	/* Decode continuation bytes. */
142	while (cbytes > 0) {
143	b = (uint8_t) str[(*offset)++];
144
145	/* Must be 10xxxxxx. */
146	if ((b & 0xc0) != 0x80) {
147	return invalch;
148	}
149
150	/* Shift data bits to ch. */
151	ch = (ch << CONT_BITS) \| (wchar_t) (b & LO_MASK_8(CONT_BITS));
152	--cbytes;
153	}
154
155	return ch;
156	}
157
158	/** Encode a single character to string representation.
159	*
160	* Encode a single character to string representation (i.e. UTF-8) and store
161	* it into a buffer at @a offset. Encoding starts at @a offset and this offset
162	* is moved to the position where the next character can be written to.
163	*
164	* @param ch Input character.
165	* @param str Output buffer.
166	* @param offset Offset (in bytes) where to start writing.
167	* @param sz Size of the output buffer.
168	*
169	* @return EOK if the character was encoded successfully, EOVERFLOW if there
170	* was not enough space in the output buffer or EINVAL if the character
171	* code was invalid.
172	*/
173	int chr_encode(wchar_t ch, char str, size_t offset, size_t sz)
174	{
175	uint32_t cc; /* Unsigned version of ch. */
176
177	int cbytes; /* Number of continuation bytes. */
178	int b0_bits; /* Number of data bits in first byte. */
179	int i;
180
181	if (*offset >= sz)
182	return EOVERFLOW;
183
184	if (ch < 0)
185	return EINVAL;
186
187	/* Bit operations should only be done on unsigned numbers. */
188	cc = (uint32_t) ch;
189
190	/* Determine how many continuation bytes are needed. */
191	if ((cc & ~LO_MASK_32(7)) == 0) {
192	b0_bits = 7;
193	cbytes = 0;
194	} else if ((cc & ~LO_MASK_32(11)) == 0) {
195	b0_bits = 5;
196	cbytes = 1;
197	} else if ((cc & ~LO_MASK_32(16)) == 0) {
198	b0_bits = 4;
199	cbytes = 2;
200	} else if ((cc & ~LO_MASK_32(21)) == 0) {
201	b0_bits = 3;
202	cbytes = 3;
203	} else {
204	/* Codes longer than 21 bits are not supported. */
205	return EINVAL;
206	}
207
208	/* Check for available space in buffer. */
209	if (*offset + cbytes >= sz)
210	return EOVERFLOW;
211
212	/* Encode continuation bytes. */
213	for (i = cbytes; i > 0; --i) {
214	str[*offset + i] = 0x80 \| (cc & LO_MASK_32(CONT_BITS));
215	cc = cc >> CONT_BITS;
216	}
217
218	/* Encode first byte. */
219	str[*offset] = (cc & LO_MASK_32(b0_bits)) \| HI_MASK_8(8 - b0_bits - 1);
220
221	/* Advance offset. */
222	*offset += (1 + cbytes);
223
224	return EOK;
225	}
226
227	/** Get display width of character.
228	*
229	* @param ch The character.
230	* @return Character width in display cells.
231	*/
232	count_t chr_width(wchar_t ch)
233	{
234	return 1;
235	}
236
237	/** Get size of string, with length limit.
238	*
239	* Get the number of bytes which are used by up to @a max_len first
240	* characters in the string @a str. If @a max_len is greater than
241	* the length of @a str, the entire string is measured.
242	*
243	* @param str String to consider.
244	* @param count Maximum number of characters to measure.
245	*
246	* @return Number of bytes used by the characters.
247	*/
248	size_t str_lsize(const char *str, count_t max_len)
249	{
250	count_t len = 0;
251	size_t cur = 0;
252	size_t prev;
253	wchar_t ch;
254
255	while (true) {
256	prev = cur;
257	if (len >= max_len)
258	break;
259	ch = chr_decode(str, &cur, UTF8_NO_LIMIT);
260	if (ch == '\0') break;
261
262	len++;
263	}
264
265	return prev;
266	}
267
268	/** Get size of string, with width limit.
269	*
270	* Get the number of bytes which are used by the longest prefix of @a str
271	* that can fit into @a max_width display cells.
272	*
273	* @param str String to consider.
274	* @param count Maximum number of display cells.
275	*
276	* @return Number of bytes used by the characters that fit.
277	*/
278	size_t str_wsize(const char *str, count_t max_width)
279	{
280	count_t width = 0;
281	size_t cur = 0;
282	size_t prev;
283	wchar_t ch;
284
285	while (true) {
286	prev = cur;
287	if (width >= max_width)
288	break;
289	ch = chr_decode(str, &cur, UTF8_NO_LIMIT);
290	if (ch == '\0') break;
291
292	width += chr_width(ch);
293	}
294
295	return prev;
296	}
297
298
299	/** Get length of wide string, with width limit.
300	*
301	* Get the number of characters in a wide string that can fit into @a max_width
302	* display cells.
303	*
304	* @param wstr Wide string to consider.
305	* @param count Maximum number of display cells.
306	*
307	* @return Number of bytes used by the characters that fit.
308	*/
309	count_t wstr_wlength(const wchar_t *wstr, count_t max_width)
310	{
311	count_t width = 0;
312	index_t cur = 0;
313
314	while (true) {
315	if (width >= max_width)
316	break;
317	if (wstr[cur] == '\0') break;
318
319	width += chr_width(wstr[cur]);
320	++cur;
321	}
322
323	return (count_t) cur;
324	}
325
326	/** Check whether character is plain ASCII.
327	*
328	* @return True if character is plain ASCII.
329	*
330	*/
331	bool ascii_check(const wchar_t ch)
332	{
333	if ((ch >= 0) && (ch <= 127))
334	return true;
335
336	return false;
337	}
338
339	/** Check whether character is Unicode.
340	*
341	* @return True if character is valid Unicode code point.
342	*/
343	bool unicode_check(const wchar_t ch)
344	{
345	if ((ch >= 0) && (ch <= 1114111))
346	return true;
347
348	return false;
349	}
350
351	/** Return number of bytes the string occupies.
352	*
353	* @param str A string.
354	* @return Number of bytes in @a str excluding the null terminator.
355	*/
356	size_t str_size(const char *str)
357	{
358	size_t size;
359
360	size = 0;
361	while (*str++ != '\0')
362	++size;
363
364	return size;
365	}
366
367	/** Return number of characters in a string.
368	*
369	* @param str NULL-terminated string.
370	* @return Number of characters in string.
371	*/
372	count_t str_length(const char *str)
373	{
374	count_t len = 0;
375	size_t offset = 0;
376
377	while (chr_decode(str, &offset, UTF8_NO_LIMIT) != 0) {
378	len++;
379	}
380
381	return len;
382	}
383
384	/** Return number of characters in a wide string.
385	*
386	* @param str NULL-terminated wide string.
387	* @return Number of characters in @a str.
388	*/
389	count_t wstr_length(const wchar_t *wstr)
390	{
391	count_t len;
392
393	len = 0;
394	while (*wstr++ != '\0')
395	++len;
396
397	return len;
398	}
399
400	/** Compare two NULL terminated strings
401	*
402	* Do a char-by-char comparison of two NULL terminated strings.
403	* The strings are considered equal iff they consist of the same
404	* characters on the minimum of their lengths.
405	*
406	* @param src First string to compare.
407	* @param dst Second string to compare.
408	*
409	* @return 0 if the strings are equal, -1 if first is smaller, 1 if second smaller.
410	*
411	*/
412	int strcmp(const char src, const char dst)
413	{
414	for (; src && dst; src++, dst++) {
415	if (src < dst)
416	return -1;
417	if (src > dst)
418	return 1;
419	}
420	if (src == dst)
421	return 0;
422
423	if (!*src)
424	return -1;
425
426	return 1;
427	}
428
429
430	/** Compare two NULL terminated strings
431	*
432	* Do a char-by-char comparison of two NULL terminated strings.
433	* The strings are considered equal iff they consist of the same
434	* characters on the minimum of their lengths and specified maximal
435	* length.
436	*
437	* @param src First string to compare.
438	* @param dst Second string to compare.
439	* @param len Maximal length for comparison.
440	*
441	* @return 0 if the strings are equal, -1 if first is smaller, 1 if second smaller.
442	*/
443	int strncmp(const char src, const char dst, size_t len)
444	{
445	unsigned int i;
446
447	for (i = 0; (src) && (dst) && (i < len); src++, dst++, i++) {
448	if (src < dst)
449	return -1;
450
451	if (src > dst)
452	return 1;
453	}
454
455	if (i == len \|\| src == dst)
456	return 0;
457
458	if (!*src)
459	return -1;
460
461	return 1;
462	}
463
464
465
466	/** Copy NULL terminated string.
467	*
468	* Copy at most 'len' characters from string 'src' to 'dest'.
469	* If 'src' is shorter than 'len', '\0' is inserted behind the
470	* last copied character.
471	*
472	* @param src Source string.
473	* @param dest Destination buffer.
474	* @param len Size of destination buffer.
475	*/
476	void strncpy(char dest, const char src, size_t len)
477	{
478	unsigned int i;
479
480	for (i = 0; i < len; i++) {
481	if (!(dest[i] = src[i]))
482	return;
483	}
484
485	dest[i - 1] = '\0';
486	}
487
488	/** Find first occurence of character in string.
489	*
490	* @param s String to search.
491	* @param i Character to look for.
492	*
493	* @return Pointer to character in @a s or NULL if not found.
494	*/
495	extern char strchr(const char s, int i)
496	{
497	while (*s != '\0') {
498	if (*s == i)
499	return (char *) s;
500	++s;
501	}
502
503	return NULL;
504	}
505
506	/** @}
507	*/

Note: See TracBrowser for help on using the repository browser.

Download in other formats: