Context Navigation

str.c@ de1712e

Visit:

lfn serial ticket/834-toolchain-update topic/msim-upgrade topic/simplify-dev-export

Last change on this file since de1712e was 1772e6d, checked in by Martin Sucha <sucha14@…>, 13 years ago

Update documentation for str_cmp and str_lcmp.

This is a modified version of formulation suggested by Jiri Zarevucky.

Property mode set to 100644

File size: 11.5 KB

Rev	Line
[4872160]	1	/*
	2	* Copyright (c) 2001-2004 Jakub Jermar
	3	* All rights reserved.
	4	*
	5	* Redistribution and use in source and binary forms, with or without
	6	* modification, are permitted provided that the following conditions
	7	* are met:
	8	*
	9	* - Redistributions of source code must retain the above copyright
	10	* notice, this list of conditions and the following disclaimer.
	11	* - Redistributions in binary form must reproduce the above copyright
	12	* notice, this list of conditions and the following disclaimer in the
	13	* documentation and/or other materials provided with the distribution.
	14	* - The name of the author may not be used to endorse or promote products
	15	* derived from this software without specific prior written permission.
	16	*
	17	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	18	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	19	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	20	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	21	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	22	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	23	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	24	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	25	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	26	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	27	*/
	28
	29	/**
	30	* @file
	31	* @brief String functions.
	32	*
	33	* Strings and characters use the Universal Character Set (UCS). The standard
	34	* strings, called just strings are encoded in UTF-8. Wide strings (encoded
	35	* in UTF-32) are supported to a limited degree. A single character is
	36	* represented as wchar_t.@n
	37	*
	38	* Overview of the terminology:@n
	39	*
	40	* Term Meaning
	41	* -------------------- ----------------------------------------------------
	42	* byte 8 bits stored in uint8_t (unsigned 8 bit integer)
	43	*
	44	* character UTF-32 encoded Unicode character, stored in wchar_t
	45	* (signed 32 bit integer), code points 0 .. 1114111
	46	* are valid
	47	*
	48	* ASCII character 7 bit encoded ASCII character, stored in char
	49	* (usually signed 8 bit integer), code points 0 .. 127
	50	* are valid
	51	*
	52	* string UTF-8 encoded NULL-terminated Unicode string, char *
	53	*
	54	* wide string UTF-32 encoded NULL-terminated Unicode string,
	55	* wchar_t *
	56	*
	57	* [wide] string size number of BYTES in a [wide] string (excluding
	58	* the NULL-terminator), size_t
	59	*
	60	* [wide] string length number of CHARACTERS in a [wide] string (excluding
	61	* the NULL-terminator), size_t
	62	*
	63	* [wide] string width number of display cells on a monospace display taken
	64	* by a [wide] string, size_t
	65	*
	66	*
	67	* Overview of string metrics:@n
	68	*
	69	* Metric Abbrev. Type Meaning
	70	* ------ ------ ------ -------------------------------------------------
	71	* size n size_t number of BYTES in a string (excluding the
	72	* NULL-terminator)
	73	*
	74	* length l size_t number of CHARACTERS in a string (excluding the
	75	* null terminator)
	76	*
	77	* width w size_t number of display cells on a monospace display
	78	* taken by a string
	79	*
	80	*
	81	* Function naming prefixes:@n
	82	*
	83	* chr_ operate on characters
	84	* ascii_ operate on ASCII characters
	85	* str_ operate on strings
	86	* wstr_ operate on wide strings
	87	*
	88	* [w]str_[n\|l\|w] operate on a prefix limited by size, length
	89	* or width
	90	*
	91	*
	92	* A specific character inside a [wide] string can be referred to by:@n
	93	*
	94	* pointer (char , wchar_t )
	95	* byte offset (size_t)
	96	* character index (size_t)
	97	*
	98	*/
	99
	100	#include <str.h>
	101	#include <errno.h>
	102
[8e893ae]	103	/** Check the condition if wchar_t is signed */
	104	#ifdef WCHAR_IS_UNSIGNED
	105	#define WCHAR_SIGNED_CHECK(cond) (true)
	106	#else
	107	#define WCHAR_SIGNED_CHECK(cond) (cond)
	108	#endif
	109
[4872160]	110	/** Byte mask consisting of lowest @n bits (out of 8) */
	111	#define LO_MASK_8(n) ((uint8_t) ((1 << (n)) - 1))
	112
	113	/** Byte mask consisting of lowest @n bits (out of 32) */
	114	#define LO_MASK_32(n) ((uint32_t) ((1 << (n)) - 1))
	115
	116	/** Byte mask consisting of highest @n bits (out of 8) */
	117	#define HI_MASK_8(n) (~LO_MASK_8(8 - (n)))
	118
	119	/** Number of data bits in a UTF-8 continuation byte */
	120	#define CONT_BITS 6
	121
	122	/** Decode a single character from a string.
	123	*
	124	* Decode a single character from a string of size @a size. Decoding starts
	125	* at @a offset and this offset is moved to the beginning of the next
	126	* character. In case of decoding error, offset generally advances at least
	127	* by one. However, offset is never moved beyond size.
	128	*
	129	* @param str String (not necessarily NULL-terminated).
	130	* @param offset Byte offset in string where to start decoding.
	131	* @param size Size of the string (in bytes).
	132	*
	133	* @return Value of decoded character, U_SPECIAL on decoding error or
	134	* NULL if attempt to decode beyond @a size.
	135	*
	136	*/
	137	wchar_t str_decode(const char str, size_t offset, size_t size)
	138	{
	139	if (*offset + 1 > size)
	140	return 0;
	141
	142	/* First byte read from string */
	143	uint8_t b0 = (uint8_t) str[(*offset)++];
	144
	145	/* Determine code length */
	146
	147	unsigned int b0_bits; /* Data bits in first byte */
	148	unsigned int cbytes; /* Number of continuation bytes */
	149
	150	if ((b0 & 0x80) == 0) {
	151	/* 0xxxxxxx (Plain ASCII) */
	152	b0_bits = 7;
	153	cbytes = 0;
	154	} else if ((b0 & 0xe0) == 0xc0) {
	155	/* 110xxxxx 10xxxxxx */
	156	b0_bits = 5;
	157	cbytes = 1;
	158	} else if ((b0 & 0xf0) == 0xe0) {
	159	/* 1110xxxx 10xxxxxx 10xxxxxx */
	160	b0_bits = 4;
	161	cbytes = 2;
	162	} else if ((b0 & 0xf8) == 0xf0) {
	163	/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
	164	b0_bits = 3;
	165	cbytes = 3;
	166	} else {
	167	/* 10xxxxxx -- unexpected continuation byte */
	168	return U_SPECIAL;
	169	}
	170
	171	if (*offset + cbytes > size)
	172	return U_SPECIAL;
	173
	174	wchar_t ch = b0 & LO_MASK_8(b0_bits);
	175
	176	/* Decode continuation bytes */
	177	while (cbytes > 0) {
	178	uint8_t b = (uint8_t) str[(*offset)++];
	179
	180	/* Must be 10xxxxxx */
	181	if ((b & 0xc0) != 0x80)
	182	return U_SPECIAL;
	183
	184	/* Shift data bits to ch */
	185	ch = (ch << CONT_BITS) \| (wchar_t) (b & LO_MASK_8(CONT_BITS));
	186	cbytes--;
	187	}
	188
	189	return ch;
	190	}
	191
	192	/** Encode a single character to string representation.
	193	*
	194	* Encode a single character to string representation (i.e. UTF-8) and store
	195	* it into a buffer at @a offset. Encoding starts at @a offset and this offset
	196	* is moved to the position where the next character can be written to.
	197	*
	198	* @param ch Input character.
	199	* @param str Output buffer.
	200	* @param offset Byte offset where to start writing.
	201	* @param size Size of the output buffer (in bytes).
	202	*
	203	* @return EOK if the character was encoded successfully, EOVERFLOW if there
	204	* was not enough space in the output buffer or EINVAL if the character
	205	* code was invalid.
	206	*/
[8e893ae]	207	int chr_encode(const wchar_t ch, char str, size_t offset, size_t size)
[4872160]	208	{
	209	if (*offset >= size)
	210	return EOVERFLOW;
	211
	212	if (!chr_check(ch))
	213	return EINVAL;
	214
	215	/* Unsigned version of ch (bit operations should only be done
	216	on unsigned types). */
	217	uint32_t cc = (uint32_t) ch;
	218
	219	/* Determine how many continuation bytes are needed */
	220
	221	unsigned int b0_bits; /* Data bits in first byte */
	222	unsigned int cbytes; /* Number of continuation bytes */
	223
	224	if ((cc & ~LO_MASK_32(7)) == 0) {
	225	b0_bits = 7;
	226	cbytes = 0;
	227	} else if ((cc & ~LO_MASK_32(11)) == 0) {
	228	b0_bits = 5;
	229	cbytes = 1;
	230	} else if ((cc & ~LO_MASK_32(16)) == 0) {
	231	b0_bits = 4;
	232	cbytes = 2;
	233	} else if ((cc & ~LO_MASK_32(21)) == 0) {
	234	b0_bits = 3;
	235	cbytes = 3;
	236	} else {
	237	/* Codes longer than 21 bits are not supported */
	238	return EINVAL;
	239	}
	240
	241	/* Check for available space in buffer */
	242	if (*offset + cbytes >= size)
	243	return EOVERFLOW;
	244
	245	/* Encode continuation bytes */
	246	unsigned int i;
	247	for (i = cbytes; i > 0; i--) {
	248	str[*offset + i] = 0x80 \| (cc & LO_MASK_32(CONT_BITS));
	249	cc = cc >> CONT_BITS;
	250	}
	251
	252	/* Encode first byte */
	253	str[*offset] = (cc & LO_MASK_32(b0_bits)) \| HI_MASK_8(8 - b0_bits - 1);
	254
	255	/* Advance offset */
	256	*offset += cbytes + 1;
	257
	258	return EOK;
	259	}
	260
	261	/** Get size of string.
	262	*
	263	* Get the number of bytes which are used by the string @a str (excluding the
	264	* NULL-terminator).
	265	*
	266	* @param str String to consider.
	267	*
	268	* @return Number of bytes used by the string
	269	*
	270	*/
	271	size_t str_size(const char *str)
	272	{
	273	size_t size = 0;
	274
	275	while (*str++ != 0)
	276	size++;
	277
	278	return size;
	279	}
	280
	281	/** Get size of string with length limit.
	282	*
	283	* Get the number of bytes which are used by up to @a max_len first
	284	* characters in the string @a str. If @a max_len is greater than
	285	* the length of @a str, the entire string is measured (excluding the
	286	* NULL-terminator).
	287	*
	288	* @param str String to consider.
	289	* @param max_len Maximum number of characters to measure.
	290	*
	291	* @return Number of bytes used by the characters.
	292	*
	293	*/
	294	size_t str_lsize(const char *str, size_t max_len)
	295	{
	296	size_t len = 0;
	297	size_t offset = 0;
	298
	299	while (len < max_len) {
	300	if (str_decode(str, &offset, STR_NO_LIMIT) == 0)
	301	break;
	302
	303	len++;
	304	}
	305
	306	return offset;
	307	}
	308
	309	/** Get number of characters in a string.
	310	*
	311	* @param str NULL-terminated string.
	312	*
	313	* @return Number of characters in string.
	314	*
	315	*/
	316	size_t str_length(const char *str)
	317	{
	318	size_t len = 0;
	319	size_t offset = 0;
	320
	321	while (str_decode(str, &offset, STR_NO_LIMIT) != 0)
	322	len++;
	323
	324	return len;
	325	}
	326
	327	/** Check whether character is plain ASCII.
	328	*
	329	* @return True if character is plain ASCII.
	330	*
	331	*/
	332	bool ascii_check(wchar_t ch)
	333	{
[8e893ae]	334	if (WCHAR_SIGNED_CHECK(ch >= 0) && (ch <= 127))
[4872160]	335	return true;
	336
	337	return false;
	338	}
	339
	340	/** Check whether character is valid
	341	*
	342	* @return True if character is a valid Unicode code point.
	343	*
	344	*/
	345	bool chr_check(wchar_t ch)
	346	{
[8e893ae]	347	if (WCHAR_SIGNED_CHECK(ch >= 0) && (ch <= 1114111))
[4872160]	348	return true;
	349
	350	return false;
	351	}
	352
	353	/** Compare two NULL terminated strings.
	354	*
	355	* Do a char-by-char comparison of two NULL-terminated strings.
[4efeab5]	356	* The strings are considered equal iff their length is equal
	357	* and both strings consist of the same sequence of characters.
	358	*
[1772e6d]	359	* A string S1 is less than another string S2 if it has a character with
	360	* lower value at the first character position where the strings differ.
	361	* If the strings differ in length, the shorter one is treated as if
	362	* padded by characters with a value of zero.
[4872160]	363	*
	364	* @param s1 First string to compare.
	365	* @param s2 Second string to compare.
	366	*
[1772e6d]	367	* @return 0 if the strings are equal, -1 if the first is less than the second,
	368	* 1 if the second is less than the first.
[4872160]	369	*
	370	*/
	371	int str_cmp(const char s1, const char s2)
	372	{
	373	wchar_t c1 = 0;
	374	wchar_t c2 = 0;
	375
	376	size_t off1 = 0;
	377	size_t off2 = 0;
	378
	379	while (true) {
	380	c1 = str_decode(s1, &off1, STR_NO_LIMIT);
	381	c2 = str_decode(s2, &off2, STR_NO_LIMIT);
	382
	383	if (c1 < c2)
	384	return -1;
	385
	386	if (c1 > c2)
	387	return 1;
	388
	389	if ((c1 == 0) \|\| (c2 == 0))
	390	break;
	391	}
	392
	393	return 0;
	394	}
	395
	396	/** Copy string.
	397	*
	398	* Copy source string @a src to destination buffer @a dest.
	399	* No more than @a size bytes are written. If the size of the output buffer
	400	* is at least one byte, the output string will always be well-formed, i.e.
	401	* null-terminated and containing only complete characters.
	402	*
	403	* @param dest Destination buffer.
	404	* @param count Size of the destination buffer (must be > 0).
	405	* @param src Source string.
	406	*
	407	*/
	408	void str_cpy(char dest, size_t size, const char src)
	409	{
	410	size_t src_off = 0;
	411	size_t dest_off = 0;
	412
	413	wchar_t ch;
	414	while ((ch = str_decode(src, &src_off, STR_NO_LIMIT)) != 0) {
	415	if (chr_encode(ch, dest, &dest_off, size - 1) != EOK)
	416	break;
	417	}
	418
	419	dest[dest_off] = '\0';
	420	}
	421
	422	/** @}
	423	*/

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: mainline/boot/generic/src/str.c@ de1712e

Download in other formats: