Context Navigation

source: mainline/boot/generic/src/str.c@ 002fd5f

Visit:

lfn serial ticket/834-toolchain-update topic/msim-upgrade topic/simplify-dev-export

Last change on this file since 002fd5f was 002fd5f, checked in by jzr <zarevucky.jiri@…>, 8 years ago
Replace parts of system headers with <_bits/…>.
Property mode set to `100644`
File size: 11.5 KB

Rev	Line
[4872160]	1	/*
	2	* Copyright (c) 2001-2004 Jakub Jermar
	3	* All rights reserved.
	4	*
	5	* Redistribution and use in source and binary forms, with or without
	6	* modification, are permitted provided that the following conditions
	7	* are met:
	8	*
	9	* - Redistributions of source code must retain the above copyright
	10	* notice, this list of conditions and the following disclaimer.
	11	* - Redistributions in binary form must reproduce the above copyright
	12	* notice, this list of conditions and the following disclaimer in the
	13	* documentation and/or other materials provided with the distribution.
	14	* - The name of the author may not be used to endorse or promote products
	15	* derived from this software without specific prior written permission.
	16	*
	17	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	18	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	19	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	20	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	21	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	22	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	23	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	24	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	25	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	26	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	27	*/
	28
	29	/**
	30	* @file
	31	* @brief String functions.
	32	*
	33	* Strings and characters use the Universal Character Set (UCS). The standard
	34	* strings, called just strings are encoded in UTF-8. Wide strings (encoded
	35	* in UTF-32) are supported to a limited degree. A single character is
	36	* represented as wchar_t.@n
	37	*
	38	* Overview of the terminology:@n
	39	*
	40	* Term Meaning
	41	* -------------------- ----------------------------------------------------
	42	* byte 8 bits stored in uint8_t (unsigned 8 bit integer)
	43	*
	44	* character UTF-32 encoded Unicode character, stored in wchar_t
	45	* (signed 32 bit integer), code points 0 .. 1114111
	46	* are valid
	47	*
	48	* ASCII character 7 bit encoded ASCII character, stored in char
	49	* (usually signed 8 bit integer), code points 0 .. 127
	50	* are valid
	51	*
	52	* string UTF-8 encoded NULL-terminated Unicode string, char *
	53	*
	54	* wide string UTF-32 encoded NULL-terminated Unicode string,
	55	* wchar_t *
	56	*
	57	* [wide] string size number of BYTES in a [wide] string (excluding
	58	* the NULL-terminator), size_t
	59	*
	60	* [wide] string length number of CHARACTERS in a [wide] string (excluding
	61	* the NULL-terminator), size_t
	62	*
	63	* [wide] string width number of display cells on a monospace display taken
	64	* by a [wide] string, size_t
	65	*
	66	*
	67	* Overview of string metrics:@n
	68	*
	69	* Metric Abbrev. Type Meaning
	70	* ------ ------ ------ -------------------------------------------------
	71	* size n size_t number of BYTES in a string (excluding the
	72	* NULL-terminator)
	73	*
	74	* length l size_t number of CHARACTERS in a string (excluding the
	75	* null terminator)
	76	*
	77	* width w size_t number of display cells on a monospace display
	78	* taken by a string
	79	*
	80	*
	81	* Function naming prefixes:@n
	82	*
	83	* chr_ operate on characters
	84	* ascii_ operate on ASCII characters
	85	* str_ operate on strings
	86	* wstr_ operate on wide strings
	87	*
	88	* [w]str_[n\|l\|w] operate on a prefix limited by size, length
	89	* or width
	90	*
	91	*
	92	* A specific character inside a [wide] string can be referred to by:@n
	93	*
	94	* pointer (char , wchar_t )
	95	* byte offset (size_t)
	96	* character index (size_t)
	97	*
	98	*/
	99
	100	#include <errno.h>
[d735e2e]	101	#include <stdbool.h>
	102	#include <stddef.h>
	103	#include <str.h>
[4872160]	104
[8e893ae]	105	/** Check the condition if wchar_t is signed */
[002fd5f]	106	#ifdef __WCHAR_UNSIGNED__
[8e893ae]	107	#define WCHAR_SIGNED_CHECK(cond) (true)
	108	#else
	109	#define WCHAR_SIGNED_CHECK(cond) (cond)
	110	#endif
	111
[4872160]	112	/** Byte mask consisting of lowest @n bits (out of 8) */
	113	#define LO_MASK_8(n) ((uint8_t) ((1 << (n)) - 1))
	114
	115	/** Byte mask consisting of lowest @n bits (out of 32) */
	116	#define LO_MASK_32(n) ((uint32_t) ((1 << (n)) - 1))
	117
	118	/** Byte mask consisting of highest @n bits (out of 8) */
	119	#define HI_MASK_8(n) (~LO_MASK_8(8 - (n)))
	120
	121	/** Number of data bits in a UTF-8 continuation byte */
	122	#define CONT_BITS 6
	123
	124	/** Decode a single character from a string.
	125	*
	126	* Decode a single character from a string of size @a size. Decoding starts
	127	* at @a offset and this offset is moved to the beginning of the next
	128	* character. In case of decoding error, offset generally advances at least
	129	* by one. However, offset is never moved beyond size.
	130	*
	131	* @param str String (not necessarily NULL-terminated).
	132	* @param offset Byte offset in string where to start decoding.
	133	* @param size Size of the string (in bytes).
	134	*
	135	* @return Value of decoded character, U_SPECIAL on decoding error or
	136	* NULL if attempt to decode beyond @a size.
	137	*
	138	*/
	139	wchar_t str_decode(const char str, size_t offset, size_t size)
	140	{
	141	if (*offset + 1 > size)
	142	return 0;
	143
	144	/* First byte read from string */
	145	uint8_t b0 = (uint8_t) str[(*offset)++];
	146
	147	/* Determine code length */
	148
	149	unsigned int b0_bits; /* Data bits in first byte */
	150	unsigned int cbytes; /* Number of continuation bytes */
	151
	152	if ((b0 & 0x80) == 0) {
	153	/* 0xxxxxxx (Plain ASCII) */
	154	b0_bits = 7;
	155	cbytes = 0;
	156	} else if ((b0 & 0xe0) == 0xc0) {
	157	/* 110xxxxx 10xxxxxx */
	158	b0_bits = 5;
	159	cbytes = 1;
	160	} else if ((b0 & 0xf0) == 0xe0) {
	161	/* 1110xxxx 10xxxxxx 10xxxxxx */
	162	b0_bits = 4;
	163	cbytes = 2;
	164	} else if ((b0 & 0xf8) == 0xf0) {
	165	/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
	166	b0_bits = 3;
	167	cbytes = 3;
	168	} else {
	169	/* 10xxxxxx -- unexpected continuation byte */
	170	return U_SPECIAL;
	171	}
	172
	173	if (*offset + cbytes > size)
	174	return U_SPECIAL;
	175
	176	wchar_t ch = b0 & LO_MASK_8(b0_bits);
	177
	178	/* Decode continuation bytes */
	179	while (cbytes > 0) {
	180	uint8_t b = (uint8_t) str[(*offset)++];
	181
	182	/* Must be 10xxxxxx */
	183	if ((b & 0xc0) != 0x80)
	184	return U_SPECIAL;
	185
	186	/* Shift data bits to ch */
	187	ch = (ch << CONT_BITS) \| (wchar_t) (b & LO_MASK_8(CONT_BITS));
	188	cbytes--;
	189	}
	190
	191	return ch;
	192	}
	193
	194	/** Encode a single character to string representation.
	195	*
	196	* Encode a single character to string representation (i.e. UTF-8) and store
	197	* it into a buffer at @a offset. Encoding starts at @a offset and this offset
	198	* is moved to the position where the next character can be written to.
	199	*
	200	* @param ch Input character.
	201	* @param str Output buffer.
	202	* @param offset Byte offset where to start writing.
	203	* @param size Size of the output buffer (in bytes).
	204	*
	205	* @return EOK if the character was encoded successfully, EOVERFLOW if there
	206	* was not enough space in the output buffer or EINVAL if the character
	207	* code was invalid.
	208	*/
[8e893ae]	209	int chr_encode(const wchar_t ch, char str, size_t offset, size_t size)
[4872160]	210	{
	211	if (*offset >= size)
	212	return EOVERFLOW;
	213
	214	if (!chr_check(ch))
	215	return EINVAL;
	216
	217	/* Unsigned version of ch (bit operations should only be done
	218	on unsigned types). */
	219	uint32_t cc = (uint32_t) ch;
	220
	221	/* Determine how many continuation bytes are needed */
	222
	223	unsigned int b0_bits; /* Data bits in first byte */
	224	unsigned int cbytes; /* Number of continuation bytes */
	225
	226	if ((cc & ~LO_MASK_32(7)) == 0) {
	227	b0_bits = 7;
	228	cbytes = 0;
	229	} else if ((cc & ~LO_MASK_32(11)) == 0) {
	230	b0_bits = 5;
	231	cbytes = 1;
	232	} else if ((cc & ~LO_MASK_32(16)) == 0) {
	233	b0_bits = 4;
	234	cbytes = 2;
	235	} else if ((cc & ~LO_MASK_32(21)) == 0) {
	236	b0_bits = 3;
	237	cbytes = 3;
	238	} else {
	239	/* Codes longer than 21 bits are not supported */
	240	return EINVAL;
	241	}
	242
	243	/* Check for available space in buffer */
	244	if (*offset + cbytes >= size)
	245	return EOVERFLOW;
	246
	247	/* Encode continuation bytes */
	248	unsigned int i;
	249	for (i = cbytes; i > 0; i--) {
	250	str[*offset + i] = 0x80 \| (cc & LO_MASK_32(CONT_BITS));
	251	cc = cc >> CONT_BITS;
	252	}
	253
	254	/* Encode first byte */
	255	str[*offset] = (cc & LO_MASK_32(b0_bits)) \| HI_MASK_8(8 - b0_bits - 1);
	256
	257	/* Advance offset */
	258	*offset += cbytes + 1;
	259
	260	return EOK;
	261	}
	262
	263	/** Get size of string.
	264	*
	265	* Get the number of bytes which are used by the string @a str (excluding the
	266	* NULL-terminator).
	267	*
	268	* @param str String to consider.
	269	*
	270	* @return Number of bytes used by the string
	271	*
	272	*/
	273	size_t str_size(const char *str)
	274	{
	275	size_t size = 0;
	276
	277	while (*str++ != 0)
	278	size++;
	279
	280	return size;
	281	}
	282
	283	/** Get size of string with length limit.
	284	*
	285	* Get the number of bytes which are used by up to @a max_len first
	286	* characters in the string @a str. If @a max_len is greater than
	287	* the length of @a str, the entire string is measured (excluding the
	288	* NULL-terminator).
	289	*
	290	* @param str String to consider.
	291	* @param max_len Maximum number of characters to measure.
	292	*
	293	* @return Number of bytes used by the characters.
	294	*
	295	*/
	296	size_t str_lsize(const char *str, size_t max_len)
	297	{
	298	size_t len = 0;
	299	size_t offset = 0;
	300
	301	while (len < max_len) {
	302	if (str_decode(str, &offset, STR_NO_LIMIT) == 0)
	303	break;
	304
	305	len++;
	306	}
	307
	308	return offset;
	309	}
	310
	311	/** Get number of characters in a string.
	312	*
	313	* @param str NULL-terminated string.
	314	*
	315	* @return Number of characters in string.
	316	*
	317	*/
	318	size_t str_length(const char *str)
	319	{
	320	size_t len = 0;
	321	size_t offset = 0;
	322
	323	while (str_decode(str, &offset, STR_NO_LIMIT) != 0)
	324	len++;
	325
	326	return len;
	327	}
	328
	329	/** Check whether character is plain ASCII.
	330	*
	331	* @return True if character is plain ASCII.
	332	*
	333	*/
	334	bool ascii_check(wchar_t ch)
	335	{
[8e893ae]	336	if (WCHAR_SIGNED_CHECK(ch >= 0) && (ch <= 127))
[4872160]	337	return true;
	338
	339	return false;
	340	}
	341
	342	/** Check whether character is valid
	343	*
	344	* @return True if character is a valid Unicode code point.
	345	*
	346	*/
	347	bool chr_check(wchar_t ch)
	348	{
[8e893ae]	349	if (WCHAR_SIGNED_CHECK(ch >= 0) && (ch <= 1114111))
[4872160]	350	return true;
	351
	352	return false;
	353	}
	354
	355	/** Compare two NULL terminated strings.
	356	*
	357	* Do a char-by-char comparison of two NULL-terminated strings.
[4efeab5]	358	* The strings are considered equal iff their length is equal
	359	* and both strings consist of the same sequence of characters.
	360	*
[1772e6d]	361	* A string S1 is less than another string S2 if it has a character with
	362	* lower value at the first character position where the strings differ.
	363	* If the strings differ in length, the shorter one is treated as if
	364	* padded by characters with a value of zero.
[4872160]	365	*
	366	* @param s1 First string to compare.
	367	* @param s2 Second string to compare.
	368	*
[1772e6d]	369	* @return 0 if the strings are equal, -1 if the first is less than the second,
	370	* 1 if the second is less than the first.
[4872160]	371	*
	372	*/
	373	int str_cmp(const char s1, const char s2)
	374	{
	375	wchar_t c1 = 0;
	376	wchar_t c2 = 0;
	377
	378	size_t off1 = 0;
	379	size_t off2 = 0;
	380
	381	while (true) {
	382	c1 = str_decode(s1, &off1, STR_NO_LIMIT);
	383	c2 = str_decode(s2, &off2, STR_NO_LIMIT);
	384
	385	if (c1 < c2)
	386	return -1;
	387
	388	if (c1 > c2)
	389	return 1;
	390
	391	if ((c1 == 0) \|\| (c2 == 0))
	392	break;
	393	}
	394
	395	return 0;
	396	}
	397
	398	/** Copy string.
	399	*
	400	* Copy source string @a src to destination buffer @a dest.
	401	* No more than @a size bytes are written. If the size of the output buffer
	402	* is at least one byte, the output string will always be well-formed, i.e.
	403	* null-terminated and containing only complete characters.
	404	*
	405	* @param dest Destination buffer.
	406	* @param count Size of the destination buffer (must be > 0).
	407	* @param src Source string.
	408	*
	409	*/
	410	void str_cpy(char dest, size_t size, const char src)
	411	{
	412	size_t src_off = 0;
	413	size_t dest_off = 0;
	414
	415	wchar_t ch;
	416	while ((ch = str_decode(src, &src_off, STR_NO_LIMIT)) != 0) {
	417	if (chr_encode(ch, dest, &dest_off, size - 1) != EOK)
	418	break;
	419	}
	420
	421	dest[dest_off] = '\0';
	422	}
	423
	424	/** @}
	425	*/

Note: See TracBrowser for help on using the repository browser.

Download in other formats: