Context Navigation

source: mainline/boot/generic/src/str.c@ d066259

Visit:

lfn serial ticket/834-toolchain-update topic/msim-upgrade topic/simplify-dev-export

Last change on this file since d066259 was d066259, checked in by Jiří Zárevúcky <zarevucky.jiri@…>, 6 years ago
Synchronize str.c/str.h across boot/kernel/uspace
Property mode set to `100644`
File size: 11.7 KB

Rev	Line
[4872160]	1	/*
	2	* Copyright (c) 2001-2004 Jakub Jermar
[d066259]	3	* Copyright (c) 2005 Martin Decky
	4	* Copyright (c) 2008 Jiri Svoboda
	5	* Copyright (c) 2011 Martin Sucha
	6	* Copyright (c) 2011 Oleg Romanenko
[4872160]	7	* All rights reserved.
	8	*
	9	* Redistribution and use in source and binary forms, with or without
	10	* modification, are permitted provided that the following conditions
	11	* are met:
	12	*
	13	* - Redistributions of source code must retain the above copyright
	14	* notice, this list of conditions and the following disclaimer.
	15	* - Redistributions in binary form must reproduce the above copyright
	16	* notice, this list of conditions and the following disclaimer in the
	17	* documentation and/or other materials provided with the distribution.
	18	* - The name of the author may not be used to endorse or promote products
	19	* derived from this software without specific prior written permission.
	20	*
	21	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	22	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	23	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	24	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	25	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	26	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	27	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	28	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	29	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	30	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	31	*/
	32
	33	/**
	34	* @file
	35	* @brief String functions.
	36	*
	37	* Strings and characters use the Universal Character Set (UCS). The standard
	38	* strings, called just strings are encoded in UTF-8. Wide strings (encoded
	39	* in UTF-32) are supported to a limited degree. A single character is
	40	* represented as wchar_t.@n
	41	*
	42	* Overview of the terminology:@n
	43	*
	44	* Term Meaning
	45	* -------------------- ----------------------------------------------------
	46	* byte 8 bits stored in uint8_t (unsigned 8 bit integer)
	47	*
	48	* character UTF-32 encoded Unicode character, stored in wchar_t
	49	* (signed 32 bit integer), code points 0 .. 1114111
	50	* are valid
	51	*
	52	* ASCII character 7 bit encoded ASCII character, stored in char
	53	* (usually signed 8 bit integer), code points 0 .. 127
	54	* are valid
	55	*
	56	* string UTF-8 encoded NULL-terminated Unicode string, char *
	57	*
	58	* wide string UTF-32 encoded NULL-terminated Unicode string,
	59	* wchar_t *
	60	*
	61	* [wide] string size number of BYTES in a [wide] string (excluding
	62	* the NULL-terminator), size_t
	63	*
	64	* [wide] string length number of CHARACTERS in a [wide] string (excluding
	65	* the NULL-terminator), size_t
	66	*
	67	* [wide] string width number of display cells on a monospace display taken
	68	* by a [wide] string, size_t
	69	*
	70	*
	71	* Overview of string metrics:@n
	72	*
	73	* Metric Abbrev. Type Meaning
	74	* ------ ------ ------ -------------------------------------------------
	75	* size n size_t number of BYTES in a string (excluding the
	76	* NULL-terminator)
	77	*
	78	* length l size_t number of CHARACTERS in a string (excluding the
	79	* null terminator)
	80	*
	81	* width w size_t number of display cells on a monospace display
	82	* taken by a string
	83	*
	84	*
	85	* Function naming prefixes:@n
	86	*
	87	* chr_ operate on characters
	88	* ascii_ operate on ASCII characters
	89	* str_ operate on strings
	90	* wstr_ operate on wide strings
	91	*
	92	* [w]str_[n\|l\|w] operate on a prefix limited by size, length
	93	* or width
	94	*
	95	*
	96	* A specific character inside a [wide] string can be referred to by:@n
	97	*
	98	* pointer (char , wchar_t )
	99	* byte offset (size_t)
	100	* character index (size_t)
	101	*
	102	*/
	103
[d066259]	104	#include <str.h>
	105
[4872160]	106	#include <errno.h>
[d735e2e]	107	#include <stdbool.h>
	108	#include <stddef.h>
[10d65d70]	109	#include <stdint.h>
[4872160]	110
[8e893ae]	111	/** Check the condition if wchar_t is signed */
[002fd5f]	112	#ifdef __WCHAR_UNSIGNED__
[1433ecda]	113	#define WCHAR_SIGNED_CHECK(cond) (true)
[8e893ae]	114	#else
[1433ecda]	115	#define WCHAR_SIGNED_CHECK(cond) (cond)
[8e893ae]	116	#endif
	117
[4872160]	118	/** Byte mask consisting of lowest @n bits (out of 8) */
	119	#define LO_MASK_8(n) ((uint8_t) ((1 << (n)) - 1))
	120
	121	/** Byte mask consisting of lowest @n bits (out of 32) */
	122	#define LO_MASK_32(n) ((uint32_t) ((1 << (n)) - 1))
	123
	124	/** Byte mask consisting of highest @n bits (out of 8) */
	125	#define HI_MASK_8(n) (~LO_MASK_8(8 - (n)))
	126
	127	/** Number of data bits in a UTF-8 continuation byte */
	128	#define CONT_BITS 6
	129
	130	/** Decode a single character from a string.
	131	*
	132	* Decode a single character from a string of size @a size. Decoding starts
	133	* at @a offset and this offset is moved to the beginning of the next
	134	* character. In case of decoding error, offset generally advances at least
	135	* by one. However, offset is never moved beyond size.
	136	*
	137	* @param str String (not necessarily NULL-terminated).
	138	* @param offset Byte offset in string where to start decoding.
	139	* @param size Size of the string (in bytes).
	140	*
	141	* @return Value of decoded character, U_SPECIAL on decoding error or
	142	* NULL if attempt to decode beyond @a size.
	143	*
	144	*/
	145	wchar_t str_decode(const char str, size_t offset, size_t size)
	146	{
	147	if (*offset + 1 > size)
	148	return 0;
[a35b458]	149
[4872160]	150	/* First byte read from string */
	151	uint8_t b0 = (uint8_t) str[(*offset)++];
[a35b458]	152
[4872160]	153	/* Determine code length */
[a35b458]	154
[4872160]	155	unsigned int b0_bits; /* Data bits in first byte */
	156	unsigned int cbytes; /* Number of continuation bytes */
[a35b458]	157
[4872160]	158	if ((b0 & 0x80) == 0) {
	159	/* 0xxxxxxx (Plain ASCII) */
	160	b0_bits = 7;
	161	cbytes = 0;
	162	} else if ((b0 & 0xe0) == 0xc0) {
	163	/* 110xxxxx 10xxxxxx */
	164	b0_bits = 5;
	165	cbytes = 1;
	166	} else if ((b0 & 0xf0) == 0xe0) {
	167	/* 1110xxxx 10xxxxxx 10xxxxxx */
	168	b0_bits = 4;
	169	cbytes = 2;
	170	} else if ((b0 & 0xf8) == 0xf0) {
	171	/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
	172	b0_bits = 3;
	173	cbytes = 3;
	174	} else {
	175	/* 10xxxxxx -- unexpected continuation byte */
	176	return U_SPECIAL;
	177	}
[a35b458]	178
[4872160]	179	if (*offset + cbytes > size)
	180	return U_SPECIAL;
[a35b458]	181
[4872160]	182	wchar_t ch = b0 & LO_MASK_8(b0_bits);
[a35b458]	183
[4872160]	184	/* Decode continuation bytes */
	185	while (cbytes > 0) {
	186	uint8_t b = (uint8_t) str[(*offset)++];
[a35b458]	187
[4872160]	188	/* Must be 10xxxxxx */
	189	if ((b & 0xc0) != 0x80)
	190	return U_SPECIAL;
[a35b458]	191
[4872160]	192	/* Shift data bits to ch */
	193	ch = (ch << CONT_BITS) \| (wchar_t) (b & LO_MASK_8(CONT_BITS));
	194	cbytes--;
	195	}
[a35b458]	196
[4872160]	197	return ch;
	198	}
	199
	200	/** Encode a single character to string representation.
	201	*
	202	* Encode a single character to string representation (i.e. UTF-8) and store
	203	* it into a buffer at @a offset. Encoding starts at @a offset and this offset
	204	* is moved to the position where the next character can be written to.
	205	*
	206	* @param ch Input character.
	207	* @param str Output buffer.
	208	* @param offset Byte offset where to start writing.
	209	* @param size Size of the output buffer (in bytes).
	210	*
	211	* @return EOK if the character was encoded successfully, EOVERFLOW if there
	212	* was not enough space in the output buffer or EINVAL if the character
	213	* code was invalid.
	214	*/
[d066259]	215	errno_t chr_encode(const wchar_t ch, char str, size_t offset, size_t size)
[4872160]	216	{
	217	if (*offset >= size)
	218	return EOVERFLOW;
[a35b458]	219
[4872160]	220	if (!chr_check(ch))
	221	return EINVAL;
[a35b458]	222
[7c3fb9b]	223	/*
	224	* Unsigned version of ch (bit operations should only be done
	225	* on unsigned types).
	226	*/
[4872160]	227	uint32_t cc = (uint32_t) ch;
[a35b458]	228
[4872160]	229	/* Determine how many continuation bytes are needed */
[a35b458]	230
[4872160]	231	unsigned int b0_bits; /* Data bits in first byte */
	232	unsigned int cbytes; /* Number of continuation bytes */
[a35b458]	233
[4872160]	234	if ((cc & ~LO_MASK_32(7)) == 0) {
	235	b0_bits = 7;
	236	cbytes = 0;
	237	} else if ((cc & ~LO_MASK_32(11)) == 0) {
	238	b0_bits = 5;
	239	cbytes = 1;
	240	} else if ((cc & ~LO_MASK_32(16)) == 0) {
	241	b0_bits = 4;
	242	cbytes = 2;
	243	} else if ((cc & ~LO_MASK_32(21)) == 0) {
	244	b0_bits = 3;
	245	cbytes = 3;
	246	} else {
	247	/* Codes longer than 21 bits are not supported */
	248	return EINVAL;
	249	}
[a35b458]	250
[4872160]	251	/* Check for available space in buffer */
	252	if (*offset + cbytes >= size)
	253	return EOVERFLOW;
[a35b458]	254
[4872160]	255	/* Encode continuation bytes */
	256	unsigned int i;
	257	for (i = cbytes; i > 0; i--) {
	258	str[*offset + i] = 0x80 \| (cc & LO_MASK_32(CONT_BITS));
	259	cc = cc >> CONT_BITS;
	260	}
[a35b458]	261
[4872160]	262	/* Encode first byte */
	263	str[*offset] = (cc & LO_MASK_32(b0_bits)) \| HI_MASK_8(8 - b0_bits - 1);
[a35b458]	264
[4872160]	265	/* Advance offset */
	266	*offset += cbytes + 1;
[a35b458]	267
[4872160]	268	return EOK;
	269	}
	270
	271	/** Get size of string.
	272	*
	273	* Get the number of bytes which are used by the string @a str (excluding the
	274	* NULL-terminator).
	275	*
	276	* @param str String to consider.
	277	*
	278	* @return Number of bytes used by the string
	279	*
	280	*/
	281	size_t str_size(const char *str)
	282	{
	283	size_t size = 0;
[a35b458]	284
[4872160]	285	while (*str++ != 0)
	286	size++;
[a35b458]	287
[4872160]	288	return size;
	289	}
	290
	291	/** Get size of string with length limit.
	292	*
	293	* Get the number of bytes which are used by up to @a max_len first
	294	* characters in the string @a str. If @a max_len is greater than
	295	* the length of @a str, the entire string is measured (excluding the
	296	* NULL-terminator).
	297	*
	298	* @param str String to consider.
	299	* @param max_len Maximum number of characters to measure.
	300	*
	301	* @return Number of bytes used by the characters.
	302	*
	303	*/
	304	size_t str_lsize(const char *str, size_t max_len)
	305	{
	306	size_t len = 0;
	307	size_t offset = 0;
[a35b458]	308
[4872160]	309	while (len < max_len) {
	310	if (str_decode(str, &offset, STR_NO_LIMIT) == 0)
	311	break;
[a35b458]	312
[4872160]	313	len++;
	314	}
[a35b458]	315
[4872160]	316	return offset;
	317	}
	318
	319	/** Get number of characters in a string.
	320	*
	321	* @param str NULL-terminated string.
	322	*
	323	* @return Number of characters in string.
	324	*
	325	*/
	326	size_t str_length(const char *str)
	327	{
	328	size_t len = 0;
	329	size_t offset = 0;
[a35b458]	330
[4872160]	331	while (str_decode(str, &offset, STR_NO_LIMIT) != 0)
	332	len++;
[a35b458]	333
[4872160]	334	return len;
	335	}
	336
	337	/** Check whether character is plain ASCII.
	338	*
	339	* @return True if character is plain ASCII.
	340	*
	341	*/
	342	bool ascii_check(wchar_t ch)
	343	{
[8e893ae]	344	if (WCHAR_SIGNED_CHECK(ch >= 0) && (ch <= 127))
[4872160]	345	return true;
[a35b458]	346
[4872160]	347	return false;
	348	}
	349
	350	/** Check whether character is valid
	351	*
	352	* @return True if character is a valid Unicode code point.
	353	*
	354	*/
	355	bool chr_check(wchar_t ch)
	356	{
[8e893ae]	357	if (WCHAR_SIGNED_CHECK(ch >= 0) && (ch <= 1114111))
[4872160]	358	return true;
[a35b458]	359
[4872160]	360	return false;
	361	}
	362
	363	/** Compare two NULL terminated strings.
	364	*
	365	* Do a char-by-char comparison of two NULL-terminated strings.
[4efeab5]	366	* The strings are considered equal iff their length is equal
	367	* and both strings consist of the same sequence of characters.
	368	*
[1772e6d]	369	* A string S1 is less than another string S2 if it has a character with
	370	* lower value at the first character position where the strings differ.
	371	* If the strings differ in length, the shorter one is treated as if
	372	* padded by characters with a value of zero.
[4872160]	373	*
	374	* @param s1 First string to compare.
	375	* @param s2 Second string to compare.
	376	*
[1772e6d]	377	* @return 0 if the strings are equal, -1 if the first is less than the second,
	378	* 1 if the second is less than the first.
[4872160]	379	*
	380	*/
	381	int str_cmp(const char s1, const char s2)
	382	{
	383	wchar_t c1 = 0;
	384	wchar_t c2 = 0;
[a35b458]	385
[4872160]	386	size_t off1 = 0;
	387	size_t off2 = 0;
[a35b458]	388
[4872160]	389	while (true) {
	390	c1 = str_decode(s1, &off1, STR_NO_LIMIT);
	391	c2 = str_decode(s2, &off2, STR_NO_LIMIT);
[a35b458]	392
[4872160]	393	if (c1 < c2)
	394	return -1;
[a35b458]	395
[4872160]	396	if (c1 > c2)
	397	return 1;
[a35b458]	398
[d066259]	399	if (c1 == 0 \|\| c2 == 0)
[4872160]	400	break;
	401	}
[a35b458]	402
[4872160]	403	return 0;
	404	}
	405
	406	/** Copy string.
	407	*
	408	* Copy source string @a src to destination buffer @a dest.
	409	* No more than @a size bytes are written. If the size of the output buffer
	410	* is at least one byte, the output string will always be well-formed, i.e.
	411	* null-terminated and containing only complete characters.
	412	*
	413	* @param dest Destination buffer.
	414	* @param count Size of the destination buffer (must be > 0).
	415	* @param src Source string.
	416	*
	417	*/
	418	void str_cpy(char dest, size_t size, const char src)
	419	{
	420	size_t src_off = 0;
	421	size_t dest_off = 0;
[a35b458]	422
[4872160]	423	wchar_t ch;
	424	while ((ch = str_decode(src, &src_off, STR_NO_LIMIT)) != 0) {
	425	if (chr_encode(ch, dest, &dest_off, size - 1) != EOK)
	426	break;
	427	}
[a35b458]	428
[4872160]	429	dest[dest_off] = '\0';
	430	}
	431
	432	/** @}
	433	*/

Note: See TracBrowser for help on using the repository browser.

Download in other formats: