Context Navigation

str.c@ 08e103d4

Visit:

Last change on this file since 08e103d4 was 08e103d4, checked in by Jiří Zárevúcky <zarevucky.jiri@…>, 6 years ago

Use clearer naming for string length functions

This and the following commit change the names of functions, as well as
their documentation, to use unambiguous terms "bytes" and "code points"
instead of ambiguous terms "size", "length", and "characters".

Property mode set to 100644

File size: 11.7 KB

Rev	Line
[4872160]	1	/*
	2	* Copyright (c) 2001-2004 Jakub Jermar
[d066259]	3	* Copyright (c) 2005 Martin Decky
	4	* Copyright (c) 2008 Jiri Svoboda
	5	* Copyright (c) 2011 Martin Sucha
	6	* Copyright (c) 2011 Oleg Romanenko
[4872160]	7	* All rights reserved.
	8	*
	9	* Redistribution and use in source and binary forms, with or without
	10	* modification, are permitted provided that the following conditions
	11	* are met:
	12	*
	13	* - Redistributions of source code must retain the above copyright
	14	* notice, this list of conditions and the following disclaimer.
	15	* - Redistributions in binary form must reproduce the above copyright
	16	* notice, this list of conditions and the following disclaimer in the
	17	* documentation and/or other materials provided with the distribution.
	18	* - The name of the author may not be used to endorse or promote products
	19	* derived from this software without specific prior written permission.
	20	*
	21	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	22	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	23	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	24	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	25	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	26	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	27	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	28	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	29	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	30	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	31	*/
	32
	33	/**
	34	* @file
	35	* @brief String functions.
	36	*
	37	* Strings and characters use the Universal Character Set (UCS). The standard
	38	* strings, called just strings are encoded in UTF-8. Wide strings (encoded
	39	* in UTF-32) are supported to a limited degree. A single character is
	40	* represented as wchar_t.@n
	41	*
	42	* Overview of the terminology:@n
	43	*
	44	* Term Meaning
	45	* -------------------- ----------------------------------------------------
	46	* byte 8 bits stored in uint8_t (unsigned 8 bit integer)
	47	*
	48	* character UTF-32 encoded Unicode character, stored in wchar_t
	49	* (signed 32 bit integer), code points 0 .. 1114111
	50	* are valid
	51	*
	52	* ASCII character 7 bit encoded ASCII character, stored in char
	53	* (usually signed 8 bit integer), code points 0 .. 127
	54	* are valid
	55	*
	56	* string UTF-8 encoded NULL-terminated Unicode string, char *
	57	*
	58	* wide string UTF-32 encoded NULL-terminated Unicode string,
	59	* wchar_t *
	60	*
	61	* [wide] string size number of BYTES in a [wide] string (excluding
	62	* the NULL-terminator), size_t
	63	*
	64	* [wide] string length number of CHARACTERS in a [wide] string (excluding
	65	* the NULL-terminator), size_t
	66	*
	67	* [wide] string width number of display cells on a monospace display taken
	68	* by a [wide] string, size_t
	69	*
	70	*
	71	* Overview of string metrics:@n
	72	*
	73	* Metric Abbrev. Type Meaning
	74	* ------ ------ ------ -------------------------------------------------
	75	* size n size_t number of BYTES in a string (excluding the
	76	* NULL-terminator)
	77	*
	78	* length l size_t number of CHARACTERS in a string (excluding the
	79	* null terminator)
	80	*
	81	* width w size_t number of display cells on a monospace display
	82	* taken by a string
	83	*
	84	*
	85	* Function naming prefixes:@n
	86	*
	87	* chr_ operate on characters
	88	* ascii_ operate on ASCII characters
	89	* str_ operate on strings
	90	* wstr_ operate on wide strings
	91	*
	92	* [w]str_[n\|l\|w] operate on a prefix limited by size, length
	93	* or width
	94	*
	95	*
	96	* A specific character inside a [wide] string can be referred to by:@n
	97	*
	98	* pointer (char , wchar_t )
	99	* byte offset (size_t)
	100	* character index (size_t)
	101	*
	102	*/
	103
[d066259]	104	#include <str.h>
	105
[4872160]	106	#include <errno.h>
[d735e2e]	107	#include <stdbool.h>
	108	#include <stddef.h>
[10d65d70]	109	#include <stdint.h>
[4872160]	110
[8e893ae]	111	/** Check the condition if wchar_t is signed */
[002fd5f]	112	#ifdef __WCHAR_UNSIGNED__
[1433ecda]	113	#define WCHAR_SIGNED_CHECK(cond) (true)
[8e893ae]	114	#else
[1433ecda]	115	#define WCHAR_SIGNED_CHECK(cond) (cond)
[8e893ae]	116	#endif
	117
[4872160]	118	/** Byte mask consisting of lowest @n bits (out of 8) */
	119	#define LO_MASK_8(n) ((uint8_t) ((1 << (n)) - 1))
	120
	121	/** Byte mask consisting of lowest @n bits (out of 32) */
	122	#define LO_MASK_32(n) ((uint32_t) ((1 << (n)) - 1))
	123
	124	/** Byte mask consisting of highest @n bits (out of 8) */
	125	#define HI_MASK_8(n) (~LO_MASK_8(8 - (n)))
	126
	127	/** Number of data bits in a UTF-8 continuation byte */
	128	#define CONT_BITS 6
	129
	130	/** Decode a single character from a string.
	131	*
	132	* Decode a single character from a string of size @a size. Decoding starts
	133	* at @a offset and this offset is moved to the beginning of the next
	134	* character. In case of decoding error, offset generally advances at least
	135	* by one. However, offset is never moved beyond size.
	136	*
	137	* @param str String (not necessarily NULL-terminated).
	138	* @param offset Byte offset in string where to start decoding.
	139	* @param size Size of the string (in bytes).
	140	*
	141	* @return Value of decoded character, U_SPECIAL on decoding error or
	142	* NULL if attempt to decode beyond @a size.
	143	*
	144	*/
	145	wchar_t str_decode(const char str, size_t offset, size_t size)
	146	{
	147	if (*offset + 1 > size)
	148	return 0;
[a35b458]	149
[4872160]	150	/* First byte read from string */
	151	uint8_t b0 = (uint8_t) str[(*offset)++];
[a35b458]	152
[4872160]	153	/* Determine code length */
[a35b458]	154
[4872160]	155	unsigned int b0_bits; /* Data bits in first byte */
	156	unsigned int cbytes; /* Number of continuation bytes */
[a35b458]	157
[4872160]	158	if ((b0 & 0x80) == 0) {
	159	/* 0xxxxxxx (Plain ASCII) */
	160	b0_bits = 7;
	161	cbytes = 0;
	162	} else if ((b0 & 0xe0) == 0xc0) {
	163	/* 110xxxxx 10xxxxxx */
	164	b0_bits = 5;
	165	cbytes = 1;
	166	} else if ((b0 & 0xf0) == 0xe0) {
	167	/* 1110xxxx 10xxxxxx 10xxxxxx */
	168	b0_bits = 4;
	169	cbytes = 2;
	170	} else if ((b0 & 0xf8) == 0xf0) {
	171	/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
	172	b0_bits = 3;
	173	cbytes = 3;
	174	} else {
	175	/* 10xxxxxx -- unexpected continuation byte */
	176	return U_SPECIAL;
	177	}
[a35b458]	178
[4872160]	179	if (*offset + cbytes > size)
	180	return U_SPECIAL;
[a35b458]	181
[4872160]	182	wchar_t ch = b0 & LO_MASK_8(b0_bits);
[a35b458]	183
[4872160]	184	/* Decode continuation bytes */
	185	while (cbytes > 0) {
	186	uint8_t b = (uint8_t) str[(*offset)++];
[a35b458]	187
[4872160]	188	/* Must be 10xxxxxx */
	189	if ((b & 0xc0) != 0x80)
	190	return U_SPECIAL;
[a35b458]	191
[4872160]	192	/* Shift data bits to ch */
	193	ch = (ch << CONT_BITS) \| (wchar_t) (b & LO_MASK_8(CONT_BITS));
	194	cbytes--;
	195	}
[a35b458]	196
[4872160]	197	return ch;
	198	}
	199
	200	/** Encode a single character to string representation.
	201	*
	202	* Encode a single character to string representation (i.e. UTF-8) and store
	203	* it into a buffer at @a offset. Encoding starts at @a offset and this offset
	204	* is moved to the position where the next character can be written to.
	205	*
	206	* @param ch Input character.
	207	* @param str Output buffer.
	208	* @param offset Byte offset where to start writing.
	209	* @param size Size of the output buffer (in bytes).
	210	*
	211	* @return EOK if the character was encoded successfully, EOVERFLOW if there
	212	* was not enough space in the output buffer or EINVAL if the character
	213	* code was invalid.
	214	*/
[d066259]	215	errno_t chr_encode(const wchar_t ch, char str, size_t offset, size_t size)
[4872160]	216	{
	217	if (*offset >= size)
	218	return EOVERFLOW;
[a35b458]	219
[4872160]	220	if (!chr_check(ch))
	221	return EINVAL;
[a35b458]	222
[7c3fb9b]	223	/*
	224	* Unsigned version of ch (bit operations should only be done
	225	* on unsigned types).
	226	*/
[4872160]	227	uint32_t cc = (uint32_t) ch;
[a35b458]	228
[4872160]	229	/* Determine how many continuation bytes are needed */
[a35b458]	230
[4872160]	231	unsigned int b0_bits; /* Data bits in first byte */
	232	unsigned int cbytes; /* Number of continuation bytes */
[a35b458]	233
[4872160]	234	if ((cc & ~LO_MASK_32(7)) == 0) {
	235	b0_bits = 7;
	236	cbytes = 0;
	237	} else if ((cc & ~LO_MASK_32(11)) == 0) {
	238	b0_bits = 5;
	239	cbytes = 1;
	240	} else if ((cc & ~LO_MASK_32(16)) == 0) {
	241	b0_bits = 4;
	242	cbytes = 2;
	243	} else if ((cc & ~LO_MASK_32(21)) == 0) {
	244	b0_bits = 3;
	245	cbytes = 3;
	246	} else {
	247	/* Codes longer than 21 bits are not supported */
	248	return EINVAL;
	249	}
[a35b458]	250
[4872160]	251	/* Check for available space in buffer */
	252	if (*offset + cbytes >= size)
	253	return EOVERFLOW;
[a35b458]	254
[4872160]	255	/* Encode continuation bytes */
	256	unsigned int i;
	257	for (i = cbytes; i > 0; i--) {
	258	str[*offset + i] = 0x80 \| (cc & LO_MASK_32(CONT_BITS));
	259	cc = cc >> CONT_BITS;
	260	}
[a35b458]	261
[4872160]	262	/* Encode first byte */
	263	str[*offset] = (cc & LO_MASK_32(b0_bits)) \| HI_MASK_8(8 - b0_bits - 1);
[a35b458]	264
[4872160]	265	/* Advance offset */
	266	*offset += cbytes + 1;
[a35b458]	267
[4872160]	268	return EOK;
	269	}
	270
	271	/** Get size of string.
	272	*
	273	* Get the number of bytes which are used by the string @a str (excluding the
	274	* NULL-terminator).
	275	*
	276	* @param str String to consider.
	277	*
	278	* @return Number of bytes used by the string
	279	*
	280	*/
[08e103d4]	281	size_t str_bytes(const char *str)
[4872160]	282	{
	283	size_t size = 0;
[a35b458]	284
[4872160]	285	while (*str++ != 0)
	286	size++;
[a35b458]	287
[4872160]	288	return size;
	289	}
	290
	291	/** Get size of string with length limit.
	292	*
	293	* Get the number of bytes which are used by up to @a max_len first
	294	* characters in the string @a str. If @a max_len is greater than
	295	* the length of @a str, the entire string is measured (excluding the
	296	* NULL-terminator).
	297	*
	298	* @param str String to consider.
	299	* @param max_len Maximum number of characters to measure.
	300	*
	301	* @return Number of bytes used by the characters.
	302	*
	303	*/
[08e103d4]	304	size_t str_lbytes(const char *str, size_t max_len)
[4872160]	305	{
	306	size_t len = 0;
	307	size_t offset = 0;
[a35b458]	308
[4872160]	309	while (len < max_len) {
	310	if (str_decode(str, &offset, STR_NO_LIMIT) == 0)
	311	break;
[a35b458]	312
[4872160]	313	len++;
	314	}
[a35b458]	315
[4872160]	316	return offset;
	317	}
	318
	319	/** Get number of characters in a string.
	320	*
	321	* @param str NULL-terminated string.
	322	*
	323	* @return Number of characters in string.
	324	*
	325	*/
[08e103d4]	326	size_t str_code_points(const char *str)
[4872160]	327	{
	328	size_t len = 0;
	329	size_t offset = 0;
[a35b458]	330
[4872160]	331	while (str_decode(str, &offset, STR_NO_LIMIT) != 0)
	332	len++;
[a35b458]	333
[4872160]	334	return len;
	335	}
	336
	337	/** Check whether character is plain ASCII.
	338	*
	339	* @return True if character is plain ASCII.
	340	*
	341	*/
	342	bool ascii_check(wchar_t ch)
	343	{
[8e893ae]	344	if (WCHAR_SIGNED_CHECK(ch >= 0) && (ch <= 127))
[4872160]	345	return true;
[a35b458]	346
[4872160]	347	return false;
	348	}
	349
	350	/** Check whether character is valid
	351	*
	352	* @return True if character is a valid Unicode code point.
	353	*
	354	*/
	355	bool chr_check(wchar_t ch)
	356	{
[8e893ae]	357	if (WCHAR_SIGNED_CHECK(ch >= 0) && (ch <= 1114111))
[4872160]	358	return true;
[a35b458]	359
[4872160]	360	return false;
	361	}
	362
	363	/** Compare two NULL terminated strings.
	364	*
	365	* Do a char-by-char comparison of two NULL-terminated strings.
[4efeab5]	366	* The strings are considered equal iff their length is equal
	367	* and both strings consist of the same sequence of characters.
	368	*
[1772e6d]	369	* A string S1 is less than another string S2 if it has a character with
	370	* lower value at the first character position where the strings differ.
	371	* If the strings differ in length, the shorter one is treated as if
	372	* padded by characters with a value of zero.
[4872160]	373	*
	374	* @param s1 First string to compare.
	375	* @param s2 Second string to compare.
	376	*
[1772e6d]	377	* @return 0 if the strings are equal, -1 if the first is less than the second,
	378	* 1 if the second is less than the first.
[4872160]	379	*
	380	*/
	381	int str_cmp(const char s1, const char s2)
	382	{
	383	wchar_t c1 = 0;
	384	wchar_t c2 = 0;
[a35b458]	385
[4872160]	386	size_t off1 = 0;
	387	size_t off2 = 0;
[a35b458]	388
[4872160]	389	while (true) {
	390	c1 = str_decode(s1, &off1, STR_NO_LIMIT);
	391	c2 = str_decode(s2, &off2, STR_NO_LIMIT);
[a35b458]	392
[4872160]	393	if (c1 < c2)
	394	return -1;
[a35b458]	395
[4872160]	396	if (c1 > c2)
	397	return 1;
[a35b458]	398
[d066259]	399	if (c1 == 0 \|\| c2 == 0)
[4872160]	400	break;
	401	}
[a35b458]	402
[4872160]	403	return 0;
	404	}
	405
	406	/** Copy string.
	407	*
	408	* Copy source string @a src to destination buffer @a dest.
	409	* No more than @a size bytes are written. If the size of the output buffer
	410	* is at least one byte, the output string will always be well-formed, i.e.
	411	* null-terminated and containing only complete characters.
	412	*
	413	* @param dest Destination buffer.
	414	* @param count Size of the destination buffer (must be > 0).
	415	* @param src Source string.
	416	*
	417	*/
	418	void str_cpy(char dest, size_t size, const char src)
	419	{
	420	size_t src_off = 0;
	421	size_t dest_off = 0;
[a35b458]	422
[4872160]	423	wchar_t ch;
	424	while ((ch = str_decode(src, &src_off, STR_NO_LIMIT)) != 0) {
	425	if (chr_encode(ch, dest, &dest_off, size - 1) != EOK)
	426	break;
	427	}
[a35b458]	428
[4872160]	429	dest[dest_off] = '\0';
	430	}
	431
	432	/** @}
	433	*/

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: mainline/boot/generic/src/str.c@ 08e103d4

Download in other formats: