Context Navigation

str.c@ de38873

Visit:

serial ticket/834-toolchain-update topic/msim-upgrade topic/simplify-dev-export

Last change on this file since de38873 was 28a5ebd, checked in by Martin Decky <martin@…>, 5 years ago

Use char32_t instead of wchat_t to represent UTF-32 strings

The intention of the native HelenOS string API has been always to
support Unicode in the UTF-8 and UTF-32 encodings as the sole character
representations and ignore the obsolete mess of older single-byte and
multibyte character encodings. Before C11, the wchar_t type has been
slightly misused for the purpose of the UTF-32 strings. The newer
char32_t type is obviously a much more suitable option. The standard
defines char32_t as uint_least32_t, thus we can take the liberty to fix
it to uint32_t.

To maintain compatilibity with the C Standard, the putwchar(wchar_t)
functions has been replaced by our custom putuchar(char32_t) functions
where appropriate.

Property mode set to 100644

File size: 11.5 KB

Rev	Line
[4872160]	1	/*
	2	* Copyright (c) 2001-2004 Jakub Jermar
[d066259]	3	* Copyright (c) 2005 Martin Decky
	4	* Copyright (c) 2008 Jiri Svoboda
	5	* Copyright (c) 2011 Martin Sucha
	6	* Copyright (c) 2011 Oleg Romanenko
[4872160]	7	* All rights reserved.
	8	*
	9	* Redistribution and use in source and binary forms, with or without
	10	* modification, are permitted provided that the following conditions
	11	* are met:
	12	*
	13	* - Redistributions of source code must retain the above copyright
	14	* notice, this list of conditions and the following disclaimer.
	15	* - Redistributions in binary form must reproduce the above copyright
	16	* notice, this list of conditions and the following disclaimer in the
	17	* documentation and/or other materials provided with the distribution.
	18	* - The name of the author may not be used to endorse or promote products
	19	* derived from this software without specific prior written permission.
	20	*
	21	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	22	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	23	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	24	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	25	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	26	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	27	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	28	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	29	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	30	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	31	*/
	32
	33	/**
	34	* @file
	35	* @brief String functions.
	36	*
	37	* Strings and characters use the Universal Character Set (UCS). The standard
	38	* strings, called just strings are encoded in UTF-8. Wide strings (encoded
	39	* in UTF-32) are supported to a limited degree. A single character is
[28a5ebd]	40	* represented as char32_t.@n
[4872160]	41	*
	42	* Overview of the terminology:@n
	43	*
	44	* Term Meaning
	45	* -------------------- ----------------------------------------------------
	46	* byte 8 bits stored in uint8_t (unsigned 8 bit integer)
	47	*
[28a5ebd]	48	* character UTF-32 encoded Unicode character, stored in char32_t
	49	* (unsigned 32 bit integer), code points 0 .. 1114111
[4872160]	50	* are valid
	51	*
	52	* ASCII character 7 bit encoded ASCII character, stored in char
	53	* (usually signed 8 bit integer), code points 0 .. 127
	54	* are valid
	55	*
	56	* string UTF-8 encoded NULL-terminated Unicode string, char *
	57	*
	58	* wide string UTF-32 encoded NULL-terminated Unicode string,
[28a5ebd]	59	* char32_t *
[4872160]	60	*
	61	* [wide] string size number of BYTES in a [wide] string (excluding
	62	* the NULL-terminator), size_t
	63	*
	64	* [wide] string length number of CHARACTERS in a [wide] string (excluding
	65	* the NULL-terminator), size_t
	66	*
	67	* [wide] string width number of display cells on a monospace display taken
	68	* by a [wide] string, size_t
	69	*
	70	*
	71	* Overview of string metrics:@n
	72	*
	73	* Metric Abbrev. Type Meaning
	74	* ------ ------ ------ -------------------------------------------------
	75	* size n size_t number of BYTES in a string (excluding the
	76	* NULL-terminator)
	77	*
	78	* length l size_t number of CHARACTERS in a string (excluding the
	79	* null terminator)
	80	*
	81	* width w size_t number of display cells on a monospace display
	82	* taken by a string
	83	*
	84	*
	85	* Function naming prefixes:@n
	86	*
	87	* chr_ operate on characters
	88	* ascii_ operate on ASCII characters
	89	* str_ operate on strings
	90	* wstr_ operate on wide strings
	91	*
	92	* [w]str_[n\|l\|w] operate on a prefix limited by size, length
	93	* or width
	94	*
	95	*
	96	* A specific character inside a [wide] string can be referred to by:@n
	97	*
[28a5ebd]	98	* pointer (char , char32_t )
[4872160]	99	* byte offset (size_t)
	100	* character index (size_t)
	101	*
	102	*/
	103
[d066259]	104	#include <str.h>
	105
[4872160]	106	#include <errno.h>
[d735e2e]	107	#include <stdbool.h>
	108	#include <stddef.h>
[10d65d70]	109	#include <stdint.h>
[4872160]	110
	111	/** Byte mask consisting of lowest @n bits (out of 8) */
	112	#define LO_MASK_8(n) ((uint8_t) ((1 << (n)) - 1))
	113
	114	/** Byte mask consisting of lowest @n bits (out of 32) */
	115	#define LO_MASK_32(n) ((uint32_t) ((1 << (n)) - 1))
	116
	117	/** Byte mask consisting of highest @n bits (out of 8) */
	118	#define HI_MASK_8(n) (~LO_MASK_8(8 - (n)))
	119
	120	/** Number of data bits in a UTF-8 continuation byte */
	121	#define CONT_BITS 6
	122
	123	/** Decode a single character from a string.
	124	*
	125	* Decode a single character from a string of size @a size. Decoding starts
	126	* at @a offset and this offset is moved to the beginning of the next
	127	* character. In case of decoding error, offset generally advances at least
	128	* by one. However, offset is never moved beyond size.
	129	*
	130	* @param str String (not necessarily NULL-terminated).
	131	* @param offset Byte offset in string where to start decoding.
	132	* @param size Size of the string (in bytes).
	133	*
	134	* @return Value of decoded character, U_SPECIAL on decoding error or
	135	* NULL if attempt to decode beyond @a size.
	136	*
	137	*/
[28a5ebd]	138	char32_t str_decode(const char str, size_t offset, size_t size)
[4872160]	139	{
	140	if (*offset + 1 > size)
	141	return 0;
[a35b458]	142
[4872160]	143	/* First byte read from string */
	144	uint8_t b0 = (uint8_t) str[(*offset)++];
[a35b458]	145
[4872160]	146	/* Determine code length */
[a35b458]	147
[4872160]	148	unsigned int b0_bits; /* Data bits in first byte */
	149	unsigned int cbytes; /* Number of continuation bytes */
[a35b458]	150
[4872160]	151	if ((b0 & 0x80) == 0) {
	152	/* 0xxxxxxx (Plain ASCII) */
	153	b0_bits = 7;
	154	cbytes = 0;
	155	} else if ((b0 & 0xe0) == 0xc0) {
	156	/* 110xxxxx 10xxxxxx */
	157	b0_bits = 5;
	158	cbytes = 1;
	159	} else if ((b0 & 0xf0) == 0xe0) {
	160	/* 1110xxxx 10xxxxxx 10xxxxxx */
	161	b0_bits = 4;
	162	cbytes = 2;
	163	} else if ((b0 & 0xf8) == 0xf0) {
	164	/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
	165	b0_bits = 3;
	166	cbytes = 3;
	167	} else {
	168	/* 10xxxxxx -- unexpected continuation byte */
	169	return U_SPECIAL;
	170	}
[a35b458]	171
[4872160]	172	if (*offset + cbytes > size)
	173	return U_SPECIAL;
[a35b458]	174
[28a5ebd]	175	char32_t ch = b0 & LO_MASK_8(b0_bits);
[a35b458]	176
[4872160]	177	/* Decode continuation bytes */
	178	while (cbytes > 0) {
	179	uint8_t b = (uint8_t) str[(*offset)++];
[a35b458]	180
[4872160]	181	/* Must be 10xxxxxx */
	182	if ((b & 0xc0) != 0x80)
	183	return U_SPECIAL;
[a35b458]	184
[4872160]	185	/* Shift data bits to ch */
[28a5ebd]	186	ch = (ch << CONT_BITS) \| (char32_t) (b & LO_MASK_8(CONT_BITS));
[4872160]	187	cbytes--;
	188	}
[a35b458]	189
[4872160]	190	return ch;
	191	}
	192
	193	/** Encode a single character to string representation.
	194	*
	195	* Encode a single character to string representation (i.e. UTF-8) and store
	196	* it into a buffer at @a offset. Encoding starts at @a offset and this offset
	197	* is moved to the position where the next character can be written to.
	198	*
	199	* @param ch Input character.
	200	* @param str Output buffer.
	201	* @param offset Byte offset where to start writing.
	202	* @param size Size of the output buffer (in bytes).
	203	*
	204	* @return EOK if the character was encoded successfully, EOVERFLOW if there
	205	* was not enough space in the output buffer or EINVAL if the character
	206	* code was invalid.
	207	*/
[28a5ebd]	208	errno_t chr_encode(const char32_t ch, char str, size_t offset, size_t size)
[4872160]	209	{
	210	if (*offset >= size)
	211	return EOVERFLOW;
[a35b458]	212
[4872160]	213	if (!chr_check(ch))
	214	return EINVAL;
[a35b458]	215
[7c3fb9b]	216	/*
	217	* Unsigned version of ch (bit operations should only be done
	218	* on unsigned types).
	219	*/
[4872160]	220	uint32_t cc = (uint32_t) ch;
[a35b458]	221
[4872160]	222	/* Determine how many continuation bytes are needed */
[a35b458]	223
[4872160]	224	unsigned int b0_bits; /* Data bits in first byte */
	225	unsigned int cbytes; /* Number of continuation bytes */
[a35b458]	226
[4872160]	227	if ((cc & ~LO_MASK_32(7)) == 0) {
	228	b0_bits = 7;
	229	cbytes = 0;
	230	} else if ((cc & ~LO_MASK_32(11)) == 0) {
	231	b0_bits = 5;
	232	cbytes = 1;
	233	} else if ((cc & ~LO_MASK_32(16)) == 0) {
	234	b0_bits = 4;
	235	cbytes = 2;
	236	} else if ((cc & ~LO_MASK_32(21)) == 0) {
	237	b0_bits = 3;
	238	cbytes = 3;
	239	} else {
	240	/* Codes longer than 21 bits are not supported */
	241	return EINVAL;
	242	}
[a35b458]	243
[4872160]	244	/* Check for available space in buffer */
	245	if (*offset + cbytes >= size)
	246	return EOVERFLOW;
[a35b458]	247
[4872160]	248	/* Encode continuation bytes */
	249	unsigned int i;
	250	for (i = cbytes; i > 0; i--) {
	251	str[*offset + i] = 0x80 \| (cc & LO_MASK_32(CONT_BITS));
	252	cc = cc >> CONT_BITS;
	253	}
[a35b458]	254
[4872160]	255	/* Encode first byte */
	256	str[*offset] = (cc & LO_MASK_32(b0_bits)) \| HI_MASK_8(8 - b0_bits - 1);
[a35b458]	257
[4872160]	258	/* Advance offset */
	259	*offset += cbytes + 1;
[a35b458]	260
[4872160]	261	return EOK;
	262	}
	263
	264	/** Get size of string.
	265	*
	266	* Get the number of bytes which are used by the string @a str (excluding the
	267	* NULL-terminator).
	268	*
	269	* @param str String to consider.
	270	*
	271	* @return Number of bytes used by the string
	272	*
	273	*/
	274	size_t str_size(const char *str)
	275	{
	276	size_t size = 0;
[a35b458]	277
[4872160]	278	while (*str++ != 0)
	279	size++;
[a35b458]	280
[4872160]	281	return size;
	282	}
	283
	284	/** Get size of string with length limit.
	285	*
	286	* Get the number of bytes which are used by up to @a max_len first
	287	* characters in the string @a str. If @a max_len is greater than
	288	* the length of @a str, the entire string is measured (excluding the
	289	* NULL-terminator).
	290	*
	291	* @param str String to consider.
	292	* @param max_len Maximum number of characters to measure.
	293	*
	294	* @return Number of bytes used by the characters.
	295	*
	296	*/
	297	size_t str_lsize(const char *str, size_t max_len)
	298	{
	299	size_t len = 0;
	300	size_t offset = 0;
[a35b458]	301
[4872160]	302	while (len < max_len) {
	303	if (str_decode(str, &offset, STR_NO_LIMIT) == 0)
	304	break;
[a35b458]	305
[4872160]	306	len++;
	307	}
[a35b458]	308
[4872160]	309	return offset;
	310	}
	311
	312	/** Get number of characters in a string.
	313	*
	314	* @param str NULL-terminated string.
	315	*
	316	* @return Number of characters in string.
	317	*
	318	*/
	319	size_t str_length(const char *str)
	320	{
	321	size_t len = 0;
	322	size_t offset = 0;
[a35b458]	323
[4872160]	324	while (str_decode(str, &offset, STR_NO_LIMIT) != 0)
	325	len++;
[a35b458]	326
[4872160]	327	return len;
	328	}
	329
	330	/** Check whether character is plain ASCII.
	331	*
	332	* @return True if character is plain ASCII.
	333	*
	334	*/
[28a5ebd]	335	bool ascii_check(char32_t ch)
[4872160]	336	{
[28a5ebd]	337	if (ch <= 127)
[4872160]	338	return true;
[a35b458]	339
[4872160]	340	return false;
	341	}
	342
	343	/** Check whether character is valid
	344	*
	345	* @return True if character is a valid Unicode code point.
	346	*
	347	*/
[28a5ebd]	348	bool chr_check(char32_t ch)
[4872160]	349	{
[28a5ebd]	350	if (ch <= 1114111)
[4872160]	351	return true;
[a35b458]	352
[4872160]	353	return false;
	354	}
	355
	356	/** Compare two NULL terminated strings.
	357	*
	358	* Do a char-by-char comparison of two NULL-terminated strings.
[4efeab5]	359	* The strings are considered equal iff their length is equal
	360	* and both strings consist of the same sequence of characters.
	361	*
[1772e6d]	362	* A string S1 is less than another string S2 if it has a character with
	363	* lower value at the first character position where the strings differ.
	364	* If the strings differ in length, the shorter one is treated as if
	365	* padded by characters with a value of zero.
[4872160]	366	*
	367	* @param s1 First string to compare.
	368	* @param s2 Second string to compare.
	369	*
[1772e6d]	370	* @return 0 if the strings are equal, -1 if the first is less than the second,
	371	* 1 if the second is less than the first.
[4872160]	372	*
	373	*/
	374	int str_cmp(const char s1, const char s2)
	375	{
[28a5ebd]	376	char32_t c1 = 0;
	377	char32_t c2 = 0;
[a35b458]	378
[4872160]	379	size_t off1 = 0;
	380	size_t off2 = 0;
[a35b458]	381
[4872160]	382	while (true) {
	383	c1 = str_decode(s1, &off1, STR_NO_LIMIT);
	384	c2 = str_decode(s2, &off2, STR_NO_LIMIT);
[a35b458]	385
[4872160]	386	if (c1 < c2)
	387	return -1;
[a35b458]	388
[4872160]	389	if (c1 > c2)
	390	return 1;
[a35b458]	391
[d066259]	392	if (c1 == 0 \|\| c2 == 0)
[4872160]	393	break;
	394	}
[a35b458]	395
[4872160]	396	return 0;
	397	}
	398
	399	/** Copy string.
	400	*
	401	* Copy source string @a src to destination buffer @a dest.
	402	* No more than @a size bytes are written. If the size of the output buffer
	403	* is at least one byte, the output string will always be well-formed, i.e.
	404	* null-terminated and containing only complete characters.
	405	*
	406	* @param dest Destination buffer.
	407	* @param count Size of the destination buffer (must be > 0).
	408	* @param src Source string.
	409	*
	410	*/
	411	void str_cpy(char dest, size_t size, const char src)
	412	{
	413	size_t src_off = 0;
	414	size_t dest_off = 0;
[a35b458]	415
[28a5ebd]	416	char32_t ch;
[4872160]	417	while ((ch = str_decode(src, &src_off, STR_NO_LIMIT)) != 0) {
	418	if (chr_encode(ch, dest, &dest_off, size - 1) != EOK)
	419	break;
	420	}
[a35b458]	421
[4872160]	422	dest[dest_off] = '\0';
	423	}
	424
	425	/** @}
	426	*/

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: mainline/boot/generic/src/str.c@ de38873

Download in other formats: