Context Navigation

source: mainline/common/str.c@ ae787807

Visit:

Last change on this file since ae787807 was b31323f, checked in by Jiří Zárevúcky <zarevucky.jiri@…>, 2 months ago
Test, fix and extend string sanitization
Property mode set to `100644`
File size: 45.6 KB

Rev	Line
[936351c1]	1	/*
[d066259]	2	* Copyright (c) 2001-2004 Jakub Jermar
[df4ed85]	3	* Copyright (c) 2005 Martin Decky
[576845ec]	4	* Copyright (c) 2008 Jiri Svoboda
[22cf42d9]	5	* Copyright (c) 2011 Martin Sucha
[c4bbca8]	6	* Copyright (c) 2011 Oleg Romanenko
[65bf084]	7	* Copyright (c) 2025 Jiří Zárevúcky
[936351c1]	8	* All rights reserved.
	9	*
	10	* Redistribution and use in source and binary forms, with or without
	11	* modification, are permitted provided that the following conditions
	12	* are met:
	13	*
	14	* - Redistributions of source code must retain the above copyright
	15	* notice, this list of conditions and the following disclaimer.
	16	* - Redistributions in binary form must reproduce the above copyright
	17	* notice, this list of conditions and the following disclaimer in the
	18	* documentation and/or other materials provided with the distribution.
	19	* - The name of the author may not be used to endorse or promote products
	20	* derived from this software without specific prior written permission.
	21	*
	22	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	23	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	24	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	25	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	26	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	27	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	28	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	29	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	30	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	31	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	32	*/
	33
[a46da63]	34	/** @addtogroup libc
[b2951e2]	35	* @{
	36	*/
[d066259]	37
	38	/**
	39	* @file
	40	* @brief String functions.
	41	*
	42	* Strings and characters use the Universal Character Set (UCS). The standard
	43	* strings, called just strings are encoded in UTF-8. Wide strings (encoded
	44	* in UTF-32) are supported to a limited degree. A single character is
[28a5ebd]	45	* represented as char32_t.@n
[d066259]	46	*
	47	* Overview of the terminology:@n
	48	*
	49	* Term Meaning
	50	* -------------------- ----------------------------------------------------
	51	* byte 8 bits stored in uint8_t (unsigned 8 bit integer)
	52	*
[28a5ebd]	53	* character UTF-32 encoded Unicode character, stored in char32_t
	54	* (unsigned 32 bit integer), code points 0 .. 1114111
[d066259]	55	* are valid
	56	*
[28c39f3]	57	* Note that Unicode characters do not match
	58	* one-to-one with displayed characters or glyphs on
	59	* screen. For that level of precision, look up
	60	* Grapheme Clusters.
	61	*
[d066259]	62	* ASCII character 7 bit encoded ASCII character, stored in char
	63	* (usually signed 8 bit integer), code points 0 .. 127
	64	* are valid
	65	*
	66	* string UTF-8 encoded NULL-terminated Unicode string, char *
	67	*
	68	* wide string UTF-32 encoded NULL-terminated Unicode string,
[28a5ebd]	69	* char32_t *
[d066259]	70	*
	71	* [wide] string size number of BYTES in a [wide] string (excluding
	72	* the NULL-terminator), size_t
	73	*
	74	* [wide] string length number of CHARACTERS in a [wide] string (excluding
	75	* the NULL-terminator), size_t
	76	*
	77	* [wide] string width number of display cells on a monospace display taken
	78	* by a [wide] string, size_t
	79	*
[28c39f3]	80	* This is virtually impossible to determine exactly for
	81	* all strings without knowing specifics of the display
	82	* device, due to various factors affecting text output.
	83	* If you have the option to query the terminal for
	84	* position change caused by outputting the string,
	85	* it is preferrable to determine width that way.
	86	*
[d066259]	87	*
	88	* Overview of string metrics:@n
	89	*
	90	* Metric Abbrev. Type Meaning
	91	* ------ ------ ------ -------------------------------------------------
	92	* size n size_t number of BYTES in a string (excluding the
	93	* NULL-terminator)
	94	*
	95	* length l size_t number of CHARACTERS in a string (excluding the
	96	* null terminator)
	97	*
	98	* width w size_t number of display cells on a monospace display
	99	* taken by a string
	100	*
	101	*
	102	* Function naming prefixes:@n
	103	*
	104	* chr_ operate on characters
	105	* ascii_ operate on ASCII characters
	106	* str_ operate on strings
	107	* wstr_ operate on wide strings
	108	*
	109	* [w]str_[n\|l\|w] operate on a prefix limited by size, length
	110	* or width
	111	*
	112	*
	113	* A specific character inside a [wide] string can be referred to by:@n
	114	*
[28a5ebd]	115	* pointer (char , char32_t )
[d066259]	116	* byte offset (size_t)
	117	* character index (size_t)
	118	*
[b2951e2]	119	*/
	120
[19f857a]	121	#include <str.h>
[d066259]	122
[28c39f3]	123	#include <align.h>
[38d150e]	124	#include <assert.h>
[e64c4b2]	125	#include <ctype.h>
[171f9a1]	126	#include <errno.h>
[65bf084]	127	#include <limits.h>
[28c39f3]	128	#include <macros.h>
	129	#include <mem.h>
[d066259]	130	#include <stdbool.h>
	131	#include <stddef.h>
	132	#include <stdint.h>
	133	#include <stdlib.h>
[28c39f3]	134	#include <uchar.h>
[171f9a1]	135
[65bf084]	136	#if __STDC_HOSTED__
	137	#include <fibril.h>
	138	#endif
	139
	140	static void _set_ilseq()
	141	{
	142	#ifdef errno
	143	errno = EILSEQ;
	144	#endif
	145	}
	146
[171f9a1]	147	/** Byte mask consisting of lowest @n bits (out of 8) */
	148	#define LO_MASK_8(n) ((uint8_t) ((1 << (n)) - 1))
	149
	150	/** Byte mask consisting of lowest @n bits (out of 32) */
	151	#define LO_MASK_32(n) ((uint32_t) ((1 << (n)) - 1))
	152
	153	/** Byte mask consisting of highest @n bits (out of 8) */
	154	#define HI_MASK_8(n) (~LO_MASK_8(8 - (n)))
	155
	156	/** Number of data bits in a UTF-8 continuation byte */
	157	#define CONT_BITS 6
	158
[65bf084]	159	#define UTF8_MASK_INITIAL2 0b00011111
	160	#define UTF8_MASK_INITIAL3 0b00001111
	161	#define UTF8_MASK_INITIAL4 0b00000111
	162	#define UTF8_MASK_CONT 0b00111111
	163
	164	#define CHAR_INVALID ((char32_t) UINT_MAX)
	165
[28c39f3]	166	static inline bool _is_ascii(uint8_t b)
	167	{
	168	return b < 0x80;
	169	}
	170
[65bf084]	171	static inline bool _is_continuation(uint8_t b)
[28c39f3]	172	{
[65bf084]	173	return (b & 0xC0) == 0x80;
	174	}
	175
	176	static inline bool _is_2_byte(uint8_t c)
	177	{
	178	return (c & 0xE0) == 0xC0;
	179	}
	180
	181	static inline bool _is_3_byte(uint8_t c)
	182	{
	183	return (c & 0xF0) == 0xE0;
	184	}
	185
	186	static inline bool _is_4_byte(uint8_t c)
	187	{
	188	return (c & 0xF8) == 0xF0;
[28c39f3]	189	}
	190
	191	static inline int _char_continuation_bytes(char32_t c)
	192	{
[6120b7b]	193	if ((c & ~LO_MASK_32(7)) == 0)
	194	return 0;
	195
[28c39f3]	196	if ((c & ~LO_MASK_32(11)) == 0)
	197	return 1;
	198
	199	if ((c & ~LO_MASK_32(16)) == 0)
	200	return 2;
	201
	202	if ((c & ~LO_MASK_32(21)) == 0)
	203	return 3;
	204
	205	/* Codes longer than 21 bits are not supported */
	206	return -1;
	207	}
	208
	209	static inline int _continuation_bytes(uint8_t b)
	210	{
	211	/* 0xxxxxxx */
	212	if (_is_ascii(b))
	213	return 0;
	214
	215	/* 110xxxxx 10xxxxxx */
[65bf084]	216	if (_is_2_byte(b))
[28c39f3]	217	return 1;
	218
	219	/* 1110xxxx 10xxxxxx 10xxxxxx */
[65bf084]	220	if (_is_3_byte(b))
[28c39f3]	221	return 2;
	222
	223	/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
[65bf084]	224	if (_is_4_byte(b))
[28c39f3]	225	return 3;
	226
	227	return -1;
	228	}
	229
[65bf084]	230	static bool _is_non_shortest(const mbstate_t *mb, uint8_t b)
	231	{
	232	return (mb->state == 0b1111110000000000 && !(b & 0b00100000)) \|\|
	233	(mb->state == 0b1111111111110000 && !(b & 0b00110000));
	234	}
	235
[b31323f]	236	static bool _is_surrogate(const mbstate_t *mb, uint8_t b)
	237	{
	238	return (mb->state == 0b1111110000001101 && b >= 0xa0);
	239	}
	240
[65bf084]	241	#define _likely(expr) __builtin_expect((expr), true)
	242	#define _unlikely(expr) __builtin_expect((expr), false)
	243
	244	#define FAST_PATHS 1
	245
	246	static char32_t _str_decode(const char s, size_t offset, size_t size, mbstate_t *mb)
	247	{
	248	assert(s);
	249	assert(offset);
	250	assert(*offset <= size);
	251	assert(size == STR_NO_LIMIT \|\| s + size >= s);
	252	assert(mb);
	253
	254	if (*offset == size)
	255	return 0;
	256
	257	if (_likely(!mb->state)) {
	258	/* Clean slate, read initial byte. */
	259	uint8_t b = s[(*offset)++];
	260
	261	/* Fast exit for the most common case. */
	262	if (_likely(_is_ascii(b)))
	263	return b;
	264
	265	/* unexpected continuation byte */
	266	if (_unlikely(_is_continuation(b)))
	267	return CHAR_INVALID;
	268
	269	/*
	270	* The value stored into `continuation` is designed to have
	271	* just enough leading ones that after shifting in one less than
	272	* the expected number of continuation bytes, the most significant
	273	* bit becomes zero. (The field is 16b wide.)
	274	*/
	275
	276	if (_is_2_byte(b)) {
	277	/* Reject non-shortest form. */
	278	if (_unlikely(!(b & 0b00011110)))
	279	return CHAR_INVALID;
	280
	281	#if FAST_PATHS
	282	/* We can usually take this exit. */
	283	if (_likely(offset < size && _is_continuation(s[offset])))
	284	return (b & UTF8_MASK_INITIAL2) << 6 \|
	285	(s[(*offset)++] & UTF8_MASK_CONT);
	286	#endif
	287
	288	/* 2 byte continuation 110xxxxx */
	289	mb->state = b ^ 0b0000000011000000;
	290
	291	} else if (_is_3_byte(b)) {
	292	#if FAST_PATHS
	293	/* We can usually take this exit. */
	294	if (_likely(offset + 1 < size && _is_continuation(s[offset]) && _is_continuation(s[*offset + 1]))) {
	295
	296	char32_t ch = (b & UTF8_MASK_INITIAL3) << 12 \|
	297	(s[(*offset)] & UTF8_MASK_CONT) << 6 \|
	298	(s[(*offset) + 1] & UTF8_MASK_CONT);
	299
	300	*offset += 2;
	301
	302	/* Reject non-shortest form. */
	303	if (_unlikely(!(ch & 0xFFFFF800)))
	304	return CHAR_INVALID;
	305
[b31323f]	306	/* Reject surrogates */
	307	if (_unlikely(ch >= 0xD800 && ch < 0xE000))
	308	return CHAR_INVALID;
	309
[65bf084]	310	return ch;
	311	}
	312	#endif
	313
	314	/* 3 byte continuation 1110xxxx */
	315	mb->state = b ^ 0b1111110011100000;
	316
	317	} else if (_is_4_byte(b)) {
	318	#if FAST_PATHS
	319	/* We can usually take this exit. */
	320	if (_likely(offset + 2 < size && _is_continuation(s[offset]) &&
	321	_is_continuation(s[offset + 1]) && _is_continuation(s[offset + 2]))) {
	322
	323	char32_t ch = (b & UTF8_MASK_INITIAL4) << 18 \|
	324	(s[(*offset)] & UTF8_MASK_CONT) << 12 \|
	325	(s[(*offset) + 1] & UTF8_MASK_CONT) << 6 \|
	326	(s[(*offset) + 2] & UTF8_MASK_CONT);
	327
	328	*offset += 3;
	329
	330	/* Reject non-shortest form. */
	331	if (_unlikely(!(ch & 0xFFFF0000)))
	332	return CHAR_INVALID;
	333
[b31323f]	334	/* Reject out-of-range characters. */
	335	if (_unlikely(ch >= 0x110000))
	336	return CHAR_INVALID;
	337
[65bf084]	338	return ch;
	339	}
	340	#endif
	341
	342	/* 4 byte continuation 11110xxx */
	343	mb->state = b ^ 0b1111111100000000;
	344	} else {
	345	return CHAR_INVALID;
	346	}
	347	}
	348
	349	/* Deal with the remaining edge and invalid cases. */
	350	for (; offset < size; (offset)++) {
	351	/* Read continuation bytes. */
	352	uint8_t b = s[*offset];
	353
[b31323f]	354	if (!_is_continuation(b) \|\| _is_non_shortest(mb, b) \|\| _is_surrogate(mb, b)) {
[65bf084]	355	mb->state = 0;
	356	return CHAR_INVALID;
	357	}
	358
	359	/* Top bit becomes zero when shifting in the second to last byte. */
	360	if (!(mb->state & 0x8000)) {
	361	char32_t c = ((char32_t) mb->state) << 6 \| (b & UTF8_MASK_CONT);
	362	mb->state = 0;
	363	(*offset)++;
	364	return c;
	365	}
	366
	367	mb->state = mb->state << 6 \| (b & UTF8_MASK_CONT);
	368	}
	369
	370	/* Incomplete character. */
	371	assert(mb->state);
	372	return 0;
	373	}
	374
	375	/** Standard <uchar.h> function since C11. */
	376	size_t mbrtoc32(char32_t c, const char s, size_t n, mbstate_t *mb)
	377	{
	378	#if __STDC_HOSTED__
	379	static fibril_local mbstate_t global_state = { };
	380
	381	if (!mb)
	382	mb = &global_state;
	383	#endif
	384
	385	if (!s) {
	386	/* Equivalent to mbrtoc32(NULL, "", 1, mb); */
	387	c = NULL;
	388	s = "";
	389	n = 1;
	390	}
	391
	392	size_t offset = 0;
	393	char32_t ret = _str_decode(s, &offset, n, mb);
	394	if (ret == CHAR_INVALID) {
	395	assert(!mb->state);
	396	_set_ilseq();
	397	return UCHAR_ILSEQ;
	398	}
	399	if (mb->state) {
	400	assert(ret == 0);
	401	return UCHAR_INCOMPLETE;
	402	}
	403
	404	if (c)
	405	*c = ret;
	406	return ret ? offset : 0;
	407	}
	408
[171f9a1]	409	/** Decode a single character from a string.
	410	*
	411	* Decode a single character from a string of size @a size. Decoding starts
	412	* at @a offset and this offset is moved to the beginning of the next
	413	* character. In case of decoding error, offset generally advances at least
	414	* by one. However, offset is never moved beyond size.
	415	*
	416	* @param str String (not necessarily NULL-terminated).
	417	* @param offset Byte offset in string where to start decoding.
	418	* @param size Size of the string (in bytes).
	419	*
	420	* @return Value of decoded character, U_SPECIAL on decoding error or
	421	* NULL if attempt to decode beyond @a size.
	422	*
	423	*/
[28a5ebd]	424	char32_t str_decode(const char str, size_t offset, size_t size)
[171f9a1]	425	{
[65bf084]	426	mbstate_t mb = { };
	427	char32_t ch = _str_decode(str, offset, size, &mb);
[28c39f3]	428
[65bf084]	429	if (ch == CHAR_INVALID)
[171f9a1]	430	return U_SPECIAL;
[a35b458]	431
[65bf084]	432	if (mb.state)
[0600976]	433	return U_SPECIAL;
	434
[171f9a1]	435	return ch;
	436	}
	437
[568693b]	438	/** Decode a single character from a string to the left.
	439	*
	440	* Decode a single character from a string of size @a size. Decoding starts
	441	* at @a offset and this offset is moved to the beginning of the previous
	442	* character. In case of decoding error, offset generally decreases at least
	443	* by one. However, offset is never moved before 0.
	444	*
	445	* @param str String (not necessarily NULL-terminated).
	446	* @param offset Byte offset in string where to start decoding.
	447	* @param size Size of the string (in bytes).
	448	*
	449	* @return Value of decoded character, U_SPECIAL on decoding error or
	450	* NULL if attempt to decode beyond @a start of str.
	451	*
	452	*/
[28a5ebd]	453	char32_t str_decode_reverse(const char str, size_t offset, size_t size)
[568693b]	454	{
	455	if (*offset == 0)
	456	return 0;
[a35b458]	457
[28c39f3]	458	int cbytes = 0;
[568693b]	459	/* Continue while continuation bytes found */
[28c39f3]	460	while (*offset > 0 && cbytes < 4) {
[568693b]	461	uint8_t b = (uint8_t) str[--(*offset)];
[a35b458]	462
[65bf084]	463	if (_is_continuation(b)) {
[28c39f3]	464	cbytes++;
	465	continue;
[568693b]	466	}
[28c39f3]	467
[65bf084]	468	/* Reject non-shortest form encoding. */
[28c39f3]	469	if (cbytes != _continuation_bytes(b))
	470	return U_SPECIAL;
	471
	472	/* Start byte */
	473	size_t start_offset = *offset;
	474	return str_decode(str, &start_offset, size);
[568693b]	475	}
[28c39f3]	476
[568693b]	477	/* Too many continuation bytes */
	478	return U_SPECIAL;
	479	}
	480
[171f9a1]	481	/** Encode a single character to string representation.
	482	*
	483	* Encode a single character to string representation (i.e. UTF-8) and store
	484	* it into a buffer at @a offset. Encoding starts at @a offset and this offset
	485	* is moved to the position where the next character can be written to.
	486	*
	487	* @param ch Input character.
	488	* @param str Output buffer.
	489	* @param offset Byte offset where to start writing.
	490	* @param size Size of the output buffer (in bytes).
	491	*
	492	* @return EOK if the character was encoded successfully, EOVERFLOW if there
[d4a3ee5]	493	* was not enough space in the output buffer or EINVAL if the character
	494	* code was invalid.
[171f9a1]	495	*/
[28c39f3]	496	errno_t chr_encode(char32_t ch, char str, size_t offset, size_t size)
[171f9a1]	497	{
[65bf084]	498	// TODO: merge with c32rtomb()
	499
[171f9a1]	500	if (*offset >= size)
	501	return EOVERFLOW;
[a35b458]	502
[28c39f3]	503	/* Fast exit for the most common case. */
	504	if (ch < 0x80) {
	505	str[(*offset)++] = (char) ch;
	506	return EOK;
	507	}
	508
	509	/* Codes longer than 21 bits are not supported */
[171f9a1]	510	if (!chr_check(ch))
	511	return EINVAL;
[a35b458]	512
[171f9a1]	513	/* Determine how many continuation bytes are needed */
[a35b458]	514
[28c39f3]	515	unsigned int cbytes = _char_continuation_bytes(ch);
	516	unsigned int b0_bits = 6 - cbytes; /* Data bits in first byte */
[a35b458]	517
[171f9a1]	518	/* Check for available space in buffer */
	519	if (*offset + cbytes >= size)
	520	return EOVERFLOW;
[a35b458]	521
[171f9a1]	522	/* Encode continuation bytes */
	523	unsigned int i;
	524	for (i = cbytes; i > 0; i--) {
[28c39f3]	525	str[*offset + i] = 0x80 \| (ch & LO_MASK_32(CONT_BITS));
	526	ch >>= CONT_BITS;
[171f9a1]	527	}
[a35b458]	528
[171f9a1]	529	/* Encode first byte */
[28c39f3]	530	str[*offset] = (ch & LO_MASK_32(b0_bits)) \| HI_MASK_8(8 - b0_bits - 1);
[a35b458]	531
[171f9a1]	532	/* Advance offset */
	533	*offset += cbytes + 1;
[a35b458]	534
[171f9a1]	535	return EOK;
	536	}
	537
[b31323f]	538	/* Convert in place any bytes that don't form a valid character into replacement. */
	539	static size_t _str_sanitize(char *str, size_t n, uint8_t replacement)
[28c39f3]	540	{
[0600976]	541	uint8_t b = (uint8_t ) str;
[b31323f]	542	size_t count = 0;
[0600976]	543
[b31323f]	544	for (; n > 0 && b[0]; b++, n--) {
[0600976]	545	int cont = _continuation_bytes(b[0]);
	546	if (__builtin_expect(cont, 0) == 0)
[28c39f3]	547	continue;
	548
	549	if (cont < 0 \|\| n <= (size_t) cont) {
[b31323f]	550	b[0] = replacement;
	551	count++;
[28c39f3]	552	continue;
	553	}
	554
[0600976]	555	/* Check continuation bytes. */
[b31323f]	556	bool valid = true;
[28c39f3]	557	for (int i = 1; i <= cont; i++) {
[65bf084]	558	if (!_is_continuation(b[i])) {
[b31323f]	559	valid = false;
	560	break;
[28c39f3]	561	}
	562	}
[0600976]	563
[b31323f]	564	if (!valid) {
	565	b[0] = replacement;
	566	count++;
	567	continue;
	568	}
	569
[0600976]	570	/*
	571	* Check for non-shortest form encoding.
	572	* See https://www.unicode.org/versions/corrigendum1.html
	573	*/
	574
[b31323f]	575	/* 0b110!!!!x 0b10xxxxxx */
	576	if (cont == 1 && !(b[0] & 0b00011110)) {
	577	b[0] = replacement;
	578	count++;
	579	continue;
	580	}
	581
	582	/* 0b1110!!!! 0b10!xxxxx 0b10xxxxxx */
	583	if (cont == 2 && !(b[0] & 0b00001111) && !(b[1] & 0b00100000)) {
	584	b[0] = replacement;
	585	count++;
	586	continue;
	587	}
[0600976]	588
[b31323f]	589	/* 0b11110!!! 0b10!!xxxx 0b10xxxxxx 0b10xxxxxx */
	590	if (cont == 3 && !(b[0] & 0b00000111) && !(b[1] & 0b00110000)) {
	591	b[0] = replacement;
	592	count++;
[0600976]	593	continue;
[b31323f]	594	}
[0600976]	595
[b31323f]	596	/* Check for surrogate character encoding. */
	597	if (cont == 2 && b[0] == 0xED && b[1] >= 0xA0) {
	598	b[0] = replacement;
	599	count++;
[0600976]	600	continue;
[b31323f]	601	}
[0600976]	602
[b31323f]	603	/* Check for out-of-range code points. */
	604	if (cont == 3 && (b[0] > 0xF4 \|\| (b[0] == 0xF4 && b[1] >= 0x90))) {
	605	b[0] = replacement;
	606	count++;
[0600976]	607	continue;
	608	}
[b31323f]	609
	610	b += cont;
	611	n -= cont;
[28c39f3]	612	}
[b31323f]	613
	614	return count;
	615	}
	616
	617	size_t str_sanitize(char *str, size_t n, uint8_t replacement)
	618	{
	619	return _str_sanitize(str, n, replacement);
[28c39f3]	620	}
	621
	622	static size_t _str_size(const char *str)
	623	{
	624	size_t size = 0;
	625
	626	while (*str++ != 0)
	627	size++;
	628
	629	return size;
	630	}
	631
[f2b8cdc]	632	/** Get size of string.
	633	*
	634	* Get the number of bytes which are used by the string @a str (excluding the
	635	* NULL-terminator).
	636	*
	637	* @param str String to consider.
	638	*
	639	* @return Number of bytes used by the string
	640	*
	641	*/
	642	size_t str_size(const char *str)
	643	{
[28c39f3]	644	return _str_size(str);
[f2b8cdc]	645	}
	646
	647	/** Get size of wide string.
	648	*
	649	* Get the number of bytes which are used by the wide string @a str (excluding the
	650	* NULL-terminator).
	651	*
	652	* @param str Wide string to consider.
	653	*
	654	* @return Number of bytes used by the wide string
	655	*
	656	*/
[28a5ebd]	657	size_t wstr_size(const char32_t *str)
[f2b8cdc]	658	{
[28a5ebd]	659	return (wstr_length(str) * sizeof(char32_t));
[f2b8cdc]	660	}
	661
	662	/** Get size of string with length limit.
	663	*
	664	* Get the number of bytes which are used by up to @a max_len first
	665	* characters in the string @a str. If @a max_len is greater than
	666	* the length of @a str, the entire string is measured (excluding the
	667	* NULL-terminator).
	668	*
	669	* @param str String to consider.
	670	* @param max_len Maximum number of characters to measure.
	671	*
	672	* @return Number of bytes used by the characters.
	673	*
	674	*/
[d4a3ee5]	675	size_t str_lsize(const char *str, size_t max_len)
[f2b8cdc]	676	{
[d4a3ee5]	677	size_t len = 0;
[f2b8cdc]	678	size_t offset = 0;
[a35b458]	679
[f2b8cdc]	680	while (len < max_len) {
	681	if (str_decode(str, &offset, STR_NO_LIMIT) == 0)
	682	break;
[a35b458]	683
[f2b8cdc]	684	len++;
	685	}
[a35b458]	686
[f2b8cdc]	687	return offset;
	688	}
	689
[28c39f3]	690	static size_t _str_nsize(const char *str, size_t max_size)
	691	{
	692	size_t size = 0;
	693
	694	while ((*str++ != 0) && (size < max_size))
	695	size++;
	696
	697	return size;
	698	}
	699
[560d79f]	700	/** Get size of string with size limit.
	701	*
	702	* Get the number of bytes which are used by the string @a str
	703	* (excluding the NULL-terminator), but no more than @max_size bytes.
	704	*
	705	* @param str String to consider.
	706	* @param max_size Maximum number of bytes to measure.
	707	*
	708	* @return Number of bytes used by the string
	709	*
	710	*/
	711	size_t str_nsize(const char *str, size_t max_size)
	712	{
[28c39f3]	713	return _str_nsize(str, max_size);
[560d79f]	714	}
	715
	716	/** Get size of wide string with size limit.
	717	*
	718	* Get the number of bytes which are used by the wide string @a str
	719	* (excluding the NULL-terminator), but no more than @max_size bytes.
	720	*
	721	* @param str Wide string to consider.
	722	* @param max_size Maximum number of bytes to measure.
	723	*
	724	* @return Number of bytes used by the wide string
	725	*
	726	*/
[28a5ebd]	727	size_t wstr_nsize(const char32_t *str, size_t max_size)
[560d79f]	728	{
[28a5ebd]	729	return (wstr_nlength(str, max_size) * sizeof(char32_t));
[560d79f]	730	}
	731
[f2b8cdc]	732	/** Get size of wide string with length limit.
	733	*
	734	* Get the number of bytes which are used by up to @a max_len first
	735	* wide characters in the wide string @a str. If @a max_len is greater than
	736	* the length of @a str, the entire wide string is measured (excluding the
	737	* NULL-terminator).
	738	*
	739	* @param str Wide string to consider.
	740	* @param max_len Maximum number of wide characters to measure.
	741	*
	742	* @return Number of bytes used by the wide characters.
	743	*
	744	*/
[28a5ebd]	745	size_t wstr_lsize(const char32_t *str, size_t max_len)
[f2b8cdc]	746	{
[28a5ebd]	747	return (wstr_nlength(str, max_len * sizeof(char32_t)) * sizeof(char32_t));
[f2b8cdc]	748	}
	749
	750	/** Get number of characters in a string.
	751	*
	752	* @param str NULL-terminated string.
	753	*
	754	* @return Number of characters in string.
	755	*
	756	*/
[d4a3ee5]	757	size_t str_length(const char *str)
[f2b8cdc]	758	{
[d4a3ee5]	759	size_t len = 0;
[f2b8cdc]	760	size_t offset = 0;
[a35b458]	761
[f2b8cdc]	762	while (str_decode(str, &offset, STR_NO_LIMIT) != 0)
	763	len++;
[a35b458]	764
[f2b8cdc]	765	return len;
	766	}
	767
	768	/** Get number of characters in a wide string.
	769	*
	770	* @param str NULL-terminated wide string.
	771	*
	772	* @return Number of characters in @a str.
	773	*
	774	*/
[28a5ebd]	775	size_t wstr_length(const char32_t *wstr)
[f2b8cdc]	776	{
[d4a3ee5]	777	size_t len = 0;
[a35b458]	778
[f2b8cdc]	779	while (*wstr++ != 0)
	780	len++;
[a35b458]	781
[f2b8cdc]	782	return len;
	783	}
	784
	785	/** Get number of characters in a string with size limit.
	786	*
	787	* @param str NULL-terminated string.
	788	* @param size Maximum number of bytes to consider.
	789	*
	790	* @return Number of characters in string.
	791	*
	792	*/
[d4a3ee5]	793	size_t str_nlength(const char *str, size_t size)
[f2b8cdc]	794	{
[d4a3ee5]	795	size_t len = 0;
[f2b8cdc]	796	size_t offset = 0;
[a35b458]	797
[f2b8cdc]	798	while (str_decode(str, &offset, size) != 0)
	799	len++;
[a35b458]	800
[f2b8cdc]	801	return len;
	802	}
	803
	804	/** Get number of characters in a string with size limit.
	805	*
	806	* @param str NULL-terminated string.
	807	* @param size Maximum number of bytes to consider.
	808	*
	809	* @return Number of characters in string.
	810	*
	811	*/
[28a5ebd]	812	size_t wstr_nlength(const char32_t *str, size_t size)
[f2b8cdc]	813	{
[d4a3ee5]	814	size_t len = 0;
[28a5ebd]	815	size_t limit = ALIGN_DOWN(size, sizeof(char32_t));
[d4a3ee5]	816	size_t offset = 0;
[a35b458]	817
[f2b8cdc]	818	while ((offset < limit) && (*str++ != 0)) {
	819	len++;
[28a5ebd]	820	offset += sizeof(char32_t);
[f2b8cdc]	821	}
[a35b458]	822
[f2b8cdc]	823	return len;
	824	}
	825
[be2a38ad]	826	/** Get character display width on a character cell display.
	827	*
	828	* @param ch Character
	829	* @return Width of character in cells.
	830	*/
[28a5ebd]	831	size_t chr_width(char32_t ch)
[be2a38ad]	832	{
	833	return 1;
	834	}
	835
	836	/** Get string display width on a character cell display.
	837	*
	838	* @param str String
	839	* @return Width of string in cells.
	840	*/
	841	size_t str_width(const char *str)
	842	{
	843	size_t width = 0;
	844	size_t offset = 0;
[28a5ebd]	845	char32_t ch;
[a35b458]	846
[be2a38ad]	847	while ((ch = str_decode(str, &offset, STR_NO_LIMIT)) != 0)
	848	width += chr_width(ch);
[a35b458]	849
[be2a38ad]	850	return width;
	851	}
	852
[f2b8cdc]	853	/** Check whether character is plain ASCII.
	854	*
	855	* @return True if character is plain ASCII.
	856	*
	857	*/
[28a5ebd]	858	bool ascii_check(char32_t ch)
[f2b8cdc]	859	{
[28a5ebd]	860	if (ch <= 127)
[f2b8cdc]	861	return true;
[a35b458]	862
[f2b8cdc]	863	return false;
	864	}
	865
[171f9a1]	866	/** Check whether character is valid
	867	*
	868	* @return True if character is a valid Unicode code point.
	869	*
	870	*/
[28a5ebd]	871	bool chr_check(char32_t ch)
[171f9a1]	872	{
[28a5ebd]	873	if (ch <= 1114111)
[171f9a1]	874	return true;
[a35b458]	875
[171f9a1]	876	return false;
	877	}
[936351c1]	878
[f2b8cdc]	879	/** Compare two NULL terminated strings.
	880	*
	881	* Do a char-by-char comparison of two NULL-terminated strings.
[4efeab5]	882	* The strings are considered equal iff their length is equal
	883	* and both strings consist of the same sequence of characters.
	884	*
[1772e6d]	885	* A string S1 is less than another string S2 if it has a character with
	886	* lower value at the first character position where the strings differ.
	887	* If the strings differ in length, the shorter one is treated as if
	888	* padded by characters with a value of zero.
[f2b8cdc]	889	*
	890	* @param s1 First string to compare.
	891	* @param s2 Second string to compare.
	892	*
[1772e6d]	893	* @return 0 if the strings are equal, -1 if the first is less than the second,
	894	* 1 if the second is less than the first.
[f2b8cdc]	895	*
	896	*/
	897	int str_cmp(const char s1, const char s2)
	898	{
[28c39f3]	899	/*
	900	* UTF-8 has the nice property that lexicographic ordering on bytes is
	901	* the same as the lexicographic ordering of the character sequences.
	902	*/
	903	while (s1 == s2 && *s1 != 0) {
	904	s1++;
	905	s2++;
[f2b8cdc]	906	}
	907
[28c39f3]	908	if (s1 == s2)
	909	return 0;
	910
	911	return (s1 < s2) ? -1 : 1;
[f2b8cdc]	912	}
	913
	914	/** Compare two NULL terminated strings with length limit.
	915	*
	916	* Do a char-by-char comparison of two NULL-terminated strings.
[4efeab5]	917	* The strings are considered equal iff
	918	* min(str_length(s1), max_len) == min(str_length(s2), max_len)
	919	* and both strings consist of the same sequence of characters,
	920	* up to max_len characters.
	921	*
[1772e6d]	922	* A string S1 is less than another string S2 if it has a character with
	923	* lower value at the first character position where the strings differ.
	924	* If the strings differ in length, the shorter one is treated as if
	925	* padded by characters with a value of zero. Only the first max_len
	926	* characters are considered.
[f2b8cdc]	927	*
	928	* @param s1 First string to compare.
	929	* @param s2 Second string to compare.
	930	* @param max_len Maximum number of characters to consider.
	931	*
[1772e6d]	932	* @return 0 if the strings are equal, -1 if the first is less than the second,
	933	* 1 if the second is less than the first.
[f2b8cdc]	934	*
	935	*/
[d4a3ee5]	936	int str_lcmp(const char s1, const char s2, size_t max_len)
[f2b8cdc]	937	{
[28a5ebd]	938	char32_t c1 = 0;
	939	char32_t c2 = 0;
[8227d63]	940
[f2b8cdc]	941	size_t off1 = 0;
	942	size_t off2 = 0;
[8227d63]	943
[d4a3ee5]	944	size_t len = 0;
[f2b8cdc]	945
	946	while (true) {
	947	if (len >= max_len)
	948	break;
	949
	950	c1 = str_decode(s1, &off1, STR_NO_LIMIT);
	951	c2 = str_decode(s2, &off2, STR_NO_LIMIT);
	952
[8227d63]	953	if (c1 < c2)
	954	return -1;
	955
	956	if (c1 > c2)
	957	return 1;
	958
	959	if (c1 == 0 \|\| c2 == 0)
	960	break;
	961
	962	++len;
	963	}
	964
	965	return 0;
	966
	967	}
	968
	969	/** Compare two NULL terminated strings in case-insensitive manner.
	970	*
	971	* Do a char-by-char comparison of two NULL-terminated strings.
	972	* The strings are considered equal iff their length is equal
	973	* and both strings consist of the same sequence of characters
	974	* when converted to lower case.
	975	*
	976	* A string S1 is less than another string S2 if it has a character with
	977	* lower value at the first character position where the strings differ.
	978	* If the strings differ in length, the shorter one is treated as if
	979	* padded by characters with a value of zero.
	980	*
	981	* @param s1 First string to compare.
	982	* @param s2 Second string to compare.
	983	*
	984	* @return 0 if the strings are equal, -1 if the first is less than the second,
	985	* 1 if the second is less than the first.
	986	*
	987	*/
	988	int str_casecmp(const char s1, const char s2)
	989	{
[28c39f3]	990	// FIXME: doesn't work for non-ASCII caseful characters
	991
[28a5ebd]	992	char32_t c1 = 0;
	993	char32_t c2 = 0;
[8227d63]	994
	995	size_t off1 = 0;
	996	size_t off2 = 0;
	997
	998	while (true) {
	999	c1 = tolower(str_decode(s1, &off1, STR_NO_LIMIT));
	1000	c2 = tolower(str_decode(s2, &off2, STR_NO_LIMIT));
	1001
	1002	if (c1 < c2)
	1003	return -1;
	1004
	1005	if (c1 > c2)
	1006	return 1;
	1007
	1008	if (c1 == 0 \|\| c2 == 0)
	1009	break;
	1010	}
	1011
	1012	return 0;
	1013	}
	1014
	1015	/** Compare two NULL terminated strings with length limit in case-insensitive
	1016	* manner.
	1017	*
	1018	* Do a char-by-char comparison of two NULL-terminated strings.
	1019	* The strings are considered equal iff
	1020	* min(str_length(s1), max_len) == min(str_length(s2), max_len)
	1021	* and both strings consist of the same sequence of characters,
	1022	* up to max_len characters.
	1023	*
	1024	* A string S1 is less than another string S2 if it has a character with
	1025	* lower value at the first character position where the strings differ.
	1026	* If the strings differ in length, the shorter one is treated as if
	1027	* padded by characters with a value of zero. Only the first max_len
	1028	* characters are considered.
	1029	*
	1030	* @param s1 First string to compare.
	1031	* @param s2 Second string to compare.
	1032	* @param max_len Maximum number of characters to consider.
	1033	*
	1034	* @return 0 if the strings are equal, -1 if the first is less than the second,
	1035	* 1 if the second is less than the first.
	1036	*
	1037	*/
	1038	int str_lcasecmp(const char s1, const char s2, size_t max_len)
	1039	{
[28c39f3]	1040	// FIXME: doesn't work for non-ASCII caseful characters
	1041
[28a5ebd]	1042	char32_t c1 = 0;
	1043	char32_t c2 = 0;
[a35b458]	1044
[8227d63]	1045	size_t off1 = 0;
	1046	size_t off2 = 0;
[a35b458]	1047
[8227d63]	1048	size_t len = 0;
	1049
	1050	while (true) {
	1051	if (len >= max_len)
	1052	break;
	1053
	1054	c1 = tolower(str_decode(s1, &off1, STR_NO_LIMIT));
	1055	c2 = tolower(str_decode(s2, &off2, STR_NO_LIMIT));
	1056
[f2b8cdc]	1057	if (c1 < c2)
	1058	return -1;
	1059
	1060	if (c1 > c2)
	1061	return 1;
	1062
	1063	if (c1 == 0 \|\| c2 == 0)
	1064	break;
	1065
[1b20da0]	1066	++len;
[f2b8cdc]	1067	}
	1068
	1069	return 0;
	1070
	1071	}
	1072
[28c39f3]	1073	static bool _test_prefix(const char s, const char p)
	1074	{
	1075	while (s == p && *s != 0) {
	1076	s++;
	1077	p++;
	1078	}
	1079
	1080	return *p == 0;
	1081	}
	1082
[dce39b4]	1083	/** Test whether p is a prefix of s.
	1084	*
	1085	* Do a char-by-char comparison of two NULL-terminated strings
	1086	* and determine if p is a prefix of s.
	1087	*
	1088	* @param s The string in which to look
	1089	* @param p The string to check if it is a prefix of s
	1090	*
	1091	* @return true iff p is prefix of s else false
	1092	*
	1093	*/
	1094	bool str_test_prefix(const char s, const char p)
	1095	{
[28c39f3]	1096	return _test_prefix(s, p);
[dce39b4]	1097	}
	1098
[086cab0]	1099	/** Get a string suffix.
	1100	*
	1101	* Return a string suffix defined by the prefix length.
	1102	*
	1103	* @param s The string to get the suffix from.
	1104	* @param prefix_length Number of prefix characters to ignore.
	1105	*
	1106	* @return String suffix.
	1107	*
	1108	*/
	1109	const char str_suffix(const char s, size_t prefix_length)
	1110	{
	1111	size_t off = 0;
	1112	size_t i = 0;
	1113
	1114	while (true) {
	1115	str_decode(s, &off, STR_NO_LIMIT);
	1116	i++;
	1117
	1118	if (i >= prefix_length)
	1119	break;
	1120	}
	1121
	1122	return s + off;
	1123	}
	1124
[28c39f3]	1125	/** Copy string as a sequence of bytes. */
	1126	static void _str_cpy(char dest, const char src)
	1127	{
	1128	while (*src)
	1129	(dest++) = (src++);
	1130
	1131	*dest = 0;
	1132	}
	1133
	1134	/** Copy string as a sequence of bytes. */
	1135	static void _str_cpyn(char dest, size_t size, const char src)
	1136	{
[0600976]	1137	assert(dest && src && size);
	1138
	1139	if (!dest \|\| !src \|\| !size)
	1140	return;
	1141
	1142	if (size == STR_NO_LIMIT)
	1143	return _str_cpy(dest, src);
	1144
[28c39f3]	1145	char *dest_top = dest + size - 1;
[0600976]	1146	assert(size == 1 \|\| dest < dest_top);
[28c39f3]	1147
	1148	while (*src && dest < dest_top)
	1149	(dest++) = (src++);
	1150
	1151	*dest = 0;
	1152	}
	1153
[6eb2e96]	1154	/** Copy string.
[f2b8cdc]	1155	*
[6eb2e96]	1156	* Copy source string @a src to destination buffer @a dest.
	1157	* No more than @a size bytes are written. If the size of the output buffer
	1158	* is at least one byte, the output string will always be well-formed, i.e.
	1159	* null-terminated and containing only complete characters.
[f2b8cdc]	1160	*
[abf09311]	1161	* @param dest Destination buffer.
[6700ee2]	1162	* @param count Size of the destination buffer (must be > 0).
[6eb2e96]	1163	* @param src Source string.
[8e893ae]	1164	*
[f2b8cdc]	1165	*/
[6eb2e96]	1166	void str_cpy(char dest, size_t size, const char src)
[f2b8cdc]	1167	{
[6700ee2]	1168	/* There must be space for a null terminator in the buffer. */
	1169	assert(size > 0);
[d066259]	1170	assert(src != NULL);
[28c39f3]	1171	assert(dest != NULL);
[0600976]	1172	assert(size == STR_NO_LIMIT \|\| dest + size > dest);
[a35b458]	1173
[28c39f3]	1174	/* Copy data. */
	1175	_str_cpyn(dest, size, src);
[a35b458]	1176
[28c39f3]	1177	/* In-place translate invalid bytes to U_SPECIAL. */
[b31323f]	1178	_str_sanitize(dest, size, U_SPECIAL);
[6eb2e96]	1179	}
	1180
	1181	/** Copy size-limited substring.
	1182	*
[6700ee2]	1183	* Copy prefix of string @a src of max. size @a size to destination buffer
	1184	* @a dest. No more than @a size bytes are written. The output string will
	1185	* always be well-formed, i.e. null-terminated and containing only complete
	1186	* characters.
[6eb2e96]	1187	*
	1188	* No more than @a n bytes are read from the input string, so it does not
	1189	* have to be null-terminated.
	1190	*
[abf09311]	1191	* @param dest Destination buffer.
[6700ee2]	1192	* @param count Size of the destination buffer (must be > 0).
[6eb2e96]	1193	* @param src Source string.
[abf09311]	1194	* @param n Maximum number of bytes to read from @a src.
[8e893ae]	1195	*
[6eb2e96]	1196	*/
	1197	void str_ncpy(char dest, size_t size, const char src, size_t n)
	1198	{
[6700ee2]	1199	/* There must be space for a null terminator in the buffer. */
	1200	assert(size > 0);
[28c39f3]	1201	assert(src != NULL);
[a35b458]	1202
[28c39f3]	1203	/* Copy data. */
	1204	_str_cpyn(dest, min(size, n + 1), src);
[a35b458]	1205
[28c39f3]	1206	/* In-place translate invalid bytes to U_SPECIAL. */
[b31323f]	1207	_str_sanitize(dest, size, U_SPECIAL);
[f2b8cdc]	1208	}
	1209
[4482bc7]	1210	/** Append one string to another.
	1211	*
	1212	* Append source string @a src to string in destination buffer @a dest.
	1213	* Size of the destination buffer is @a dest. If the size of the output buffer
	1214	* is at least one byte, the output string will always be well-formed, i.e.
	1215	* null-terminated and containing only complete characters.
	1216	*
[0f06dbc]	1217	* @param dest Destination buffer.
[4482bc7]	1218	* @param count Size of the destination buffer.
	1219	* @param src Source string.
	1220	*/
	1221	void str_append(char dest, size_t size, const char src)
	1222	{
[28c39f3]	1223	assert(src != NULL);
	1224	assert(dest != NULL);
	1225	assert(size > 0);
[0600976]	1226	assert(size == STR_NO_LIMIT \|\| dest + size > dest);
[a35b458]	1227
[28c39f3]	1228	size_t dstr_size = _str_nsize(dest, size);
[0600976]	1229	if (dstr_size < size) {
	1230	_str_cpyn(dest + dstr_size, size - dstr_size, src);
[b31323f]	1231	_str_sanitize(dest + dstr_size, size - dstr_size, U_SPECIAL);
[0600976]	1232	}
[4482bc7]	1233	}
	1234
[dcb74c0a]	1235	/** Convert space-padded ASCII to string.
	1236	*
	1237	* Common legacy text encoding in hardware is 7-bit ASCII fitted into
[c3d19ac]	1238	* a fixed-width byte buffer (bit 7 always zero), right-padded with spaces
[dcb74c0a]	1239	* (ASCII 0x20). Convert space-padded ascii to string representation.
	1240	*
	1241	* If the text does not fit into the destination buffer, the function converts
	1242	* as many characters as possible and returns EOVERFLOW.
	1243	*
	1244	* If the text contains non-ASCII bytes (with bit 7 set), the whole string is
	1245	* converted anyway and invalid characters are replaced with question marks
	1246	* (U_SPECIAL) and the function returns EIO.
	1247	*
	1248	* Regardless of return value upon return @a dest will always be well-formed.
	1249	*
	1250	* @param dest Destination buffer
	1251	* @param size Size of destination buffer
	1252	* @param src Space-padded ASCII.
	1253	* @param n Size of the source buffer in bytes.
	1254	*
	1255	* @return EOK on success, EOVERFLOW if the text does not fit
	1256	* destination buffer, EIO if the text contains
	1257	* non-ASCII bytes.
	1258	*/
[b7fd2a0]	1259	errno_t spascii_to_str(char dest, size_t size, const uint8_t src, size_t n)
[dcb74c0a]	1260	{
[28c39f3]	1261	size_t len = 0;
[dcb74c0a]	1262
[28c39f3]	1263	/* Determine the length of the source string. */
	1264	for (size_t i = 0; i < n; i++) {
	1265	if (src[i] == 0)
	1266	break;
	1267
	1268	if (src[i] != ' ')
	1269	len = i + 1;
	1270	}
	1271
	1272	errno_t result = EOK;
	1273	size_t out_len = min(len, size - 1);
	1274
	1275	/* Copy characters */
	1276	for (size_t i = 0; i < out_len; i++) {
	1277	dest[i] = src[i];
	1278
	1279	if (dest[i] < 0) {
	1280	dest[i] = U_SPECIAL;
[dcb74c0a]	1281	result = EIO;
	1282	}
[28c39f3]	1283	}
[dcb74c0a]	1284
[28c39f3]	1285	dest[out_len] = 0;
[dcb74c0a]	1286
[28c39f3]	1287	if (out_len < len)
	1288	return EOVERFLOW;
[dcb74c0a]	1289
	1290	return result;
	1291	}
	1292
[0f06dbc]	1293	/** Convert wide string to string.
[f2b8cdc]	1294	*
[0f06dbc]	1295	* Convert wide string @a src to string. The output is written to the buffer
	1296	* specified by @a dest and @a size. @a size must be non-zero and the string
	1297	* written will always be well-formed.
[f2b8cdc]	1298	*
[0f06dbc]	1299	* @param dest Destination buffer.
	1300	* @param size Size of the destination buffer.
	1301	* @param src Source wide string.
[f2b8cdc]	1302	*/
[28a5ebd]	1303	void wstr_to_str(char dest, size_t size, const char32_t src)
[f2b8cdc]	1304	{
[28a5ebd]	1305	char32_t ch;
[0f06dbc]	1306	size_t src_idx;
	1307	size_t dest_off;
	1308
	1309	/* There must be space for a null terminator in the buffer. */
	1310	assert(size > 0);
[a35b458]	1311
[0f06dbc]	1312	src_idx = 0;
	1313	dest_off = 0;
	1314
[f2b8cdc]	1315	while ((ch = src[src_idx++]) != 0) {
[81e9cb3]	1316	if (chr_encode(ch, dest, &dest_off, size - 1) != EOK)
[f2b8cdc]	1317	break;
	1318	}
[0f06dbc]	1319
	1320	dest[dest_off] = '\0';
[f2b8cdc]	1321	}
	1322
[82374b2]	1323	/** Convert UTF16 string to string.
	1324	*
	1325	* Convert utf16 string @a src to string. The output is written to the buffer
	1326	* specified by @a dest and @a size. @a size must be non-zero and the string
	1327	* written will always be well-formed. Surrogate pairs also supported.
	1328	*
	1329	* @param dest Destination buffer.
	1330	* @param size Size of the destination buffer.
	1331	* @param src Source utf16 string.
	1332	*
[cde999a]	1333	* @return EOK, if success, an error code otherwise.
[82374b2]	1334	*/
[b7fd2a0]	1335	errno_t utf16_to_str(char dest, size_t size, const uint16_t src)
[82374b2]	1336	{
[abb7491c]	1337	size_t idx = 0, dest_off = 0;
[28a5ebd]	1338	char32_t ch;
[b7fd2a0]	1339	errno_t rc = EOK;
[82374b2]	1340
	1341	/* There must be space for a null terminator in the buffer. */
	1342	assert(size > 0);
	1343
	1344	while (src[idx]) {
	1345	if ((src[idx] & 0xfc00) == 0xd800) {
[abb7491c]	1346	if (src[idx + 1] && (src[idx + 1] & 0xfc00) == 0xdc00) {
[82374b2]	1347	ch = 0x10000;
	1348	ch += (src[idx] & 0x03FF) << 10;
[abb7491c]	1349	ch += (src[idx + 1] & 0x03FF);
[82374b2]	1350	idx += 2;
[1433ecda]	1351	} else
[82374b2]	1352	break;
	1353	} else {
	1354	ch = src[idx];
	1355	idx++;
	1356	}
[abb7491c]	1357	rc = chr_encode(ch, dest, &dest_off, size - 1);
[82374b2]	1358	if (rc != EOK)
	1359	break;
	1360	}
	1361	dest[dest_off] = '\0';
	1362	return rc;
	1363	}
	1364
[b06414f]	1365	/** Convert string to UTF16 string.
	1366	*
	1367	* Convert string @a src to utf16 string. The output is written to the buffer
	1368	* specified by @a dest and @a dlen. @a dlen must be non-zero and the string
	1369	* written will always be well-formed. Surrogate pairs also supported.
	1370	*
	1371	* @param dest Destination buffer.
	1372	* @param dlen Number of utf16 characters that fit in the destination buffer.
	1373	* @param src Source string.
	1374	*
[cde999a]	1375	* @return EOK, if success, an error code otherwise.
[b06414f]	1376	*/
[b7fd2a0]	1377	errno_t str_to_utf16(uint16_t dest, size_t dlen, const char src)
[fc97128]	1378	{
[b7fd2a0]	1379	errno_t rc = EOK;
[abb7491c]	1380	size_t offset = 0;
	1381	size_t idx = 0;
[28a5ebd]	1382	char32_t c;
[fc97128]	1383
[b06414f]	1384	assert(dlen > 0);
[a35b458]	1385
[fc97128]	1386	while ((c = str_decode(src, &offset, STR_NO_LIMIT)) != 0) {
	1387	if (c > 0x10000) {
[b06414f]	1388	if (idx + 2 >= dlen - 1) {
[abb7491c]	1389	rc = EOVERFLOW;
[fc97128]	1390	break;
	1391	}
	1392	c = (c - 0x10000);
	1393	dest[idx] = 0xD800 \| (c >> 10);
[abb7491c]	1394	dest[idx + 1] = 0xDC00 \| (c & 0x3FF);
[fc97128]	1395	idx++;
	1396	} else {
[1433ecda]	1397	dest[idx] = c;
[fc97128]	1398	}
	1399
	1400	idx++;
[b06414f]	1401	if (idx >= dlen - 1) {
[abb7491c]	1402	rc = EOVERFLOW;
[fc97128]	1403	break;
	1404	}
	1405	}
	1406
	1407	dest[idx] = '\0';
	1408	return rc;
[f2b8cdc]	1409	}
	1410
[b2906c0]	1411	/** Get size of UTF-16 string.
	1412	*
	1413	* Get the number of words which are used by the UTF-16 string @a ustr
	1414	* (excluding the NULL-terminator).
	1415	*
	1416	* @param ustr UTF-16 string to consider.
	1417	*
	1418	* @return Number of words used by the UTF-16 string
	1419	*
	1420	*/
	1421	size_t utf16_wsize(const uint16_t *ustr)
	1422	{
	1423	size_t wsize = 0;
	1424
	1425	while (*ustr++ != 0)
	1426	wsize++;
	1427
	1428	return wsize;
	1429	}
	1430
[b67c7d64]	1431	/** Convert wide string to new string.
	1432	*
	1433	* Convert wide string @a src to string. Space for the new string is allocated
	1434	* on the heap.
	1435	*
	1436	* @param src Source wide string.
	1437	* @return New string.
	1438	*/
[28a5ebd]	1439	char wstr_to_astr(const char32_t src)
[b67c7d64]	1440	{
	1441	char dbuf[STR_BOUNDS(1)];
	1442	char *str;
[28a5ebd]	1443	char32_t ch;
[b67c7d64]	1444
	1445	size_t src_idx;
	1446	size_t dest_off;
	1447	size_t dest_size;
	1448
	1449	/* Compute size of encoded string. */
	1450
	1451	src_idx = 0;
	1452	dest_size = 0;
	1453
	1454	while ((ch = src[src_idx++]) != 0) {
	1455	dest_off = 0;
	1456	if (chr_encode(ch, dbuf, &dest_off, STR_BOUNDS(1)) != EOK)
	1457	break;
	1458	dest_size += dest_off;
	1459	}
	1460
	1461	str = malloc(dest_size + 1);
	1462	if (str == NULL)
	1463	return NULL;
	1464
	1465	/* Encode string. */
	1466
	1467	src_idx = 0;
	1468	dest_off = 0;
	1469
	1470	while ((ch = src[src_idx++]) != 0) {
	1471	if (chr_encode(ch, str, &dest_off, dest_size) != EOK)
	1472	break;
	1473	}
	1474
	1475	str[dest_size] = '\0';
	1476	return str;
	1477	}
	1478
[da2bd08]	1479	/** Convert string to wide string.
	1480	*
	1481	* Convert string @a src to wide string. The output is written to the
[0f06dbc]	1482	* buffer specified by @a dest and @a dlen. @a dlen must be non-zero
	1483	* and the wide string written will always be null-terminated.
[da2bd08]	1484	*
	1485	* @param dest Destination buffer.
	1486	* @param dlen Length of destination buffer (number of wchars).
	1487	* @param src Source string.
	1488	*/
[28a5ebd]	1489	void str_to_wstr(char32_t dest, size_t dlen, const char src)
[da2bd08]	1490	{
	1491	size_t offset;
	1492	size_t di;
[28a5ebd]	1493	char32_t c;
[da2bd08]	1494
	1495	assert(dlen > 0);
	1496
	1497	offset = 0;
	1498	di = 0;
	1499
	1500	do {
[81e9cb3]	1501	if (di >= dlen - 1)
[da2bd08]	1502	break;
	1503
	1504	c = str_decode(src, &offset, STR_NO_LIMIT);
	1505	dest[di++] = c;
	1506	} while (c != '\0');
	1507
	1508	dest[dlen - 1] = '\0';
	1509	}
	1510
[22cf42d9]	1511	/** Convert string to wide string.
	1512	*
	1513	* Convert string @a src to wide string. A new wide NULL-terminated
	1514	* string will be allocated on the heap.
	1515	*
	1516	* @param src Source string.
	1517	*/
[28a5ebd]	1518	char32_t str_to_awstr(const char str)
[22cf42d9]	1519	{
	1520	size_t len = str_length(str);
[a35b458]	1521
[28a5ebd]	1522	char32_t *wstr = calloc(len + 1, sizeof(char32_t));
[b48d046]	1523	if (wstr == NULL)
	1524	return NULL;
[a35b458]	1525
[b48d046]	1526	str_to_wstr(wstr, len + 1, str);
[22cf42d9]	1527	return wstr;
	1528	}
	1529
[28c39f3]	1530	static char _strchr(const char str, char c)
	1531	{
	1532	while (str != 0 && str != c)
	1533	str++;
	1534
	1535	return (str == c) ? (char ) str : NULL;
	1536	}
	1537
[f2b8cdc]	1538	/** Find first occurence of character in string.
	1539	*
	1540	* @param str String to search.
	1541	* @param ch Character to look for.
	1542	*
	1543	* @return Pointer to character in @a str or NULL if not found.
	1544	*/
[28a5ebd]	1545	char str_chr(const char str, char32_t ch)
[f2b8cdc]	1546	{
[28c39f3]	1547	/* Fast path for an ASCII character. */
	1548	if (ascii_check(ch))
	1549	return _strchr(str, ch);
[a35b458]	1550
[28c39f3]	1551	/* Convert character to UTF-8. */
	1552	char utf8[STR_BOUNDS(1) + 1];
	1553	size_t offset = 0;
	1554
	1555	if (chr_encode(ch, utf8, &offset, sizeof(utf8)) != EOK \|\| offset == 0)
	1556	return NULL;
	1557
	1558	utf8[offset] = '\0';
	1559
	1560	/* Find the first byte, then check if all of them are correct. */
	1561	while (*str != 0) {
	1562	str = _strchr(str, utf8[0]);
	1563	if (!str)
	1564	return NULL;
	1565
	1566	if (_test_prefix(str, utf8))
	1567	return (char *) str;
	1568
	1569	str++;
[f2b8cdc]	1570	}
[a35b458]	1571
[f2b8cdc]	1572	return NULL;
	1573	}
	1574
[da680b4b]	1575	/** Find first occurence of substring in string.
	1576	*
	1577	* @param hs Haystack (string)
	1578	* @param n Needle (substring to look for)
	1579	*
	1580	* @return Pointer to character in @a hs or @c NULL if not found.
	1581	*/
	1582	char str_str(const char hs, const char *n)
	1583	{
[28c39f3]	1584	size_t hsize = _str_size(hs);
	1585	size_t nsize = _str_size(n);
[da680b4b]	1586
[28c39f3]	1587	while (hsize >= nsize) {
	1588	if (_test_prefix(hs, n))
	1589	return (char *) hs;
[da680b4b]	1590
[28c39f3]	1591	hs++;
	1592	hsize--;
[da680b4b]	1593	}
	1594
	1595	return NULL;
	1596	}
	1597
[28c39f3]	1598	static void _str_rtrim(char *str, char c)
	1599	{
	1600	char *last = str;
	1601
	1602	while (*str) {
	1603	if (*str != c)
	1604	last = str;
	1605
	1606	str++;
	1607	}
	1608
	1609	/* Truncate string. */
	1610	last[1] = 0;
	1611	}
	1612
[1737bfb]	1613	/** Removes specified trailing characters from a string.
	1614	*
	1615	* @param str String to remove from.
	1616	* @param ch Character to remove.
	1617	*/
[28a5ebd]	1618	void str_rtrim(char *str, char32_t ch)
[1737bfb]	1619	{
[28c39f3]	1620	/* Fast path for the ASCII case. */
	1621	if (ascii_check(ch)) {
	1622	_str_rtrim(str, ch);
	1623	return;
	1624	}
	1625
[1737bfb]	1626	size_t off = 0;
	1627	size_t pos = 0;
[28a5ebd]	1628	char32_t c;
[1737bfb]	1629	bool update_last_chunk = true;
	1630	char *last_chunk = NULL;
	1631
	1632	while ((c = str_decode(str, &off, STR_NO_LIMIT))) {
	1633	if (c != ch) {
	1634	update_last_chunk = true;
	1635	last_chunk = NULL;
	1636	} else if (update_last_chunk) {
	1637	update_last_chunk = false;
	1638	last_chunk = (str + pos);
	1639	}
	1640	pos = off;
	1641	}
	1642
	1643	if (last_chunk)
	1644	*last_chunk = '\0';
	1645	}
	1646
[28c39f3]	1647	static void _str_ltrim(char *str, char c)
	1648	{
	1649	char *p = str;
	1650
	1651	while (*p == c)
	1652	p++;
	1653
	1654	if (str != p)
	1655	_str_cpy(str, p);
	1656	}
	1657
[1737bfb]	1658	/** Removes specified leading characters from a string.
	1659	*
	1660	* @param str String to remove from.
	1661	* @param ch Character to remove.
	1662	*/
[28a5ebd]	1663	void str_ltrim(char *str, char32_t ch)
[1737bfb]	1664	{
[28c39f3]	1665	/* Fast path for the ASCII case. */
	1666	if (ascii_check(ch)) {
	1667	_str_ltrim(str, ch);
	1668	return;
	1669	}
	1670
[28a5ebd]	1671	char32_t acc;
[1737bfb]	1672	size_t off = 0;
	1673	size_t pos = 0;
	1674	size_t str_sz = str_size(str);
	1675
	1676	while ((acc = str_decode(str, &off, STR_NO_LIMIT)) != 0) {
	1677	if (acc != ch)
	1678	break;
	1679	else
	1680	pos = off;
	1681	}
	1682
	1683	if (pos > 0) {
	1684	memmove(str, &str[pos], str_sz - pos);
	1685	pos = str_sz - pos;
[a18a8b9]	1686	str[pos] = '\0';
[1737bfb]	1687	}
	1688	}
	1689
[28c39f3]	1690	static char _str_rchr(const char str, char c)
	1691	{
	1692	const char *last = NULL;
	1693
	1694	while (*str) {
	1695	if (*str == c)
	1696	last = str;
	1697
	1698	str++;
	1699	}
	1700
	1701	return (char *) last;
	1702	}
	1703
[7afb4a5]	1704	/** Find last occurence of character in string.
	1705	*
	1706	* @param str String to search.
	1707	* @param ch Character to look for.
	1708	*
	1709	* @return Pointer to character in @a str or NULL if not found.
	1710	*/
[28a5ebd]	1711	char str_rchr(const char str, char32_t ch)
[7afb4a5]	1712	{
[28c39f3]	1713	if (ascii_check(ch))
	1714	return _str_rchr(str, ch);
	1715
[28a5ebd]	1716	char32_t acc;
[7afb4a5]	1717	size_t off = 0;
[f2d2c7ba]	1718	size_t last = 0;
[d4a3ee5]	1719	const char *res = NULL;
[a35b458]	1720
[7afb4a5]	1721	while ((acc = str_decode(str, &off, STR_NO_LIMIT)) != 0) {
	1722	if (acc == ch)
[f2d2c7ba]	1723	res = (str + last);
	1724	last = off;
[7afb4a5]	1725	}
[a35b458]	1726
[dd2cfa7]	1727	return (char *) res;
[7afb4a5]	1728	}
	1729
[f2b8cdc]	1730	/** Insert a wide character into a wide string.
	1731	*
	1732	* Insert a wide character into a wide string at position
	1733	* @a pos. The characters after the position are shifted.
	1734	*
	1735	* @param str String to insert to.
	1736	* @param ch Character to insert to.
	1737	* @param pos Character index where to insert.
[7c3fb9b]	1738	* @param max_pos Characters in the buffer.
[f2b8cdc]	1739	*
	1740	* @return True if the insertion was sucessful, false if the position
	1741	* is out of bounds.
	1742	*
	1743	*/
[28a5ebd]	1744	bool wstr_linsert(char32_t *str, char32_t ch, size_t pos, size_t max_pos)
[f2b8cdc]	1745	{
[d4a3ee5]	1746	size_t len = wstr_length(str);
[a35b458]	1747
[f2b8cdc]	1748	if ((pos > len) \|\| (pos + 1 > max_pos))
	1749	return false;
[a35b458]	1750
[d4a3ee5]	1751	size_t i;
[f2b8cdc]	1752	for (i = len; i + 1 > pos; i--)
	1753	str[i + 1] = str[i];
[a35b458]	1754
[f2b8cdc]	1755	str[pos] = ch;
[a35b458]	1756
[f2b8cdc]	1757	return true;
	1758	}
	1759
	1760	/** Remove a wide character from a wide string.
	1761	*
	1762	* Remove a wide character from a wide string at position
	1763	* @a pos. The characters after the position are shifted.
	1764	*
	1765	* @param str String to remove from.
	1766	* @param pos Character index to remove.
	1767	*
	1768	* @return True if the removal was sucessful, false if the position
	1769	* is out of bounds.
	1770	*
	1771	*/
[28a5ebd]	1772	bool wstr_remove(char32_t *str, size_t pos)
[f2b8cdc]	1773	{
[d4a3ee5]	1774	size_t len = wstr_length(str);
[a35b458]	1775
[f2b8cdc]	1776	if (pos >= len)
	1777	return false;
[a35b458]	1778
[d4a3ee5]	1779	size_t i;
[f2b8cdc]	1780	for (i = pos + 1; i <= len; i++)
	1781	str[i - 1] = str[i];
[a35b458]	1782
[f2b8cdc]	1783	return true;
	1784	}
	1785
[abf09311]	1786	/** Duplicate string.
	1787	*
	1788	* Allocate a new string and copy characters from the source
	1789	* string into it. The duplicate string is allocated via sleeping
	1790	* malloc(), thus this function can sleep in no memory conditions.
	1791	*
	1792	* The allocation cannot fail and the return value is always
	1793	* a valid pointer. The duplicate string is always a well-formed
	1794	* null-terminated UTF-8 string, but it can differ from the source
	1795	* string on the byte level.
	1796	*
	1797	* @param src Source string.
	1798	*
	1799	* @return Duplicate string.
	1800	*
	1801	*/
[fc6dd18]	1802	char str_dup(const char src)
	1803	{
[28c39f3]	1804	size_t size = _str_size(src) + 1;
[d066259]	1805	char *dest = malloc(size);
	1806	if (!dest)
	1807	return NULL;
[a35b458]	1808
[0600976]	1809	memcpy(dest, src, size);
[b31323f]	1810	_str_sanitize(dest, size, U_SPECIAL);
[abf09311]	1811	return dest;
[fc6dd18]	1812	}
	1813
[abf09311]	1814	/** Duplicate string with size limit.
	1815	*
	1816	* Allocate a new string and copy up to @max_size bytes from the source
	1817	* string into it. The duplicate string is allocated via sleeping
	1818	* malloc(), thus this function can sleep in no memory conditions.
	1819	* No more than @max_size + 1 bytes is allocated, but if the size
	1820	* occupied by the source string is smaller than @max_size + 1,
	1821	* less is allocated.
	1822	*
	1823	* The allocation cannot fail and the return value is always
	1824	* a valid pointer. The duplicate string is always a well-formed
	1825	* null-terminated UTF-8 string, but it can differ from the source
	1826	* string on the byte level.
	1827	*
	1828	* @param src Source string.
	1829	* @param n Maximum number of bytes to duplicate.
	1830	*
	1831	* @return Duplicate string.
	1832	*
	1833	*/
	1834	char str_ndup(const char src, size_t n)
[fc6dd18]	1835	{
[0600976]	1836	size_t size = _str_nsize(src, n);
[a35b458]	1837
[0600976]	1838	char *dest = malloc(size + 1);
[d066259]	1839	if (!dest)
	1840	return NULL;
[a35b458]	1841
[0600976]	1842	memcpy(dest, src, size);
[b31323f]	1843	_str_sanitize(dest, size, U_SPECIAL);
[0600976]	1844	dest[size] = 0;
[fc6dd18]	1845	return dest;
	1846	}
	1847
[ee3f6f6]	1848	/** Split string by delimiters.
	1849	*
	1850	* @param s String to be tokenized. May not be NULL.
	1851	* @param delim String with the delimiters.
	1852	* @param next Variable which will receive the pointer to the
	1853	* continuation of the string following the first
	1854	* occurrence of any of the delimiter characters.
	1855	* May be NULL.
	1856	* @return Pointer to the prefix of @a s before the first
	1857	* delimiter character. NULL if no such prefix
	1858	* exists.
	1859	*/
	1860	char str_tok(char s, const char delim, char *next)
[576845ec]	1861	{
	1862	char start, end;
[69df837f]	1863
[ee3f6f6]	1864	if (!s)
	1865	return NULL;
[a35b458]	1866
[ee3f6f6]	1867	size_t len = str_size(s);
	1868	size_t cur;
	1869	size_t tmp;
[28a5ebd]	1870	char32_t ch;
[69df837f]	1871
[576845ec]	1872	/* Skip over leading delimiters. */
[948222e4]	1873	tmp = 0;
	1874	cur = 0;
	1875	while ((ch = str_decode(s, &tmp, len)) && str_chr(delim, ch))
[ee3f6f6]	1876	cur = tmp;
	1877	start = &s[cur];
[69df837f]	1878
[576845ec]	1879	/* Skip over token characters. */
[948222e4]	1880	tmp = cur;
	1881	while ((ch = str_decode(s, &tmp, len)) && !str_chr(delim, ch))
[ee3f6f6]	1882	cur = tmp;
	1883	end = &s[cur];
	1884	if (next)
	1885	*next = (ch ? &s[tmp] : &s[cur]);
	1886
	1887	if (start == end)
[576845ec]	1888	return NULL; /* No more tokens. */
[69df837f]	1889
[576845ec]	1890	/* Overwrite delimiter with NULL terminator. */
	1891	*end = '\0';
	1892	return start;
[69df837f]	1893	}
	1894
[e535eeb]	1895	void order_suffix(const uint64_t val, uint64_t rv, char suffix)
	1896	{
[933cadf]	1897	if (val > UINT64_C(10000000000000000000)) {
	1898	*rv = val / UINT64_C(1000000000000000000);
[e535eeb]	1899	*suffix = 'Z';
[933cadf]	1900	} else if (val > UINT64_C(1000000000000000000)) {
	1901	*rv = val / UINT64_C(1000000000000000);
[e535eeb]	1902	*suffix = 'E';
[933cadf]	1903	} else if (val > UINT64_C(1000000000000000)) {
	1904	*rv = val / UINT64_C(1000000000000);
[e535eeb]	1905	*suffix = 'T';
[933cadf]	1906	} else if (val > UINT64_C(1000000000000)) {
	1907	*rv = val / UINT64_C(1000000000);
[e535eeb]	1908	*suffix = 'G';
[933cadf]	1909	} else if (val > UINT64_C(1000000000)) {
	1910	*rv = val / UINT64_C(1000000);
[e535eeb]	1911	*suffix = 'M';
[933cadf]	1912	} else if (val > UINT64_C(1000000)) {
	1913	*rv = val / UINT64_C(1000);
[e535eeb]	1914	*suffix = 'k';
	1915	} else {
	1916	*rv = val;
	1917	*suffix = ' ';
	1918	}
	1919	}
	1920
[933cadf]	1921	void bin_order_suffix(const uint64_t val, uint64_t rv, const char *suffix,
	1922	bool fixed)
	1923	{
	1924	if (val > UINT64_C(1152921504606846976)) {
	1925	*rv = val / UINT64_C(1125899906842624);
	1926	*suffix = "EiB";
	1927	} else if (val > UINT64_C(1125899906842624)) {
	1928	*rv = val / UINT64_C(1099511627776);
	1929	*suffix = "TiB";
	1930	} else if (val > UINT64_C(1099511627776)) {
	1931	*rv = val / UINT64_C(1073741824);
	1932	*suffix = "GiB";
	1933	} else if (val > UINT64_C(1073741824)) {
	1934	*rv = val / UINT64_C(1048576);
	1935	*suffix = "MiB";
	1936	} else if (val > UINT64_C(1048576)) {
	1937	*rv = val / UINT64_C(1024);
	1938	*suffix = "KiB";
	1939	} else {
	1940	*rv = val;
	1941	if (fixed)
	1942	*suffix = "B ";
	1943	else
	1944	*suffix = "B";
	1945	}
	1946	}
	1947
[a46da63]	1948	/** @}
[b2951e2]	1949	*/

Note: See TracBrowser for help on using the repository browser.

Download in other formats: