Context Navigation

source: mainline/common/str.c@ 65bf084

Visit:

Last change on this file since 65bf084 was 65bf084, checked in by Jiří Zárevúcky <zarevucky.jiri@…>, 3 months ago
Implement both str_decode() and mbrtoc32() using one function
Property mode set to `100644`
File size: 44.6 KB

Rev	Line
[936351c1]	1	/*
[d066259]	2	* Copyright (c) 2001-2004 Jakub Jermar
[df4ed85]	3	* Copyright (c) 2005 Martin Decky
[576845ec]	4	* Copyright (c) 2008 Jiri Svoboda
[22cf42d9]	5	* Copyright (c) 2011 Martin Sucha
[c4bbca8]	6	* Copyright (c) 2011 Oleg Romanenko
[65bf084]	7	* Copyright (c) 2025 Jiří Zárevúcky
[936351c1]	8	* All rights reserved.
	9	*
	10	* Redistribution and use in source and binary forms, with or without
	11	* modification, are permitted provided that the following conditions
	12	* are met:
	13	*
	14	* - Redistributions of source code must retain the above copyright
	15	* notice, this list of conditions and the following disclaimer.
	16	* - Redistributions in binary form must reproduce the above copyright
	17	* notice, this list of conditions and the following disclaimer in the
	18	* documentation and/or other materials provided with the distribution.
	19	* - The name of the author may not be used to endorse or promote products
	20	* derived from this software without specific prior written permission.
	21	*
	22	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	23	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	24	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	25	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	26	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	27	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	28	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	29	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	30	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	31	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	32	*/
	33
[a46da63]	34	/** @addtogroup libc
[b2951e2]	35	* @{
	36	*/
[d066259]	37
	38	/**
	39	* @file
	40	* @brief String functions.
	41	*
	42	* Strings and characters use the Universal Character Set (UCS). The standard
	43	* strings, called just strings are encoded in UTF-8. Wide strings (encoded
	44	* in UTF-32) are supported to a limited degree. A single character is
[28a5ebd]	45	* represented as char32_t.@n
[d066259]	46	*
	47	* Overview of the terminology:@n
	48	*
	49	* Term Meaning
	50	* -------------------- ----------------------------------------------------
	51	* byte 8 bits stored in uint8_t (unsigned 8 bit integer)
	52	*
[28a5ebd]	53	* character UTF-32 encoded Unicode character, stored in char32_t
	54	* (unsigned 32 bit integer), code points 0 .. 1114111
[d066259]	55	* are valid
	56	*
[28c39f3]	57	* Note that Unicode characters do not match
	58	* one-to-one with displayed characters or glyphs on
	59	* screen. For that level of precision, look up
	60	* Grapheme Clusters.
	61	*
[d066259]	62	* ASCII character 7 bit encoded ASCII character, stored in char
	63	* (usually signed 8 bit integer), code points 0 .. 127
	64	* are valid
	65	*
	66	* string UTF-8 encoded NULL-terminated Unicode string, char *
	67	*
	68	* wide string UTF-32 encoded NULL-terminated Unicode string,
[28a5ebd]	69	* char32_t *
[d066259]	70	*
	71	* [wide] string size number of BYTES in a [wide] string (excluding
	72	* the NULL-terminator), size_t
	73	*
	74	* [wide] string length number of CHARACTERS in a [wide] string (excluding
	75	* the NULL-terminator), size_t
	76	*
	77	* [wide] string width number of display cells on a monospace display taken
	78	* by a [wide] string, size_t
	79	*
[28c39f3]	80	* This is virtually impossible to determine exactly for
	81	* all strings without knowing specifics of the display
	82	* device, due to various factors affecting text output.
	83	* If you have the option to query the terminal for
	84	* position change caused by outputting the string,
	85	* it is preferrable to determine width that way.
	86	*
[d066259]	87	*
	88	* Overview of string metrics:@n
	89	*
	90	* Metric Abbrev. Type Meaning
	91	* ------ ------ ------ -------------------------------------------------
	92	* size n size_t number of BYTES in a string (excluding the
	93	* NULL-terminator)
	94	*
	95	* length l size_t number of CHARACTERS in a string (excluding the
	96	* null terminator)
	97	*
	98	* width w size_t number of display cells on a monospace display
	99	* taken by a string
	100	*
	101	*
	102	* Function naming prefixes:@n
	103	*
	104	* chr_ operate on characters
	105	* ascii_ operate on ASCII characters
	106	* str_ operate on strings
	107	* wstr_ operate on wide strings
	108	*
	109	* [w]str_[n\|l\|w] operate on a prefix limited by size, length
	110	* or width
	111	*
	112	*
	113	* A specific character inside a [wide] string can be referred to by:@n
	114	*
[28a5ebd]	115	* pointer (char , char32_t )
[d066259]	116	* byte offset (size_t)
	117	* character index (size_t)
	118	*
[b2951e2]	119	*/
	120
[19f857a]	121	#include <str.h>
[d066259]	122
[28c39f3]	123	#include <align.h>
[38d150e]	124	#include <assert.h>
[e64c4b2]	125	#include <ctype.h>
[171f9a1]	126	#include <errno.h>
[65bf084]	127	#include <limits.h>
[28c39f3]	128	#include <macros.h>
	129	#include <mem.h>
[d066259]	130	#include <stdbool.h>
	131	#include <stddef.h>
	132	#include <stdint.h>
	133	#include <stdlib.h>
[28c39f3]	134	#include <uchar.h>
[171f9a1]	135
[65bf084]	136	#if __STDC_HOSTED__
	137	#include <fibril.h>
	138	#endif
	139
	140	static void _set_ilseq()
	141	{
	142	#ifdef errno
	143	errno = EILSEQ;
	144	#endif
	145	}
	146
[171f9a1]	147	/** Byte mask consisting of lowest @n bits (out of 8) */
	148	#define LO_MASK_8(n) ((uint8_t) ((1 << (n)) - 1))
	149
	150	/** Byte mask consisting of lowest @n bits (out of 32) */
	151	#define LO_MASK_32(n) ((uint32_t) ((1 << (n)) - 1))
	152
	153	/** Byte mask consisting of highest @n bits (out of 8) */
	154	#define HI_MASK_8(n) (~LO_MASK_8(8 - (n)))
	155
	156	/** Number of data bits in a UTF-8 continuation byte */
	157	#define CONT_BITS 6
	158
[65bf084]	159	#define UTF8_MASK_INITIAL2 0b00011111
	160	#define UTF8_MASK_INITIAL3 0b00001111
	161	#define UTF8_MASK_INITIAL4 0b00000111
	162	#define UTF8_MASK_CONT 0b00111111
	163
	164	#define CHAR_INVALID ((char32_t) UINT_MAX)
	165
[28c39f3]	166	static inline bool _is_ascii(uint8_t b)
	167	{
	168	return b < 0x80;
	169	}
	170
[65bf084]	171	static inline bool _is_continuation(uint8_t b)
[28c39f3]	172	{
[65bf084]	173	return (b & 0xC0) == 0x80;
	174	}
	175
	176	static inline bool _is_2_byte(uint8_t c)
	177	{
	178	return (c & 0xE0) == 0xC0;
	179	}
	180
	181	static inline bool _is_3_byte(uint8_t c)
	182	{
	183	return (c & 0xF0) == 0xE0;
	184	}
	185
	186	static inline bool _is_4_byte(uint8_t c)
	187	{
	188	return (c & 0xF8) == 0xF0;
[28c39f3]	189	}
	190
	191	static inline int _char_continuation_bytes(char32_t c)
	192	{
[6120b7b]	193	if ((c & ~LO_MASK_32(7)) == 0)
	194	return 0;
	195
[28c39f3]	196	if ((c & ~LO_MASK_32(11)) == 0)
	197	return 1;
	198
	199	if ((c & ~LO_MASK_32(16)) == 0)
	200	return 2;
	201
	202	if ((c & ~LO_MASK_32(21)) == 0)
	203	return 3;
	204
	205	/* Codes longer than 21 bits are not supported */
	206	return -1;
	207	}
	208
	209	static inline int _continuation_bytes(uint8_t b)
	210	{
	211	/* 0xxxxxxx */
	212	if (_is_ascii(b))
	213	return 0;
	214
	215	/* 110xxxxx 10xxxxxx */
[65bf084]	216	if (_is_2_byte(b))
[28c39f3]	217	return 1;
	218
	219	/* 1110xxxx 10xxxxxx 10xxxxxx */
[65bf084]	220	if (_is_3_byte(b))
[28c39f3]	221	return 2;
	222
	223	/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
[65bf084]	224	if (_is_4_byte(b))
[28c39f3]	225	return 3;
	226
	227	return -1;
	228	}
	229
[65bf084]	230	static bool _is_non_shortest(const mbstate_t *mb, uint8_t b)
	231	{
	232	return (mb->state == 0b1111110000000000 && !(b & 0b00100000)) \|\|
	233	(mb->state == 0b1111111111110000 && !(b & 0b00110000));
	234	}
	235
	236	#define _likely(expr) __builtin_expect((expr), true)
	237	#define _unlikely(expr) __builtin_expect((expr), false)
	238
	239	#define FAST_PATHS 1
	240
	241	static char32_t _str_decode(const char s, size_t offset, size_t size, mbstate_t *mb)
	242	{
	243	assert(s);
	244	assert(offset);
	245	assert(*offset <= size);
	246	assert(size == STR_NO_LIMIT \|\| s + size >= s);
	247	assert(mb);
	248
	249	if (*offset == size)
	250	return 0;
	251
	252	if (_likely(!mb->state)) {
	253	/* Clean slate, read initial byte. */
	254	uint8_t b = s[(*offset)++];
	255
	256	/* Fast exit for the most common case. */
	257	if (_likely(_is_ascii(b)))
	258	return b;
	259
	260	/* unexpected continuation byte */
	261	if (_unlikely(_is_continuation(b)))
	262	return CHAR_INVALID;
	263
	264	/*
	265	* The value stored into `continuation` is designed to have
	266	* just enough leading ones that after shifting in one less than
	267	* the expected number of continuation bytes, the most significant
	268	* bit becomes zero. (The field is 16b wide.)
	269	*/
	270
	271	if (_is_2_byte(b)) {
	272	/* Reject non-shortest form. */
	273	if (_unlikely(!(b & 0b00011110)))
	274	return CHAR_INVALID;
	275
	276	#if FAST_PATHS
	277	/* We can usually take this exit. */
	278	if (_likely(offset < size && _is_continuation(s[offset])))
	279	return (b & UTF8_MASK_INITIAL2) << 6 \|
	280	(s[(*offset)++] & UTF8_MASK_CONT);
	281	#endif
	282
	283	/* 2 byte continuation 110xxxxx */
	284	mb->state = b ^ 0b0000000011000000;
	285
	286	} else if (_is_3_byte(b)) {
	287	#if FAST_PATHS
	288	/* We can usually take this exit. */
	289	if (_likely(offset + 1 < size && _is_continuation(s[offset]) && _is_continuation(s[*offset + 1]))) {
	290
	291	char32_t ch = (b & UTF8_MASK_INITIAL3) << 12 \|
	292	(s[(*offset)] & UTF8_MASK_CONT) << 6 \|
	293	(s[(*offset) + 1] & UTF8_MASK_CONT);
	294
	295	*offset += 2;
	296
	297	/* Reject non-shortest form. */
	298	if (_unlikely(!(ch & 0xFFFFF800)))
	299	return CHAR_INVALID;
	300
	301	return ch;
	302	}
	303	#endif
	304
	305	/* 3 byte continuation 1110xxxx */
	306	mb->state = b ^ 0b1111110011100000;
	307
	308	} else if (_is_4_byte(b)) {
	309	#if FAST_PATHS
	310	/* We can usually take this exit. */
	311	if (_likely(offset + 2 < size && _is_continuation(s[offset]) &&
	312	_is_continuation(s[offset + 1]) && _is_continuation(s[offset + 2]))) {
	313
	314	char32_t ch = (b & UTF8_MASK_INITIAL4) << 18 \|
	315	(s[(*offset)] & UTF8_MASK_CONT) << 12 \|
	316	(s[(*offset) + 1] & UTF8_MASK_CONT) << 6 \|
	317	(s[(*offset) + 2] & UTF8_MASK_CONT);
	318
	319	*offset += 3;
	320
	321	/* Reject non-shortest form. */
	322	if (_unlikely(!(ch & 0xFFFF0000)))
	323	return CHAR_INVALID;
	324
	325	return ch;
	326	}
	327	#endif
	328
	329	/* 4 byte continuation 11110xxx */
	330	mb->state = b ^ 0b1111111100000000;
	331	} else {
	332	return CHAR_INVALID;
	333	}
	334	}
	335
	336	/* Deal with the remaining edge and invalid cases. */
	337	for (; offset < size; (offset)++) {
	338	/* Read continuation bytes. */
	339	uint8_t b = s[*offset];
	340
	341	if (!_is_continuation(b) \|\| _is_non_shortest(mb, b)) {
	342	mb->state = 0;
	343	return CHAR_INVALID;
	344	}
	345
	346	/* Top bit becomes zero when shifting in the second to last byte. */
	347	if (!(mb->state & 0x8000)) {
	348	char32_t c = ((char32_t) mb->state) << 6 \| (b & UTF8_MASK_CONT);
	349	mb->state = 0;
	350	(*offset)++;
	351	return c;
	352	}
	353
	354	mb->state = mb->state << 6 \| (b & UTF8_MASK_CONT);
	355	}
	356
	357	/* Incomplete character. */
	358	assert(mb->state);
	359	return 0;
	360	}
	361
	362	/** Standard <uchar.h> function since C11. */
	363	size_t mbrtoc32(char32_t c, const char s, size_t n, mbstate_t *mb)
	364	{
	365	#if __STDC_HOSTED__
	366	static fibril_local mbstate_t global_state = { };
	367
	368	if (!mb)
	369	mb = &global_state;
	370	#endif
	371
	372	if (!s) {
	373	/* Equivalent to mbrtoc32(NULL, "", 1, mb); */
	374	c = NULL;
	375	s = "";
	376	n = 1;
	377	}
	378
	379	size_t offset = 0;
	380	char32_t ret = _str_decode(s, &offset, n, mb);
	381	if (ret == CHAR_INVALID) {
	382	assert(!mb->state);
	383	_set_ilseq();
	384	return UCHAR_ILSEQ;
	385	}
	386	if (mb->state) {
	387	assert(ret == 0);
	388	return UCHAR_INCOMPLETE;
	389	}
	390
	391	if (c)
	392	*c = ret;
	393	return ret ? offset : 0;
	394	}
	395
[171f9a1]	396	/** Decode a single character from a string.
	397	*
	398	* Decode a single character from a string of size @a size. Decoding starts
	399	* at @a offset and this offset is moved to the beginning of the next
	400	* character. In case of decoding error, offset generally advances at least
	401	* by one. However, offset is never moved beyond size.
	402	*
	403	* @param str String (not necessarily NULL-terminated).
	404	* @param offset Byte offset in string where to start decoding.
	405	* @param size Size of the string (in bytes).
	406	*
	407	* @return Value of decoded character, U_SPECIAL on decoding error or
	408	* NULL if attempt to decode beyond @a size.
	409	*
	410	*/
[28a5ebd]	411	char32_t str_decode(const char str, size_t offset, size_t size)
[171f9a1]	412	{
[65bf084]	413	mbstate_t mb = { };
	414	char32_t ch = _str_decode(str, offset, size, &mb);
[28c39f3]	415
[65bf084]	416	if (ch == CHAR_INVALID)
[171f9a1]	417	return U_SPECIAL;
[a35b458]	418
[65bf084]	419	if (mb.state)
[0600976]	420	return U_SPECIAL;
	421
[171f9a1]	422	return ch;
	423	}
	424
[568693b]	425	/** Decode a single character from a string to the left.
	426	*
	427	* Decode a single character from a string of size @a size. Decoding starts
	428	* at @a offset and this offset is moved to the beginning of the previous
	429	* character. In case of decoding error, offset generally decreases at least
	430	* by one. However, offset is never moved before 0.
	431	*
	432	* @param str String (not necessarily NULL-terminated).
	433	* @param offset Byte offset in string where to start decoding.
	434	* @param size Size of the string (in bytes).
	435	*
	436	* @return Value of decoded character, U_SPECIAL on decoding error or
	437	* NULL if attempt to decode beyond @a start of str.
	438	*
	439	*/
[28a5ebd]	440	char32_t str_decode_reverse(const char str, size_t offset, size_t size)
[568693b]	441	{
	442	if (*offset == 0)
	443	return 0;
[a35b458]	444
[28c39f3]	445	int cbytes = 0;
[568693b]	446	/* Continue while continuation bytes found */
[28c39f3]	447	while (*offset > 0 && cbytes < 4) {
[568693b]	448	uint8_t b = (uint8_t) str[--(*offset)];
[a35b458]	449
[65bf084]	450	if (_is_continuation(b)) {
[28c39f3]	451	cbytes++;
	452	continue;
[568693b]	453	}
[28c39f3]	454
[65bf084]	455	/* Reject non-shortest form encoding. */
[28c39f3]	456	if (cbytes != _continuation_bytes(b))
	457	return U_SPECIAL;
	458
	459	/* Start byte */
	460	size_t start_offset = *offset;
	461	return str_decode(str, &start_offset, size);
[568693b]	462	}
[28c39f3]	463
[568693b]	464	/* Too many continuation bytes */
	465	return U_SPECIAL;
	466	}
	467
[171f9a1]	468	/** Encode a single character to string representation.
	469	*
	470	* Encode a single character to string representation (i.e. UTF-8) and store
	471	* it into a buffer at @a offset. Encoding starts at @a offset and this offset
	472	* is moved to the position where the next character can be written to.
	473	*
	474	* @param ch Input character.
	475	* @param str Output buffer.
	476	* @param offset Byte offset where to start writing.
	477	* @param size Size of the output buffer (in bytes).
	478	*
	479	* @return EOK if the character was encoded successfully, EOVERFLOW if there
[d4a3ee5]	480	* was not enough space in the output buffer or EINVAL if the character
	481	* code was invalid.
[171f9a1]	482	*/
[28c39f3]	483	errno_t chr_encode(char32_t ch, char str, size_t offset, size_t size)
[171f9a1]	484	{
[65bf084]	485	// TODO: merge with c32rtomb()
	486
[171f9a1]	487	if (*offset >= size)
	488	return EOVERFLOW;
[a35b458]	489
[28c39f3]	490	/* Fast exit for the most common case. */
	491	if (ch < 0x80) {
	492	str[(*offset)++] = (char) ch;
	493	return EOK;
	494	}
	495
	496	/* Codes longer than 21 bits are not supported */
[171f9a1]	497	if (!chr_check(ch))
	498	return EINVAL;
[a35b458]	499
[171f9a1]	500	/* Determine how many continuation bytes are needed */
[a35b458]	501
[28c39f3]	502	unsigned int cbytes = _char_continuation_bytes(ch);
	503	unsigned int b0_bits = 6 - cbytes; /* Data bits in first byte */
[a35b458]	504
[171f9a1]	505	/* Check for available space in buffer */
	506	if (*offset + cbytes >= size)
	507	return EOVERFLOW;
[a35b458]	508
[171f9a1]	509	/* Encode continuation bytes */
	510	unsigned int i;
	511	for (i = cbytes; i > 0; i--) {
[28c39f3]	512	str[*offset + i] = 0x80 \| (ch & LO_MASK_32(CONT_BITS));
	513	ch >>= CONT_BITS;
[171f9a1]	514	}
[a35b458]	515
[171f9a1]	516	/* Encode first byte */
[28c39f3]	517	str[*offset] = (ch & LO_MASK_32(b0_bits)) \| HI_MASK_8(8 - b0_bits - 1);
[a35b458]	518
[171f9a1]	519	/* Advance offset */
	520	*offset += cbytes + 1;
[a35b458]	521
[171f9a1]	522	return EOK;
	523	}
	524
[28c39f3]	525	/* Convert in place any bytes that don't form a valid character into U_SPECIAL. */
[0600976]	526	static void _sanitize_string(char *str, size_t n)
[28c39f3]	527	{
[0600976]	528	uint8_t b = (uint8_t ) str;
	529
	530	for (; *b && n > 0; b++, n--) {
	531	int cont = _continuation_bytes(b[0]);
	532	if (__builtin_expect(cont, 0) == 0)
[28c39f3]	533	continue;
	534
	535	if (cont < 0 \|\| n <= (size_t) cont) {
[0600976]	536	b[0] = U_SPECIAL;
[28c39f3]	537	continue;
	538	}
	539
[0600976]	540	/* Check continuation bytes. */
[28c39f3]	541	for (int i = 1; i <= cont; i++) {
[65bf084]	542	if (!_is_continuation(b[i])) {
[0600976]	543	b[0] = U_SPECIAL;
[28c39f3]	544	continue;
	545	}
	546	}
[0600976]	547
	548	/*
	549	* Check for non-shortest form encoding.
	550	* See https://www.unicode.org/versions/corrigendum1.html
	551	*/
	552
	553	switch (cont) {
	554	case 1:
	555	/* 0b110!!!!x 0b10xxxxxx */
	556	if (!(b[0] & 0b00011110))
	557	b[0] = U_SPECIAL;
	558
	559	continue;
	560	case 2:
	561	/* 0b1110!!!! 0b10!xxxxx 0b10xxxxxx */
	562	if (!(b[0] & 0b00001111) && !(b[1] & 0b00100000))
	563	b[0] = U_SPECIAL;
	564
	565	continue;
	566	case 3:
	567	/* 0b11110!!! 0b10!!xxxx 0b10xxxxxx 0b10xxxxxx */
	568	if (!(b[0] & 0b00000111) && !(b[1] & 0b00110000))
	569	b[0] = U_SPECIAL;
	570
	571	continue;
	572	}
[28c39f3]	573	}
	574	}
	575
	576	static size_t _str_size(const char *str)
	577	{
	578	size_t size = 0;
	579
	580	while (*str++ != 0)
	581	size++;
	582
	583	return size;
	584	}
	585
[f2b8cdc]	586	/** Get size of string.
	587	*
	588	* Get the number of bytes which are used by the string @a str (excluding the
	589	* NULL-terminator).
	590	*
	591	* @param str String to consider.
	592	*
	593	* @return Number of bytes used by the string
	594	*
	595	*/
	596	size_t str_size(const char *str)
	597	{
[28c39f3]	598	return _str_size(str);
[f2b8cdc]	599	}
	600
	601	/** Get size of wide string.
	602	*
	603	* Get the number of bytes which are used by the wide string @a str (excluding the
	604	* NULL-terminator).
	605	*
	606	* @param str Wide string to consider.
	607	*
	608	* @return Number of bytes used by the wide string
	609	*
	610	*/
[28a5ebd]	611	size_t wstr_size(const char32_t *str)
[f2b8cdc]	612	{
[28a5ebd]	613	return (wstr_length(str) * sizeof(char32_t));
[f2b8cdc]	614	}
	615
	616	/** Get size of string with length limit.
	617	*
	618	* Get the number of bytes which are used by up to @a max_len first
	619	* characters in the string @a str. If @a max_len is greater than
	620	* the length of @a str, the entire string is measured (excluding the
	621	* NULL-terminator).
	622	*
	623	* @param str String to consider.
	624	* @param max_len Maximum number of characters to measure.
	625	*
	626	* @return Number of bytes used by the characters.
	627	*
	628	*/
[d4a3ee5]	629	size_t str_lsize(const char *str, size_t max_len)
[f2b8cdc]	630	{
[d4a3ee5]	631	size_t len = 0;
[f2b8cdc]	632	size_t offset = 0;
[a35b458]	633
[f2b8cdc]	634	while (len < max_len) {
	635	if (str_decode(str, &offset, STR_NO_LIMIT) == 0)
	636	break;
[a35b458]	637
[f2b8cdc]	638	len++;
	639	}
[a35b458]	640
[f2b8cdc]	641	return offset;
	642	}
	643
[28c39f3]	644	static size_t _str_nsize(const char *str, size_t max_size)
	645	{
	646	size_t size = 0;
	647
	648	while ((*str++ != 0) && (size < max_size))
	649	size++;
	650
	651	return size;
	652	}
	653
[560d79f]	654	/** Get size of string with size limit.
	655	*
	656	* Get the number of bytes which are used by the string @a str
	657	* (excluding the NULL-terminator), but no more than @max_size bytes.
	658	*
	659	* @param str String to consider.
	660	* @param max_size Maximum number of bytes to measure.
	661	*
	662	* @return Number of bytes used by the string
	663	*
	664	*/
	665	size_t str_nsize(const char *str, size_t max_size)
	666	{
[28c39f3]	667	return _str_nsize(str, max_size);
[560d79f]	668	}
	669
	670	/** Get size of wide string with size limit.
	671	*
	672	* Get the number of bytes which are used by the wide string @a str
	673	* (excluding the NULL-terminator), but no more than @max_size bytes.
	674	*
	675	* @param str Wide string to consider.
	676	* @param max_size Maximum number of bytes to measure.
	677	*
	678	* @return Number of bytes used by the wide string
	679	*
	680	*/
[28a5ebd]	681	size_t wstr_nsize(const char32_t *str, size_t max_size)
[560d79f]	682	{
[28a5ebd]	683	return (wstr_nlength(str, max_size) * sizeof(char32_t));
[560d79f]	684	}
	685
[f2b8cdc]	686	/** Get size of wide string with length limit.
	687	*
	688	* Get the number of bytes which are used by up to @a max_len first
	689	* wide characters in the wide string @a str. If @a max_len is greater than
	690	* the length of @a str, the entire wide string is measured (excluding the
	691	* NULL-terminator).
	692	*
	693	* @param str Wide string to consider.
	694	* @param max_len Maximum number of wide characters to measure.
	695	*
	696	* @return Number of bytes used by the wide characters.
	697	*
	698	*/
[28a5ebd]	699	size_t wstr_lsize(const char32_t *str, size_t max_len)
[f2b8cdc]	700	{
[28a5ebd]	701	return (wstr_nlength(str, max_len * sizeof(char32_t)) * sizeof(char32_t));
[f2b8cdc]	702	}
	703
	704	/** Get number of characters in a string.
	705	*
	706	* @param str NULL-terminated string.
	707	*
	708	* @return Number of characters in string.
	709	*
	710	*/
[d4a3ee5]	711	size_t str_length(const char *str)
[f2b8cdc]	712	{
[d4a3ee5]	713	size_t len = 0;
[f2b8cdc]	714	size_t offset = 0;
[a35b458]	715
[f2b8cdc]	716	while (str_decode(str, &offset, STR_NO_LIMIT) != 0)
	717	len++;
[a35b458]	718
[f2b8cdc]	719	return len;
	720	}
	721
	722	/** Get number of characters in a wide string.
	723	*
	724	* @param str NULL-terminated wide string.
	725	*
	726	* @return Number of characters in @a str.
	727	*
	728	*/
[28a5ebd]	729	size_t wstr_length(const char32_t *wstr)
[f2b8cdc]	730	{
[d4a3ee5]	731	size_t len = 0;
[a35b458]	732
[f2b8cdc]	733	while (*wstr++ != 0)
	734	len++;
[a35b458]	735
[f2b8cdc]	736	return len;
	737	}
	738
	739	/** Get number of characters in a string with size limit.
	740	*
	741	* @param str NULL-terminated string.
	742	* @param size Maximum number of bytes to consider.
	743	*
	744	* @return Number of characters in string.
	745	*
	746	*/
[d4a3ee5]	747	size_t str_nlength(const char *str, size_t size)
[f2b8cdc]	748	{
[d4a3ee5]	749	size_t len = 0;
[f2b8cdc]	750	size_t offset = 0;
[a35b458]	751
[f2b8cdc]	752	while (str_decode(str, &offset, size) != 0)
	753	len++;
[a35b458]	754
[f2b8cdc]	755	return len;
	756	}
	757
	758	/** Get number of characters in a string with size limit.
	759	*
	760	* @param str NULL-terminated string.
	761	* @param size Maximum number of bytes to consider.
	762	*
	763	* @return Number of characters in string.
	764	*
	765	*/
[28a5ebd]	766	size_t wstr_nlength(const char32_t *str, size_t size)
[f2b8cdc]	767	{
[d4a3ee5]	768	size_t len = 0;
[28a5ebd]	769	size_t limit = ALIGN_DOWN(size, sizeof(char32_t));
[d4a3ee5]	770	size_t offset = 0;
[a35b458]	771
[f2b8cdc]	772	while ((offset < limit) && (*str++ != 0)) {
	773	len++;
[28a5ebd]	774	offset += sizeof(char32_t);
[f2b8cdc]	775	}
[a35b458]	776
[f2b8cdc]	777	return len;
	778	}
	779
[be2a38ad]	780	/** Get character display width on a character cell display.
	781	*
	782	* @param ch Character
	783	* @return Width of character in cells.
	784	*/
[28a5ebd]	785	size_t chr_width(char32_t ch)
[be2a38ad]	786	{
	787	return 1;
	788	}
	789
	790	/** Get string display width on a character cell display.
	791	*
	792	* @param str String
	793	* @return Width of string in cells.
	794	*/
	795	size_t str_width(const char *str)
	796	{
	797	size_t width = 0;
	798	size_t offset = 0;
[28a5ebd]	799	char32_t ch;
[a35b458]	800
[be2a38ad]	801	while ((ch = str_decode(str, &offset, STR_NO_LIMIT)) != 0)
	802	width += chr_width(ch);
[a35b458]	803
[be2a38ad]	804	return width;
	805	}
	806
[f2b8cdc]	807	/** Check whether character is plain ASCII.
	808	*
	809	* @return True if character is plain ASCII.
	810	*
	811	*/
[28a5ebd]	812	bool ascii_check(char32_t ch)
[f2b8cdc]	813	{
[28a5ebd]	814	if (ch <= 127)
[f2b8cdc]	815	return true;
[a35b458]	816
[f2b8cdc]	817	return false;
	818	}
	819
[171f9a1]	820	/** Check whether character is valid
	821	*
	822	* @return True if character is a valid Unicode code point.
	823	*
	824	*/
[28a5ebd]	825	bool chr_check(char32_t ch)
[171f9a1]	826	{
[28a5ebd]	827	if (ch <= 1114111)
[171f9a1]	828	return true;
[a35b458]	829
[171f9a1]	830	return false;
	831	}
[936351c1]	832
[f2b8cdc]	833	/** Compare two NULL terminated strings.
	834	*
	835	* Do a char-by-char comparison of two NULL-terminated strings.
[4efeab5]	836	* The strings are considered equal iff their length is equal
	837	* and both strings consist of the same sequence of characters.
	838	*
[1772e6d]	839	* A string S1 is less than another string S2 if it has a character with
	840	* lower value at the first character position where the strings differ.
	841	* If the strings differ in length, the shorter one is treated as if
	842	* padded by characters with a value of zero.
[f2b8cdc]	843	*
	844	* @param s1 First string to compare.
	845	* @param s2 Second string to compare.
	846	*
[1772e6d]	847	* @return 0 if the strings are equal, -1 if the first is less than the second,
	848	* 1 if the second is less than the first.
[f2b8cdc]	849	*
	850	*/
	851	int str_cmp(const char s1, const char s2)
	852	{
[28c39f3]	853	/*
	854	* UTF-8 has the nice property that lexicographic ordering on bytes is
	855	* the same as the lexicographic ordering of the character sequences.
	856	*/
	857	while (s1 == s2 && *s1 != 0) {
	858	s1++;
	859	s2++;
[f2b8cdc]	860	}
	861
[28c39f3]	862	if (s1 == s2)
	863	return 0;
	864
	865	return (s1 < s2) ? -1 : 1;
[f2b8cdc]	866	}
	867
	868	/** Compare two NULL terminated strings with length limit.
	869	*
	870	* Do a char-by-char comparison of two NULL-terminated strings.
[4efeab5]	871	* The strings are considered equal iff
	872	* min(str_length(s1), max_len) == min(str_length(s2), max_len)
	873	* and both strings consist of the same sequence of characters,
	874	* up to max_len characters.
	875	*
[1772e6d]	876	* A string S1 is less than another string S2 if it has a character with
	877	* lower value at the first character position where the strings differ.
	878	* If the strings differ in length, the shorter one is treated as if
	879	* padded by characters with a value of zero. Only the first max_len
	880	* characters are considered.
[f2b8cdc]	881	*
	882	* @param s1 First string to compare.
	883	* @param s2 Second string to compare.
	884	* @param max_len Maximum number of characters to consider.
	885	*
[1772e6d]	886	* @return 0 if the strings are equal, -1 if the first is less than the second,
	887	* 1 if the second is less than the first.
[f2b8cdc]	888	*
	889	*/
[d4a3ee5]	890	int str_lcmp(const char s1, const char s2, size_t max_len)
[f2b8cdc]	891	{
[28a5ebd]	892	char32_t c1 = 0;
	893	char32_t c2 = 0;
[8227d63]	894
[f2b8cdc]	895	size_t off1 = 0;
	896	size_t off2 = 0;
[8227d63]	897
[d4a3ee5]	898	size_t len = 0;
[f2b8cdc]	899
	900	while (true) {
	901	if (len >= max_len)
	902	break;
	903
	904	c1 = str_decode(s1, &off1, STR_NO_LIMIT);
	905	c2 = str_decode(s2, &off2, STR_NO_LIMIT);
	906
[8227d63]	907	if (c1 < c2)
	908	return -1;
	909
	910	if (c1 > c2)
	911	return 1;
	912
	913	if (c1 == 0 \|\| c2 == 0)
	914	break;
	915
	916	++len;
	917	}
	918
	919	return 0;
	920
	921	}
	922
	923	/** Compare two NULL terminated strings in case-insensitive manner.
	924	*
	925	* Do a char-by-char comparison of two NULL-terminated strings.
	926	* The strings are considered equal iff their length is equal
	927	* and both strings consist of the same sequence of characters
	928	* when converted to lower case.
	929	*
	930	* A string S1 is less than another string S2 if it has a character with
	931	* lower value at the first character position where the strings differ.
	932	* If the strings differ in length, the shorter one is treated as if
	933	* padded by characters with a value of zero.
	934	*
	935	* @param s1 First string to compare.
	936	* @param s2 Second string to compare.
	937	*
	938	* @return 0 if the strings are equal, -1 if the first is less than the second,
	939	* 1 if the second is less than the first.
	940	*
	941	*/
	942	int str_casecmp(const char s1, const char s2)
	943	{
[28c39f3]	944	// FIXME: doesn't work for non-ASCII caseful characters
	945
[28a5ebd]	946	char32_t c1 = 0;
	947	char32_t c2 = 0;
[8227d63]	948
	949	size_t off1 = 0;
	950	size_t off2 = 0;
	951
	952	while (true) {
	953	c1 = tolower(str_decode(s1, &off1, STR_NO_LIMIT));
	954	c2 = tolower(str_decode(s2, &off2, STR_NO_LIMIT));
	955
	956	if (c1 < c2)
	957	return -1;
	958
	959	if (c1 > c2)
	960	return 1;
	961
	962	if (c1 == 0 \|\| c2 == 0)
	963	break;
	964	}
	965
	966	return 0;
	967	}
	968
	969	/** Compare two NULL terminated strings with length limit in case-insensitive
	970	* manner.
	971	*
	972	* Do a char-by-char comparison of two NULL-terminated strings.
	973	* The strings are considered equal iff
	974	* min(str_length(s1), max_len) == min(str_length(s2), max_len)
	975	* and both strings consist of the same sequence of characters,
	976	* up to max_len characters.
	977	*
	978	* A string S1 is less than another string S2 if it has a character with
	979	* lower value at the first character position where the strings differ.
	980	* If the strings differ in length, the shorter one is treated as if
	981	* padded by characters with a value of zero. Only the first max_len
	982	* characters are considered.
	983	*
	984	* @param s1 First string to compare.
	985	* @param s2 Second string to compare.
	986	* @param max_len Maximum number of characters to consider.
	987	*
	988	* @return 0 if the strings are equal, -1 if the first is less than the second,
	989	* 1 if the second is less than the first.
	990	*
	991	*/
	992	int str_lcasecmp(const char s1, const char s2, size_t max_len)
	993	{
[28c39f3]	994	// FIXME: doesn't work for non-ASCII caseful characters
	995
[28a5ebd]	996	char32_t c1 = 0;
	997	char32_t c2 = 0;
[a35b458]	998
[8227d63]	999	size_t off1 = 0;
	1000	size_t off2 = 0;
[a35b458]	1001
[8227d63]	1002	size_t len = 0;
	1003
	1004	while (true) {
	1005	if (len >= max_len)
	1006	break;
	1007
	1008	c1 = tolower(str_decode(s1, &off1, STR_NO_LIMIT));
	1009	c2 = tolower(str_decode(s2, &off2, STR_NO_LIMIT));
	1010
[f2b8cdc]	1011	if (c1 < c2)
	1012	return -1;
	1013
	1014	if (c1 > c2)
	1015	return 1;
	1016
	1017	if (c1 == 0 \|\| c2 == 0)
	1018	break;
	1019
[1b20da0]	1020	++len;
[f2b8cdc]	1021	}
	1022
	1023	return 0;
	1024
	1025	}
	1026
[28c39f3]	1027	static bool _test_prefix(const char s, const char p)
	1028	{
	1029	while (s == p && *s != 0) {
	1030	s++;
	1031	p++;
	1032	}
	1033
	1034	return *p == 0;
	1035	}
	1036
[dce39b4]	1037	/** Test whether p is a prefix of s.
	1038	*
	1039	* Do a char-by-char comparison of two NULL-terminated strings
	1040	* and determine if p is a prefix of s.
	1041	*
	1042	* @param s The string in which to look
	1043	* @param p The string to check if it is a prefix of s
	1044	*
	1045	* @return true iff p is prefix of s else false
	1046	*
	1047	*/
	1048	bool str_test_prefix(const char s, const char p)
	1049	{
[28c39f3]	1050	return _test_prefix(s, p);
[dce39b4]	1051	}
	1052
[086cab0]	1053	/** Get a string suffix.
	1054	*
	1055	* Return a string suffix defined by the prefix length.
	1056	*
	1057	* @param s The string to get the suffix from.
	1058	* @param prefix_length Number of prefix characters to ignore.
	1059	*
	1060	* @return String suffix.
	1061	*
	1062	*/
	1063	const char str_suffix(const char s, size_t prefix_length)
	1064	{
	1065	size_t off = 0;
	1066	size_t i = 0;
	1067
	1068	while (true) {
	1069	str_decode(s, &off, STR_NO_LIMIT);
	1070	i++;
	1071
	1072	if (i >= prefix_length)
	1073	break;
	1074	}
	1075
	1076	return s + off;
	1077	}
	1078
[28c39f3]	1079	/** Copy string as a sequence of bytes. */
	1080	static void _str_cpy(char dest, const char src)
	1081	{
	1082	while (*src)
	1083	(dest++) = (src++);
	1084
	1085	*dest = 0;
	1086	}
	1087
	1088	/** Copy string as a sequence of bytes. */
	1089	static void _str_cpyn(char dest, size_t size, const char src)
	1090	{
[0600976]	1091	assert(dest && src && size);
	1092
	1093	if (!dest \|\| !src \|\| !size)
	1094	return;
	1095
	1096	if (size == STR_NO_LIMIT)
	1097	return _str_cpy(dest, src);
	1098
[28c39f3]	1099	char *dest_top = dest + size - 1;
[0600976]	1100	assert(size == 1 \|\| dest < dest_top);
[28c39f3]	1101
	1102	while (*src && dest < dest_top)
	1103	(dest++) = (src++);
	1104
	1105	*dest = 0;
	1106	}
	1107
[6eb2e96]	1108	/** Copy string.
[f2b8cdc]	1109	*
[6eb2e96]	1110	* Copy source string @a src to destination buffer @a dest.
	1111	* No more than @a size bytes are written. If the size of the output buffer
	1112	* is at least one byte, the output string will always be well-formed, i.e.
	1113	* null-terminated and containing only complete characters.
[f2b8cdc]	1114	*
[abf09311]	1115	* @param dest Destination buffer.
[6700ee2]	1116	* @param count Size of the destination buffer (must be > 0).
[6eb2e96]	1117	* @param src Source string.
[8e893ae]	1118	*
[f2b8cdc]	1119	*/
[6eb2e96]	1120	void str_cpy(char dest, size_t size, const char src)
[f2b8cdc]	1121	{
[6700ee2]	1122	/* There must be space for a null terminator in the buffer. */
	1123	assert(size > 0);
[d066259]	1124	assert(src != NULL);
[28c39f3]	1125	assert(dest != NULL);
[0600976]	1126	assert(size == STR_NO_LIMIT \|\| dest + size > dest);
[a35b458]	1127
[28c39f3]	1128	/* Copy data. */
	1129	_str_cpyn(dest, size, src);
[a35b458]	1130
[28c39f3]	1131	/* In-place translate invalid bytes to U_SPECIAL. */
[0600976]	1132	_sanitize_string(dest, size);
[6eb2e96]	1133	}
	1134
	1135	/** Copy size-limited substring.
	1136	*
[6700ee2]	1137	* Copy prefix of string @a src of max. size @a size to destination buffer
	1138	* @a dest. No more than @a size bytes are written. The output string will
	1139	* always be well-formed, i.e. null-terminated and containing only complete
	1140	* characters.
[6eb2e96]	1141	*
	1142	* No more than @a n bytes are read from the input string, so it does not
	1143	* have to be null-terminated.
	1144	*
[abf09311]	1145	* @param dest Destination buffer.
[6700ee2]	1146	* @param count Size of the destination buffer (must be > 0).
[6eb2e96]	1147	* @param src Source string.
[abf09311]	1148	* @param n Maximum number of bytes to read from @a src.
[8e893ae]	1149	*
[6eb2e96]	1150	*/
	1151	void str_ncpy(char dest, size_t size, const char src, size_t n)
	1152	{
[6700ee2]	1153	/* There must be space for a null terminator in the buffer. */
	1154	assert(size > 0);
[28c39f3]	1155	assert(src != NULL);
[a35b458]	1156
[28c39f3]	1157	/* Copy data. */
	1158	_str_cpyn(dest, min(size, n + 1), src);
[a35b458]	1159
[28c39f3]	1160	/* In-place translate invalid bytes to U_SPECIAL. */
[0600976]	1161	_sanitize_string(dest, size);
[f2b8cdc]	1162	}
	1163
[4482bc7]	1164	/** Append one string to another.
	1165	*
	1166	* Append source string @a src to string in destination buffer @a dest.
	1167	* Size of the destination buffer is @a dest. If the size of the output buffer
	1168	* is at least one byte, the output string will always be well-formed, i.e.
	1169	* null-terminated and containing only complete characters.
	1170	*
[0f06dbc]	1171	* @param dest Destination buffer.
[4482bc7]	1172	* @param count Size of the destination buffer.
	1173	* @param src Source string.
	1174	*/
	1175	void str_append(char dest, size_t size, const char src)
	1176	{
[28c39f3]	1177	assert(src != NULL);
	1178	assert(dest != NULL);
	1179	assert(size > 0);
[0600976]	1180	assert(size == STR_NO_LIMIT \|\| dest + size > dest);
[a35b458]	1181
[28c39f3]	1182	size_t dstr_size = _str_nsize(dest, size);
[0600976]	1183	if (dstr_size < size) {
	1184	_str_cpyn(dest + dstr_size, size - dstr_size, src);
	1185	_sanitize_string(dest + dstr_size, size - dstr_size);
	1186	}
[4482bc7]	1187	}
	1188
[dcb74c0a]	1189	/** Convert space-padded ASCII to string.
	1190	*
	1191	* Common legacy text encoding in hardware is 7-bit ASCII fitted into
[c3d19ac]	1192	* a fixed-width byte buffer (bit 7 always zero), right-padded with spaces
[dcb74c0a]	1193	* (ASCII 0x20). Convert space-padded ascii to string representation.
	1194	*
	1195	* If the text does not fit into the destination buffer, the function converts
	1196	* as many characters as possible and returns EOVERFLOW.
	1197	*
	1198	* If the text contains non-ASCII bytes (with bit 7 set), the whole string is
	1199	* converted anyway and invalid characters are replaced with question marks
	1200	* (U_SPECIAL) and the function returns EIO.
	1201	*
	1202	* Regardless of return value upon return @a dest will always be well-formed.
	1203	*
	1204	* @param dest Destination buffer
	1205	* @param size Size of destination buffer
	1206	* @param src Space-padded ASCII.
	1207	* @param n Size of the source buffer in bytes.
	1208	*
	1209	* @return EOK on success, EOVERFLOW if the text does not fit
	1210	* destination buffer, EIO if the text contains
	1211	* non-ASCII bytes.
	1212	*/
[b7fd2a0]	1213	errno_t spascii_to_str(char dest, size_t size, const uint8_t src, size_t n)
[dcb74c0a]	1214	{
[28c39f3]	1215	size_t len = 0;
[dcb74c0a]	1216
[28c39f3]	1217	/* Determine the length of the source string. */
	1218	for (size_t i = 0; i < n; i++) {
	1219	if (src[i] == 0)
	1220	break;
	1221
	1222	if (src[i] != ' ')
	1223	len = i + 1;
	1224	}
	1225
	1226	errno_t result = EOK;
	1227	size_t out_len = min(len, size - 1);
	1228
	1229	/* Copy characters */
	1230	for (size_t i = 0; i < out_len; i++) {
	1231	dest[i] = src[i];
	1232
	1233	if (dest[i] < 0) {
	1234	dest[i] = U_SPECIAL;
[dcb74c0a]	1235	result = EIO;
	1236	}
[28c39f3]	1237	}
[dcb74c0a]	1238
[28c39f3]	1239	dest[out_len] = 0;
[dcb74c0a]	1240
[28c39f3]	1241	if (out_len < len)
	1242	return EOVERFLOW;
[dcb74c0a]	1243
	1244	return result;
	1245	}
	1246
[0f06dbc]	1247	/** Convert wide string to string.
[f2b8cdc]	1248	*
[0f06dbc]	1249	* Convert wide string @a src to string. The output is written to the buffer
	1250	* specified by @a dest and @a size. @a size must be non-zero and the string
	1251	* written will always be well-formed.
[f2b8cdc]	1252	*
[0f06dbc]	1253	* @param dest Destination buffer.
	1254	* @param size Size of the destination buffer.
	1255	* @param src Source wide string.
[f2b8cdc]	1256	*/
[28a5ebd]	1257	void wstr_to_str(char dest, size_t size, const char32_t src)
[f2b8cdc]	1258	{
[28a5ebd]	1259	char32_t ch;
[0f06dbc]	1260	size_t src_idx;
	1261	size_t dest_off;
	1262
	1263	/* There must be space for a null terminator in the buffer. */
	1264	assert(size > 0);
[a35b458]	1265
[0f06dbc]	1266	src_idx = 0;
	1267	dest_off = 0;
	1268
[f2b8cdc]	1269	while ((ch = src[src_idx++]) != 0) {
[81e9cb3]	1270	if (chr_encode(ch, dest, &dest_off, size - 1) != EOK)
[f2b8cdc]	1271	break;
	1272	}
[0f06dbc]	1273
	1274	dest[dest_off] = '\0';
[f2b8cdc]	1275	}
	1276
[82374b2]	1277	/** Convert UTF16 string to string.
	1278	*
	1279	* Convert utf16 string @a src to string. The output is written to the buffer
	1280	* specified by @a dest and @a size. @a size must be non-zero and the string
	1281	* written will always be well-formed. Surrogate pairs also supported.
	1282	*
	1283	* @param dest Destination buffer.
	1284	* @param size Size of the destination buffer.
	1285	* @param src Source utf16 string.
	1286	*
[cde999a]	1287	* @return EOK, if success, an error code otherwise.
[82374b2]	1288	*/
[b7fd2a0]	1289	errno_t utf16_to_str(char dest, size_t size, const uint16_t src)
[82374b2]	1290	{
[abb7491c]	1291	size_t idx = 0, dest_off = 0;
[28a5ebd]	1292	char32_t ch;
[b7fd2a0]	1293	errno_t rc = EOK;
[82374b2]	1294
	1295	/* There must be space for a null terminator in the buffer. */
	1296	assert(size > 0);
	1297
	1298	while (src[idx]) {
	1299	if ((src[idx] & 0xfc00) == 0xd800) {
[abb7491c]	1300	if (src[idx + 1] && (src[idx + 1] & 0xfc00) == 0xdc00) {
[82374b2]	1301	ch = 0x10000;
	1302	ch += (src[idx] & 0x03FF) << 10;
[abb7491c]	1303	ch += (src[idx + 1] & 0x03FF);
[82374b2]	1304	idx += 2;
[1433ecda]	1305	} else
[82374b2]	1306	break;
	1307	} else {
	1308	ch = src[idx];
	1309	idx++;
	1310	}
[abb7491c]	1311	rc = chr_encode(ch, dest, &dest_off, size - 1);
[82374b2]	1312	if (rc != EOK)
	1313	break;
	1314	}
	1315	dest[dest_off] = '\0';
	1316	return rc;
	1317	}
	1318
[b06414f]	1319	/** Convert string to UTF16 string.
	1320	*
	1321	* Convert string @a src to utf16 string. The output is written to the buffer
	1322	* specified by @a dest and @a dlen. @a dlen must be non-zero and the string
	1323	* written will always be well-formed. Surrogate pairs also supported.
	1324	*
	1325	* @param dest Destination buffer.
	1326	* @param dlen Number of utf16 characters that fit in the destination buffer.
	1327	* @param src Source string.
	1328	*
[cde999a]	1329	* @return EOK, if success, an error code otherwise.
[b06414f]	1330	*/
[b7fd2a0]	1331	errno_t str_to_utf16(uint16_t dest, size_t dlen, const char src)
[fc97128]	1332	{
[b7fd2a0]	1333	errno_t rc = EOK;
[abb7491c]	1334	size_t offset = 0;
	1335	size_t idx = 0;
[28a5ebd]	1336	char32_t c;
[fc97128]	1337
[b06414f]	1338	assert(dlen > 0);
[a35b458]	1339
[fc97128]	1340	while ((c = str_decode(src, &offset, STR_NO_LIMIT)) != 0) {
	1341	if (c > 0x10000) {
[b06414f]	1342	if (idx + 2 >= dlen - 1) {
[abb7491c]	1343	rc = EOVERFLOW;
[fc97128]	1344	break;
	1345	}
	1346	c = (c - 0x10000);
	1347	dest[idx] = 0xD800 \| (c >> 10);
[abb7491c]	1348	dest[idx + 1] = 0xDC00 \| (c & 0x3FF);
[fc97128]	1349	idx++;
	1350	} else {
[1433ecda]	1351	dest[idx] = c;
[fc97128]	1352	}
	1353
	1354	idx++;
[b06414f]	1355	if (idx >= dlen - 1) {
[abb7491c]	1356	rc = EOVERFLOW;
[fc97128]	1357	break;
	1358	}
	1359	}
	1360
	1361	dest[idx] = '\0';
	1362	return rc;
[f2b8cdc]	1363	}
	1364
[b2906c0]	1365	/** Get size of UTF-16 string.
	1366	*
	1367	* Get the number of words which are used by the UTF-16 string @a ustr
	1368	* (excluding the NULL-terminator).
	1369	*
	1370	* @param ustr UTF-16 string to consider.
	1371	*
	1372	* @return Number of words used by the UTF-16 string
	1373	*
	1374	*/
	1375	size_t utf16_wsize(const uint16_t *ustr)
	1376	{
	1377	size_t wsize = 0;
	1378
	1379	while (*ustr++ != 0)
	1380	wsize++;
	1381
	1382	return wsize;
	1383	}
	1384
[b67c7d64]	1385	/** Convert wide string to new string.
	1386	*
	1387	* Convert wide string @a src to string. Space for the new string is allocated
	1388	* on the heap.
	1389	*
	1390	* @param src Source wide string.
	1391	* @return New string.
	1392	*/
[28a5ebd]	1393	char wstr_to_astr(const char32_t src)
[b67c7d64]	1394	{
	1395	char dbuf[STR_BOUNDS(1)];
	1396	char *str;
[28a5ebd]	1397	char32_t ch;
[b67c7d64]	1398
	1399	size_t src_idx;
	1400	size_t dest_off;
	1401	size_t dest_size;
	1402
	1403	/* Compute size of encoded string. */
	1404
	1405	src_idx = 0;
	1406	dest_size = 0;
	1407
	1408	while ((ch = src[src_idx++]) != 0) {
	1409	dest_off = 0;
	1410	if (chr_encode(ch, dbuf, &dest_off, STR_BOUNDS(1)) != EOK)
	1411	break;
	1412	dest_size += dest_off;
	1413	}
	1414
	1415	str = malloc(dest_size + 1);
	1416	if (str == NULL)
	1417	return NULL;
	1418
	1419	/* Encode string. */
	1420
	1421	src_idx = 0;
	1422	dest_off = 0;
	1423
	1424	while ((ch = src[src_idx++]) != 0) {
	1425	if (chr_encode(ch, str, &dest_off, dest_size) != EOK)
	1426	break;
	1427	}
	1428
	1429	str[dest_size] = '\0';
	1430	return str;
	1431	}
	1432
[da2bd08]	1433	/** Convert string to wide string.
	1434	*
	1435	* Convert string @a src to wide string. The output is written to the
[0f06dbc]	1436	* buffer specified by @a dest and @a dlen. @a dlen must be non-zero
	1437	* and the wide string written will always be null-terminated.
[da2bd08]	1438	*
	1439	* @param dest Destination buffer.
	1440	* @param dlen Length of destination buffer (number of wchars).
	1441	* @param src Source string.
	1442	*/
[28a5ebd]	1443	void str_to_wstr(char32_t dest, size_t dlen, const char src)
[da2bd08]	1444	{
	1445	size_t offset;
	1446	size_t di;
[28a5ebd]	1447	char32_t c;
[da2bd08]	1448
	1449	assert(dlen > 0);
	1450
	1451	offset = 0;
	1452	di = 0;
	1453
	1454	do {
[81e9cb3]	1455	if (di >= dlen - 1)
[da2bd08]	1456	break;
	1457
	1458	c = str_decode(src, &offset, STR_NO_LIMIT);
	1459	dest[di++] = c;
	1460	} while (c != '\0');
	1461
	1462	dest[dlen - 1] = '\0';
	1463	}
	1464
[22cf42d9]	1465	/** Convert string to wide string.
	1466	*
	1467	* Convert string @a src to wide string. A new wide NULL-terminated
	1468	* string will be allocated on the heap.
	1469	*
	1470	* @param src Source string.
	1471	*/
[28a5ebd]	1472	char32_t str_to_awstr(const char str)
[22cf42d9]	1473	{
	1474	size_t len = str_length(str);
[a35b458]	1475
[28a5ebd]	1476	char32_t *wstr = calloc(len + 1, sizeof(char32_t));
[b48d046]	1477	if (wstr == NULL)
	1478	return NULL;
[a35b458]	1479
[b48d046]	1480	str_to_wstr(wstr, len + 1, str);
[22cf42d9]	1481	return wstr;
	1482	}
	1483
[28c39f3]	1484	static char _strchr(const char str, char c)
	1485	{
	1486	while (str != 0 && str != c)
	1487	str++;
	1488
	1489	return (str == c) ? (char ) str : NULL;
	1490	}
	1491
[f2b8cdc]	1492	/** Find first occurence of character in string.
	1493	*
	1494	* @param str String to search.
	1495	* @param ch Character to look for.
	1496	*
	1497	* @return Pointer to character in @a str or NULL if not found.
	1498	*/
[28a5ebd]	1499	char str_chr(const char str, char32_t ch)
[f2b8cdc]	1500	{
[28c39f3]	1501	/* Fast path for an ASCII character. */
	1502	if (ascii_check(ch))
	1503	return _strchr(str, ch);
[a35b458]	1504
[28c39f3]	1505	/* Convert character to UTF-8. */
	1506	char utf8[STR_BOUNDS(1) + 1];
	1507	size_t offset = 0;
	1508
	1509	if (chr_encode(ch, utf8, &offset, sizeof(utf8)) != EOK \|\| offset == 0)
	1510	return NULL;
	1511
	1512	utf8[offset] = '\0';
	1513
	1514	/* Find the first byte, then check if all of them are correct. */
	1515	while (*str != 0) {
	1516	str = _strchr(str, utf8[0]);
	1517	if (!str)
	1518	return NULL;
	1519
	1520	if (_test_prefix(str, utf8))
	1521	return (char *) str;
	1522
	1523	str++;
[f2b8cdc]	1524	}
[a35b458]	1525
[f2b8cdc]	1526	return NULL;
	1527	}
	1528
[da680b4b]	1529	/** Find first occurence of substring in string.
	1530	*
	1531	* @param hs Haystack (string)
	1532	* @param n Needle (substring to look for)
	1533	*
	1534	* @return Pointer to character in @a hs or @c NULL if not found.
	1535	*/
	1536	char str_str(const char hs, const char *n)
	1537	{
[28c39f3]	1538	size_t hsize = _str_size(hs);
	1539	size_t nsize = _str_size(n);
[da680b4b]	1540
[28c39f3]	1541	while (hsize >= nsize) {
	1542	if (_test_prefix(hs, n))
	1543	return (char *) hs;
[da680b4b]	1544
[28c39f3]	1545	hs++;
	1546	hsize--;
[da680b4b]	1547	}
	1548
	1549	return NULL;
	1550	}
	1551
[28c39f3]	1552	static void _str_rtrim(char *str, char c)
	1553	{
	1554	char *last = str;
	1555
	1556	while (*str) {
	1557	if (*str != c)
	1558	last = str;
	1559
	1560	str++;
	1561	}
	1562
	1563	/* Truncate string. */
	1564	last[1] = 0;
	1565	}
	1566
[1737bfb]	1567	/** Removes specified trailing characters from a string.
	1568	*
	1569	* @param str String to remove from.
	1570	* @param ch Character to remove.
	1571	*/
[28a5ebd]	1572	void str_rtrim(char *str, char32_t ch)
[1737bfb]	1573	{
[28c39f3]	1574	/* Fast path for the ASCII case. */
	1575	if (ascii_check(ch)) {
	1576	_str_rtrim(str, ch);
	1577	return;
	1578	}
	1579
[1737bfb]	1580	size_t off = 0;
	1581	size_t pos = 0;
[28a5ebd]	1582	char32_t c;
[1737bfb]	1583	bool update_last_chunk = true;
	1584	char *last_chunk = NULL;
	1585
	1586	while ((c = str_decode(str, &off, STR_NO_LIMIT))) {
	1587	if (c != ch) {
	1588	update_last_chunk = true;
	1589	last_chunk = NULL;
	1590	} else if (update_last_chunk) {
	1591	update_last_chunk = false;
	1592	last_chunk = (str + pos);
	1593	}
	1594	pos = off;
	1595	}
	1596
	1597	if (last_chunk)
	1598	*last_chunk = '\0';
	1599	}
	1600
[28c39f3]	1601	static void _str_ltrim(char *str, char c)
	1602	{
	1603	char *p = str;
	1604
	1605	while (*p == c)
	1606	p++;
	1607
	1608	if (str != p)
	1609	_str_cpy(str, p);
	1610	}
	1611
[1737bfb]	1612	/** Removes specified leading characters from a string.
	1613	*
	1614	* @param str String to remove from.
	1615	* @param ch Character to remove.
	1616	*/
[28a5ebd]	1617	void str_ltrim(char *str, char32_t ch)
[1737bfb]	1618	{
[28c39f3]	1619	/* Fast path for the ASCII case. */
	1620	if (ascii_check(ch)) {
	1621	_str_ltrim(str, ch);
	1622	return;
	1623	}
	1624
[28a5ebd]	1625	char32_t acc;
[1737bfb]	1626	size_t off = 0;
	1627	size_t pos = 0;
	1628	size_t str_sz = str_size(str);
	1629
	1630	while ((acc = str_decode(str, &off, STR_NO_LIMIT)) != 0) {
	1631	if (acc != ch)
	1632	break;
	1633	else
	1634	pos = off;
	1635	}
	1636
	1637	if (pos > 0) {
	1638	memmove(str, &str[pos], str_sz - pos);
	1639	pos = str_sz - pos;
[a18a8b9]	1640	str[pos] = '\0';
[1737bfb]	1641	}
	1642	}
	1643
[28c39f3]	1644	static char _str_rchr(const char str, char c)
	1645	{
	1646	const char *last = NULL;
	1647
	1648	while (*str) {
	1649	if (*str == c)
	1650	last = str;
	1651
	1652	str++;
	1653	}
	1654
	1655	return (char *) last;
	1656	}
	1657
[7afb4a5]	1658	/** Find last occurence of character in string.
	1659	*
	1660	* @param str String to search.
	1661	* @param ch Character to look for.
	1662	*
	1663	* @return Pointer to character in @a str or NULL if not found.
	1664	*/
[28a5ebd]	1665	char str_rchr(const char str, char32_t ch)
[7afb4a5]	1666	{
[28c39f3]	1667	if (ascii_check(ch))
	1668	return _str_rchr(str, ch);
	1669
[28a5ebd]	1670	char32_t acc;
[7afb4a5]	1671	size_t off = 0;
[f2d2c7ba]	1672	size_t last = 0;
[d4a3ee5]	1673	const char *res = NULL;
[a35b458]	1674
[7afb4a5]	1675	while ((acc = str_decode(str, &off, STR_NO_LIMIT)) != 0) {
	1676	if (acc == ch)
[f2d2c7ba]	1677	res = (str + last);
	1678	last = off;
[7afb4a5]	1679	}
[a35b458]	1680
[dd2cfa7]	1681	return (char *) res;
[7afb4a5]	1682	}
	1683
[f2b8cdc]	1684	/** Insert a wide character into a wide string.
	1685	*
	1686	* Insert a wide character into a wide string at position
	1687	* @a pos. The characters after the position are shifted.
	1688	*
	1689	* @param str String to insert to.
	1690	* @param ch Character to insert to.
	1691	* @param pos Character index where to insert.
[7c3fb9b]	1692	* @param max_pos Characters in the buffer.
[f2b8cdc]	1693	*
	1694	* @return True if the insertion was sucessful, false if the position
	1695	* is out of bounds.
	1696	*
	1697	*/
[28a5ebd]	1698	bool wstr_linsert(char32_t *str, char32_t ch, size_t pos, size_t max_pos)
[f2b8cdc]	1699	{
[d4a3ee5]	1700	size_t len = wstr_length(str);
[a35b458]	1701
[f2b8cdc]	1702	if ((pos > len) \|\| (pos + 1 > max_pos))
	1703	return false;
[a35b458]	1704
[d4a3ee5]	1705	size_t i;
[f2b8cdc]	1706	for (i = len; i + 1 > pos; i--)
	1707	str[i + 1] = str[i];
[a35b458]	1708
[f2b8cdc]	1709	str[pos] = ch;
[a35b458]	1710
[f2b8cdc]	1711	return true;
	1712	}
	1713
	1714	/** Remove a wide character from a wide string.
	1715	*
	1716	* Remove a wide character from a wide string at position
	1717	* @a pos. The characters after the position are shifted.
	1718	*
	1719	* @param str String to remove from.
	1720	* @param pos Character index to remove.
	1721	*
	1722	* @return True if the removal was sucessful, false if the position
	1723	* is out of bounds.
	1724	*
	1725	*/
[28a5ebd]	1726	bool wstr_remove(char32_t *str, size_t pos)
[f2b8cdc]	1727	{
[d4a3ee5]	1728	size_t len = wstr_length(str);
[a35b458]	1729
[f2b8cdc]	1730	if (pos >= len)
	1731	return false;
[a35b458]	1732
[d4a3ee5]	1733	size_t i;
[f2b8cdc]	1734	for (i = pos + 1; i <= len; i++)
	1735	str[i - 1] = str[i];
[a35b458]	1736
[f2b8cdc]	1737	return true;
	1738	}
	1739
[abf09311]	1740	/** Duplicate string.
	1741	*
	1742	* Allocate a new string and copy characters from the source
	1743	* string into it. The duplicate string is allocated via sleeping
	1744	* malloc(), thus this function can sleep in no memory conditions.
	1745	*
	1746	* The allocation cannot fail and the return value is always
	1747	* a valid pointer. The duplicate string is always a well-formed
	1748	* null-terminated UTF-8 string, but it can differ from the source
	1749	* string on the byte level.
	1750	*
	1751	* @param src Source string.
	1752	*
	1753	* @return Duplicate string.
	1754	*
	1755	*/
[fc6dd18]	1756	char str_dup(const char src)
	1757	{
[28c39f3]	1758	size_t size = _str_size(src) + 1;
[d066259]	1759	char *dest = malloc(size);
	1760	if (!dest)
	1761	return NULL;
[a35b458]	1762
[0600976]	1763	memcpy(dest, src, size);
	1764	_sanitize_string(dest, size);
[abf09311]	1765	return dest;
[fc6dd18]	1766	}
	1767
[abf09311]	1768	/** Duplicate string with size limit.
	1769	*
	1770	* Allocate a new string and copy up to @max_size bytes from the source
	1771	* string into it. The duplicate string is allocated via sleeping
	1772	* malloc(), thus this function can sleep in no memory conditions.
	1773	* No more than @max_size + 1 bytes is allocated, but if the size
	1774	* occupied by the source string is smaller than @max_size + 1,
	1775	* less is allocated.
	1776	*
	1777	* The allocation cannot fail and the return value is always
	1778	* a valid pointer. The duplicate string is always a well-formed
	1779	* null-terminated UTF-8 string, but it can differ from the source
	1780	* string on the byte level.
	1781	*
	1782	* @param src Source string.
	1783	* @param n Maximum number of bytes to duplicate.
	1784	*
	1785	* @return Duplicate string.
	1786	*
	1787	*/
	1788	char str_ndup(const char src, size_t n)
[fc6dd18]	1789	{
[0600976]	1790	size_t size = _str_nsize(src, n);
[a35b458]	1791
[0600976]	1792	char *dest = malloc(size + 1);
[d066259]	1793	if (!dest)
	1794	return NULL;
[a35b458]	1795
[0600976]	1796	memcpy(dest, src, size);
	1797	_sanitize_string(dest, size);
	1798	dest[size] = 0;
[fc6dd18]	1799	return dest;
	1800	}
	1801
[ee3f6f6]	1802	/** Split string by delimiters.
	1803	*
	1804	* @param s String to be tokenized. May not be NULL.
	1805	* @param delim String with the delimiters.
	1806	* @param next Variable which will receive the pointer to the
	1807	* continuation of the string following the first
	1808	* occurrence of any of the delimiter characters.
	1809	* May be NULL.
	1810	* @return Pointer to the prefix of @a s before the first
	1811	* delimiter character. NULL if no such prefix
	1812	* exists.
	1813	*/
	1814	char str_tok(char s, const char delim, char *next)
[576845ec]	1815	{
	1816	char start, end;
[69df837f]	1817
[ee3f6f6]	1818	if (!s)
	1819	return NULL;
[a35b458]	1820
[ee3f6f6]	1821	size_t len = str_size(s);
	1822	size_t cur;
	1823	size_t tmp;
[28a5ebd]	1824	char32_t ch;
[69df837f]	1825
[576845ec]	1826	/* Skip over leading delimiters. */
[948222e4]	1827	tmp = 0;
	1828	cur = 0;
	1829	while ((ch = str_decode(s, &tmp, len)) && str_chr(delim, ch))
[ee3f6f6]	1830	cur = tmp;
	1831	start = &s[cur];
[69df837f]	1832
[576845ec]	1833	/* Skip over token characters. */
[948222e4]	1834	tmp = cur;
	1835	while ((ch = str_decode(s, &tmp, len)) && !str_chr(delim, ch))
[ee3f6f6]	1836	cur = tmp;
	1837	end = &s[cur];
	1838	if (next)
	1839	*next = (ch ? &s[tmp] : &s[cur]);
	1840
	1841	if (start == end)
[576845ec]	1842	return NULL; /* No more tokens. */
[69df837f]	1843
[576845ec]	1844	/* Overwrite delimiter with NULL terminator. */
	1845	*end = '\0';
	1846	return start;
[69df837f]	1847	}
	1848
[e535eeb]	1849	void order_suffix(const uint64_t val, uint64_t rv, char suffix)
	1850	{
[933cadf]	1851	if (val > UINT64_C(10000000000000000000)) {
	1852	*rv = val / UINT64_C(1000000000000000000);
[e535eeb]	1853	*suffix = 'Z';
[933cadf]	1854	} else if (val > UINT64_C(1000000000000000000)) {
	1855	*rv = val / UINT64_C(1000000000000000);
[e535eeb]	1856	*suffix = 'E';
[933cadf]	1857	} else if (val > UINT64_C(1000000000000000)) {
	1858	*rv = val / UINT64_C(1000000000000);
[e535eeb]	1859	*suffix = 'T';
[933cadf]	1860	} else if (val > UINT64_C(1000000000000)) {
	1861	*rv = val / UINT64_C(1000000000);
[e535eeb]	1862	*suffix = 'G';
[933cadf]	1863	} else if (val > UINT64_C(1000000000)) {
	1864	*rv = val / UINT64_C(1000000);
[e535eeb]	1865	*suffix = 'M';
[933cadf]	1866	} else if (val > UINT64_C(1000000)) {
	1867	*rv = val / UINT64_C(1000);
[e535eeb]	1868	*suffix = 'k';
	1869	} else {
	1870	*rv = val;
	1871	*suffix = ' ';
	1872	}
	1873	}
	1874
[933cadf]	1875	void bin_order_suffix(const uint64_t val, uint64_t rv, const char *suffix,
	1876	bool fixed)
	1877	{
	1878	if (val > UINT64_C(1152921504606846976)) {
	1879	*rv = val / UINT64_C(1125899906842624);
	1880	*suffix = "EiB";
	1881	} else if (val > UINT64_C(1125899906842624)) {
	1882	*rv = val / UINT64_C(1099511627776);
	1883	*suffix = "TiB";
	1884	} else if (val > UINT64_C(1099511627776)) {
	1885	*rv = val / UINT64_C(1073741824);
	1886	*suffix = "GiB";
	1887	} else if (val > UINT64_C(1073741824)) {
	1888	*rv = val / UINT64_C(1048576);
	1889	*suffix = "MiB";
	1890	} else if (val > UINT64_C(1048576)) {
	1891	*rv = val / UINT64_C(1024);
	1892	*suffix = "KiB";
	1893	} else {
	1894	*rv = val;
	1895	if (fixed)
	1896	*suffix = "B ";
	1897	else
	1898	*suffix = "B";
	1899	}
	1900	}
	1901
[a46da63]	1902	/** @}
[b2951e2]	1903	*/

Note: See TracBrowser for help on using the repository browser.

Download in other formats: