Context Navigation

source: mainline/common/str.c@ f94a11f

Visit:

Last change on this file since f94a11f was 0600976, checked in by Jiří Zárevúcky <zarevucky.jiri@…>, 3 months ago
Reject invalid non-shortest UTF-8 forms and fix some other issues in str
Property mode set to `100644`
File size: 40.9 KB

Rev	Line
[936351c1]	1	/*
[d066259]	2	* Copyright (c) 2001-2004 Jakub Jermar
[df4ed85]	3	* Copyright (c) 2005 Martin Decky
[576845ec]	4	* Copyright (c) 2008 Jiri Svoboda
[22cf42d9]	5	* Copyright (c) 2011 Martin Sucha
[c4bbca8]	6	* Copyright (c) 2011 Oleg Romanenko
[936351c1]	7	* All rights reserved.
	8	*
	9	* Redistribution and use in source and binary forms, with or without
	10	* modification, are permitted provided that the following conditions
	11	* are met:
	12	*
	13	* - Redistributions of source code must retain the above copyright
	14	* notice, this list of conditions and the following disclaimer.
	15	* - Redistributions in binary form must reproduce the above copyright
	16	* notice, this list of conditions and the following disclaimer in the
	17	* documentation and/or other materials provided with the distribution.
	18	* - The name of the author may not be used to endorse or promote products
	19	* derived from this software without specific prior written permission.
	20	*
	21	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	22	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	23	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	24	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	25	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	26	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	27	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	28	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	29	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	30	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	31	*/
	32
[a46da63]	33	/** @addtogroup libc
[b2951e2]	34	* @{
	35	*/
[d066259]	36
	37	/**
	38	* @file
	39	* @brief String functions.
	40	*
	41	* Strings and characters use the Universal Character Set (UCS). The standard
	42	* strings, called just strings are encoded in UTF-8. Wide strings (encoded
	43	* in UTF-32) are supported to a limited degree. A single character is
[28a5ebd]	44	* represented as char32_t.@n
[d066259]	45	*
	46	* Overview of the terminology:@n
	47	*
	48	* Term Meaning
	49	* -------------------- ----------------------------------------------------
	50	* byte 8 bits stored in uint8_t (unsigned 8 bit integer)
	51	*
[28a5ebd]	52	* character UTF-32 encoded Unicode character, stored in char32_t
	53	* (unsigned 32 bit integer), code points 0 .. 1114111
[d066259]	54	* are valid
	55	*
[28c39f3]	56	* Note that Unicode characters do not match
	57	* one-to-one with displayed characters or glyphs on
	58	* screen. For that level of precision, look up
	59	* Grapheme Clusters.
	60	*
[d066259]	61	* ASCII character 7 bit encoded ASCII character, stored in char
	62	* (usually signed 8 bit integer), code points 0 .. 127
	63	* are valid
	64	*
	65	* string UTF-8 encoded NULL-terminated Unicode string, char *
	66	*
	67	* wide string UTF-32 encoded NULL-terminated Unicode string,
[28a5ebd]	68	* char32_t *
[d066259]	69	*
	70	* [wide] string size number of BYTES in a [wide] string (excluding
	71	* the NULL-terminator), size_t
	72	*
	73	* [wide] string length number of CHARACTERS in a [wide] string (excluding
	74	* the NULL-terminator), size_t
	75	*
	76	* [wide] string width number of display cells on a monospace display taken
	77	* by a [wide] string, size_t
	78	*
[28c39f3]	79	* This is virtually impossible to determine exactly for
	80	* all strings without knowing specifics of the display
	81	* device, due to various factors affecting text output.
	82	* If you have the option to query the terminal for
	83	* position change caused by outputting the string,
	84	* it is preferrable to determine width that way.
	85	*
[d066259]	86	*
	87	* Overview of string metrics:@n
	88	*
	89	* Metric Abbrev. Type Meaning
	90	* ------ ------ ------ -------------------------------------------------
	91	* size n size_t number of BYTES in a string (excluding the
	92	* NULL-terminator)
	93	*
	94	* length l size_t number of CHARACTERS in a string (excluding the
	95	* null terminator)
	96	*
	97	* width w size_t number of display cells on a monospace display
	98	* taken by a string
	99	*
	100	*
	101	* Function naming prefixes:@n
	102	*
	103	* chr_ operate on characters
	104	* ascii_ operate on ASCII characters
	105	* str_ operate on strings
	106	* wstr_ operate on wide strings
	107	*
	108	* [w]str_[n\|l\|w] operate on a prefix limited by size, length
	109	* or width
	110	*
	111	*
	112	* A specific character inside a [wide] string can be referred to by:@n
	113	*
[28a5ebd]	114	* pointer (char , char32_t )
[d066259]	115	* byte offset (size_t)
	116	* character index (size_t)
	117	*
[b2951e2]	118	*/
	119
[19f857a]	120	#include <str.h>
[d066259]	121
[28c39f3]	122	#include <align.h>
[38d150e]	123	#include <assert.h>
[e64c4b2]	124	#include <ctype.h>
[171f9a1]	125	#include <errno.h>
[28c39f3]	126	#include <macros.h>
	127	#include <mem.h>
[d066259]	128	#include <stdbool.h>
	129	#include <stddef.h>
	130	#include <stdint.h>
	131	#include <stdlib.h>
[28c39f3]	132	#include <uchar.h>
[171f9a1]	133
	134	/** Byte mask consisting of lowest @n bits (out of 8) */
	135	#define LO_MASK_8(n) ((uint8_t) ((1 << (n)) - 1))
	136
	137	/** Byte mask consisting of lowest @n bits (out of 32) */
	138	#define LO_MASK_32(n) ((uint32_t) ((1 << (n)) - 1))
	139
	140	/** Byte mask consisting of highest @n bits (out of 8) */
	141	#define HI_MASK_8(n) (~LO_MASK_8(8 - (n)))
	142
	143	/** Number of data bits in a UTF-8 continuation byte */
	144	#define CONT_BITS 6
	145
[28c39f3]	146	static inline bool _is_ascii(uint8_t b)
	147	{
	148	return b < 0x80;
	149	}
	150
	151	static inline bool _is_continuation_byte(uint8_t b)
	152	{
	153	return (b & 0xc0) == 0x80;
	154	}
	155
	156	static inline int _char_continuation_bytes(char32_t c)
	157	{
[6120b7b]	158	if ((c & ~LO_MASK_32(7)) == 0)
	159	return 0;
	160
[28c39f3]	161	if ((c & ~LO_MASK_32(11)) == 0)
	162	return 1;
	163
	164	if ((c & ~LO_MASK_32(16)) == 0)
	165	return 2;
	166
	167	if ((c & ~LO_MASK_32(21)) == 0)
	168	return 3;
	169
	170	/* Codes longer than 21 bits are not supported */
	171	return -1;
	172	}
	173
	174	static inline int _continuation_bytes(uint8_t b)
	175	{
	176	/* 0xxxxxxx */
	177	if (_is_ascii(b))
	178	return 0;
	179
	180	/* 110xxxxx 10xxxxxx */
	181	if ((b & 0xe0) == 0xc0)
	182	return 1;
	183
	184	/* 1110xxxx 10xxxxxx 10xxxxxx */
	185	if ((b & 0xf0) == 0xe0)
	186	return 2;
	187
	188	/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
	189	if ((b & 0xf8) == 0xf0)
	190	return 3;
	191
	192	return -1;
	193	}
	194
[171f9a1]	195	/** Decode a single character from a string.
	196	*
	197	* Decode a single character from a string of size @a size. Decoding starts
	198	* at @a offset and this offset is moved to the beginning of the next
	199	* character. In case of decoding error, offset generally advances at least
	200	* by one. However, offset is never moved beyond size.
	201	*
	202	* @param str String (not necessarily NULL-terminated).
	203	* @param offset Byte offset in string where to start decoding.
	204	* @param size Size of the string (in bytes).
	205	*
	206	* @return Value of decoded character, U_SPECIAL on decoding error or
	207	* NULL if attempt to decode beyond @a size.
	208	*
	209	*/
[28a5ebd]	210	char32_t str_decode(const char str, size_t offset, size_t size)
[171f9a1]	211	{
[0600976]	212	if (*offset >= size)
[171f9a1]	213	return 0;
[a35b458]	214
[171f9a1]	215	/* First byte read from string */
	216	uint8_t b0 = (uint8_t) str[(*offset)++];
[a35b458]	217
[28c39f3]	218	/* Fast exit for the most common case. */
	219	if (_is_ascii(b0))
	220	return b0;
[a35b458]	221
[28c39f3]	222	/* 10xxxxxx -- unexpected continuation byte */
	223	if (_is_continuation_byte(b0))
[171f9a1]	224	return U_SPECIAL;
[28c39f3]	225
	226	/* Determine code length */
	227
[6120b7b]	228	int cbytes = _continuation_bytes(b0);
	229	int b0_bits = 6 - cbytes; /* Data bits in first byte */
[a35b458]	230
[6d0e133]	231	if (cbytes < 0 \|\| *offset + cbytes > size)
[171f9a1]	232	return U_SPECIAL;
[a35b458]	233
[28a5ebd]	234	char32_t ch = b0 & LO_MASK_8(b0_bits);
[a35b458]	235
[171f9a1]	236	/* Decode continuation bytes */
[0600976]	237	for (int i = 0; i < cbytes; i++) {
[6d0e133]	238	uint8_t b = (uint8_t) str[*offset];
[a35b458]	239
[28c39f3]	240	if (!_is_continuation_byte(b))
[171f9a1]	241	return U_SPECIAL;
[a35b458]	242
[6d0e133]	243	(*offset)++;
	244
[171f9a1]	245	/* Shift data bits to ch */
[28a5ebd]	246	ch = (ch << CONT_BITS) \| (char32_t) (b & LO_MASK_8(CONT_BITS));
[171f9a1]	247	}
[a35b458]	248
[0600976]	249	/*
	250	* Reject non-shortest form encodings.
	251	* See https://www.unicode.org/versions/corrigendum1.html
	252	*/
	253	if (cbytes != _char_continuation_bytes(ch))
	254	return U_SPECIAL;
	255
[171f9a1]	256	return ch;
	257	}
	258
[568693b]	259	/** Decode a single character from a string to the left.
	260	*
	261	* Decode a single character from a string of size @a size. Decoding starts
	262	* at @a offset and this offset is moved to the beginning of the previous
	263	* character. In case of decoding error, offset generally decreases at least
	264	* by one. However, offset is never moved before 0.
	265	*
	266	* @param str String (not necessarily NULL-terminated).
	267	* @param offset Byte offset in string where to start decoding.
	268	* @param size Size of the string (in bytes).
	269	*
	270	* @return Value of decoded character, U_SPECIAL on decoding error or
	271	* NULL if attempt to decode beyond @a start of str.
	272	*
	273	*/
[28a5ebd]	274	char32_t str_decode_reverse(const char str, size_t offset, size_t size)
[568693b]	275	{
	276	if (*offset == 0)
	277	return 0;
[a35b458]	278
[28c39f3]	279	int cbytes = 0;
[568693b]	280	/* Continue while continuation bytes found */
[28c39f3]	281	while (*offset > 0 && cbytes < 4) {
[568693b]	282	uint8_t b = (uint8_t) str[--(*offset)];
[a35b458]	283
[28c39f3]	284	if (_is_continuation_byte(b)) {
	285	cbytes++;
	286	continue;
[568693b]	287	}
[28c39f3]	288
	289	/* Invalid byte. */
	290	if (cbytes != _continuation_bytes(b))
	291	return U_SPECIAL;
	292
	293	/* Start byte */
	294	size_t start_offset = *offset;
	295	return str_decode(str, &start_offset, size);
[568693b]	296	}
[28c39f3]	297
[568693b]	298	/* Too many continuation bytes */
	299	return U_SPECIAL;
	300	}
	301
[171f9a1]	302	/** Encode a single character to string representation.
	303	*
	304	* Encode a single character to string representation (i.e. UTF-8) and store
	305	* it into a buffer at @a offset. Encoding starts at @a offset and this offset
	306	* is moved to the position where the next character can be written to.
	307	*
	308	* @param ch Input character.
	309	* @param str Output buffer.
	310	* @param offset Byte offset where to start writing.
	311	* @param size Size of the output buffer (in bytes).
	312	*
	313	* @return EOK if the character was encoded successfully, EOVERFLOW if there
[d4a3ee5]	314	* was not enough space in the output buffer or EINVAL if the character
	315	* code was invalid.
[171f9a1]	316	*/
[28c39f3]	317	errno_t chr_encode(char32_t ch, char str, size_t offset, size_t size)
[171f9a1]	318	{
	319	if (*offset >= size)
	320	return EOVERFLOW;
[a35b458]	321
[28c39f3]	322	/* Fast exit for the most common case. */
	323	if (ch < 0x80) {
	324	str[(*offset)++] = (char) ch;
	325	return EOK;
	326	}
	327
	328	/* Codes longer than 21 bits are not supported */
[171f9a1]	329	if (!chr_check(ch))
	330	return EINVAL;
[a35b458]	331
[171f9a1]	332	/* Determine how many continuation bytes are needed */
[a35b458]	333
[28c39f3]	334	unsigned int cbytes = _char_continuation_bytes(ch);
	335	unsigned int b0_bits = 6 - cbytes; /* Data bits in first byte */
[a35b458]	336
[171f9a1]	337	/* Check for available space in buffer */
	338	if (*offset + cbytes >= size)
	339	return EOVERFLOW;
[a35b458]	340
[171f9a1]	341	/* Encode continuation bytes */
	342	unsigned int i;
	343	for (i = cbytes; i > 0; i--) {
[28c39f3]	344	str[*offset + i] = 0x80 \| (ch & LO_MASK_32(CONT_BITS));
	345	ch >>= CONT_BITS;
[171f9a1]	346	}
[a35b458]	347
[171f9a1]	348	/* Encode first byte */
[28c39f3]	349	str[*offset] = (ch & LO_MASK_32(b0_bits)) \| HI_MASK_8(8 - b0_bits - 1);
[a35b458]	350
[171f9a1]	351	/* Advance offset */
	352	*offset += cbytes + 1;
[a35b458]	353
[171f9a1]	354	return EOK;
	355	}
	356
[28c39f3]	357	/* Convert in place any bytes that don't form a valid character into U_SPECIAL. */
[0600976]	358	static void _sanitize_string(char *str, size_t n)
[28c39f3]	359	{
[0600976]	360	uint8_t b = (uint8_t ) str;
	361
	362	for (; *b && n > 0; b++, n--) {
	363	int cont = _continuation_bytes(b[0]);
	364	if (__builtin_expect(cont, 0) == 0)
[28c39f3]	365	continue;
	366
	367	if (cont < 0 \|\| n <= (size_t) cont) {
[0600976]	368	b[0] = U_SPECIAL;
[28c39f3]	369	continue;
	370	}
	371
[0600976]	372	/* Check continuation bytes. */
[28c39f3]	373	for (int i = 1; i <= cont; i++) {
[0600976]	374	if (!_is_continuation_byte(b[i])) {
	375	b[0] = U_SPECIAL;
[28c39f3]	376	continue;
	377	}
	378	}
[0600976]	379
	380	/*
	381	* Check for non-shortest form encoding.
	382	* See https://www.unicode.org/versions/corrigendum1.html
	383	*/
	384
	385	switch (cont) {
	386	case 1:
	387	/* 0b110!!!!x 0b10xxxxxx */
	388	if (!(b[0] & 0b00011110))
	389	b[0] = U_SPECIAL;
	390
	391	continue;
	392	case 2:
	393	/* 0b1110!!!! 0b10!xxxxx 0b10xxxxxx */
	394	if (!(b[0] & 0b00001111) && !(b[1] & 0b00100000))
	395	b[0] = U_SPECIAL;
	396
	397	continue;
	398	case 3:
	399	/* 0b11110!!! 0b10!!xxxx 0b10xxxxxx 0b10xxxxxx */
	400	if (!(b[0] & 0b00000111) && !(b[1] & 0b00110000))
	401	b[0] = U_SPECIAL;
	402
	403	continue;
	404	}
[28c39f3]	405	}
	406	}
	407
	408	static size_t _str_size(const char *str)
	409	{
	410	size_t size = 0;
	411
	412	while (*str++ != 0)
	413	size++;
	414
	415	return size;
	416	}
	417
[f2b8cdc]	418	/** Get size of string.
	419	*
	420	* Get the number of bytes which are used by the string @a str (excluding the
	421	* NULL-terminator).
	422	*
	423	* @param str String to consider.
	424	*
	425	* @return Number of bytes used by the string
	426	*
	427	*/
	428	size_t str_size(const char *str)
	429	{
[28c39f3]	430	return _str_size(str);
[f2b8cdc]	431	}
	432
	433	/** Get size of wide string.
	434	*
	435	* Get the number of bytes which are used by the wide string @a str (excluding the
	436	* NULL-terminator).
	437	*
	438	* @param str Wide string to consider.
	439	*
	440	* @return Number of bytes used by the wide string
	441	*
	442	*/
[28a5ebd]	443	size_t wstr_size(const char32_t *str)
[f2b8cdc]	444	{
[28a5ebd]	445	return (wstr_length(str) * sizeof(char32_t));
[f2b8cdc]	446	}
	447
	448	/** Get size of string with length limit.
	449	*
	450	* Get the number of bytes which are used by up to @a max_len first
	451	* characters in the string @a str. If @a max_len is greater than
	452	* the length of @a str, the entire string is measured (excluding the
	453	* NULL-terminator).
	454	*
	455	* @param str String to consider.
	456	* @param max_len Maximum number of characters to measure.
	457	*
	458	* @return Number of bytes used by the characters.
	459	*
	460	*/
[d4a3ee5]	461	size_t str_lsize(const char *str, size_t max_len)
[f2b8cdc]	462	{
[d4a3ee5]	463	size_t len = 0;
[f2b8cdc]	464	size_t offset = 0;
[a35b458]	465
[f2b8cdc]	466	while (len < max_len) {
	467	if (str_decode(str, &offset, STR_NO_LIMIT) == 0)
	468	break;
[a35b458]	469
[f2b8cdc]	470	len++;
	471	}
[a35b458]	472
[f2b8cdc]	473	return offset;
	474	}
	475
[28c39f3]	476	static size_t _str_nsize(const char *str, size_t max_size)
	477	{
	478	size_t size = 0;
	479
	480	while ((*str++ != 0) && (size < max_size))
	481	size++;
	482
	483	return size;
	484	}
	485
[560d79f]	486	/** Get size of string with size limit.
	487	*
	488	* Get the number of bytes which are used by the string @a str
	489	* (excluding the NULL-terminator), but no more than @max_size bytes.
	490	*
	491	* @param str String to consider.
	492	* @param max_size Maximum number of bytes to measure.
	493	*
	494	* @return Number of bytes used by the string
	495	*
	496	*/
	497	size_t str_nsize(const char *str, size_t max_size)
	498	{
[28c39f3]	499	return _str_nsize(str, max_size);
[560d79f]	500	}
	501
	502	/** Get size of wide string with size limit.
	503	*
	504	* Get the number of bytes which are used by the wide string @a str
	505	* (excluding the NULL-terminator), but no more than @max_size bytes.
	506	*
	507	* @param str Wide string to consider.
	508	* @param max_size Maximum number of bytes to measure.
	509	*
	510	* @return Number of bytes used by the wide string
	511	*
	512	*/
[28a5ebd]	513	size_t wstr_nsize(const char32_t *str, size_t max_size)
[560d79f]	514	{
[28a5ebd]	515	return (wstr_nlength(str, max_size) * sizeof(char32_t));
[560d79f]	516	}
	517
[f2b8cdc]	518	/** Get size of wide string with length limit.
	519	*
	520	* Get the number of bytes which are used by up to @a max_len first
	521	* wide characters in the wide string @a str. If @a max_len is greater than
	522	* the length of @a str, the entire wide string is measured (excluding the
	523	* NULL-terminator).
	524	*
	525	* @param str Wide string to consider.
	526	* @param max_len Maximum number of wide characters to measure.
	527	*
	528	* @return Number of bytes used by the wide characters.
	529	*
	530	*/
[28a5ebd]	531	size_t wstr_lsize(const char32_t *str, size_t max_len)
[f2b8cdc]	532	{
[28a5ebd]	533	return (wstr_nlength(str, max_len * sizeof(char32_t)) * sizeof(char32_t));
[f2b8cdc]	534	}
	535
	536	/** Get number of characters in a string.
	537	*
	538	* @param str NULL-terminated string.
	539	*
	540	* @return Number of characters in string.
	541	*
	542	*/
[d4a3ee5]	543	size_t str_length(const char *str)
[f2b8cdc]	544	{
[d4a3ee5]	545	size_t len = 0;
[f2b8cdc]	546	size_t offset = 0;
[a35b458]	547
[f2b8cdc]	548	while (str_decode(str, &offset, STR_NO_LIMIT) != 0)
	549	len++;
[a35b458]	550
[f2b8cdc]	551	return len;
	552	}
	553
	554	/** Get number of characters in a wide string.
	555	*
	556	* @param str NULL-terminated wide string.
	557	*
	558	* @return Number of characters in @a str.
	559	*
	560	*/
[28a5ebd]	561	size_t wstr_length(const char32_t *wstr)
[f2b8cdc]	562	{
[d4a3ee5]	563	size_t len = 0;
[a35b458]	564
[f2b8cdc]	565	while (*wstr++ != 0)
	566	len++;
[a35b458]	567
[f2b8cdc]	568	return len;
	569	}
	570
	571	/** Get number of characters in a string with size limit.
	572	*
	573	* @param str NULL-terminated string.
	574	* @param size Maximum number of bytes to consider.
	575	*
	576	* @return Number of characters in string.
	577	*
	578	*/
[d4a3ee5]	579	size_t str_nlength(const char *str, size_t size)
[f2b8cdc]	580	{
[d4a3ee5]	581	size_t len = 0;
[f2b8cdc]	582	size_t offset = 0;
[a35b458]	583
[f2b8cdc]	584	while (str_decode(str, &offset, size) != 0)
	585	len++;
[a35b458]	586
[f2b8cdc]	587	return len;
	588	}
	589
	590	/** Get number of characters in a string with size limit.
	591	*
	592	* @param str NULL-terminated string.
	593	* @param size Maximum number of bytes to consider.
	594	*
	595	* @return Number of characters in string.
	596	*
	597	*/
[28a5ebd]	598	size_t wstr_nlength(const char32_t *str, size_t size)
[f2b8cdc]	599	{
[d4a3ee5]	600	size_t len = 0;
[28a5ebd]	601	size_t limit = ALIGN_DOWN(size, sizeof(char32_t));
[d4a3ee5]	602	size_t offset = 0;
[a35b458]	603
[f2b8cdc]	604	while ((offset < limit) && (*str++ != 0)) {
	605	len++;
[28a5ebd]	606	offset += sizeof(char32_t);
[f2b8cdc]	607	}
[a35b458]	608
[f2b8cdc]	609	return len;
	610	}
	611
[be2a38ad]	612	/** Get character display width on a character cell display.
	613	*
	614	* @param ch Character
	615	* @return Width of character in cells.
	616	*/
[28a5ebd]	617	size_t chr_width(char32_t ch)
[be2a38ad]	618	{
	619	return 1;
	620	}
	621
	622	/** Get string display width on a character cell display.
	623	*
	624	* @param str String
	625	* @return Width of string in cells.
	626	*/
	627	size_t str_width(const char *str)
	628	{
	629	size_t width = 0;
	630	size_t offset = 0;
[28a5ebd]	631	char32_t ch;
[a35b458]	632
[be2a38ad]	633	while ((ch = str_decode(str, &offset, STR_NO_LIMIT)) != 0)
	634	width += chr_width(ch);
[a35b458]	635
[be2a38ad]	636	return width;
	637	}
	638
[f2b8cdc]	639	/** Check whether character is plain ASCII.
	640	*
	641	* @return True if character is plain ASCII.
	642	*
	643	*/
[28a5ebd]	644	bool ascii_check(char32_t ch)
[f2b8cdc]	645	{
[28a5ebd]	646	if (ch <= 127)
[f2b8cdc]	647	return true;
[a35b458]	648
[f2b8cdc]	649	return false;
	650	}
	651
[171f9a1]	652	/** Check whether character is valid
	653	*
	654	* @return True if character is a valid Unicode code point.
	655	*
	656	*/
[28a5ebd]	657	bool chr_check(char32_t ch)
[171f9a1]	658	{
[28a5ebd]	659	if (ch <= 1114111)
[171f9a1]	660	return true;
[a35b458]	661
[171f9a1]	662	return false;
	663	}
[936351c1]	664
[f2b8cdc]	665	/** Compare two NULL terminated strings.
	666	*
	667	* Do a char-by-char comparison of two NULL-terminated strings.
[4efeab5]	668	* The strings are considered equal iff their length is equal
	669	* and both strings consist of the same sequence of characters.
	670	*
[1772e6d]	671	* A string S1 is less than another string S2 if it has a character with
	672	* lower value at the first character position where the strings differ.
	673	* If the strings differ in length, the shorter one is treated as if
	674	* padded by characters with a value of zero.
[f2b8cdc]	675	*
	676	* @param s1 First string to compare.
	677	* @param s2 Second string to compare.
	678	*
[1772e6d]	679	* @return 0 if the strings are equal, -1 if the first is less than the second,
	680	* 1 if the second is less than the first.
[f2b8cdc]	681	*
	682	*/
	683	int str_cmp(const char s1, const char s2)
	684	{
[28c39f3]	685	/*
	686	* UTF-8 has the nice property that lexicographic ordering on bytes is
	687	* the same as the lexicographic ordering of the character sequences.
	688	*/
	689	while (s1 == s2 && *s1 != 0) {
	690	s1++;
	691	s2++;
[f2b8cdc]	692	}
	693
[28c39f3]	694	if (s1 == s2)
	695	return 0;
	696
	697	return (s1 < s2) ? -1 : 1;
[f2b8cdc]	698	}
	699
	700	/** Compare two NULL terminated strings with length limit.
	701	*
	702	* Do a char-by-char comparison of two NULL-terminated strings.
[4efeab5]	703	* The strings are considered equal iff
	704	* min(str_length(s1), max_len) == min(str_length(s2), max_len)
	705	* and both strings consist of the same sequence of characters,
	706	* up to max_len characters.
	707	*
[1772e6d]	708	* A string S1 is less than another string S2 if it has a character with
	709	* lower value at the first character position where the strings differ.
	710	* If the strings differ in length, the shorter one is treated as if
	711	* padded by characters with a value of zero. Only the first max_len
	712	* characters are considered.
[f2b8cdc]	713	*
	714	* @param s1 First string to compare.
	715	* @param s2 Second string to compare.
	716	* @param max_len Maximum number of characters to consider.
	717	*
[1772e6d]	718	* @return 0 if the strings are equal, -1 if the first is less than the second,
	719	* 1 if the second is less than the first.
[f2b8cdc]	720	*
	721	*/
[d4a3ee5]	722	int str_lcmp(const char s1, const char s2, size_t max_len)
[f2b8cdc]	723	{
[28a5ebd]	724	char32_t c1 = 0;
	725	char32_t c2 = 0;
[8227d63]	726
[f2b8cdc]	727	size_t off1 = 0;
	728	size_t off2 = 0;
[8227d63]	729
[d4a3ee5]	730	size_t len = 0;
[f2b8cdc]	731
	732	while (true) {
	733	if (len >= max_len)
	734	break;
	735
	736	c1 = str_decode(s1, &off1, STR_NO_LIMIT);
	737	c2 = str_decode(s2, &off2, STR_NO_LIMIT);
	738
[8227d63]	739	if (c1 < c2)
	740	return -1;
	741
	742	if (c1 > c2)
	743	return 1;
	744
	745	if (c1 == 0 \|\| c2 == 0)
	746	break;
	747
	748	++len;
	749	}
	750
	751	return 0;
	752
	753	}
	754
	755	/** Compare two NULL terminated strings in case-insensitive manner.
	756	*
	757	* Do a char-by-char comparison of two NULL-terminated strings.
	758	* The strings are considered equal iff their length is equal
	759	* and both strings consist of the same sequence of characters
	760	* when converted to lower case.
	761	*
	762	* A string S1 is less than another string S2 if it has a character with
	763	* lower value at the first character position where the strings differ.
	764	* If the strings differ in length, the shorter one is treated as if
	765	* padded by characters with a value of zero.
	766	*
	767	* @param s1 First string to compare.
	768	* @param s2 Second string to compare.
	769	*
	770	* @return 0 if the strings are equal, -1 if the first is less than the second,
	771	* 1 if the second is less than the first.
	772	*
	773	*/
	774	int str_casecmp(const char s1, const char s2)
	775	{
[28c39f3]	776	// FIXME: doesn't work for non-ASCII caseful characters
	777
[28a5ebd]	778	char32_t c1 = 0;
	779	char32_t c2 = 0;
[8227d63]	780
	781	size_t off1 = 0;
	782	size_t off2 = 0;
	783
	784	while (true) {
	785	c1 = tolower(str_decode(s1, &off1, STR_NO_LIMIT));
	786	c2 = tolower(str_decode(s2, &off2, STR_NO_LIMIT));
	787
	788	if (c1 < c2)
	789	return -1;
	790
	791	if (c1 > c2)
	792	return 1;
	793
	794	if (c1 == 0 \|\| c2 == 0)
	795	break;
	796	}
	797
	798	return 0;
	799	}
	800
	801	/** Compare two NULL terminated strings with length limit in case-insensitive
	802	* manner.
	803	*
	804	* Do a char-by-char comparison of two NULL-terminated strings.
	805	* The strings are considered equal iff
	806	* min(str_length(s1), max_len) == min(str_length(s2), max_len)
	807	* and both strings consist of the same sequence of characters,
	808	* up to max_len characters.
	809	*
	810	* A string S1 is less than another string S2 if it has a character with
	811	* lower value at the first character position where the strings differ.
	812	* If the strings differ in length, the shorter one is treated as if
	813	* padded by characters with a value of zero. Only the first max_len
	814	* characters are considered.
	815	*
	816	* @param s1 First string to compare.
	817	* @param s2 Second string to compare.
	818	* @param max_len Maximum number of characters to consider.
	819	*
	820	* @return 0 if the strings are equal, -1 if the first is less than the second,
	821	* 1 if the second is less than the first.
	822	*
	823	*/
	824	int str_lcasecmp(const char s1, const char s2, size_t max_len)
	825	{
[28c39f3]	826	// FIXME: doesn't work for non-ASCII caseful characters
	827
[28a5ebd]	828	char32_t c1 = 0;
	829	char32_t c2 = 0;
[a35b458]	830
[8227d63]	831	size_t off1 = 0;
	832	size_t off2 = 0;
[a35b458]	833
[8227d63]	834	size_t len = 0;
	835
	836	while (true) {
	837	if (len >= max_len)
	838	break;
	839
	840	c1 = tolower(str_decode(s1, &off1, STR_NO_LIMIT));
	841	c2 = tolower(str_decode(s2, &off2, STR_NO_LIMIT));
	842
[f2b8cdc]	843	if (c1 < c2)
	844	return -1;
	845
	846	if (c1 > c2)
	847	return 1;
	848
	849	if (c1 == 0 \|\| c2 == 0)
	850	break;
	851
[1b20da0]	852	++len;
[f2b8cdc]	853	}
	854
	855	return 0;
	856
	857	}
	858
[28c39f3]	859	static bool _test_prefix(const char s, const char p)
	860	{
	861	while (s == p && *s != 0) {
	862	s++;
	863	p++;
	864	}
	865
	866	return *p == 0;
	867	}
	868
[dce39b4]	869	/** Test whether p is a prefix of s.
	870	*
	871	* Do a char-by-char comparison of two NULL-terminated strings
	872	* and determine if p is a prefix of s.
	873	*
	874	* @param s The string in which to look
	875	* @param p The string to check if it is a prefix of s
	876	*
	877	* @return true iff p is prefix of s else false
	878	*
	879	*/
	880	bool str_test_prefix(const char s, const char p)
	881	{
[28c39f3]	882	return _test_prefix(s, p);
[dce39b4]	883	}
	884
[086cab0]	885	/** Get a string suffix.
	886	*
	887	* Return a string suffix defined by the prefix length.
	888	*
	889	* @param s The string to get the suffix from.
	890	* @param prefix_length Number of prefix characters to ignore.
	891	*
	892	* @return String suffix.
	893	*
	894	*/
	895	const char str_suffix(const char s, size_t prefix_length)
	896	{
	897	size_t off = 0;
	898	size_t i = 0;
	899
	900	while (true) {
	901	str_decode(s, &off, STR_NO_LIMIT);
	902	i++;
	903
	904	if (i >= prefix_length)
	905	break;
	906	}
	907
	908	return s + off;
	909	}
	910
[28c39f3]	911	/** Copy string as a sequence of bytes. */
	912	static void _str_cpy(char dest, const char src)
	913	{
	914	while (*src)
	915	(dest++) = (src++);
	916
	917	*dest = 0;
	918	}
	919
	920	/** Copy string as a sequence of bytes. */
	921	static void _str_cpyn(char dest, size_t size, const char src)
	922	{
[0600976]	923	assert(dest && src && size);
	924
	925	if (!dest \|\| !src \|\| !size)
	926	return;
	927
	928	if (size == STR_NO_LIMIT)
	929	return _str_cpy(dest, src);
	930
[28c39f3]	931	char *dest_top = dest + size - 1;
[0600976]	932	assert(size == 1 \|\| dest < dest_top);
[28c39f3]	933
	934	while (*src && dest < dest_top)
	935	(dest++) = (src++);
	936
	937	*dest = 0;
	938	}
	939
[6eb2e96]	940	/** Copy string.
[f2b8cdc]	941	*
[6eb2e96]	942	* Copy source string @a src to destination buffer @a dest.
	943	* No more than @a size bytes are written. If the size of the output buffer
	944	* is at least one byte, the output string will always be well-formed, i.e.
	945	* null-terminated and containing only complete characters.
[f2b8cdc]	946	*
[abf09311]	947	* @param dest Destination buffer.
[6700ee2]	948	* @param count Size of the destination buffer (must be > 0).
[6eb2e96]	949	* @param src Source string.
[8e893ae]	950	*
[f2b8cdc]	951	*/
[6eb2e96]	952	void str_cpy(char dest, size_t size, const char src)
[f2b8cdc]	953	{
[6700ee2]	954	/* There must be space for a null terminator in the buffer. */
	955	assert(size > 0);
[d066259]	956	assert(src != NULL);
[28c39f3]	957	assert(dest != NULL);
[0600976]	958	assert(size == STR_NO_LIMIT \|\| dest + size > dest);
[a35b458]	959
[28c39f3]	960	/* Copy data. */
	961	_str_cpyn(dest, size, src);
[a35b458]	962
[28c39f3]	963	/* In-place translate invalid bytes to U_SPECIAL. */
[0600976]	964	_sanitize_string(dest, size);
[6eb2e96]	965	}
	966
	967	/** Copy size-limited substring.
	968	*
[6700ee2]	969	* Copy prefix of string @a src of max. size @a size to destination buffer
	970	* @a dest. No more than @a size bytes are written. The output string will
	971	* always be well-formed, i.e. null-terminated and containing only complete
	972	* characters.
[6eb2e96]	973	*
	974	* No more than @a n bytes are read from the input string, so it does not
	975	* have to be null-terminated.
	976	*
[abf09311]	977	* @param dest Destination buffer.
[6700ee2]	978	* @param count Size of the destination buffer (must be > 0).
[6eb2e96]	979	* @param src Source string.
[abf09311]	980	* @param n Maximum number of bytes to read from @a src.
[8e893ae]	981	*
[6eb2e96]	982	*/
	983	void str_ncpy(char dest, size_t size, const char src, size_t n)
	984	{
[6700ee2]	985	/* There must be space for a null terminator in the buffer. */
	986	assert(size > 0);
[28c39f3]	987	assert(src != NULL);
[a35b458]	988
[28c39f3]	989	/* Copy data. */
	990	_str_cpyn(dest, min(size, n + 1), src);
[a35b458]	991
[28c39f3]	992	/* In-place translate invalid bytes to U_SPECIAL. */
[0600976]	993	_sanitize_string(dest, size);
[f2b8cdc]	994	}
	995
[4482bc7]	996	/** Append one string to another.
	997	*
	998	* Append source string @a src to string in destination buffer @a dest.
	999	* Size of the destination buffer is @a dest. If the size of the output buffer
	1000	* is at least one byte, the output string will always be well-formed, i.e.
	1001	* null-terminated and containing only complete characters.
	1002	*
[0f06dbc]	1003	* @param dest Destination buffer.
[4482bc7]	1004	* @param count Size of the destination buffer.
	1005	* @param src Source string.
	1006	*/
	1007	void str_append(char dest, size_t size, const char src)
	1008	{
[28c39f3]	1009	assert(src != NULL);
	1010	assert(dest != NULL);
	1011	assert(size > 0);
[0600976]	1012	assert(size == STR_NO_LIMIT \|\| dest + size > dest);
[a35b458]	1013
[28c39f3]	1014	size_t dstr_size = _str_nsize(dest, size);
[0600976]	1015	if (dstr_size < size) {
	1016	_str_cpyn(dest + dstr_size, size - dstr_size, src);
	1017	_sanitize_string(dest + dstr_size, size - dstr_size);
	1018	}
[4482bc7]	1019	}
	1020
[dcb74c0a]	1021	/** Convert space-padded ASCII to string.
	1022	*
	1023	* Common legacy text encoding in hardware is 7-bit ASCII fitted into
[c3d19ac]	1024	* a fixed-width byte buffer (bit 7 always zero), right-padded with spaces
[dcb74c0a]	1025	* (ASCII 0x20). Convert space-padded ascii to string representation.
	1026	*
	1027	* If the text does not fit into the destination buffer, the function converts
	1028	* as many characters as possible and returns EOVERFLOW.
	1029	*
	1030	* If the text contains non-ASCII bytes (with bit 7 set), the whole string is
	1031	* converted anyway and invalid characters are replaced with question marks
	1032	* (U_SPECIAL) and the function returns EIO.
	1033	*
	1034	* Regardless of return value upon return @a dest will always be well-formed.
	1035	*
	1036	* @param dest Destination buffer
	1037	* @param size Size of destination buffer
	1038	* @param src Space-padded ASCII.
	1039	* @param n Size of the source buffer in bytes.
	1040	*
	1041	* @return EOK on success, EOVERFLOW if the text does not fit
	1042	* destination buffer, EIO if the text contains
	1043	* non-ASCII bytes.
	1044	*/
[b7fd2a0]	1045	errno_t spascii_to_str(char dest, size_t size, const uint8_t src, size_t n)
[dcb74c0a]	1046	{
[28c39f3]	1047	size_t len = 0;
[dcb74c0a]	1048
[28c39f3]	1049	/* Determine the length of the source string. */
	1050	for (size_t i = 0; i < n; i++) {
	1051	if (src[i] == 0)
	1052	break;
	1053
	1054	if (src[i] != ' ')
	1055	len = i + 1;
	1056	}
	1057
	1058	errno_t result = EOK;
	1059	size_t out_len = min(len, size - 1);
	1060
	1061	/* Copy characters */
	1062	for (size_t i = 0; i < out_len; i++) {
	1063	dest[i] = src[i];
	1064
	1065	if (dest[i] < 0) {
	1066	dest[i] = U_SPECIAL;
[dcb74c0a]	1067	result = EIO;
	1068	}
[28c39f3]	1069	}
[dcb74c0a]	1070
[28c39f3]	1071	dest[out_len] = 0;
[dcb74c0a]	1072
[28c39f3]	1073	if (out_len < len)
	1074	return EOVERFLOW;
[dcb74c0a]	1075
	1076	return result;
	1077	}
	1078
[0f06dbc]	1079	/** Convert wide string to string.
[f2b8cdc]	1080	*
[0f06dbc]	1081	* Convert wide string @a src to string. The output is written to the buffer
	1082	* specified by @a dest and @a size. @a size must be non-zero and the string
	1083	* written will always be well-formed.
[f2b8cdc]	1084	*
[0f06dbc]	1085	* @param dest Destination buffer.
	1086	* @param size Size of the destination buffer.
	1087	* @param src Source wide string.
[f2b8cdc]	1088	*/
[28a5ebd]	1089	void wstr_to_str(char dest, size_t size, const char32_t src)
[f2b8cdc]	1090	{
[28a5ebd]	1091	char32_t ch;
[0f06dbc]	1092	size_t src_idx;
	1093	size_t dest_off;
	1094
	1095	/* There must be space for a null terminator in the buffer. */
	1096	assert(size > 0);
[a35b458]	1097
[0f06dbc]	1098	src_idx = 0;
	1099	dest_off = 0;
	1100
[f2b8cdc]	1101	while ((ch = src[src_idx++]) != 0) {
[81e9cb3]	1102	if (chr_encode(ch, dest, &dest_off, size - 1) != EOK)
[f2b8cdc]	1103	break;
	1104	}
[0f06dbc]	1105
	1106	dest[dest_off] = '\0';
[f2b8cdc]	1107	}
	1108
[82374b2]	1109	/** Convert UTF16 string to string.
	1110	*
	1111	* Convert utf16 string @a src to string. The output is written to the buffer
	1112	* specified by @a dest and @a size. @a size must be non-zero and the string
	1113	* written will always be well-formed. Surrogate pairs also supported.
	1114	*
	1115	* @param dest Destination buffer.
	1116	* @param size Size of the destination buffer.
	1117	* @param src Source utf16 string.
	1118	*
[cde999a]	1119	* @return EOK, if success, an error code otherwise.
[82374b2]	1120	*/
[b7fd2a0]	1121	errno_t utf16_to_str(char dest, size_t size, const uint16_t src)
[82374b2]	1122	{
[abb7491c]	1123	size_t idx = 0, dest_off = 0;
[28a5ebd]	1124	char32_t ch;
[b7fd2a0]	1125	errno_t rc = EOK;
[82374b2]	1126
	1127	/* There must be space for a null terminator in the buffer. */
	1128	assert(size > 0);
	1129
	1130	while (src[idx]) {
	1131	if ((src[idx] & 0xfc00) == 0xd800) {
[abb7491c]	1132	if (src[idx + 1] && (src[idx + 1] & 0xfc00) == 0xdc00) {
[82374b2]	1133	ch = 0x10000;
	1134	ch += (src[idx] & 0x03FF) << 10;
[abb7491c]	1135	ch += (src[idx + 1] & 0x03FF);
[82374b2]	1136	idx += 2;
[1433ecda]	1137	} else
[82374b2]	1138	break;
	1139	} else {
	1140	ch = src[idx];
	1141	idx++;
	1142	}
[abb7491c]	1143	rc = chr_encode(ch, dest, &dest_off, size - 1);
[82374b2]	1144	if (rc != EOK)
	1145	break;
	1146	}
	1147	dest[dest_off] = '\0';
	1148	return rc;
	1149	}
	1150
[b06414f]	1151	/** Convert string to UTF16 string.
	1152	*
	1153	* Convert string @a src to utf16 string. The output is written to the buffer
	1154	* specified by @a dest and @a dlen. @a dlen must be non-zero and the string
	1155	* written will always be well-formed. Surrogate pairs also supported.
	1156	*
	1157	* @param dest Destination buffer.
	1158	* @param dlen Number of utf16 characters that fit in the destination buffer.
	1159	* @param src Source string.
	1160	*
[cde999a]	1161	* @return EOK, if success, an error code otherwise.
[b06414f]	1162	*/
[b7fd2a0]	1163	errno_t str_to_utf16(uint16_t dest, size_t dlen, const char src)
[fc97128]	1164	{
[b7fd2a0]	1165	errno_t rc = EOK;
[abb7491c]	1166	size_t offset = 0;
	1167	size_t idx = 0;
[28a5ebd]	1168	char32_t c;
[fc97128]	1169
[b06414f]	1170	assert(dlen > 0);
[a35b458]	1171
[fc97128]	1172	while ((c = str_decode(src, &offset, STR_NO_LIMIT)) != 0) {
	1173	if (c > 0x10000) {
[b06414f]	1174	if (idx + 2 >= dlen - 1) {
[abb7491c]	1175	rc = EOVERFLOW;
[fc97128]	1176	break;
	1177	}
	1178	c = (c - 0x10000);
	1179	dest[idx] = 0xD800 \| (c >> 10);
[abb7491c]	1180	dest[idx + 1] = 0xDC00 \| (c & 0x3FF);
[fc97128]	1181	idx++;
	1182	} else {
[1433ecda]	1183	dest[idx] = c;
[fc97128]	1184	}
	1185
	1186	idx++;
[b06414f]	1187	if (idx >= dlen - 1) {
[abb7491c]	1188	rc = EOVERFLOW;
[fc97128]	1189	break;
	1190	}
	1191	}
	1192
	1193	dest[idx] = '\0';
	1194	return rc;
[f2b8cdc]	1195	}
	1196
[b2906c0]	1197	/** Get size of UTF-16 string.
	1198	*
	1199	* Get the number of words which are used by the UTF-16 string @a ustr
	1200	* (excluding the NULL-terminator).
	1201	*
	1202	* @param ustr UTF-16 string to consider.
	1203	*
	1204	* @return Number of words used by the UTF-16 string
	1205	*
	1206	*/
	1207	size_t utf16_wsize(const uint16_t *ustr)
	1208	{
	1209	size_t wsize = 0;
	1210
	1211	while (*ustr++ != 0)
	1212	wsize++;
	1213
	1214	return wsize;
	1215	}
	1216
[b67c7d64]	1217	/** Convert wide string to new string.
	1218	*
	1219	* Convert wide string @a src to string. Space for the new string is allocated
	1220	* on the heap.
	1221	*
	1222	* @param src Source wide string.
	1223	* @return New string.
	1224	*/
[28a5ebd]	1225	char wstr_to_astr(const char32_t src)
[b67c7d64]	1226	{
	1227	char dbuf[STR_BOUNDS(1)];
	1228	char *str;
[28a5ebd]	1229	char32_t ch;
[b67c7d64]	1230
	1231	size_t src_idx;
	1232	size_t dest_off;
	1233	size_t dest_size;
	1234
	1235	/* Compute size of encoded string. */
	1236
	1237	src_idx = 0;
	1238	dest_size = 0;
	1239
	1240	while ((ch = src[src_idx++]) != 0) {
	1241	dest_off = 0;
	1242	if (chr_encode(ch, dbuf, &dest_off, STR_BOUNDS(1)) != EOK)
	1243	break;
	1244	dest_size += dest_off;
	1245	}
	1246
	1247	str = malloc(dest_size + 1);
	1248	if (str == NULL)
	1249	return NULL;
	1250
	1251	/* Encode string. */
	1252
	1253	src_idx = 0;
	1254	dest_off = 0;
	1255
	1256	while ((ch = src[src_idx++]) != 0) {
	1257	if (chr_encode(ch, str, &dest_off, dest_size) != EOK)
	1258	break;
	1259	}
	1260
	1261	str[dest_size] = '\0';
	1262	return str;
	1263	}
	1264
[da2bd08]	1265	/** Convert string to wide string.
	1266	*
	1267	* Convert string @a src to wide string. The output is written to the
[0f06dbc]	1268	* buffer specified by @a dest and @a dlen. @a dlen must be non-zero
	1269	* and the wide string written will always be null-terminated.
[da2bd08]	1270	*
	1271	* @param dest Destination buffer.
	1272	* @param dlen Length of destination buffer (number of wchars).
	1273	* @param src Source string.
	1274	*/
[28a5ebd]	1275	void str_to_wstr(char32_t dest, size_t dlen, const char src)
[da2bd08]	1276	{
	1277	size_t offset;
	1278	size_t di;
[28a5ebd]	1279	char32_t c;
[da2bd08]	1280
	1281	assert(dlen > 0);
	1282
	1283	offset = 0;
	1284	di = 0;
	1285
	1286	do {
[81e9cb3]	1287	if (di >= dlen - 1)
[da2bd08]	1288	break;
	1289
	1290	c = str_decode(src, &offset, STR_NO_LIMIT);
	1291	dest[di++] = c;
	1292	} while (c != '\0');
	1293
	1294	dest[dlen - 1] = '\0';
	1295	}
	1296
[22cf42d9]	1297	/** Convert string to wide string.
	1298	*
	1299	* Convert string @a src to wide string. A new wide NULL-terminated
	1300	* string will be allocated on the heap.
	1301	*
	1302	* @param src Source string.
	1303	*/
[28a5ebd]	1304	char32_t str_to_awstr(const char str)
[22cf42d9]	1305	{
	1306	size_t len = str_length(str);
[a35b458]	1307
[28a5ebd]	1308	char32_t *wstr = calloc(len + 1, sizeof(char32_t));
[b48d046]	1309	if (wstr == NULL)
	1310	return NULL;
[a35b458]	1311
[b48d046]	1312	str_to_wstr(wstr, len + 1, str);
[22cf42d9]	1313	return wstr;
	1314	}
	1315
[28c39f3]	1316	static char _strchr(const char str, char c)
	1317	{
	1318	while (str != 0 && str != c)
	1319	str++;
	1320
	1321	return (str == c) ? (char ) str : NULL;
	1322	}
	1323
[f2b8cdc]	1324	/** Find first occurence of character in string.
	1325	*
	1326	* @param str String to search.
	1327	* @param ch Character to look for.
	1328	*
	1329	* @return Pointer to character in @a str or NULL if not found.
	1330	*/
[28a5ebd]	1331	char str_chr(const char str, char32_t ch)
[f2b8cdc]	1332	{
[28c39f3]	1333	/* Fast path for an ASCII character. */
	1334	if (ascii_check(ch))
	1335	return _strchr(str, ch);
[a35b458]	1336
[28c39f3]	1337	/* Convert character to UTF-8. */
	1338	char utf8[STR_BOUNDS(1) + 1];
	1339	size_t offset = 0;
	1340
	1341	if (chr_encode(ch, utf8, &offset, sizeof(utf8)) != EOK \|\| offset == 0)
	1342	return NULL;
	1343
	1344	utf8[offset] = '\0';
	1345
	1346	/* Find the first byte, then check if all of them are correct. */
	1347	while (*str != 0) {
	1348	str = _strchr(str, utf8[0]);
	1349	if (!str)
	1350	return NULL;
	1351
	1352	if (_test_prefix(str, utf8))
	1353	return (char *) str;
	1354
	1355	str++;
[f2b8cdc]	1356	}
[a35b458]	1357
[f2b8cdc]	1358	return NULL;
	1359	}
	1360
[da680b4b]	1361	/** Find first occurence of substring in string.
	1362	*
	1363	* @param hs Haystack (string)
	1364	* @param n Needle (substring to look for)
	1365	*
	1366	* @return Pointer to character in @a hs or @c NULL if not found.
	1367	*/
	1368	char str_str(const char hs, const char *n)
	1369	{
[28c39f3]	1370	size_t hsize = _str_size(hs);
	1371	size_t nsize = _str_size(n);
[da680b4b]	1372
[28c39f3]	1373	while (hsize >= nsize) {
	1374	if (_test_prefix(hs, n))
	1375	return (char *) hs;
[da680b4b]	1376
[28c39f3]	1377	hs++;
	1378	hsize--;
[da680b4b]	1379	}
	1380
	1381	return NULL;
	1382	}
	1383
[28c39f3]	1384	static void _str_rtrim(char *str, char c)
	1385	{
	1386	char *last = str;
	1387
	1388	while (*str) {
	1389	if (*str != c)
	1390	last = str;
	1391
	1392	str++;
	1393	}
	1394
	1395	/* Truncate string. */
	1396	last[1] = 0;
	1397	}
	1398
[1737bfb]	1399	/** Removes specified trailing characters from a string.
	1400	*
	1401	* @param str String to remove from.
	1402	* @param ch Character to remove.
	1403	*/
[28a5ebd]	1404	void str_rtrim(char *str, char32_t ch)
[1737bfb]	1405	{
[28c39f3]	1406	/* Fast path for the ASCII case. */
	1407	if (ascii_check(ch)) {
	1408	_str_rtrim(str, ch);
	1409	return;
	1410	}
	1411
[1737bfb]	1412	size_t off = 0;
	1413	size_t pos = 0;
[28a5ebd]	1414	char32_t c;
[1737bfb]	1415	bool update_last_chunk = true;
	1416	char *last_chunk = NULL;
	1417
	1418	while ((c = str_decode(str, &off, STR_NO_LIMIT))) {
	1419	if (c != ch) {
	1420	update_last_chunk = true;
	1421	last_chunk = NULL;
	1422	} else if (update_last_chunk) {
	1423	update_last_chunk = false;
	1424	last_chunk = (str + pos);
	1425	}
	1426	pos = off;
	1427	}
	1428
	1429	if (last_chunk)
	1430	*last_chunk = '\0';
	1431	}
	1432
[28c39f3]	1433	static void _str_ltrim(char *str, char c)
	1434	{
	1435	char *p = str;
	1436
	1437	while (*p == c)
	1438	p++;
	1439
	1440	if (str != p)
	1441	_str_cpy(str, p);
	1442	}
	1443
[1737bfb]	1444	/** Removes specified leading characters from a string.
	1445	*
	1446	* @param str String to remove from.
	1447	* @param ch Character to remove.
	1448	*/
[28a5ebd]	1449	void str_ltrim(char *str, char32_t ch)
[1737bfb]	1450	{
[28c39f3]	1451	/* Fast path for the ASCII case. */
	1452	if (ascii_check(ch)) {
	1453	_str_ltrim(str, ch);
	1454	return;
	1455	}
	1456
[28a5ebd]	1457	char32_t acc;
[1737bfb]	1458	size_t off = 0;
	1459	size_t pos = 0;
	1460	size_t str_sz = str_size(str);
	1461
	1462	while ((acc = str_decode(str, &off, STR_NO_LIMIT)) != 0) {
	1463	if (acc != ch)
	1464	break;
	1465	else
	1466	pos = off;
	1467	}
	1468
	1469	if (pos > 0) {
	1470	memmove(str, &str[pos], str_sz - pos);
	1471	pos = str_sz - pos;
[a18a8b9]	1472	str[pos] = '\0';
[1737bfb]	1473	}
	1474	}
	1475
[28c39f3]	1476	static char _str_rchr(const char str, char c)
	1477	{
	1478	const char *last = NULL;
	1479
	1480	while (*str) {
	1481	if (*str == c)
	1482	last = str;
	1483
	1484	str++;
	1485	}
	1486
	1487	return (char *) last;
	1488	}
	1489
[7afb4a5]	1490	/** Find last occurence of character in string.
	1491	*
	1492	* @param str String to search.
	1493	* @param ch Character to look for.
	1494	*
	1495	* @return Pointer to character in @a str or NULL if not found.
	1496	*/
[28a5ebd]	1497	char str_rchr(const char str, char32_t ch)
[7afb4a5]	1498	{
[28c39f3]	1499	if (ascii_check(ch))
	1500	return _str_rchr(str, ch);
	1501
[28a5ebd]	1502	char32_t acc;
[7afb4a5]	1503	size_t off = 0;
[f2d2c7ba]	1504	size_t last = 0;
[d4a3ee5]	1505	const char *res = NULL;
[a35b458]	1506
[7afb4a5]	1507	while ((acc = str_decode(str, &off, STR_NO_LIMIT)) != 0) {
	1508	if (acc == ch)
[f2d2c7ba]	1509	res = (str + last);
	1510	last = off;
[7afb4a5]	1511	}
[a35b458]	1512
[dd2cfa7]	1513	return (char *) res;
[7afb4a5]	1514	}
	1515
[f2b8cdc]	1516	/** Insert a wide character into a wide string.
	1517	*
	1518	* Insert a wide character into a wide string at position
	1519	* @a pos. The characters after the position are shifted.
	1520	*
	1521	* @param str String to insert to.
	1522	* @param ch Character to insert to.
	1523	* @param pos Character index where to insert.
[7c3fb9b]	1524	* @param max_pos Characters in the buffer.
[f2b8cdc]	1525	*
	1526	* @return True if the insertion was sucessful, false if the position
	1527	* is out of bounds.
	1528	*
	1529	*/
[28a5ebd]	1530	bool wstr_linsert(char32_t *str, char32_t ch, size_t pos, size_t max_pos)
[f2b8cdc]	1531	{
[d4a3ee5]	1532	size_t len = wstr_length(str);
[a35b458]	1533
[f2b8cdc]	1534	if ((pos > len) \|\| (pos + 1 > max_pos))
	1535	return false;
[a35b458]	1536
[d4a3ee5]	1537	size_t i;
[f2b8cdc]	1538	for (i = len; i + 1 > pos; i--)
	1539	str[i + 1] = str[i];
[a35b458]	1540
[f2b8cdc]	1541	str[pos] = ch;
[a35b458]	1542
[f2b8cdc]	1543	return true;
	1544	}
	1545
	1546	/** Remove a wide character from a wide string.
	1547	*
	1548	* Remove a wide character from a wide string at position
	1549	* @a pos. The characters after the position are shifted.
	1550	*
	1551	* @param str String to remove from.
	1552	* @param pos Character index to remove.
	1553	*
	1554	* @return True if the removal was sucessful, false if the position
	1555	* is out of bounds.
	1556	*
	1557	*/
[28a5ebd]	1558	bool wstr_remove(char32_t *str, size_t pos)
[f2b8cdc]	1559	{
[d4a3ee5]	1560	size_t len = wstr_length(str);
[a35b458]	1561
[f2b8cdc]	1562	if (pos >= len)
	1563	return false;
[a35b458]	1564
[d4a3ee5]	1565	size_t i;
[f2b8cdc]	1566	for (i = pos + 1; i <= len; i++)
	1567	str[i - 1] = str[i];
[a35b458]	1568
[f2b8cdc]	1569	return true;
	1570	}
	1571
[abf09311]	1572	/** Duplicate string.
	1573	*
	1574	* Allocate a new string and copy characters from the source
	1575	* string into it. The duplicate string is allocated via sleeping
	1576	* malloc(), thus this function can sleep in no memory conditions.
	1577	*
	1578	* The allocation cannot fail and the return value is always
	1579	* a valid pointer. The duplicate string is always a well-formed
	1580	* null-terminated UTF-8 string, but it can differ from the source
	1581	* string on the byte level.
	1582	*
	1583	* @param src Source string.
	1584	*
	1585	* @return Duplicate string.
	1586	*
	1587	*/
[fc6dd18]	1588	char str_dup(const char src)
	1589	{
[28c39f3]	1590	size_t size = _str_size(src) + 1;
[d066259]	1591	char *dest = malloc(size);
	1592	if (!dest)
	1593	return NULL;
[a35b458]	1594
[0600976]	1595	memcpy(dest, src, size);
	1596	_sanitize_string(dest, size);
[abf09311]	1597	return dest;
[fc6dd18]	1598	}
	1599
[abf09311]	1600	/** Duplicate string with size limit.
	1601	*
	1602	* Allocate a new string and copy up to @max_size bytes from the source
	1603	* string into it. The duplicate string is allocated via sleeping
	1604	* malloc(), thus this function can sleep in no memory conditions.
	1605	* No more than @max_size + 1 bytes is allocated, but if the size
	1606	* occupied by the source string is smaller than @max_size + 1,
	1607	* less is allocated.
	1608	*
	1609	* The allocation cannot fail and the return value is always
	1610	* a valid pointer. The duplicate string is always a well-formed
	1611	* null-terminated UTF-8 string, but it can differ from the source
	1612	* string on the byte level.
	1613	*
	1614	* @param src Source string.
	1615	* @param n Maximum number of bytes to duplicate.
	1616	*
	1617	* @return Duplicate string.
	1618	*
	1619	*/
	1620	char str_ndup(const char src, size_t n)
[fc6dd18]	1621	{
[0600976]	1622	size_t size = _str_nsize(src, n);
[a35b458]	1623
[0600976]	1624	char *dest = malloc(size + 1);
[d066259]	1625	if (!dest)
	1626	return NULL;
[a35b458]	1627
[0600976]	1628	memcpy(dest, src, size);
	1629	_sanitize_string(dest, size);
	1630	dest[size] = 0;
[fc6dd18]	1631	return dest;
	1632	}
	1633
[ee3f6f6]	1634	/** Split string by delimiters.
	1635	*
	1636	* @param s String to be tokenized. May not be NULL.
	1637	* @param delim String with the delimiters.
	1638	* @param next Variable which will receive the pointer to the
	1639	* continuation of the string following the first
	1640	* occurrence of any of the delimiter characters.
	1641	* May be NULL.
	1642	* @return Pointer to the prefix of @a s before the first
	1643	* delimiter character. NULL if no such prefix
	1644	* exists.
	1645	*/
	1646	char str_tok(char s, const char delim, char *next)
[576845ec]	1647	{
	1648	char start, end;
[69df837f]	1649
[ee3f6f6]	1650	if (!s)
	1651	return NULL;
[a35b458]	1652
[ee3f6f6]	1653	size_t len = str_size(s);
	1654	size_t cur;
	1655	size_t tmp;
[28a5ebd]	1656	char32_t ch;
[69df837f]	1657
[576845ec]	1658	/* Skip over leading delimiters. */
[948222e4]	1659	tmp = 0;
	1660	cur = 0;
	1661	while ((ch = str_decode(s, &tmp, len)) && str_chr(delim, ch))
[ee3f6f6]	1662	cur = tmp;
	1663	start = &s[cur];
[69df837f]	1664
[576845ec]	1665	/* Skip over token characters. */
[948222e4]	1666	tmp = cur;
	1667	while ((ch = str_decode(s, &tmp, len)) && !str_chr(delim, ch))
[ee3f6f6]	1668	cur = tmp;
	1669	end = &s[cur];
	1670	if (next)
	1671	*next = (ch ? &s[tmp] : &s[cur]);
	1672
	1673	if (start == end)
[576845ec]	1674	return NULL; /* No more tokens. */
[69df837f]	1675
[576845ec]	1676	/* Overwrite delimiter with NULL terminator. */
	1677	*end = '\0';
	1678	return start;
[69df837f]	1679	}
	1680
[e535eeb]	1681	void order_suffix(const uint64_t val, uint64_t rv, char suffix)
	1682	{
[933cadf]	1683	if (val > UINT64_C(10000000000000000000)) {
	1684	*rv = val / UINT64_C(1000000000000000000);
[e535eeb]	1685	*suffix = 'Z';
[933cadf]	1686	} else if (val > UINT64_C(1000000000000000000)) {
	1687	*rv = val / UINT64_C(1000000000000000);
[e535eeb]	1688	*suffix = 'E';
[933cadf]	1689	} else if (val > UINT64_C(1000000000000000)) {
	1690	*rv = val / UINT64_C(1000000000000);
[e535eeb]	1691	*suffix = 'T';
[933cadf]	1692	} else if (val > UINT64_C(1000000000000)) {
	1693	*rv = val / UINT64_C(1000000000);
[e535eeb]	1694	*suffix = 'G';
[933cadf]	1695	} else if (val > UINT64_C(1000000000)) {
	1696	*rv = val / UINT64_C(1000000);
[e535eeb]	1697	*suffix = 'M';
[933cadf]	1698	} else if (val > UINT64_C(1000000)) {
	1699	*rv = val / UINT64_C(1000);
[e535eeb]	1700	*suffix = 'k';
	1701	} else {
	1702	*rv = val;
	1703	*suffix = ' ';
	1704	}
	1705	}
	1706
[933cadf]	1707	void bin_order_suffix(const uint64_t val, uint64_t rv, const char *suffix,
	1708	bool fixed)
	1709	{
	1710	if (val > UINT64_C(1152921504606846976)) {
	1711	*rv = val / UINT64_C(1125899906842624);
	1712	*suffix = "EiB";
	1713	} else if (val > UINT64_C(1125899906842624)) {
	1714	*rv = val / UINT64_C(1099511627776);
	1715	*suffix = "TiB";
	1716	} else if (val > UINT64_C(1099511627776)) {
	1717	*rv = val / UINT64_C(1073741824);
	1718	*suffix = "GiB";
	1719	} else if (val > UINT64_C(1073741824)) {
	1720	*rv = val / UINT64_C(1048576);
	1721	*suffix = "MiB";
	1722	} else if (val > UINT64_C(1048576)) {
	1723	*rv = val / UINT64_C(1024);
	1724	*suffix = "KiB";
	1725	} else {
	1726	*rv = val;
	1727	if (fixed)
	1728	*suffix = "B ";
	1729	else
	1730	*suffix = "B";
	1731	}
	1732	}
	1733
[a46da63]	1734	/** @}
[b2951e2]	1735	*/

Note: See TracBrowser for help on using the repository browser.

Download in other formats: