Context Navigation

source: mainline/common/str.c@ 45adeeb

Visit:

Last change on this file since 45adeeb was 45adeeb, checked in by Jiří Zárevúcky <zarevucky.jiri@…>, 2 months ago
Expose restartable version of str_decode() as str_decode_r()
Property mode set to `100644`
File size: 45.8 KB

Line
1	/*
2	* Copyright (c) 2001-2004 Jakub Jermar
3	* Copyright (c) 2005 Martin Decky
4	* Copyright (c) 2008 Jiri Svoboda
5	* Copyright (c) 2011 Martin Sucha
6	* Copyright (c) 2011 Oleg Romanenko
7	* Copyright (c) 2025 Jiří Zárevúcky
8	* All rights reserved.
9	*
10	* Redistribution and use in source and binary forms, with or without
11	* modification, are permitted provided that the following conditions
12	* are met:
13	*
14	* - Redistributions of source code must retain the above copyright
15	* notice, this list of conditions and the following disclaimer.
16	* - Redistributions in binary form must reproduce the above copyright
17	* notice, this list of conditions and the following disclaimer in the
18	* documentation and/or other materials provided with the distribution.
19	* - The name of the author may not be used to endorse or promote products
20	* derived from this software without specific prior written permission.
21	*
22	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
23	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
24	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
25	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
26	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
27	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
31	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32	*/
33
34	/** @addtogroup libc
35	* @{
36	*/
37
38	/**
39	* @file
40	* @brief String functions.
41	*
42	* Strings and characters use the Universal Character Set (UCS). The standard
43	* strings, called just strings are encoded in UTF-8. Wide strings (encoded
44	* in UTF-32) are supported to a limited degree. A single character is
45	* represented as char32_t.@n
46	*
47	* Overview of the terminology:@n
48	*
49	* Term Meaning
50	* -------------------- ----------------------------------------------------
51	* byte 8 bits stored in uint8_t (unsigned 8 bit integer)
52	*
53	* character UTF-32 encoded Unicode character, stored in char32_t
54	* (unsigned 32 bit integer), code points 0 .. 1114111
55	* are valid
56	*
57	* Note that Unicode characters do not match
58	* one-to-one with displayed characters or glyphs on
59	* screen. For that level of precision, look up
60	* Grapheme Clusters.
61	*
62	* ASCII character 7 bit encoded ASCII character, stored in char
63	* (usually signed 8 bit integer), code points 0 .. 127
64	* are valid
65	*
66	* string UTF-8 encoded NULL-terminated Unicode string, char *
67	*
68	* wide string UTF-32 encoded NULL-terminated Unicode string,
69	* char32_t *
70	*
71	* [wide] string size number of BYTES in a [wide] string (excluding
72	* the NULL-terminator), size_t
73	*
74	* [wide] string length number of CHARACTERS in a [wide] string (excluding
75	* the NULL-terminator), size_t
76	*
77	* [wide] string width number of display cells on a monospace display taken
78	* by a [wide] string, size_t
79	*
80	* This is virtually impossible to determine exactly for
81	* all strings without knowing specifics of the display
82	* device, due to various factors affecting text output.
83	* If you have the option to query the terminal for
84	* position change caused by outputting the string,
85	* it is preferrable to determine width that way.
86	*
87	*
88	* Overview of string metrics:@n
89	*
90	* Metric Abbrev. Type Meaning
91	* ------ ------ ------ -------------------------------------------------
92	* size n size_t number of BYTES in a string (excluding the
93	* NULL-terminator)
94	*
95	* length l size_t number of CHARACTERS in a string (excluding the
96	* null terminator)
97	*
98	* width w size_t number of display cells on a monospace display
99	* taken by a string
100	*
101	*
102	* Function naming prefixes:@n
103	*
104	* chr_ operate on characters
105	* ascii_ operate on ASCII characters
106	* str_ operate on strings
107	* wstr_ operate on wide strings
108	*
109	* [w]str_[n\|l\|w] operate on a prefix limited by size, length
110	* or width
111	*
112	*
113	* A specific character inside a [wide] string can be referred to by:@n
114	*
115	* pointer (char , char32_t )
116	* byte offset (size_t)
117	* character index (size_t)
118	*
119	*/
120
121	#include <str.h>
122
123	#include <align.h>
124	#include <assert.h>
125	#include <ctype.h>
126	#include <errno.h>
127	#include <limits.h>
128	#include <macros.h>
129	#include <mem.h>
130	#include <stdbool.h>
131	#include <stddef.h>
132	#include <stdint.h>
133	#include <stdlib.h>
134	#include <uchar.h>
135
136	#if __STDC_HOSTED__
137	#include <fibril.h>
138	#endif
139
140	static void _set_ilseq()
141	{
142	#ifdef errno
143	errno = EILSEQ;
144	#endif
145	}
146
147	/** Byte mask consisting of lowest @n bits (out of 8) */
148	#define LO_MASK_8(n) ((uint8_t) ((1 << (n)) - 1))
149
150	/** Byte mask consisting of lowest @n bits (out of 32) */
151	#define LO_MASK_32(n) ((uint32_t) ((1 << (n)) - 1))
152
153	/** Byte mask consisting of highest @n bits (out of 8) */
154	#define HI_MASK_8(n) (~LO_MASK_8(8 - (n)))
155
156	/** Number of data bits in a UTF-8 continuation byte */
157	#define CONT_BITS 6
158
159	#define UTF8_MASK_INITIAL2 0b00011111
160	#define UTF8_MASK_INITIAL3 0b00001111
161	#define UTF8_MASK_INITIAL4 0b00000111
162	#define UTF8_MASK_CONT 0b00111111
163
164	#define CHAR_INVALID ((char32_t) UINT_MAX)
165
166	static inline bool _is_ascii(uint8_t b)
167	{
168	return b < 0x80;
169	}
170
171	static inline bool _is_continuation(uint8_t b)
172	{
173	return (b & 0xC0) == 0x80;
174	}
175
176	static inline bool _is_2_byte(uint8_t c)
177	{
178	return (c & 0xE0) == 0xC0;
179	}
180
181	static inline bool _is_3_byte(uint8_t c)
182	{
183	return (c & 0xF0) == 0xE0;
184	}
185
186	static inline bool _is_4_byte(uint8_t c)
187	{
188	return (c & 0xF8) == 0xF0;
189	}
190
191	static inline int _char_continuation_bytes(char32_t c)
192	{
193	if ((c & ~LO_MASK_32(7)) == 0)
194	return 0;
195
196	if ((c & ~LO_MASK_32(11)) == 0)
197	return 1;
198
199	if ((c & ~LO_MASK_32(16)) == 0)
200	return 2;
201
202	if ((c & ~LO_MASK_32(21)) == 0)
203	return 3;
204
205	/* Codes longer than 21 bits are not supported */
206	return -1;
207	}
208
209	static inline int _continuation_bytes(uint8_t b)
210	{
211	/* 0xxxxxxx */
212	if (_is_ascii(b))
213	return 0;
214
215	/* 110xxxxx 10xxxxxx */
216	if (_is_2_byte(b))
217	return 1;
218
219	/* 1110xxxx 10xxxxxx 10xxxxxx */
220	if (_is_3_byte(b))
221	return 2;
222
223	/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
224	if (_is_4_byte(b))
225	return 3;
226
227	return -1;
228	}
229
230	static bool _is_non_shortest(const mbstate_t *mb, uint8_t b)
231	{
232	return (mb->state == 0b1111110000000000 && !(b & 0b00100000)) \|\|
233	(mb->state == 0b1111111111110000 && !(b & 0b00110000));
234	}
235
236	static bool _is_surrogate(const mbstate_t *mb, uint8_t b)
237	{
238	return (mb->state == 0b1111110000001101 && b >= 0xa0);
239	}
240
241	#define _likely(expr) __builtin_expect((expr), true)
242	#define _unlikely(expr) __builtin_expect((expr), false)
243
244	#define FAST_PATHS 1
245
246	static char32_t _str_decode(const char s, size_t offset, size_t size, mbstate_t *mb)
247	{
248	assert(s);
249	assert(offset);
250	assert(*offset <= size);
251	assert(size == STR_NO_LIMIT \|\| s + size >= s);
252	assert(mb);
253
254	if (*offset == size)
255	return 0;
256
257	if (_likely(!mb->state)) {
258	/* Clean slate, read initial byte. */
259	uint8_t b = s[(*offset)++];
260
261	/* Fast exit for the most common case. */
262	if (_likely(_is_ascii(b)))
263	return b;
264
265	/* unexpected continuation byte */
266	if (_unlikely(_is_continuation(b)))
267	return CHAR_INVALID;
268
269	/*
270	* The value stored into `continuation` is designed to have
271	* just enough leading ones that after shifting in one less than
272	* the expected number of continuation bytes, the most significant
273	* bit becomes zero. (The field is 16b wide.)
274	*/
275
276	if (_is_2_byte(b)) {
277	/* Reject non-shortest form. */
278	if (_unlikely(!(b & 0b00011110)))
279	return CHAR_INVALID;
280
281	#if FAST_PATHS
282	/* We can usually take this exit. */
283	if (_likely(offset < size && _is_continuation(s[offset])))
284	return (b & UTF8_MASK_INITIAL2) << 6 \|
285	(s[(*offset)++] & UTF8_MASK_CONT);
286	#endif
287
288	/* 2 byte continuation 110xxxxx */
289	mb->state = b ^ 0b0000000011000000;
290
291	} else if (_is_3_byte(b)) {
292	#if FAST_PATHS
293	/* We can usually take this exit. */
294	if (_likely(offset + 1 < size && _is_continuation(s[offset]) && _is_continuation(s[*offset + 1]))) {
295
296	char32_t ch = (b & UTF8_MASK_INITIAL3) << 12 \|
297	(s[(*offset)] & UTF8_MASK_CONT) << 6 \|
298	(s[(*offset) + 1] & UTF8_MASK_CONT);
299
300	*offset += 2;
301
302	/* Reject non-shortest form. */
303	if (_unlikely(!(ch & 0xFFFFF800)))
304	return CHAR_INVALID;
305
306	/* Reject surrogates */
307	if (_unlikely(ch >= 0xD800 && ch < 0xE000))
308	return CHAR_INVALID;
309
310	return ch;
311	}
312	#endif
313
314	/* 3 byte continuation 1110xxxx */
315	mb->state = b ^ 0b1111110011100000;
316
317	} else if (_is_4_byte(b)) {
318	#if FAST_PATHS
319	/* We can usually take this exit. */
320	if (_likely(offset + 2 < size && _is_continuation(s[offset]) &&
321	_is_continuation(s[offset + 1]) && _is_continuation(s[offset + 2]))) {
322
323	char32_t ch = (b & UTF8_MASK_INITIAL4) << 18 \|
324	(s[(*offset)] & UTF8_MASK_CONT) << 12 \|
325	(s[(*offset) + 1] & UTF8_MASK_CONT) << 6 \|
326	(s[(*offset) + 2] & UTF8_MASK_CONT);
327
328	*offset += 3;
329
330	/* Reject non-shortest form. */
331	if (_unlikely(!(ch & 0xFFFF0000)))
332	return CHAR_INVALID;
333
334	/* Reject out-of-range characters. */
335	if (_unlikely(ch >= 0x110000))
336	return CHAR_INVALID;
337
338	return ch;
339	}
340	#endif
341
342	/* 4 byte continuation 11110xxx */
343	mb->state = b ^ 0b1111111100000000;
344	} else {
345	return CHAR_INVALID;
346	}
347	}
348
349	/* Deal with the remaining edge and invalid cases. */
350	for (; offset < size; (offset)++) {
351	/* Read continuation bytes. */
352	uint8_t b = s[*offset];
353
354	if (!_is_continuation(b) \|\| _is_non_shortest(mb, b) \|\| _is_surrogate(mb, b)) {
355	mb->state = 0;
356	return CHAR_INVALID;
357	}
358
359	/* Top bit becomes zero when shifting in the second to last byte. */
360	if (!(mb->state & 0x8000)) {
361	char32_t c = ((char32_t) mb->state) << 6 \| (b & UTF8_MASK_CONT);
362	mb->state = 0;
363	(*offset)++;
364	return c;
365	}
366
367	mb->state = mb->state << 6 \| (b & UTF8_MASK_CONT);
368	}
369
370	/* Incomplete character. */
371	assert(mb->state);
372	return 0;
373	}
374
375	/** Standard <uchar.h> function since C11. */
376	size_t mbrtoc32(char32_t c, const char s, size_t n, mbstate_t *mb)
377	{
378	#if __STDC_HOSTED__
379	static fibril_local mbstate_t global_state = { };
380
381	if (!mb)
382	mb = &global_state;
383	#endif
384
385	if (!s) {
386	/* Equivalent to mbrtoc32(NULL, "", 1, mb); */
387	c = NULL;
388	s = "";
389	n = 1;
390	}
391
392	size_t offset = 0;
393	char32_t ret = _str_decode(s, &offset, n, mb);
394	if (ret == CHAR_INVALID) {
395	assert(!mb->state);
396	_set_ilseq();
397	return UCHAR_ILSEQ;
398	}
399	if (mb->state) {
400	assert(ret == 0);
401	return UCHAR_INCOMPLETE;
402	}
403
404	if (c)
405	*c = ret;
406	return ret ? offset : 0;
407	}
408
409	/** Decode a single character from a string.
410	*
411	* Decode a single character from a string of size @a size. Decoding starts
412	* at @a offset and this offset is moved to the beginning of the next
413	* character. In case of decoding error, offset generally advances at least
414	* by one. However, offset is never moved beyond size.
415	*
416	* @param str String (not necessarily NULL-terminated).
417	* @param offset Byte offset in string where to start decoding.
418	* @param size Size of the string (in bytes).
419	*
420	* @return Value of decoded character, U_SPECIAL on decoding error or
421	* NULL if attempt to decode beyond @a size.
422	*
423	*/
424	char32_t str_decode(const char str, size_t offset, size_t size)
425	{
426	mbstate_t mb = { };
427	char32_t ch = _str_decode(str, offset, size, &mb);
428
429	if (ch == CHAR_INVALID \|\| mb.state)
430	return U_SPECIAL;
431
432	return ch;
433	}
434
435	char32_t str_decode_r(const char str, size_t offset, size_t size,
436	char32_t replacement, mbstate_t *mb)
437	{
438	char32_t ch = _str_decode(str, offset, size, mb);
439	return (ch == CHAR_INVALID) ? replacement : ch;
440	}
441
442	/** Decode a single character from a string to the left.
443	*
444	* Decode a single character from a string of size @a size. Decoding starts
445	* at @a offset and this offset is moved to the beginning of the previous
446	* character. In case of decoding error, offset generally decreases at least
447	* by one. However, offset is never moved before 0.
448	*
449	* @param str String (not necessarily NULL-terminated).
450	* @param offset Byte offset in string where to start decoding.
451	* @param size Size of the string (in bytes).
452	*
453	* @return Value of decoded character, U_SPECIAL on decoding error or
454	* NULL if attempt to decode beyond @a start of str.
455	*
456	*/
457	char32_t str_decode_reverse(const char str, size_t offset, size_t size)
458	{
459	if (*offset == 0)
460	return 0;
461
462	int cbytes = 0;
463	/* Continue while continuation bytes found */
464	while (*offset > 0 && cbytes < 4) {
465	uint8_t b = (uint8_t) str[--(*offset)];
466
467	if (_is_continuation(b)) {
468	cbytes++;
469	continue;
470	}
471
472	/* Reject non-shortest form encoding. */
473	if (cbytes != _continuation_bytes(b))
474	return U_SPECIAL;
475
476	/* Start byte */
477	size_t start_offset = *offset;
478	return str_decode(str, &start_offset, size);
479	}
480
481	/* Too many continuation bytes */
482	return U_SPECIAL;
483	}
484
485	/** Encode a single character to string representation.
486	*
487	* Encode a single character to string representation (i.e. UTF-8) and store
488	* it into a buffer at @a offset. Encoding starts at @a offset and this offset
489	* is moved to the position where the next character can be written to.
490	*
491	* @param ch Input character.
492	* @param str Output buffer.
493	* @param offset Byte offset where to start writing.
494	* @param size Size of the output buffer (in bytes).
495	*
496	* @return EOK if the character was encoded successfully, EOVERFLOW if there
497	* was not enough space in the output buffer or EINVAL if the character
498	* code was invalid.
499	*/
500	errno_t chr_encode(char32_t ch, char str, size_t offset, size_t size)
501	{
502	// TODO: merge with c32rtomb()
503
504	if (*offset >= size)
505	return EOVERFLOW;
506
507	/* Fast exit for the most common case. */
508	if (ch < 0x80) {
509	str[(*offset)++] = (char) ch;
510	return EOK;
511	}
512
513	/* Codes longer than 21 bits are not supported */
514	if (!chr_check(ch))
515	return EINVAL;
516
517	/* Determine how many continuation bytes are needed */
518
519	unsigned int cbytes = _char_continuation_bytes(ch);
520	unsigned int b0_bits = 6 - cbytes; /* Data bits in first byte */
521
522	/* Check for available space in buffer */
523	if (*offset + cbytes >= size)
524	return EOVERFLOW;
525
526	/* Encode continuation bytes */
527	unsigned int i;
528	for (i = cbytes; i > 0; i--) {
529	str[*offset + i] = 0x80 \| (ch & LO_MASK_32(CONT_BITS));
530	ch >>= CONT_BITS;
531	}
532
533	/* Encode first byte */
534	str[*offset] = (ch & LO_MASK_32(b0_bits)) \| HI_MASK_8(8 - b0_bits - 1);
535
536	/* Advance offset */
537	*offset += cbytes + 1;
538
539	return EOK;
540	}
541
542	/* Convert in place any bytes that don't form a valid character into replacement. */
543	static size_t _str_sanitize(char *str, size_t n, uint8_t replacement)
544	{
545	uint8_t b = (uint8_t ) str;
546	size_t count = 0;
547
548	for (; n > 0 && b[0]; b++, n--) {
549	int cont = _continuation_bytes(b[0]);
550	if (__builtin_expect(cont, 0) == 0)
551	continue;
552
553	if (cont < 0 \|\| n <= (size_t) cont) {
554	b[0] = replacement;
555	count++;
556	continue;
557	}
558
559	/* Check continuation bytes. */
560	bool valid = true;
561	for (int i = 1; i <= cont; i++) {
562	if (!_is_continuation(b[i])) {
563	valid = false;
564	break;
565	}
566	}
567
568	if (!valid) {
569	b[0] = replacement;
570	count++;
571	continue;
572	}
573
574	/*
575	* Check for non-shortest form encoding.
576	* See https://www.unicode.org/versions/corrigendum1.html
577	*/
578
579	/* 0b110!!!!x 0b10xxxxxx */
580	if (cont == 1 && !(b[0] & 0b00011110)) {
581	b[0] = replacement;
582	count++;
583	continue;
584	}
585
586	/* 0b1110!!!! 0b10!xxxxx 0b10xxxxxx */
587	if (cont == 2 && !(b[0] & 0b00001111) && !(b[1] & 0b00100000)) {
588	b[0] = replacement;
589	count++;
590	continue;
591	}
592
593	/* 0b11110!!! 0b10!!xxxx 0b10xxxxxx 0b10xxxxxx */
594	if (cont == 3 && !(b[0] & 0b00000111) && !(b[1] & 0b00110000)) {
595	b[0] = replacement;
596	count++;
597	continue;
598	}
599
600	/* Check for surrogate character encoding. */
601	if (cont == 2 && b[0] == 0xED && b[1] >= 0xA0) {
602	b[0] = replacement;
603	count++;
604	continue;
605	}
606
607	/* Check for out-of-range code points. */
608	if (cont == 3 && (b[0] > 0xF4 \|\| (b[0] == 0xF4 && b[1] >= 0x90))) {
609	b[0] = replacement;
610	count++;
611	continue;
612	}
613
614	b += cont;
615	n -= cont;
616	}
617
618	return count;
619	}
620
621	size_t str_sanitize(char *str, size_t n, uint8_t replacement)
622	{
623	return _str_sanitize(str, n, replacement);
624	}
625
626	static size_t _str_size(const char *str)
627	{
628	size_t size = 0;
629
630	while (*str++ != 0)
631	size++;
632
633	return size;
634	}
635
636	/** Get size of string.
637	*
638	* Get the number of bytes which are used by the string @a str (excluding the
639	* NULL-terminator).
640	*
641	* @param str String to consider.
642	*
643	* @return Number of bytes used by the string
644	*
645	*/
646	size_t str_size(const char *str)
647	{
648	return _str_size(str);
649	}
650
651	/** Get size of wide string.
652	*
653	* Get the number of bytes which are used by the wide string @a str (excluding the
654	* NULL-terminator).
655	*
656	* @param str Wide string to consider.
657	*
658	* @return Number of bytes used by the wide string
659	*
660	*/
661	size_t wstr_size(const char32_t *str)
662	{
663	return (wstr_length(str) * sizeof(char32_t));
664	}
665
666	/** Get size of string with length limit.
667	*
668	* Get the number of bytes which are used by up to @a max_len first
669	* characters in the string @a str. If @a max_len is greater than
670	* the length of @a str, the entire string is measured (excluding the
671	* NULL-terminator).
672	*
673	* @param str String to consider.
674	* @param max_len Maximum number of characters to measure.
675	*
676	* @return Number of bytes used by the characters.
677	*
678	*/
679	size_t str_lsize(const char *str, size_t max_len)
680	{
681	size_t len = 0;
682	size_t offset = 0;
683
684	while (len < max_len) {
685	if (str_decode(str, &offset, STR_NO_LIMIT) == 0)
686	break;
687
688	len++;
689	}
690
691	return offset;
692	}
693
694	static size_t _str_nsize(const char *str, size_t max_size)
695	{
696	size_t size = 0;
697
698	while ((*str++ != 0) && (size < max_size))
699	size++;
700
701	return size;
702	}
703
704	/** Get size of string with size limit.
705	*
706	* Get the number of bytes which are used by the string @a str
707	* (excluding the NULL-terminator), but no more than @max_size bytes.
708	*
709	* @param str String to consider.
710	* @param max_size Maximum number of bytes to measure.
711	*
712	* @return Number of bytes used by the string
713	*
714	*/
715	size_t str_nsize(const char *str, size_t max_size)
716	{
717	return _str_nsize(str, max_size);
718	}
719
720	/** Get size of wide string with size limit.
721	*
722	* Get the number of bytes which are used by the wide string @a str
723	* (excluding the NULL-terminator), but no more than @max_size bytes.
724	*
725	* @param str Wide string to consider.
726	* @param max_size Maximum number of bytes to measure.
727	*
728	* @return Number of bytes used by the wide string
729	*
730	*/
731	size_t wstr_nsize(const char32_t *str, size_t max_size)
732	{
733	return (wstr_nlength(str, max_size) * sizeof(char32_t));
734	}
735
736	/** Get size of wide string with length limit.
737	*
738	* Get the number of bytes which are used by up to @a max_len first
739	* wide characters in the wide string @a str. If @a max_len is greater than
740	* the length of @a str, the entire wide string is measured (excluding the
741	* NULL-terminator).
742	*
743	* @param str Wide string to consider.
744	* @param max_len Maximum number of wide characters to measure.
745	*
746	* @return Number of bytes used by the wide characters.
747	*
748	*/
749	size_t wstr_lsize(const char32_t *str, size_t max_len)
750	{
751	return (wstr_nlength(str, max_len * sizeof(char32_t)) * sizeof(char32_t));
752	}
753
754	/** Get number of characters in a string.
755	*
756	* @param str NULL-terminated string.
757	*
758	* @return Number of characters in string.
759	*
760	*/
761	size_t str_length(const char *str)
762	{
763	size_t len = 0;
764	size_t offset = 0;
765
766	while (str_decode(str, &offset, STR_NO_LIMIT) != 0)
767	len++;
768
769	return len;
770	}
771
772	/** Get number of characters in a wide string.
773	*
774	* @param str NULL-terminated wide string.
775	*
776	* @return Number of characters in @a str.
777	*
778	*/
779	size_t wstr_length(const char32_t *wstr)
780	{
781	size_t len = 0;
782
783	while (*wstr++ != 0)
784	len++;
785
786	return len;
787	}
788
789	/** Get number of characters in a string with size limit.
790	*
791	* @param str NULL-terminated string.
792	* @param size Maximum number of bytes to consider.
793	*
794	* @return Number of characters in string.
795	*
796	*/
797	size_t str_nlength(const char *str, size_t size)
798	{
799	size_t len = 0;
800	size_t offset = 0;
801
802	while (str_decode(str, &offset, size) != 0)
803	len++;
804
805	return len;
806	}
807
808	/** Get number of characters in a string with size limit.
809	*
810	* @param str NULL-terminated string.
811	* @param size Maximum number of bytes to consider.
812	*
813	* @return Number of characters in string.
814	*
815	*/
816	size_t wstr_nlength(const char32_t *str, size_t size)
817	{
818	size_t len = 0;
819	size_t limit = ALIGN_DOWN(size, sizeof(char32_t));
820	size_t offset = 0;
821
822	while ((offset < limit) && (*str++ != 0)) {
823	len++;
824	offset += sizeof(char32_t);
825	}
826
827	return len;
828	}
829
830	/** Get character display width on a character cell display.
831	*
832	* @param ch Character
833	* @return Width of character in cells.
834	*/
835	size_t chr_width(char32_t ch)
836	{
837	return 1;
838	}
839
840	/** Get string display width on a character cell display.
841	*
842	* @param str String
843	* @return Width of string in cells.
844	*/
845	size_t str_width(const char *str)
846	{
847	size_t width = 0;
848	size_t offset = 0;
849	char32_t ch;
850
851	while ((ch = str_decode(str, &offset, STR_NO_LIMIT)) != 0)
852	width += chr_width(ch);
853
854	return width;
855	}
856
857	/** Check whether character is plain ASCII.
858	*
859	* @return True if character is plain ASCII.
860	*
861	*/
862	bool ascii_check(char32_t ch)
863	{
864	if (ch <= 127)
865	return true;
866
867	return false;
868	}
869
870	/** Check whether character is valid
871	*
872	* @return True if character is a valid Unicode code point.
873	*
874	*/
875	bool chr_check(char32_t ch)
876	{
877	if (ch <= 1114111)
878	return true;
879
880	return false;
881	}
882
883	/** Compare two NULL terminated strings.
884	*
885	* Do a char-by-char comparison of two NULL-terminated strings.
886	* The strings are considered equal iff their length is equal
887	* and both strings consist of the same sequence of characters.
888	*
889	* A string S1 is less than another string S2 if it has a character with
890	* lower value at the first character position where the strings differ.
891	* If the strings differ in length, the shorter one is treated as if
892	* padded by characters with a value of zero.
893	*
894	* @param s1 First string to compare.
895	* @param s2 Second string to compare.
896	*
897	* @return 0 if the strings are equal, -1 if the first is less than the second,
898	* 1 if the second is less than the first.
899	*
900	*/
901	int str_cmp(const char s1, const char s2)
902	{
903	/*
904	* UTF-8 has the nice property that lexicographic ordering on bytes is
905	* the same as the lexicographic ordering of the character sequences.
906	*/
907	while (s1 == s2 && *s1 != 0) {
908	s1++;
909	s2++;
910	}
911
912	if (s1 == s2)
913	return 0;
914
915	return (s1 < s2) ? -1 : 1;
916	}
917
918	/** Compare two NULL terminated strings with length limit.
919	*
920	* Do a char-by-char comparison of two NULL-terminated strings.
921	* The strings are considered equal iff
922	* min(str_length(s1), max_len) == min(str_length(s2), max_len)
923	* and both strings consist of the same sequence of characters,
924	* up to max_len characters.
925	*
926	* A string S1 is less than another string S2 if it has a character with
927	* lower value at the first character position where the strings differ.
928	* If the strings differ in length, the shorter one is treated as if
929	* padded by characters with a value of zero. Only the first max_len
930	* characters are considered.
931	*
932	* @param s1 First string to compare.
933	* @param s2 Second string to compare.
934	* @param max_len Maximum number of characters to consider.
935	*
936	* @return 0 if the strings are equal, -1 if the first is less than the second,
937	* 1 if the second is less than the first.
938	*
939	*/
940	int str_lcmp(const char s1, const char s2, size_t max_len)
941	{
942	char32_t c1 = 0;
943	char32_t c2 = 0;
944
945	size_t off1 = 0;
946	size_t off2 = 0;
947
948	size_t len = 0;
949
950	while (true) {
951	if (len >= max_len)
952	break;
953
954	c1 = str_decode(s1, &off1, STR_NO_LIMIT);
955	c2 = str_decode(s2, &off2, STR_NO_LIMIT);
956
957	if (c1 < c2)
958	return -1;
959
960	if (c1 > c2)
961	return 1;
962
963	if (c1 == 0 \|\| c2 == 0)
964	break;
965
966	++len;
967	}
968
969	return 0;
970
971	}
972
973	/** Compare two NULL terminated strings in case-insensitive manner.
974	*
975	* Do a char-by-char comparison of two NULL-terminated strings.
976	* The strings are considered equal iff their length is equal
977	* and both strings consist of the same sequence of characters
978	* when converted to lower case.
979	*
980	* A string S1 is less than another string S2 if it has a character with
981	* lower value at the first character position where the strings differ.
982	* If the strings differ in length, the shorter one is treated as if
983	* padded by characters with a value of zero.
984	*
985	* @param s1 First string to compare.
986	* @param s2 Second string to compare.
987	*
988	* @return 0 if the strings are equal, -1 if the first is less than the second,
989	* 1 if the second is less than the first.
990	*
991	*/
992	int str_casecmp(const char s1, const char s2)
993	{
994	// FIXME: doesn't work for non-ASCII caseful characters
995
996	char32_t c1 = 0;
997	char32_t c2 = 0;
998
999	size_t off1 = 0;
1000	size_t off2 = 0;
1001
1002	while (true) {
1003	c1 = tolower(str_decode(s1, &off1, STR_NO_LIMIT));
1004	c2 = tolower(str_decode(s2, &off2, STR_NO_LIMIT));
1005
1006	if (c1 < c2)
1007	return -1;
1008
1009	if (c1 > c2)
1010	return 1;
1011
1012	if (c1 == 0 \|\| c2 == 0)
1013	break;
1014	}
1015
1016	return 0;
1017	}
1018
1019	/** Compare two NULL terminated strings with length limit in case-insensitive
1020	* manner.
1021	*
1022	* Do a char-by-char comparison of two NULL-terminated strings.
1023	* The strings are considered equal iff
1024	* min(str_length(s1), max_len) == min(str_length(s2), max_len)
1025	* and both strings consist of the same sequence of characters,
1026	* up to max_len characters.
1027	*
1028	* A string S1 is less than another string S2 if it has a character with
1029	* lower value at the first character position where the strings differ.
1030	* If the strings differ in length, the shorter one is treated as if
1031	* padded by characters with a value of zero. Only the first max_len
1032	* characters are considered.
1033	*
1034	* @param s1 First string to compare.
1035	* @param s2 Second string to compare.
1036	* @param max_len Maximum number of characters to consider.
1037	*
1038	* @return 0 if the strings are equal, -1 if the first is less than the second,
1039	* 1 if the second is less than the first.
1040	*
1041	*/
1042	int str_lcasecmp(const char s1, const char s2, size_t max_len)
1043	{
1044	// FIXME: doesn't work for non-ASCII caseful characters
1045
1046	char32_t c1 = 0;
1047	char32_t c2 = 0;
1048
1049	size_t off1 = 0;
1050	size_t off2 = 0;
1051
1052	size_t len = 0;
1053
1054	while (true) {
1055	if (len >= max_len)
1056	break;
1057
1058	c1 = tolower(str_decode(s1, &off1, STR_NO_LIMIT));
1059	c2 = tolower(str_decode(s2, &off2, STR_NO_LIMIT));
1060
1061	if (c1 < c2)
1062	return -1;
1063
1064	if (c1 > c2)
1065	return 1;
1066
1067	if (c1 == 0 \|\| c2 == 0)
1068	break;
1069
1070	++len;
1071	}
1072
1073	return 0;
1074
1075	}
1076
1077	static bool _test_prefix(const char s, const char p)
1078	{
1079	while (s == p && *s != 0) {
1080	s++;
1081	p++;
1082	}
1083
1084	return *p == 0;
1085	}
1086
1087	/** Test whether p is a prefix of s.
1088	*
1089	* Do a char-by-char comparison of two NULL-terminated strings
1090	* and determine if p is a prefix of s.
1091	*
1092	* @param s The string in which to look
1093	* @param p The string to check if it is a prefix of s
1094	*
1095	* @return true iff p is prefix of s else false
1096	*
1097	*/
1098	bool str_test_prefix(const char s, const char p)
1099	{
1100	return _test_prefix(s, p);
1101	}
1102
1103	/** Get a string suffix.
1104	*
1105	* Return a string suffix defined by the prefix length.
1106	*
1107	* @param s The string to get the suffix from.
1108	* @param prefix_length Number of prefix characters to ignore.
1109	*
1110	* @return String suffix.
1111	*
1112	*/
1113	const char str_suffix(const char s, size_t prefix_length)
1114	{
1115	size_t off = 0;
1116	size_t i = 0;
1117
1118	while (true) {
1119	str_decode(s, &off, STR_NO_LIMIT);
1120	i++;
1121
1122	if (i >= prefix_length)
1123	break;
1124	}
1125
1126	return s + off;
1127	}
1128
1129	/** Copy string as a sequence of bytes. */
1130	static void _str_cpy(char dest, const char src)
1131	{
1132	while (*src)
1133	(dest++) = (src++);
1134
1135	*dest = 0;
1136	}
1137
1138	/** Copy string as a sequence of bytes. */
1139	static void _str_cpyn(char dest, size_t size, const char src)
1140	{
1141	assert(dest && src && size);
1142
1143	if (!dest \|\| !src \|\| !size)
1144	return;
1145
1146	if (size == STR_NO_LIMIT)
1147	return _str_cpy(dest, src);
1148
1149	char *dest_top = dest + size - 1;
1150	assert(size == 1 \|\| dest < dest_top);
1151
1152	while (*src && dest < dest_top)
1153	(dest++) = (src++);
1154
1155	*dest = 0;
1156	}
1157
1158	/** Copy string.
1159	*
1160	* Copy source string @a src to destination buffer @a dest.
1161	* No more than @a size bytes are written. If the size of the output buffer
1162	* is at least one byte, the output string will always be well-formed, i.e.
1163	* null-terminated and containing only complete characters.
1164	*
1165	* @param dest Destination buffer.
1166	* @param count Size of the destination buffer (must be > 0).
1167	* @param src Source string.
1168	*
1169	*/
1170	void str_cpy(char dest, size_t size, const char src)
1171	{
1172	/* There must be space for a null terminator in the buffer. */
1173	assert(size > 0);
1174	assert(src != NULL);
1175	assert(dest != NULL);
1176	assert(size == STR_NO_LIMIT \|\| dest + size > dest);
1177
1178	/* Copy data. */
1179	_str_cpyn(dest, size, src);
1180
1181	/* In-place translate invalid bytes to U_SPECIAL. */
1182	_str_sanitize(dest, size, U_SPECIAL);
1183	}
1184
1185	/** Copy size-limited substring.
1186	*
1187	* Copy prefix of string @a src of max. size @a size to destination buffer
1188	* @a dest. No more than @a size bytes are written. The output string will
1189	* always be well-formed, i.e. null-terminated and containing only complete
1190	* characters.
1191	*
1192	* No more than @a n bytes are read from the input string, so it does not
1193	* have to be null-terminated.
1194	*
1195	* @param dest Destination buffer.
1196	* @param count Size of the destination buffer (must be > 0).
1197	* @param src Source string.
1198	* @param n Maximum number of bytes to read from @a src.
1199	*
1200	*/
1201	void str_ncpy(char dest, size_t size, const char src, size_t n)
1202	{
1203	/* There must be space for a null terminator in the buffer. */
1204	assert(size > 0);
1205	assert(src != NULL);
1206
1207	/* Copy data. */
1208	_str_cpyn(dest, min(size, n + 1), src);
1209
1210	/* In-place translate invalid bytes to U_SPECIAL. */
1211	_str_sanitize(dest, size, U_SPECIAL);
1212	}
1213
1214	/** Append one string to another.
1215	*
1216	* Append source string @a src to string in destination buffer @a dest.
1217	* Size of the destination buffer is @a dest. If the size of the output buffer
1218	* is at least one byte, the output string will always be well-formed, i.e.
1219	* null-terminated and containing only complete characters.
1220	*
1221	* @param dest Destination buffer.
1222	* @param count Size of the destination buffer.
1223	* @param src Source string.
1224	*/
1225	void str_append(char dest, size_t size, const char src)
1226	{
1227	assert(src != NULL);
1228	assert(dest != NULL);
1229	assert(size > 0);
1230	assert(size == STR_NO_LIMIT \|\| dest + size > dest);
1231
1232	size_t dstr_size = _str_nsize(dest, size);
1233	if (dstr_size < size) {
1234	_str_cpyn(dest + dstr_size, size - dstr_size, src);
1235	_str_sanitize(dest + dstr_size, size - dstr_size, U_SPECIAL);
1236	}
1237	}
1238
1239	/** Convert space-padded ASCII to string.
1240	*
1241	* Common legacy text encoding in hardware is 7-bit ASCII fitted into
1242	* a fixed-width byte buffer (bit 7 always zero), right-padded with spaces
1243	* (ASCII 0x20). Convert space-padded ascii to string representation.
1244	*
1245	* If the text does not fit into the destination buffer, the function converts
1246	* as many characters as possible and returns EOVERFLOW.
1247	*
1248	* If the text contains non-ASCII bytes (with bit 7 set), the whole string is
1249	* converted anyway and invalid characters are replaced with question marks
1250	* (U_SPECIAL) and the function returns EIO.
1251	*
1252	* Regardless of return value upon return @a dest will always be well-formed.
1253	*
1254	* @param dest Destination buffer
1255	* @param size Size of destination buffer
1256	* @param src Space-padded ASCII.
1257	* @param n Size of the source buffer in bytes.
1258	*
1259	* @return EOK on success, EOVERFLOW if the text does not fit
1260	* destination buffer, EIO if the text contains
1261	* non-ASCII bytes.
1262	*/
1263	errno_t spascii_to_str(char dest, size_t size, const uint8_t src, size_t n)
1264	{
1265	size_t len = 0;
1266
1267	/* Determine the length of the source string. */
1268	for (size_t i = 0; i < n; i++) {
1269	if (src[i] == 0)
1270	break;
1271
1272	if (src[i] != ' ')
1273	len = i + 1;
1274	}
1275
1276	errno_t result = EOK;
1277	size_t out_len = min(len, size - 1);
1278
1279	/* Copy characters */
1280	for (size_t i = 0; i < out_len; i++) {
1281	dest[i] = src[i];
1282
1283	if (dest[i] < 0) {
1284	dest[i] = U_SPECIAL;
1285	result = EIO;
1286	}
1287	}
1288
1289	dest[out_len] = 0;
1290
1291	if (out_len < len)
1292	return EOVERFLOW;
1293
1294	return result;
1295	}
1296
1297	/** Convert wide string to string.
1298	*
1299	* Convert wide string @a src to string. The output is written to the buffer
1300	* specified by @a dest and @a size. @a size must be non-zero and the string
1301	* written will always be well-formed.
1302	*
1303	* @param dest Destination buffer.
1304	* @param size Size of the destination buffer.
1305	* @param src Source wide string.
1306	*/
1307	void wstr_to_str(char dest, size_t size, const char32_t src)
1308	{
1309	char32_t ch;
1310	size_t src_idx;
1311	size_t dest_off;
1312
1313	/* There must be space for a null terminator in the buffer. */
1314	assert(size > 0);
1315
1316	src_idx = 0;
1317	dest_off = 0;
1318
1319	while ((ch = src[src_idx++]) != 0) {
1320	if (chr_encode(ch, dest, &dest_off, size - 1) != EOK)
1321	break;
1322	}
1323
1324	dest[dest_off] = '\0';
1325	}
1326
1327	/** Convert UTF16 string to string.
1328	*
1329	* Convert utf16 string @a src to string. The output is written to the buffer
1330	* specified by @a dest and @a size. @a size must be non-zero and the string
1331	* written will always be well-formed. Surrogate pairs also supported.
1332	*
1333	* @param dest Destination buffer.
1334	* @param size Size of the destination buffer.
1335	* @param src Source utf16 string.
1336	*
1337	* @return EOK, if success, an error code otherwise.
1338	*/
1339	errno_t utf16_to_str(char dest, size_t size, const uint16_t src)
1340	{
1341	size_t idx = 0, dest_off = 0;
1342	char32_t ch;
1343	errno_t rc = EOK;
1344
1345	/* There must be space for a null terminator in the buffer. */
1346	assert(size > 0);
1347
1348	while (src[idx]) {
1349	if ((src[idx] & 0xfc00) == 0xd800) {
1350	if (src[idx + 1] && (src[idx + 1] & 0xfc00) == 0xdc00) {
1351	ch = 0x10000;
1352	ch += (src[idx] & 0x03FF) << 10;
1353	ch += (src[idx + 1] & 0x03FF);
1354	idx += 2;
1355	} else
1356	break;
1357	} else {
1358	ch = src[idx];
1359	idx++;
1360	}
1361	rc = chr_encode(ch, dest, &dest_off, size - 1);
1362	if (rc != EOK)
1363	break;
1364	}
1365	dest[dest_off] = '\0';
1366	return rc;
1367	}
1368
1369	/** Convert string to UTF16 string.
1370	*
1371	* Convert string @a src to utf16 string. The output is written to the buffer
1372	* specified by @a dest and @a dlen. @a dlen must be non-zero and the string
1373	* written will always be well-formed. Surrogate pairs also supported.
1374	*
1375	* @param dest Destination buffer.
1376	* @param dlen Number of utf16 characters that fit in the destination buffer.
1377	* @param src Source string.
1378	*
1379	* @return EOK, if success, an error code otherwise.
1380	*/
1381	errno_t str_to_utf16(uint16_t dest, size_t dlen, const char src)
1382	{
1383	errno_t rc = EOK;
1384	size_t offset = 0;
1385	size_t idx = 0;
1386	char32_t c;
1387
1388	assert(dlen > 0);
1389
1390	while ((c = str_decode(src, &offset, STR_NO_LIMIT)) != 0) {
1391	if (c > 0x10000) {
1392	if (idx + 2 >= dlen - 1) {
1393	rc = EOVERFLOW;
1394	break;
1395	}
1396	c = (c - 0x10000);
1397	dest[idx] = 0xD800 \| (c >> 10);
1398	dest[idx + 1] = 0xDC00 \| (c & 0x3FF);
1399	idx++;
1400	} else {
1401	dest[idx] = c;
1402	}
1403
1404	idx++;
1405	if (idx >= dlen - 1) {
1406	rc = EOVERFLOW;
1407	break;
1408	}
1409	}
1410
1411	dest[idx] = '\0';
1412	return rc;
1413	}
1414
1415	/** Get size of UTF-16 string.
1416	*
1417	* Get the number of words which are used by the UTF-16 string @a ustr
1418	* (excluding the NULL-terminator).
1419	*
1420	* @param ustr UTF-16 string to consider.
1421	*
1422	* @return Number of words used by the UTF-16 string
1423	*
1424	*/
1425	size_t utf16_wsize(const uint16_t *ustr)
1426	{
1427	size_t wsize = 0;
1428
1429	while (*ustr++ != 0)
1430	wsize++;
1431
1432	return wsize;
1433	}
1434
1435	/** Convert wide string to new string.
1436	*
1437	* Convert wide string @a src to string. Space for the new string is allocated
1438	* on the heap.
1439	*
1440	* @param src Source wide string.
1441	* @return New string.
1442	*/
1443	char wstr_to_astr(const char32_t src)
1444	{
1445	char dbuf[STR_BOUNDS(1)];
1446	char *str;
1447	char32_t ch;
1448
1449	size_t src_idx;
1450	size_t dest_off;
1451	size_t dest_size;
1452
1453	/* Compute size of encoded string. */
1454
1455	src_idx = 0;
1456	dest_size = 0;
1457
1458	while ((ch = src[src_idx++]) != 0) {
1459	dest_off = 0;
1460	if (chr_encode(ch, dbuf, &dest_off, STR_BOUNDS(1)) != EOK)
1461	break;
1462	dest_size += dest_off;
1463	}
1464
1465	str = malloc(dest_size + 1);
1466	if (str == NULL)
1467	return NULL;
1468
1469	/* Encode string. */
1470
1471	src_idx = 0;
1472	dest_off = 0;
1473
1474	while ((ch = src[src_idx++]) != 0) {
1475	if (chr_encode(ch, str, &dest_off, dest_size) != EOK)
1476	break;
1477	}
1478
1479	str[dest_size] = '\0';
1480	return str;
1481	}
1482
1483	/** Convert string to wide string.
1484	*
1485	* Convert string @a src to wide string. The output is written to the
1486	* buffer specified by @a dest and @a dlen. @a dlen must be non-zero
1487	* and the wide string written will always be null-terminated.
1488	*
1489	* @param dest Destination buffer.
1490	* @param dlen Length of destination buffer (number of wchars).
1491	* @param src Source string.
1492	*/
1493	void str_to_wstr(char32_t dest, size_t dlen, const char src)
1494	{
1495	size_t offset;
1496	size_t di;
1497	char32_t c;
1498
1499	assert(dlen > 0);
1500
1501	offset = 0;
1502	di = 0;
1503
1504	do {
1505	if (di >= dlen - 1)
1506	break;
1507
1508	c = str_decode(src, &offset, STR_NO_LIMIT);
1509	dest[di++] = c;
1510	} while (c != '\0');
1511
1512	dest[dlen - 1] = '\0';
1513	}
1514
1515	/** Convert string to wide string.
1516	*
1517	* Convert string @a src to wide string. A new wide NULL-terminated
1518	* string will be allocated on the heap.
1519	*
1520	* @param src Source string.
1521	*/
1522	char32_t str_to_awstr(const char str)
1523	{
1524	size_t len = str_length(str);
1525
1526	char32_t *wstr = calloc(len + 1, sizeof(char32_t));
1527	if (wstr == NULL)
1528	return NULL;
1529
1530	str_to_wstr(wstr, len + 1, str);
1531	return wstr;
1532	}
1533
1534	static char _strchr(const char str, char c)
1535	{
1536	while (str != 0 && str != c)
1537	str++;
1538
1539	return (str == c) ? (char ) str : NULL;
1540	}
1541
1542	/** Find first occurence of character in string.
1543	*
1544	* @param str String to search.
1545	* @param ch Character to look for.
1546	*
1547	* @return Pointer to character in @a str or NULL if not found.
1548	*/
1549	char str_chr(const char str, char32_t ch)
1550	{
1551	/* Fast path for an ASCII character. */
1552	if (ascii_check(ch))
1553	return _strchr(str, ch);
1554
1555	/* Convert character to UTF-8. */
1556	char utf8[STR_BOUNDS(1) + 1];
1557	size_t offset = 0;
1558
1559	if (chr_encode(ch, utf8, &offset, sizeof(utf8)) != EOK \|\| offset == 0)
1560	return NULL;
1561
1562	utf8[offset] = '\0';
1563
1564	/* Find the first byte, then check if all of them are correct. */
1565	while (*str != 0) {
1566	str = _strchr(str, utf8[0]);
1567	if (!str)
1568	return NULL;
1569
1570	if (_test_prefix(str, utf8))
1571	return (char *) str;
1572
1573	str++;
1574	}
1575
1576	return NULL;
1577	}
1578
1579	/** Find first occurence of substring in string.
1580	*
1581	* @param hs Haystack (string)
1582	* @param n Needle (substring to look for)
1583	*
1584	* @return Pointer to character in @a hs or @c NULL if not found.
1585	*/
1586	char str_str(const char hs, const char *n)
1587	{
1588	size_t hsize = _str_size(hs);
1589	size_t nsize = _str_size(n);
1590
1591	while (hsize >= nsize) {
1592	if (_test_prefix(hs, n))
1593	return (char *) hs;
1594
1595	hs++;
1596	hsize--;
1597	}
1598
1599	return NULL;
1600	}
1601
1602	static void _str_rtrim(char *str, char c)
1603	{
1604	char *last = str;
1605
1606	while (*str) {
1607	if (*str != c)
1608	last = str;
1609
1610	str++;
1611	}
1612
1613	/* Truncate string. */
1614	last[1] = 0;
1615	}
1616
1617	/** Removes specified trailing characters from a string.
1618	*
1619	* @param str String to remove from.
1620	* @param ch Character to remove.
1621	*/
1622	void str_rtrim(char *str, char32_t ch)
1623	{
1624	/* Fast path for the ASCII case. */
1625	if (ascii_check(ch)) {
1626	_str_rtrim(str, ch);
1627	return;
1628	}
1629
1630	size_t off = 0;
1631	size_t pos = 0;
1632	char32_t c;
1633	bool update_last_chunk = true;
1634	char *last_chunk = NULL;
1635
1636	while ((c = str_decode(str, &off, STR_NO_LIMIT))) {
1637	if (c != ch) {
1638	update_last_chunk = true;
1639	last_chunk = NULL;
1640	} else if (update_last_chunk) {
1641	update_last_chunk = false;
1642	last_chunk = (str + pos);
1643	}
1644	pos = off;
1645	}
1646
1647	if (last_chunk)
1648	*last_chunk = '\0';
1649	}
1650
1651	static void _str_ltrim(char *str, char c)
1652	{
1653	char *p = str;
1654
1655	while (*p == c)
1656	p++;
1657
1658	if (str != p)
1659	_str_cpy(str, p);
1660	}
1661
1662	/** Removes specified leading characters from a string.
1663	*
1664	* @param str String to remove from.
1665	* @param ch Character to remove.
1666	*/
1667	void str_ltrim(char *str, char32_t ch)
1668	{
1669	/* Fast path for the ASCII case. */
1670	if (ascii_check(ch)) {
1671	_str_ltrim(str, ch);
1672	return;
1673	}
1674
1675	char32_t acc;
1676	size_t off = 0;
1677	size_t pos = 0;
1678	size_t str_sz = str_size(str);
1679
1680	while ((acc = str_decode(str, &off, STR_NO_LIMIT)) != 0) {
1681	if (acc != ch)
1682	break;
1683	else
1684	pos = off;
1685	}
1686
1687	if (pos > 0) {
1688	memmove(str, &str[pos], str_sz - pos);
1689	pos = str_sz - pos;
1690	str[pos] = '\0';
1691	}
1692	}
1693
1694	static char _str_rchr(const char str, char c)
1695	{
1696	const char *last = NULL;
1697
1698	while (*str) {
1699	if (*str == c)
1700	last = str;
1701
1702	str++;
1703	}
1704
1705	return (char *) last;
1706	}
1707
1708	/** Find last occurence of character in string.
1709	*
1710	* @param str String to search.
1711	* @param ch Character to look for.
1712	*
1713	* @return Pointer to character in @a str or NULL if not found.
1714	*/
1715	char str_rchr(const char str, char32_t ch)
1716	{
1717	if (ascii_check(ch))
1718	return _str_rchr(str, ch);
1719
1720	char32_t acc;
1721	size_t off = 0;
1722	size_t last = 0;
1723	const char *res = NULL;
1724
1725	while ((acc = str_decode(str, &off, STR_NO_LIMIT)) != 0) {
1726	if (acc == ch)
1727	res = (str + last);
1728	last = off;
1729	}
1730
1731	return (char *) res;
1732	}
1733
1734	/** Insert a wide character into a wide string.
1735	*
1736	* Insert a wide character into a wide string at position
1737	* @a pos. The characters after the position are shifted.
1738	*
1739	* @param str String to insert to.
1740	* @param ch Character to insert to.
1741	* @param pos Character index where to insert.
1742	* @param max_pos Characters in the buffer.
1743	*
1744	* @return True if the insertion was sucessful, false if the position
1745	* is out of bounds.
1746	*
1747	*/
1748	bool wstr_linsert(char32_t *str, char32_t ch, size_t pos, size_t max_pos)
1749	{
1750	size_t len = wstr_length(str);
1751
1752	if ((pos > len) \|\| (pos + 1 > max_pos))
1753	return false;
1754
1755	size_t i;
1756	for (i = len; i + 1 > pos; i--)
1757	str[i + 1] = str[i];
1758
1759	str[pos] = ch;
1760
1761	return true;
1762	}
1763
1764	/** Remove a wide character from a wide string.
1765	*
1766	* Remove a wide character from a wide string at position
1767	* @a pos. The characters after the position are shifted.
1768	*
1769	* @param str String to remove from.
1770	* @param pos Character index to remove.
1771	*
1772	* @return True if the removal was sucessful, false if the position
1773	* is out of bounds.
1774	*
1775	*/
1776	bool wstr_remove(char32_t *str, size_t pos)
1777	{
1778	size_t len = wstr_length(str);
1779
1780	if (pos >= len)
1781	return false;
1782
1783	size_t i;
1784	for (i = pos + 1; i <= len; i++)
1785	str[i - 1] = str[i];
1786
1787	return true;
1788	}
1789
1790	/** Duplicate string.
1791	*
1792	* Allocate a new string and copy characters from the source
1793	* string into it. The duplicate string is allocated via sleeping
1794	* malloc(), thus this function can sleep in no memory conditions.
1795	*
1796	* The allocation cannot fail and the return value is always
1797	* a valid pointer. The duplicate string is always a well-formed
1798	* null-terminated UTF-8 string, but it can differ from the source
1799	* string on the byte level.
1800	*
1801	* @param src Source string.
1802	*
1803	* @return Duplicate string.
1804	*
1805	*/
1806	char str_dup(const char src)
1807	{
1808	size_t size = _str_size(src) + 1;
1809	char *dest = malloc(size);
1810	if (!dest)
1811	return NULL;
1812
1813	memcpy(dest, src, size);
1814	_str_sanitize(dest, size, U_SPECIAL);
1815	return dest;
1816	}
1817
1818	/** Duplicate string with size limit.
1819	*
1820	* Allocate a new string and copy up to @max_size bytes from the source
1821	* string into it. The duplicate string is allocated via sleeping
1822	* malloc(), thus this function can sleep in no memory conditions.
1823	* No more than @max_size + 1 bytes is allocated, but if the size
1824	* occupied by the source string is smaller than @max_size + 1,
1825	* less is allocated.
1826	*
1827	* The allocation cannot fail and the return value is always
1828	* a valid pointer. The duplicate string is always a well-formed
1829	* null-terminated UTF-8 string, but it can differ from the source
1830	* string on the byte level.
1831	*
1832	* @param src Source string.
1833	* @param n Maximum number of bytes to duplicate.
1834	*
1835	* @return Duplicate string.
1836	*
1837	*/
1838	char str_ndup(const char src, size_t n)
1839	{
1840	size_t size = _str_nsize(src, n);
1841
1842	char *dest = malloc(size + 1);
1843	if (!dest)
1844	return NULL;
1845
1846	memcpy(dest, src, size);
1847	_str_sanitize(dest, size, U_SPECIAL);
1848	dest[size] = 0;
1849	return dest;
1850	}
1851
1852	/** Split string by delimiters.
1853	*
1854	* @param s String to be tokenized. May not be NULL.
1855	* @param delim String with the delimiters.
1856	* @param next Variable which will receive the pointer to the
1857	* continuation of the string following the first
1858	* occurrence of any of the delimiter characters.
1859	* May be NULL.
1860	* @return Pointer to the prefix of @a s before the first
1861	* delimiter character. NULL if no such prefix
1862	* exists.
1863	*/
1864	char str_tok(char s, const char delim, char *next)
1865	{
1866	char start, end;
1867
1868	if (!s)
1869	return NULL;
1870
1871	size_t len = str_size(s);
1872	size_t cur;
1873	size_t tmp;
1874	char32_t ch;
1875
1876	/* Skip over leading delimiters. */
1877	tmp = 0;
1878	cur = 0;
1879	while ((ch = str_decode(s, &tmp, len)) && str_chr(delim, ch))
1880	cur = tmp;
1881	start = &s[cur];
1882
1883	/* Skip over token characters. */
1884	tmp = cur;
1885	while ((ch = str_decode(s, &tmp, len)) && !str_chr(delim, ch))
1886	cur = tmp;
1887	end = &s[cur];
1888	if (next)
1889	*next = (ch ? &s[tmp] : &s[cur]);
1890
1891	if (start == end)
1892	return NULL; /* No more tokens. */
1893
1894	/* Overwrite delimiter with NULL terminator. */
1895	*end = '\0';
1896	return start;
1897	}
1898
1899	void order_suffix(const uint64_t val, uint64_t rv, char suffix)
1900	{
1901	if (val > UINT64_C(10000000000000000000)) {
1902	*rv = val / UINT64_C(1000000000000000000);
1903	*suffix = 'Z';
1904	} else if (val > UINT64_C(1000000000000000000)) {
1905	*rv = val / UINT64_C(1000000000000000);
1906	*suffix = 'E';
1907	} else if (val > UINT64_C(1000000000000000)) {
1908	*rv = val / UINT64_C(1000000000000);
1909	*suffix = 'T';
1910	} else if (val > UINT64_C(1000000000000)) {
1911	*rv = val / UINT64_C(1000000000);
1912	*suffix = 'G';
1913	} else if (val > UINT64_C(1000000000)) {
1914	*rv = val / UINT64_C(1000000);
1915	*suffix = 'M';
1916	} else if (val > UINT64_C(1000000)) {
1917	*rv = val / UINT64_C(1000);
1918	*suffix = 'k';
1919	} else {
1920	*rv = val;
1921	*suffix = ' ';
1922	}
1923	}
1924
1925	void bin_order_suffix(const uint64_t val, uint64_t rv, const char *suffix,
1926	bool fixed)
1927	{
1928	if (val > UINT64_C(1152921504606846976)) {
1929	*rv = val / UINT64_C(1125899906842624);
1930	*suffix = "EiB";
1931	} else if (val > UINT64_C(1125899906842624)) {
1932	*rv = val / UINT64_C(1099511627776);
1933	*suffix = "TiB";
1934	} else if (val > UINT64_C(1099511627776)) {
1935	*rv = val / UINT64_C(1073741824);
1936	*suffix = "GiB";
1937	} else if (val > UINT64_C(1073741824)) {
1938	*rv = val / UINT64_C(1048576);
1939	*suffix = "MiB";
1940	} else if (val > UINT64_C(1048576)) {
1941	*rv = val / UINT64_C(1024);
1942	*suffix = "KiB";
1943	} else {
1944	*rv = val;
1945	if (fixed)
1946	*suffix = "B ";
1947	else
1948	*suffix = "B";
1949	}
1950	}
1951
1952	/** @}
1953	*/

Note: See TracBrowser for help on using the repository browser.

Download in other formats: