Context Navigation

source: mainline/common/str.c@ ae787807

Visit:

Last change on this file since ae787807 was b31323f, checked in by Jiří Zárevúcky <zarevucky.jiri@…>, 2 months ago
Test, fix and extend string sanitization
Property mode set to `100644`
File size: 45.6 KB

Line
1	/*
2	* Copyright (c) 2001-2004 Jakub Jermar
3	* Copyright (c) 2005 Martin Decky
4	* Copyright (c) 2008 Jiri Svoboda
5	* Copyright (c) 2011 Martin Sucha
6	* Copyright (c) 2011 Oleg Romanenko
7	* Copyright (c) 2025 Jiří Zárevúcky
8	* All rights reserved.
9	*
10	* Redistribution and use in source and binary forms, with or without
11	* modification, are permitted provided that the following conditions
12	* are met:
13	*
14	* - Redistributions of source code must retain the above copyright
15	* notice, this list of conditions and the following disclaimer.
16	* - Redistributions in binary form must reproduce the above copyright
17	* notice, this list of conditions and the following disclaimer in the
18	* documentation and/or other materials provided with the distribution.
19	* - The name of the author may not be used to endorse or promote products
20	* derived from this software without specific prior written permission.
21	*
22	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
23	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
24	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
25	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
26	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
27	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
31	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32	*/
33
34	/** @addtogroup libc
35	* @{
36	*/
37
38	/**
39	* @file
40	* @brief String functions.
41	*
42	* Strings and characters use the Universal Character Set (UCS). The standard
43	* strings, called just strings are encoded in UTF-8. Wide strings (encoded
44	* in UTF-32) are supported to a limited degree. A single character is
45	* represented as char32_t.@n
46	*
47	* Overview of the terminology:@n
48	*
49	* Term Meaning
50	* -------------------- ----------------------------------------------------
51	* byte 8 bits stored in uint8_t (unsigned 8 bit integer)
52	*
53	* character UTF-32 encoded Unicode character, stored in char32_t
54	* (unsigned 32 bit integer), code points 0 .. 1114111
55	* are valid
56	*
57	* Note that Unicode characters do not match
58	* one-to-one with displayed characters or glyphs on
59	* screen. For that level of precision, look up
60	* Grapheme Clusters.
61	*
62	* ASCII character 7 bit encoded ASCII character, stored in char
63	* (usually signed 8 bit integer), code points 0 .. 127
64	* are valid
65	*
66	* string UTF-8 encoded NULL-terminated Unicode string, char *
67	*
68	* wide string UTF-32 encoded NULL-terminated Unicode string,
69	* char32_t *
70	*
71	* [wide] string size number of BYTES in a [wide] string (excluding
72	* the NULL-terminator), size_t
73	*
74	* [wide] string length number of CHARACTERS in a [wide] string (excluding
75	* the NULL-terminator), size_t
76	*
77	* [wide] string width number of display cells on a monospace display taken
78	* by a [wide] string, size_t
79	*
80	* This is virtually impossible to determine exactly for
81	* all strings without knowing specifics of the display
82	* device, due to various factors affecting text output.
83	* If you have the option to query the terminal for
84	* position change caused by outputting the string,
85	* it is preferrable to determine width that way.
86	*
87	*
88	* Overview of string metrics:@n
89	*
90	* Metric Abbrev. Type Meaning
91	* ------ ------ ------ -------------------------------------------------
92	* size n size_t number of BYTES in a string (excluding the
93	* NULL-terminator)
94	*
95	* length l size_t number of CHARACTERS in a string (excluding the
96	* null terminator)
97	*
98	* width w size_t number of display cells on a monospace display
99	* taken by a string
100	*
101	*
102	* Function naming prefixes:@n
103	*
104	* chr_ operate on characters
105	* ascii_ operate on ASCII characters
106	* str_ operate on strings
107	* wstr_ operate on wide strings
108	*
109	* [w]str_[n\|l\|w] operate on a prefix limited by size, length
110	* or width
111	*
112	*
113	* A specific character inside a [wide] string can be referred to by:@n
114	*
115	* pointer (char , char32_t )
116	* byte offset (size_t)
117	* character index (size_t)
118	*
119	*/
120
121	#include <str.h>
122
123	#include <align.h>
124	#include <assert.h>
125	#include <ctype.h>
126	#include <errno.h>
127	#include <limits.h>
128	#include <macros.h>
129	#include <mem.h>
130	#include <stdbool.h>
131	#include <stddef.h>
132	#include <stdint.h>
133	#include <stdlib.h>
134	#include <uchar.h>
135
136	#if __STDC_HOSTED__
137	#include <fibril.h>
138	#endif
139
140	static void _set_ilseq()
141	{
142	#ifdef errno
143	errno = EILSEQ;
144	#endif
145	}
146
147	/** Byte mask consisting of lowest @n bits (out of 8) */
148	#define LO_MASK_8(n) ((uint8_t) ((1 << (n)) - 1))
149
150	/** Byte mask consisting of lowest @n bits (out of 32) */
151	#define LO_MASK_32(n) ((uint32_t) ((1 << (n)) - 1))
152
153	/** Byte mask consisting of highest @n bits (out of 8) */
154	#define HI_MASK_8(n) (~LO_MASK_8(8 - (n)))
155
156	/** Number of data bits in a UTF-8 continuation byte */
157	#define CONT_BITS 6
158
159	#define UTF8_MASK_INITIAL2 0b00011111
160	#define UTF8_MASK_INITIAL3 0b00001111
161	#define UTF8_MASK_INITIAL4 0b00000111
162	#define UTF8_MASK_CONT 0b00111111
163
164	#define CHAR_INVALID ((char32_t) UINT_MAX)
165
166	static inline bool _is_ascii(uint8_t b)
167	{
168	return b < 0x80;
169	}
170
171	static inline bool _is_continuation(uint8_t b)
172	{
173	return (b & 0xC0) == 0x80;
174	}
175
176	static inline bool _is_2_byte(uint8_t c)
177	{
178	return (c & 0xE0) == 0xC0;
179	}
180
181	static inline bool _is_3_byte(uint8_t c)
182	{
183	return (c & 0xF0) == 0xE0;
184	}
185
186	static inline bool _is_4_byte(uint8_t c)
187	{
188	return (c & 0xF8) == 0xF0;
189	}
190
191	static inline int _char_continuation_bytes(char32_t c)
192	{
193	if ((c & ~LO_MASK_32(7)) == 0)
194	return 0;
195
196	if ((c & ~LO_MASK_32(11)) == 0)
197	return 1;
198
199	if ((c & ~LO_MASK_32(16)) == 0)
200	return 2;
201
202	if ((c & ~LO_MASK_32(21)) == 0)
203	return 3;
204
205	/* Codes longer than 21 bits are not supported */
206	return -1;
207	}
208
209	static inline int _continuation_bytes(uint8_t b)
210	{
211	/* 0xxxxxxx */
212	if (_is_ascii(b))
213	return 0;
214
215	/* 110xxxxx 10xxxxxx */
216	if (_is_2_byte(b))
217	return 1;
218
219	/* 1110xxxx 10xxxxxx 10xxxxxx */
220	if (_is_3_byte(b))
221	return 2;
222
223	/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
224	if (_is_4_byte(b))
225	return 3;
226
227	return -1;
228	}
229
230	static bool _is_non_shortest(const mbstate_t *mb, uint8_t b)
231	{
232	return (mb->state == 0b1111110000000000 && !(b & 0b00100000)) \|\|
233	(mb->state == 0b1111111111110000 && !(b & 0b00110000));
234	}
235
236	static bool _is_surrogate(const mbstate_t *mb, uint8_t b)
237	{
238	return (mb->state == 0b1111110000001101 && b >= 0xa0);
239	}
240
241	#define _likely(expr) __builtin_expect((expr), true)
242	#define _unlikely(expr) __builtin_expect((expr), false)
243
244	#define FAST_PATHS 1
245
246	static char32_t _str_decode(const char s, size_t offset, size_t size, mbstate_t *mb)
247	{
248	assert(s);
249	assert(offset);
250	assert(*offset <= size);
251	assert(size == STR_NO_LIMIT \|\| s + size >= s);
252	assert(mb);
253
254	if (*offset == size)
255	return 0;
256
257	if (_likely(!mb->state)) {
258	/* Clean slate, read initial byte. */
259	uint8_t b = s[(*offset)++];
260
261	/* Fast exit for the most common case. */
262	if (_likely(_is_ascii(b)))
263	return b;
264
265	/* unexpected continuation byte */
266	if (_unlikely(_is_continuation(b)))
267	return CHAR_INVALID;
268
269	/*
270	* The value stored into `continuation` is designed to have
271	* just enough leading ones that after shifting in one less than
272	* the expected number of continuation bytes, the most significant
273	* bit becomes zero. (The field is 16b wide.)
274	*/
275
276	if (_is_2_byte(b)) {
277	/* Reject non-shortest form. */
278	if (_unlikely(!(b & 0b00011110)))
279	return CHAR_INVALID;
280
281	#if FAST_PATHS
282	/* We can usually take this exit. */
283	if (_likely(offset < size && _is_continuation(s[offset])))
284	return (b & UTF8_MASK_INITIAL2) << 6 \|
285	(s[(*offset)++] & UTF8_MASK_CONT);
286	#endif
287
288	/* 2 byte continuation 110xxxxx */
289	mb->state = b ^ 0b0000000011000000;
290
291	} else if (_is_3_byte(b)) {
292	#if FAST_PATHS
293	/* We can usually take this exit. */
294	if (_likely(offset + 1 < size && _is_continuation(s[offset]) && _is_continuation(s[*offset + 1]))) {
295
296	char32_t ch = (b & UTF8_MASK_INITIAL3) << 12 \|
297	(s[(*offset)] & UTF8_MASK_CONT) << 6 \|
298	(s[(*offset) + 1] & UTF8_MASK_CONT);
299
300	*offset += 2;
301
302	/* Reject non-shortest form. */
303	if (_unlikely(!(ch & 0xFFFFF800)))
304	return CHAR_INVALID;
305
306	/* Reject surrogates */
307	if (_unlikely(ch >= 0xD800 && ch < 0xE000))
308	return CHAR_INVALID;
309
310	return ch;
311	}
312	#endif
313
314	/* 3 byte continuation 1110xxxx */
315	mb->state = b ^ 0b1111110011100000;
316
317	} else if (_is_4_byte(b)) {
318	#if FAST_PATHS
319	/* We can usually take this exit. */
320	if (_likely(offset + 2 < size && _is_continuation(s[offset]) &&
321	_is_continuation(s[offset + 1]) && _is_continuation(s[offset + 2]))) {
322
323	char32_t ch = (b & UTF8_MASK_INITIAL4) << 18 \|
324	(s[(*offset)] & UTF8_MASK_CONT) << 12 \|
325	(s[(*offset) + 1] & UTF8_MASK_CONT) << 6 \|
326	(s[(*offset) + 2] & UTF8_MASK_CONT);
327
328	*offset += 3;
329
330	/* Reject non-shortest form. */
331	if (_unlikely(!(ch & 0xFFFF0000)))
332	return CHAR_INVALID;
333
334	/* Reject out-of-range characters. */
335	if (_unlikely(ch >= 0x110000))
336	return CHAR_INVALID;
337
338	return ch;
339	}
340	#endif
341
342	/* 4 byte continuation 11110xxx */
343	mb->state = b ^ 0b1111111100000000;
344	} else {
345	return CHAR_INVALID;
346	}
347	}
348
349	/* Deal with the remaining edge and invalid cases. */
350	for (; offset < size; (offset)++) {
351	/* Read continuation bytes. */
352	uint8_t b = s[*offset];
353
354	if (!_is_continuation(b) \|\| _is_non_shortest(mb, b) \|\| _is_surrogate(mb, b)) {
355	mb->state = 0;
356	return CHAR_INVALID;
357	}
358
359	/* Top bit becomes zero when shifting in the second to last byte. */
360	if (!(mb->state & 0x8000)) {
361	char32_t c = ((char32_t) mb->state) << 6 \| (b & UTF8_MASK_CONT);
362	mb->state = 0;
363	(*offset)++;
364	return c;
365	}
366
367	mb->state = mb->state << 6 \| (b & UTF8_MASK_CONT);
368	}
369
370	/* Incomplete character. */
371	assert(mb->state);
372	return 0;
373	}
374
375	/** Standard <uchar.h> function since C11. */
376	size_t mbrtoc32(char32_t c, const char s, size_t n, mbstate_t *mb)
377	{
378	#if __STDC_HOSTED__
379	static fibril_local mbstate_t global_state = { };
380
381	if (!mb)
382	mb = &global_state;
383	#endif
384
385	if (!s) {
386	/* Equivalent to mbrtoc32(NULL, "", 1, mb); */
387	c = NULL;
388	s = "";
389	n = 1;
390	}
391
392	size_t offset = 0;
393	char32_t ret = _str_decode(s, &offset, n, mb);
394	if (ret == CHAR_INVALID) {
395	assert(!mb->state);
396	_set_ilseq();
397	return UCHAR_ILSEQ;
398	}
399	if (mb->state) {
400	assert(ret == 0);
401	return UCHAR_INCOMPLETE;
402	}
403
404	if (c)
405	*c = ret;
406	return ret ? offset : 0;
407	}
408
409	/** Decode a single character from a string.
410	*
411	* Decode a single character from a string of size @a size. Decoding starts
412	* at @a offset and this offset is moved to the beginning of the next
413	* character. In case of decoding error, offset generally advances at least
414	* by one. However, offset is never moved beyond size.
415	*
416	* @param str String (not necessarily NULL-terminated).
417	* @param offset Byte offset in string where to start decoding.
418	* @param size Size of the string (in bytes).
419	*
420	* @return Value of decoded character, U_SPECIAL on decoding error or
421	* NULL if attempt to decode beyond @a size.
422	*
423	*/
424	char32_t str_decode(const char str, size_t offset, size_t size)
425	{
426	mbstate_t mb = { };
427	char32_t ch = _str_decode(str, offset, size, &mb);
428
429	if (ch == CHAR_INVALID)
430	return U_SPECIAL;
431
432	if (mb.state)
433	return U_SPECIAL;
434
435	return ch;
436	}
437
438	/** Decode a single character from a string to the left.
439	*
440	* Decode a single character from a string of size @a size. Decoding starts
441	* at @a offset and this offset is moved to the beginning of the previous
442	* character. In case of decoding error, offset generally decreases at least
443	* by one. However, offset is never moved before 0.
444	*
445	* @param str String (not necessarily NULL-terminated).
446	* @param offset Byte offset in string where to start decoding.
447	* @param size Size of the string (in bytes).
448	*
449	* @return Value of decoded character, U_SPECIAL on decoding error or
450	* NULL if attempt to decode beyond @a start of str.
451	*
452	*/
453	char32_t str_decode_reverse(const char str, size_t offset, size_t size)
454	{
455	if (*offset == 0)
456	return 0;
457
458	int cbytes = 0;
459	/* Continue while continuation bytes found */
460	while (*offset > 0 && cbytes < 4) {
461	uint8_t b = (uint8_t) str[--(*offset)];
462
463	if (_is_continuation(b)) {
464	cbytes++;
465	continue;
466	}
467
468	/* Reject non-shortest form encoding. */
469	if (cbytes != _continuation_bytes(b))
470	return U_SPECIAL;
471
472	/* Start byte */
473	size_t start_offset = *offset;
474	return str_decode(str, &start_offset, size);
475	}
476
477	/* Too many continuation bytes */
478	return U_SPECIAL;
479	}
480
481	/** Encode a single character to string representation.
482	*
483	* Encode a single character to string representation (i.e. UTF-8) and store
484	* it into a buffer at @a offset. Encoding starts at @a offset and this offset
485	* is moved to the position where the next character can be written to.
486	*
487	* @param ch Input character.
488	* @param str Output buffer.
489	* @param offset Byte offset where to start writing.
490	* @param size Size of the output buffer (in bytes).
491	*
492	* @return EOK if the character was encoded successfully, EOVERFLOW if there
493	* was not enough space in the output buffer or EINVAL if the character
494	* code was invalid.
495	*/
496	errno_t chr_encode(char32_t ch, char str, size_t offset, size_t size)
497	{
498	// TODO: merge with c32rtomb()
499
500	if (*offset >= size)
501	return EOVERFLOW;
502
503	/* Fast exit for the most common case. */
504	if (ch < 0x80) {
505	str[(*offset)++] = (char) ch;
506	return EOK;
507	}
508
509	/* Codes longer than 21 bits are not supported */
510	if (!chr_check(ch))
511	return EINVAL;
512
513	/* Determine how many continuation bytes are needed */
514
515	unsigned int cbytes = _char_continuation_bytes(ch);
516	unsigned int b0_bits = 6 - cbytes; /* Data bits in first byte */
517
518	/* Check for available space in buffer */
519	if (*offset + cbytes >= size)
520	return EOVERFLOW;
521
522	/* Encode continuation bytes */
523	unsigned int i;
524	for (i = cbytes; i > 0; i--) {
525	str[*offset + i] = 0x80 \| (ch & LO_MASK_32(CONT_BITS));
526	ch >>= CONT_BITS;
527	}
528
529	/* Encode first byte */
530	str[*offset] = (ch & LO_MASK_32(b0_bits)) \| HI_MASK_8(8 - b0_bits - 1);
531
532	/* Advance offset */
533	*offset += cbytes + 1;
534
535	return EOK;
536	}
537
538	/* Convert in place any bytes that don't form a valid character into replacement. */
539	static size_t _str_sanitize(char *str, size_t n, uint8_t replacement)
540	{
541	uint8_t b = (uint8_t ) str;
542	size_t count = 0;
543
544	for (; n > 0 && b[0]; b++, n--) {
545	int cont = _continuation_bytes(b[0]);
546	if (__builtin_expect(cont, 0) == 0)
547	continue;
548
549	if (cont < 0 \|\| n <= (size_t) cont) {
550	b[0] = replacement;
551	count++;
552	continue;
553	}
554
555	/* Check continuation bytes. */
556	bool valid = true;
557	for (int i = 1; i <= cont; i++) {
558	if (!_is_continuation(b[i])) {
559	valid = false;
560	break;
561	}
562	}
563
564	if (!valid) {
565	b[0] = replacement;
566	count++;
567	continue;
568	}
569
570	/*
571	* Check for non-shortest form encoding.
572	* See https://www.unicode.org/versions/corrigendum1.html
573	*/
574
575	/* 0b110!!!!x 0b10xxxxxx */
576	if (cont == 1 && !(b[0] & 0b00011110)) {
577	b[0] = replacement;
578	count++;
579	continue;
580	}
581
582	/* 0b1110!!!! 0b10!xxxxx 0b10xxxxxx */
583	if (cont == 2 && !(b[0] & 0b00001111) && !(b[1] & 0b00100000)) {
584	b[0] = replacement;
585	count++;
586	continue;
587	}
588
589	/* 0b11110!!! 0b10!!xxxx 0b10xxxxxx 0b10xxxxxx */
590	if (cont == 3 && !(b[0] & 0b00000111) && !(b[1] & 0b00110000)) {
591	b[0] = replacement;
592	count++;
593	continue;
594	}
595
596	/* Check for surrogate character encoding. */
597	if (cont == 2 && b[0] == 0xED && b[1] >= 0xA0) {
598	b[0] = replacement;
599	count++;
600	continue;
601	}
602
603	/* Check for out-of-range code points. */
604	if (cont == 3 && (b[0] > 0xF4 \|\| (b[0] == 0xF4 && b[1] >= 0x90))) {
605	b[0] = replacement;
606	count++;
607	continue;
608	}
609
610	b += cont;
611	n -= cont;
612	}
613
614	return count;
615	}
616
617	size_t str_sanitize(char *str, size_t n, uint8_t replacement)
618	{
619	return _str_sanitize(str, n, replacement);
620	}
621
622	static size_t _str_size(const char *str)
623	{
624	size_t size = 0;
625
626	while (*str++ != 0)
627	size++;
628
629	return size;
630	}
631
632	/** Get size of string.
633	*
634	* Get the number of bytes which are used by the string @a str (excluding the
635	* NULL-terminator).
636	*
637	* @param str String to consider.
638	*
639	* @return Number of bytes used by the string
640	*
641	*/
642	size_t str_size(const char *str)
643	{
644	return _str_size(str);
645	}
646
647	/** Get size of wide string.
648	*
649	* Get the number of bytes which are used by the wide string @a str (excluding the
650	* NULL-terminator).
651	*
652	* @param str Wide string to consider.
653	*
654	* @return Number of bytes used by the wide string
655	*
656	*/
657	size_t wstr_size(const char32_t *str)
658	{
659	return (wstr_length(str) * sizeof(char32_t));
660	}
661
662	/** Get size of string with length limit.
663	*
664	* Get the number of bytes which are used by up to @a max_len first
665	* characters in the string @a str. If @a max_len is greater than
666	* the length of @a str, the entire string is measured (excluding the
667	* NULL-terminator).
668	*
669	* @param str String to consider.
670	* @param max_len Maximum number of characters to measure.
671	*
672	* @return Number of bytes used by the characters.
673	*
674	*/
675	size_t str_lsize(const char *str, size_t max_len)
676	{
677	size_t len = 0;
678	size_t offset = 0;
679
680	while (len < max_len) {
681	if (str_decode(str, &offset, STR_NO_LIMIT) == 0)
682	break;
683
684	len++;
685	}
686
687	return offset;
688	}
689
690	static size_t _str_nsize(const char *str, size_t max_size)
691	{
692	size_t size = 0;
693
694	while ((*str++ != 0) && (size < max_size))
695	size++;
696
697	return size;
698	}
699
700	/** Get size of string with size limit.
701	*
702	* Get the number of bytes which are used by the string @a str
703	* (excluding the NULL-terminator), but no more than @max_size bytes.
704	*
705	* @param str String to consider.
706	* @param max_size Maximum number of bytes to measure.
707	*
708	* @return Number of bytes used by the string
709	*
710	*/
711	size_t str_nsize(const char *str, size_t max_size)
712	{
713	return _str_nsize(str, max_size);
714	}
715
716	/** Get size of wide string with size limit.
717	*
718	* Get the number of bytes which are used by the wide string @a str
719	* (excluding the NULL-terminator), but no more than @max_size bytes.
720	*
721	* @param str Wide string to consider.
722	* @param max_size Maximum number of bytes to measure.
723	*
724	* @return Number of bytes used by the wide string
725	*
726	*/
727	size_t wstr_nsize(const char32_t *str, size_t max_size)
728	{
729	return (wstr_nlength(str, max_size) * sizeof(char32_t));
730	}
731
732	/** Get size of wide string with length limit.
733	*
734	* Get the number of bytes which are used by up to @a max_len first
735	* wide characters in the wide string @a str. If @a max_len is greater than
736	* the length of @a str, the entire wide string is measured (excluding the
737	* NULL-terminator).
738	*
739	* @param str Wide string to consider.
740	* @param max_len Maximum number of wide characters to measure.
741	*
742	* @return Number of bytes used by the wide characters.
743	*
744	*/
745	size_t wstr_lsize(const char32_t *str, size_t max_len)
746	{
747	return (wstr_nlength(str, max_len * sizeof(char32_t)) * sizeof(char32_t));
748	}
749
750	/** Get number of characters in a string.
751	*
752	* @param str NULL-terminated string.
753	*
754	* @return Number of characters in string.
755	*
756	*/
757	size_t str_length(const char *str)
758	{
759	size_t len = 0;
760	size_t offset = 0;
761
762	while (str_decode(str, &offset, STR_NO_LIMIT) != 0)
763	len++;
764
765	return len;
766	}
767
768	/** Get number of characters in a wide string.
769	*
770	* @param str NULL-terminated wide string.
771	*
772	* @return Number of characters in @a str.
773	*
774	*/
775	size_t wstr_length(const char32_t *wstr)
776	{
777	size_t len = 0;
778
779	while (*wstr++ != 0)
780	len++;
781
782	return len;
783	}
784
785	/** Get number of characters in a string with size limit.
786	*
787	* @param str NULL-terminated string.
788	* @param size Maximum number of bytes to consider.
789	*
790	* @return Number of characters in string.
791	*
792	*/
793	size_t str_nlength(const char *str, size_t size)
794	{
795	size_t len = 0;
796	size_t offset = 0;
797
798	while (str_decode(str, &offset, size) != 0)
799	len++;
800
801	return len;
802	}
803
804	/** Get number of characters in a string with size limit.
805	*
806	* @param str NULL-terminated string.
807	* @param size Maximum number of bytes to consider.
808	*
809	* @return Number of characters in string.
810	*
811	*/
812	size_t wstr_nlength(const char32_t *str, size_t size)
813	{
814	size_t len = 0;
815	size_t limit = ALIGN_DOWN(size, sizeof(char32_t));
816	size_t offset = 0;
817
818	while ((offset < limit) && (*str++ != 0)) {
819	len++;
820	offset += sizeof(char32_t);
821	}
822
823	return len;
824	}
825
826	/** Get character display width on a character cell display.
827	*
828	* @param ch Character
829	* @return Width of character in cells.
830	*/
831	size_t chr_width(char32_t ch)
832	{
833	return 1;
834	}
835
836	/** Get string display width on a character cell display.
837	*
838	* @param str String
839	* @return Width of string in cells.
840	*/
841	size_t str_width(const char *str)
842	{
843	size_t width = 0;
844	size_t offset = 0;
845	char32_t ch;
846
847	while ((ch = str_decode(str, &offset, STR_NO_LIMIT)) != 0)
848	width += chr_width(ch);
849
850	return width;
851	}
852
853	/** Check whether character is plain ASCII.
854	*
855	* @return True if character is plain ASCII.
856	*
857	*/
858	bool ascii_check(char32_t ch)
859	{
860	if (ch <= 127)
861	return true;
862
863	return false;
864	}
865
866	/** Check whether character is valid
867	*
868	* @return True if character is a valid Unicode code point.
869	*
870	*/
871	bool chr_check(char32_t ch)
872	{
873	if (ch <= 1114111)
874	return true;
875
876	return false;
877	}
878
879	/** Compare two NULL terminated strings.
880	*
881	* Do a char-by-char comparison of two NULL-terminated strings.
882	* The strings are considered equal iff their length is equal
883	* and both strings consist of the same sequence of characters.
884	*
885	* A string S1 is less than another string S2 if it has a character with
886	* lower value at the first character position where the strings differ.
887	* If the strings differ in length, the shorter one is treated as if
888	* padded by characters with a value of zero.
889	*
890	* @param s1 First string to compare.
891	* @param s2 Second string to compare.
892	*
893	* @return 0 if the strings are equal, -1 if the first is less than the second,
894	* 1 if the second is less than the first.
895	*
896	*/
897	int str_cmp(const char s1, const char s2)
898	{
899	/*
900	* UTF-8 has the nice property that lexicographic ordering on bytes is
901	* the same as the lexicographic ordering of the character sequences.
902	*/
903	while (s1 == s2 && *s1 != 0) {
904	s1++;
905	s2++;
906	}
907
908	if (s1 == s2)
909	return 0;
910
911	return (s1 < s2) ? -1 : 1;
912	}
913
914	/** Compare two NULL terminated strings with length limit.
915	*
916	* Do a char-by-char comparison of two NULL-terminated strings.
917	* The strings are considered equal iff
918	* min(str_length(s1), max_len) == min(str_length(s2), max_len)
919	* and both strings consist of the same sequence of characters,
920	* up to max_len characters.
921	*
922	* A string S1 is less than another string S2 if it has a character with
923	* lower value at the first character position where the strings differ.
924	* If the strings differ in length, the shorter one is treated as if
925	* padded by characters with a value of zero. Only the first max_len
926	* characters are considered.
927	*
928	* @param s1 First string to compare.
929	* @param s2 Second string to compare.
930	* @param max_len Maximum number of characters to consider.
931	*
932	* @return 0 if the strings are equal, -1 if the first is less than the second,
933	* 1 if the second is less than the first.
934	*
935	*/
936	int str_lcmp(const char s1, const char s2, size_t max_len)
937	{
938	char32_t c1 = 0;
939	char32_t c2 = 0;
940
941	size_t off1 = 0;
942	size_t off2 = 0;
943
944	size_t len = 0;
945
946	while (true) {
947	if (len >= max_len)
948	break;
949
950	c1 = str_decode(s1, &off1, STR_NO_LIMIT);
951	c2 = str_decode(s2, &off2, STR_NO_LIMIT);
952
953	if (c1 < c2)
954	return -1;
955
956	if (c1 > c2)
957	return 1;
958
959	if (c1 == 0 \|\| c2 == 0)
960	break;
961
962	++len;
963	}
964
965	return 0;
966
967	}
968
969	/** Compare two NULL terminated strings in case-insensitive manner.
970	*
971	* Do a char-by-char comparison of two NULL-terminated strings.
972	* The strings are considered equal iff their length is equal
973	* and both strings consist of the same sequence of characters
974	* when converted to lower case.
975	*
976	* A string S1 is less than another string S2 if it has a character with
977	* lower value at the first character position where the strings differ.
978	* If the strings differ in length, the shorter one is treated as if
979	* padded by characters with a value of zero.
980	*
981	* @param s1 First string to compare.
982	* @param s2 Second string to compare.
983	*
984	* @return 0 if the strings are equal, -1 if the first is less than the second,
985	* 1 if the second is less than the first.
986	*
987	*/
988	int str_casecmp(const char s1, const char s2)
989	{
990	// FIXME: doesn't work for non-ASCII caseful characters
991
992	char32_t c1 = 0;
993	char32_t c2 = 0;
994
995	size_t off1 = 0;
996	size_t off2 = 0;
997
998	while (true) {
999	c1 = tolower(str_decode(s1, &off1, STR_NO_LIMIT));
1000	c2 = tolower(str_decode(s2, &off2, STR_NO_LIMIT));
1001
1002	if (c1 < c2)
1003	return -1;
1004
1005	if (c1 > c2)
1006	return 1;
1007
1008	if (c1 == 0 \|\| c2 == 0)
1009	break;
1010	}
1011
1012	return 0;
1013	}
1014
1015	/** Compare two NULL terminated strings with length limit in case-insensitive
1016	* manner.
1017	*
1018	* Do a char-by-char comparison of two NULL-terminated strings.
1019	* The strings are considered equal iff
1020	* min(str_length(s1), max_len) == min(str_length(s2), max_len)
1021	* and both strings consist of the same sequence of characters,
1022	* up to max_len characters.
1023	*
1024	* A string S1 is less than another string S2 if it has a character with
1025	* lower value at the first character position where the strings differ.
1026	* If the strings differ in length, the shorter one is treated as if
1027	* padded by characters with a value of zero. Only the first max_len
1028	* characters are considered.
1029	*
1030	* @param s1 First string to compare.
1031	* @param s2 Second string to compare.
1032	* @param max_len Maximum number of characters to consider.
1033	*
1034	* @return 0 if the strings are equal, -1 if the first is less than the second,
1035	* 1 if the second is less than the first.
1036	*
1037	*/
1038	int str_lcasecmp(const char s1, const char s2, size_t max_len)
1039	{
1040	// FIXME: doesn't work for non-ASCII caseful characters
1041
1042	char32_t c1 = 0;
1043	char32_t c2 = 0;
1044
1045	size_t off1 = 0;
1046	size_t off2 = 0;
1047
1048	size_t len = 0;
1049
1050	while (true) {
1051	if (len >= max_len)
1052	break;
1053
1054	c1 = tolower(str_decode(s1, &off1, STR_NO_LIMIT));
1055	c2 = tolower(str_decode(s2, &off2, STR_NO_LIMIT));
1056
1057	if (c1 < c2)
1058	return -1;
1059
1060	if (c1 > c2)
1061	return 1;
1062
1063	if (c1 == 0 \|\| c2 == 0)
1064	break;
1065
1066	++len;
1067	}
1068
1069	return 0;
1070
1071	}
1072
1073	static bool _test_prefix(const char s, const char p)
1074	{
1075	while (s == p && *s != 0) {
1076	s++;
1077	p++;
1078	}
1079
1080	return *p == 0;
1081	}
1082
1083	/** Test whether p is a prefix of s.
1084	*
1085	* Do a char-by-char comparison of two NULL-terminated strings
1086	* and determine if p is a prefix of s.
1087	*
1088	* @param s The string in which to look
1089	* @param p The string to check if it is a prefix of s
1090	*
1091	* @return true iff p is prefix of s else false
1092	*
1093	*/
1094	bool str_test_prefix(const char s, const char p)
1095	{
1096	return _test_prefix(s, p);
1097	}
1098
1099	/** Get a string suffix.
1100	*
1101	* Return a string suffix defined by the prefix length.
1102	*
1103	* @param s The string to get the suffix from.
1104	* @param prefix_length Number of prefix characters to ignore.
1105	*
1106	* @return String suffix.
1107	*
1108	*/
1109	const char str_suffix(const char s, size_t prefix_length)
1110	{
1111	size_t off = 0;
1112	size_t i = 0;
1113
1114	while (true) {
1115	str_decode(s, &off, STR_NO_LIMIT);
1116	i++;
1117
1118	if (i >= prefix_length)
1119	break;
1120	}
1121
1122	return s + off;
1123	}
1124
1125	/** Copy string as a sequence of bytes. */
1126	static void _str_cpy(char dest, const char src)
1127	{
1128	while (*src)
1129	(dest++) = (src++);
1130
1131	*dest = 0;
1132	}
1133
1134	/** Copy string as a sequence of bytes. */
1135	static void _str_cpyn(char dest, size_t size, const char src)
1136	{
1137	assert(dest && src && size);
1138
1139	if (!dest \|\| !src \|\| !size)
1140	return;
1141
1142	if (size == STR_NO_LIMIT)
1143	return _str_cpy(dest, src);
1144
1145	char *dest_top = dest + size - 1;
1146	assert(size == 1 \|\| dest < dest_top);
1147
1148	while (*src && dest < dest_top)
1149	(dest++) = (src++);
1150
1151	*dest = 0;
1152	}
1153
1154	/** Copy string.
1155	*
1156	* Copy source string @a src to destination buffer @a dest.
1157	* No more than @a size bytes are written. If the size of the output buffer
1158	* is at least one byte, the output string will always be well-formed, i.e.
1159	* null-terminated and containing only complete characters.
1160	*
1161	* @param dest Destination buffer.
1162	* @param count Size of the destination buffer (must be > 0).
1163	* @param src Source string.
1164	*
1165	*/
1166	void str_cpy(char dest, size_t size, const char src)
1167	{
1168	/* There must be space for a null terminator in the buffer. */
1169	assert(size > 0);
1170	assert(src != NULL);
1171	assert(dest != NULL);
1172	assert(size == STR_NO_LIMIT \|\| dest + size > dest);
1173
1174	/* Copy data. */
1175	_str_cpyn(dest, size, src);
1176
1177	/* In-place translate invalid bytes to U_SPECIAL. */
1178	_str_sanitize(dest, size, U_SPECIAL);
1179	}
1180
1181	/** Copy size-limited substring.
1182	*
1183	* Copy prefix of string @a src of max. size @a size to destination buffer
1184	* @a dest. No more than @a size bytes are written. The output string will
1185	* always be well-formed, i.e. null-terminated and containing only complete
1186	* characters.
1187	*
1188	* No more than @a n bytes are read from the input string, so it does not
1189	* have to be null-terminated.
1190	*
1191	* @param dest Destination buffer.
1192	* @param count Size of the destination buffer (must be > 0).
1193	* @param src Source string.
1194	* @param n Maximum number of bytes to read from @a src.
1195	*
1196	*/
1197	void str_ncpy(char dest, size_t size, const char src, size_t n)
1198	{
1199	/* There must be space for a null terminator in the buffer. */
1200	assert(size > 0);
1201	assert(src != NULL);
1202
1203	/* Copy data. */
1204	_str_cpyn(dest, min(size, n + 1), src);
1205
1206	/* In-place translate invalid bytes to U_SPECIAL. */
1207	_str_sanitize(dest, size, U_SPECIAL);
1208	}
1209
1210	/** Append one string to another.
1211	*
1212	* Append source string @a src to string in destination buffer @a dest.
1213	* Size of the destination buffer is @a dest. If the size of the output buffer
1214	* is at least one byte, the output string will always be well-formed, i.e.
1215	* null-terminated and containing only complete characters.
1216	*
1217	* @param dest Destination buffer.
1218	* @param count Size of the destination buffer.
1219	* @param src Source string.
1220	*/
1221	void str_append(char dest, size_t size, const char src)
1222	{
1223	assert(src != NULL);
1224	assert(dest != NULL);
1225	assert(size > 0);
1226	assert(size == STR_NO_LIMIT \|\| dest + size > dest);
1227
1228	size_t dstr_size = _str_nsize(dest, size);
1229	if (dstr_size < size) {
1230	_str_cpyn(dest + dstr_size, size - dstr_size, src);
1231	_str_sanitize(dest + dstr_size, size - dstr_size, U_SPECIAL);
1232	}
1233	}
1234
1235	/** Convert space-padded ASCII to string.
1236	*
1237	* Common legacy text encoding in hardware is 7-bit ASCII fitted into
1238	* a fixed-width byte buffer (bit 7 always zero), right-padded with spaces
1239	* (ASCII 0x20). Convert space-padded ascii to string representation.
1240	*
1241	* If the text does not fit into the destination buffer, the function converts
1242	* as many characters as possible and returns EOVERFLOW.
1243	*
1244	* If the text contains non-ASCII bytes (with bit 7 set), the whole string is
1245	* converted anyway and invalid characters are replaced with question marks
1246	* (U_SPECIAL) and the function returns EIO.
1247	*
1248	* Regardless of return value upon return @a dest will always be well-formed.
1249	*
1250	* @param dest Destination buffer
1251	* @param size Size of destination buffer
1252	* @param src Space-padded ASCII.
1253	* @param n Size of the source buffer in bytes.
1254	*
1255	* @return EOK on success, EOVERFLOW if the text does not fit
1256	* destination buffer, EIO if the text contains
1257	* non-ASCII bytes.
1258	*/
1259	errno_t spascii_to_str(char dest, size_t size, const uint8_t src, size_t n)
1260	{
1261	size_t len = 0;
1262
1263	/* Determine the length of the source string. */
1264	for (size_t i = 0; i < n; i++) {
1265	if (src[i] == 0)
1266	break;
1267
1268	if (src[i] != ' ')
1269	len = i + 1;
1270	}
1271
1272	errno_t result = EOK;
1273	size_t out_len = min(len, size - 1);
1274
1275	/* Copy characters */
1276	for (size_t i = 0; i < out_len; i++) {
1277	dest[i] = src[i];
1278
1279	if (dest[i] < 0) {
1280	dest[i] = U_SPECIAL;
1281	result = EIO;
1282	}
1283	}
1284
1285	dest[out_len] = 0;
1286
1287	if (out_len < len)
1288	return EOVERFLOW;
1289
1290	return result;
1291	}
1292
1293	/** Convert wide string to string.
1294	*
1295	* Convert wide string @a src to string. The output is written to the buffer
1296	* specified by @a dest and @a size. @a size must be non-zero and the string
1297	* written will always be well-formed.
1298	*
1299	* @param dest Destination buffer.
1300	* @param size Size of the destination buffer.
1301	* @param src Source wide string.
1302	*/
1303	void wstr_to_str(char dest, size_t size, const char32_t src)
1304	{
1305	char32_t ch;
1306	size_t src_idx;
1307	size_t dest_off;
1308
1309	/* There must be space for a null terminator in the buffer. */
1310	assert(size > 0);
1311
1312	src_idx = 0;
1313	dest_off = 0;
1314
1315	while ((ch = src[src_idx++]) != 0) {
1316	if (chr_encode(ch, dest, &dest_off, size - 1) != EOK)
1317	break;
1318	}
1319
1320	dest[dest_off] = '\0';
1321	}
1322
1323	/** Convert UTF16 string to string.
1324	*
1325	* Convert utf16 string @a src to string. The output is written to the buffer
1326	* specified by @a dest and @a size. @a size must be non-zero and the string
1327	* written will always be well-formed. Surrogate pairs also supported.
1328	*
1329	* @param dest Destination buffer.
1330	* @param size Size of the destination buffer.
1331	* @param src Source utf16 string.
1332	*
1333	* @return EOK, if success, an error code otherwise.
1334	*/
1335	errno_t utf16_to_str(char dest, size_t size, const uint16_t src)
1336	{
1337	size_t idx = 0, dest_off = 0;
1338	char32_t ch;
1339	errno_t rc = EOK;
1340
1341	/* There must be space for a null terminator in the buffer. */
1342	assert(size > 0);
1343
1344	while (src[idx]) {
1345	if ((src[idx] & 0xfc00) == 0xd800) {
1346	if (src[idx + 1] && (src[idx + 1] & 0xfc00) == 0xdc00) {
1347	ch = 0x10000;
1348	ch += (src[idx] & 0x03FF) << 10;
1349	ch += (src[idx + 1] & 0x03FF);
1350	idx += 2;
1351	} else
1352	break;
1353	} else {
1354	ch = src[idx];
1355	idx++;
1356	}
1357	rc = chr_encode(ch, dest, &dest_off, size - 1);
1358	if (rc != EOK)
1359	break;
1360	}
1361	dest[dest_off] = '\0';
1362	return rc;
1363	}
1364
1365	/** Convert string to UTF16 string.
1366	*
1367	* Convert string @a src to utf16 string. The output is written to the buffer
1368	* specified by @a dest and @a dlen. @a dlen must be non-zero and the string
1369	* written will always be well-formed. Surrogate pairs also supported.
1370	*
1371	* @param dest Destination buffer.
1372	* @param dlen Number of utf16 characters that fit in the destination buffer.
1373	* @param src Source string.
1374	*
1375	* @return EOK, if success, an error code otherwise.
1376	*/
1377	errno_t str_to_utf16(uint16_t dest, size_t dlen, const char src)
1378	{
1379	errno_t rc = EOK;
1380	size_t offset = 0;
1381	size_t idx = 0;
1382	char32_t c;
1383
1384	assert(dlen > 0);
1385
1386	while ((c = str_decode(src, &offset, STR_NO_LIMIT)) != 0) {
1387	if (c > 0x10000) {
1388	if (idx + 2 >= dlen - 1) {
1389	rc = EOVERFLOW;
1390	break;
1391	}
1392	c = (c - 0x10000);
1393	dest[idx] = 0xD800 \| (c >> 10);
1394	dest[idx + 1] = 0xDC00 \| (c & 0x3FF);
1395	idx++;
1396	} else {
1397	dest[idx] = c;
1398	}
1399
1400	idx++;
1401	if (idx >= dlen - 1) {
1402	rc = EOVERFLOW;
1403	break;
1404	}
1405	}
1406
1407	dest[idx] = '\0';
1408	return rc;
1409	}
1410
1411	/** Get size of UTF-16 string.
1412	*
1413	* Get the number of words which are used by the UTF-16 string @a ustr
1414	* (excluding the NULL-terminator).
1415	*
1416	* @param ustr UTF-16 string to consider.
1417	*
1418	* @return Number of words used by the UTF-16 string
1419	*
1420	*/
1421	size_t utf16_wsize(const uint16_t *ustr)
1422	{
1423	size_t wsize = 0;
1424
1425	while (*ustr++ != 0)
1426	wsize++;
1427
1428	return wsize;
1429	}
1430
1431	/** Convert wide string to new string.
1432	*
1433	* Convert wide string @a src to string. Space for the new string is allocated
1434	* on the heap.
1435	*
1436	* @param src Source wide string.
1437	* @return New string.
1438	*/
1439	char wstr_to_astr(const char32_t src)
1440	{
1441	char dbuf[STR_BOUNDS(1)];
1442	char *str;
1443	char32_t ch;
1444
1445	size_t src_idx;
1446	size_t dest_off;
1447	size_t dest_size;
1448
1449	/* Compute size of encoded string. */
1450
1451	src_idx = 0;
1452	dest_size = 0;
1453
1454	while ((ch = src[src_idx++]) != 0) {
1455	dest_off = 0;
1456	if (chr_encode(ch, dbuf, &dest_off, STR_BOUNDS(1)) != EOK)
1457	break;
1458	dest_size += dest_off;
1459	}
1460
1461	str = malloc(dest_size + 1);
1462	if (str == NULL)
1463	return NULL;
1464
1465	/* Encode string. */
1466
1467	src_idx = 0;
1468	dest_off = 0;
1469
1470	while ((ch = src[src_idx++]) != 0) {
1471	if (chr_encode(ch, str, &dest_off, dest_size) != EOK)
1472	break;
1473	}
1474
1475	str[dest_size] = '\0';
1476	return str;
1477	}
1478
1479	/** Convert string to wide string.
1480	*
1481	* Convert string @a src to wide string. The output is written to the
1482	* buffer specified by @a dest and @a dlen. @a dlen must be non-zero
1483	* and the wide string written will always be null-terminated.
1484	*
1485	* @param dest Destination buffer.
1486	* @param dlen Length of destination buffer (number of wchars).
1487	* @param src Source string.
1488	*/
1489	void str_to_wstr(char32_t dest, size_t dlen, const char src)
1490	{
1491	size_t offset;
1492	size_t di;
1493	char32_t c;
1494
1495	assert(dlen > 0);
1496
1497	offset = 0;
1498	di = 0;
1499
1500	do {
1501	if (di >= dlen - 1)
1502	break;
1503
1504	c = str_decode(src, &offset, STR_NO_LIMIT);
1505	dest[di++] = c;
1506	} while (c != '\0');
1507
1508	dest[dlen - 1] = '\0';
1509	}
1510
1511	/** Convert string to wide string.
1512	*
1513	* Convert string @a src to wide string. A new wide NULL-terminated
1514	* string will be allocated on the heap.
1515	*
1516	* @param src Source string.
1517	*/
1518	char32_t str_to_awstr(const char str)
1519	{
1520	size_t len = str_length(str);
1521
1522	char32_t *wstr = calloc(len + 1, sizeof(char32_t));
1523	if (wstr == NULL)
1524	return NULL;
1525
1526	str_to_wstr(wstr, len + 1, str);
1527	return wstr;
1528	}
1529
1530	static char _strchr(const char str, char c)
1531	{
1532	while (str != 0 && str != c)
1533	str++;
1534
1535	return (str == c) ? (char ) str : NULL;
1536	}
1537
1538	/** Find first occurence of character in string.
1539	*
1540	* @param str String to search.
1541	* @param ch Character to look for.
1542	*
1543	* @return Pointer to character in @a str or NULL if not found.
1544	*/
1545	char str_chr(const char str, char32_t ch)
1546	{
1547	/* Fast path for an ASCII character. */
1548	if (ascii_check(ch))
1549	return _strchr(str, ch);
1550
1551	/* Convert character to UTF-8. */
1552	char utf8[STR_BOUNDS(1) + 1];
1553	size_t offset = 0;
1554
1555	if (chr_encode(ch, utf8, &offset, sizeof(utf8)) != EOK \|\| offset == 0)
1556	return NULL;
1557
1558	utf8[offset] = '\0';
1559
1560	/* Find the first byte, then check if all of them are correct. */
1561	while (*str != 0) {
1562	str = _strchr(str, utf8[0]);
1563	if (!str)
1564	return NULL;
1565
1566	if (_test_prefix(str, utf8))
1567	return (char *) str;
1568
1569	str++;
1570	}
1571
1572	return NULL;
1573	}
1574
1575	/** Find first occurence of substring in string.
1576	*
1577	* @param hs Haystack (string)
1578	* @param n Needle (substring to look for)
1579	*
1580	* @return Pointer to character in @a hs or @c NULL if not found.
1581	*/
1582	char str_str(const char hs, const char *n)
1583	{
1584	size_t hsize = _str_size(hs);
1585	size_t nsize = _str_size(n);
1586
1587	while (hsize >= nsize) {
1588	if (_test_prefix(hs, n))
1589	return (char *) hs;
1590
1591	hs++;
1592	hsize--;
1593	}
1594
1595	return NULL;
1596	}
1597
1598	static void _str_rtrim(char *str, char c)
1599	{
1600	char *last = str;
1601
1602	while (*str) {
1603	if (*str != c)
1604	last = str;
1605
1606	str++;
1607	}
1608
1609	/* Truncate string. */
1610	last[1] = 0;
1611	}
1612
1613	/** Removes specified trailing characters from a string.
1614	*
1615	* @param str String to remove from.
1616	* @param ch Character to remove.
1617	*/
1618	void str_rtrim(char *str, char32_t ch)
1619	{
1620	/* Fast path for the ASCII case. */
1621	if (ascii_check(ch)) {
1622	_str_rtrim(str, ch);
1623	return;
1624	}
1625
1626	size_t off = 0;
1627	size_t pos = 0;
1628	char32_t c;
1629	bool update_last_chunk = true;
1630	char *last_chunk = NULL;
1631
1632	while ((c = str_decode(str, &off, STR_NO_LIMIT))) {
1633	if (c != ch) {
1634	update_last_chunk = true;
1635	last_chunk = NULL;
1636	} else if (update_last_chunk) {
1637	update_last_chunk = false;
1638	last_chunk = (str + pos);
1639	}
1640	pos = off;
1641	}
1642
1643	if (last_chunk)
1644	*last_chunk = '\0';
1645	}
1646
1647	static void _str_ltrim(char *str, char c)
1648	{
1649	char *p = str;
1650
1651	while (*p == c)
1652	p++;
1653
1654	if (str != p)
1655	_str_cpy(str, p);
1656	}
1657
1658	/** Removes specified leading characters from a string.
1659	*
1660	* @param str String to remove from.
1661	* @param ch Character to remove.
1662	*/
1663	void str_ltrim(char *str, char32_t ch)
1664	{
1665	/* Fast path for the ASCII case. */
1666	if (ascii_check(ch)) {
1667	_str_ltrim(str, ch);
1668	return;
1669	}
1670
1671	char32_t acc;
1672	size_t off = 0;
1673	size_t pos = 0;
1674	size_t str_sz = str_size(str);
1675
1676	while ((acc = str_decode(str, &off, STR_NO_LIMIT)) != 0) {
1677	if (acc != ch)
1678	break;
1679	else
1680	pos = off;
1681	}
1682
1683	if (pos > 0) {
1684	memmove(str, &str[pos], str_sz - pos);
1685	pos = str_sz - pos;
1686	str[pos] = '\0';
1687	}
1688	}
1689
1690	static char _str_rchr(const char str, char c)
1691	{
1692	const char *last = NULL;
1693
1694	while (*str) {
1695	if (*str == c)
1696	last = str;
1697
1698	str++;
1699	}
1700
1701	return (char *) last;
1702	}
1703
1704	/** Find last occurence of character in string.
1705	*
1706	* @param str String to search.
1707	* @param ch Character to look for.
1708	*
1709	* @return Pointer to character in @a str or NULL if not found.
1710	*/
1711	char str_rchr(const char str, char32_t ch)
1712	{
1713	if (ascii_check(ch))
1714	return _str_rchr(str, ch);
1715
1716	char32_t acc;
1717	size_t off = 0;
1718	size_t last = 0;
1719	const char *res = NULL;
1720
1721	while ((acc = str_decode(str, &off, STR_NO_LIMIT)) != 0) {
1722	if (acc == ch)
1723	res = (str + last);
1724	last = off;
1725	}
1726
1727	return (char *) res;
1728	}
1729
1730	/** Insert a wide character into a wide string.
1731	*
1732	* Insert a wide character into a wide string at position
1733	* @a pos. The characters after the position are shifted.
1734	*
1735	* @param str String to insert to.
1736	* @param ch Character to insert to.
1737	* @param pos Character index where to insert.
1738	* @param max_pos Characters in the buffer.
1739	*
1740	* @return True if the insertion was sucessful, false if the position
1741	* is out of bounds.
1742	*
1743	*/
1744	bool wstr_linsert(char32_t *str, char32_t ch, size_t pos, size_t max_pos)
1745	{
1746	size_t len = wstr_length(str);
1747
1748	if ((pos > len) \|\| (pos + 1 > max_pos))
1749	return false;
1750
1751	size_t i;
1752	for (i = len; i + 1 > pos; i--)
1753	str[i + 1] = str[i];
1754
1755	str[pos] = ch;
1756
1757	return true;
1758	}
1759
1760	/** Remove a wide character from a wide string.
1761	*
1762	* Remove a wide character from a wide string at position
1763	* @a pos. The characters after the position are shifted.
1764	*
1765	* @param str String to remove from.
1766	* @param pos Character index to remove.
1767	*
1768	* @return True if the removal was sucessful, false if the position
1769	* is out of bounds.
1770	*
1771	*/
1772	bool wstr_remove(char32_t *str, size_t pos)
1773	{
1774	size_t len = wstr_length(str);
1775
1776	if (pos >= len)
1777	return false;
1778
1779	size_t i;
1780	for (i = pos + 1; i <= len; i++)
1781	str[i - 1] = str[i];
1782
1783	return true;
1784	}
1785
1786	/** Duplicate string.
1787	*
1788	* Allocate a new string and copy characters from the source
1789	* string into it. The duplicate string is allocated via sleeping
1790	* malloc(), thus this function can sleep in no memory conditions.
1791	*
1792	* The allocation cannot fail and the return value is always
1793	* a valid pointer. The duplicate string is always a well-formed
1794	* null-terminated UTF-8 string, but it can differ from the source
1795	* string on the byte level.
1796	*
1797	* @param src Source string.
1798	*
1799	* @return Duplicate string.
1800	*
1801	*/
1802	char str_dup(const char src)
1803	{
1804	size_t size = _str_size(src) + 1;
1805	char *dest = malloc(size);
1806	if (!dest)
1807	return NULL;
1808
1809	memcpy(dest, src, size);
1810	_str_sanitize(dest, size, U_SPECIAL);
1811	return dest;
1812	}
1813
1814	/** Duplicate string with size limit.
1815	*
1816	* Allocate a new string and copy up to @max_size bytes from the source
1817	* string into it. The duplicate string is allocated via sleeping
1818	* malloc(), thus this function can sleep in no memory conditions.
1819	* No more than @max_size + 1 bytes is allocated, but if the size
1820	* occupied by the source string is smaller than @max_size + 1,
1821	* less is allocated.
1822	*
1823	* The allocation cannot fail and the return value is always
1824	* a valid pointer. The duplicate string is always a well-formed
1825	* null-terminated UTF-8 string, but it can differ from the source
1826	* string on the byte level.
1827	*
1828	* @param src Source string.
1829	* @param n Maximum number of bytes to duplicate.
1830	*
1831	* @return Duplicate string.
1832	*
1833	*/
1834	char str_ndup(const char src, size_t n)
1835	{
1836	size_t size = _str_nsize(src, n);
1837
1838	char *dest = malloc(size + 1);
1839	if (!dest)
1840	return NULL;
1841
1842	memcpy(dest, src, size);
1843	_str_sanitize(dest, size, U_SPECIAL);
1844	dest[size] = 0;
1845	return dest;
1846	}
1847
1848	/** Split string by delimiters.
1849	*
1850	* @param s String to be tokenized. May not be NULL.
1851	* @param delim String with the delimiters.
1852	* @param next Variable which will receive the pointer to the
1853	* continuation of the string following the first
1854	* occurrence of any of the delimiter characters.
1855	* May be NULL.
1856	* @return Pointer to the prefix of @a s before the first
1857	* delimiter character. NULL if no such prefix
1858	* exists.
1859	*/
1860	char str_tok(char s, const char delim, char *next)
1861	{
1862	char start, end;
1863
1864	if (!s)
1865	return NULL;
1866
1867	size_t len = str_size(s);
1868	size_t cur;
1869	size_t tmp;
1870	char32_t ch;
1871
1872	/* Skip over leading delimiters. */
1873	tmp = 0;
1874	cur = 0;
1875	while ((ch = str_decode(s, &tmp, len)) && str_chr(delim, ch))
1876	cur = tmp;
1877	start = &s[cur];
1878
1879	/* Skip over token characters. */
1880	tmp = cur;
1881	while ((ch = str_decode(s, &tmp, len)) && !str_chr(delim, ch))
1882	cur = tmp;
1883	end = &s[cur];
1884	if (next)
1885	*next = (ch ? &s[tmp] : &s[cur]);
1886
1887	if (start == end)
1888	return NULL; /* No more tokens. */
1889
1890	/* Overwrite delimiter with NULL terminator. */
1891	*end = '\0';
1892	return start;
1893	}
1894
1895	void order_suffix(const uint64_t val, uint64_t rv, char suffix)
1896	{
1897	if (val > UINT64_C(10000000000000000000)) {
1898	*rv = val / UINT64_C(1000000000000000000);
1899	*suffix = 'Z';
1900	} else if (val > UINT64_C(1000000000000000000)) {
1901	*rv = val / UINT64_C(1000000000000000);
1902	*suffix = 'E';
1903	} else if (val > UINT64_C(1000000000000000)) {
1904	*rv = val / UINT64_C(1000000000000);
1905	*suffix = 'T';
1906	} else if (val > UINT64_C(1000000000000)) {
1907	*rv = val / UINT64_C(1000000000);
1908	*suffix = 'G';
1909	} else if (val > UINT64_C(1000000000)) {
1910	*rv = val / UINT64_C(1000000);
1911	*suffix = 'M';
1912	} else if (val > UINT64_C(1000000)) {
1913	*rv = val / UINT64_C(1000);
1914	*suffix = 'k';
1915	} else {
1916	*rv = val;
1917	*suffix = ' ';
1918	}
1919	}
1920
1921	void bin_order_suffix(const uint64_t val, uint64_t rv, const char *suffix,
1922	bool fixed)
1923	{
1924	if (val > UINT64_C(1152921504606846976)) {
1925	*rv = val / UINT64_C(1125899906842624);
1926	*suffix = "EiB";
1927	} else if (val > UINT64_C(1125899906842624)) {
1928	*rv = val / UINT64_C(1099511627776);
1929	*suffix = "TiB";
1930	} else if (val > UINT64_C(1099511627776)) {
1931	*rv = val / UINT64_C(1073741824);
1932	*suffix = "GiB";
1933	} else if (val > UINT64_C(1073741824)) {
1934	*rv = val / UINT64_C(1048576);
1935	*suffix = "MiB";
1936	} else if (val > UINT64_C(1048576)) {
1937	*rv = val / UINT64_C(1024);
1938	*suffix = "KiB";
1939	} else {
1940	*rv = val;
1941	if (fixed)
1942	*suffix = "B ";
1943	else
1944	*suffix = "B";
1945	}
1946	}
1947
1948	/** @}
1949	*/

Note: See TracBrowser for help on using the repository browser.

Download in other formats: