Context Navigation

source: mainline/common/str.c@ 65bf084

Visit:

Last change on this file since 65bf084 was 65bf084, checked in by Jiří Zárevúcky <zarevucky.jiri@…>, 2 months ago
Implement both str_decode() and mbrtoc32() using one function
Property mode set to `100644`
File size: 44.6 KB

Line
1	/*
2	* Copyright (c) 2001-2004 Jakub Jermar
3	* Copyright (c) 2005 Martin Decky
4	* Copyright (c) 2008 Jiri Svoboda
5	* Copyright (c) 2011 Martin Sucha
6	* Copyright (c) 2011 Oleg Romanenko
7	* Copyright (c) 2025 Jiří Zárevúcky
8	* All rights reserved.
9	*
10	* Redistribution and use in source and binary forms, with or without
11	* modification, are permitted provided that the following conditions
12	* are met:
13	*
14	* - Redistributions of source code must retain the above copyright
15	* notice, this list of conditions and the following disclaimer.
16	* - Redistributions in binary form must reproduce the above copyright
17	* notice, this list of conditions and the following disclaimer in the
18	* documentation and/or other materials provided with the distribution.
19	* - The name of the author may not be used to endorse or promote products
20	* derived from this software without specific prior written permission.
21	*
22	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
23	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
24	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
25	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
26	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
27	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
31	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32	*/
33
34	/** @addtogroup libc
35	* @{
36	*/
37
38	/**
39	* @file
40	* @brief String functions.
41	*
42	* Strings and characters use the Universal Character Set (UCS). The standard
43	* strings, called just strings are encoded in UTF-8. Wide strings (encoded
44	* in UTF-32) are supported to a limited degree. A single character is
45	* represented as char32_t.@n
46	*
47	* Overview of the terminology:@n
48	*
49	* Term Meaning
50	* -------------------- ----------------------------------------------------
51	* byte 8 bits stored in uint8_t (unsigned 8 bit integer)
52	*
53	* character UTF-32 encoded Unicode character, stored in char32_t
54	* (unsigned 32 bit integer), code points 0 .. 1114111
55	* are valid
56	*
57	* Note that Unicode characters do not match
58	* one-to-one with displayed characters or glyphs on
59	* screen. For that level of precision, look up
60	* Grapheme Clusters.
61	*
62	* ASCII character 7 bit encoded ASCII character, stored in char
63	* (usually signed 8 bit integer), code points 0 .. 127
64	* are valid
65	*
66	* string UTF-8 encoded NULL-terminated Unicode string, char *
67	*
68	* wide string UTF-32 encoded NULL-terminated Unicode string,
69	* char32_t *
70	*
71	* [wide] string size number of BYTES in a [wide] string (excluding
72	* the NULL-terminator), size_t
73	*
74	* [wide] string length number of CHARACTERS in a [wide] string (excluding
75	* the NULL-terminator), size_t
76	*
77	* [wide] string width number of display cells on a monospace display taken
78	* by a [wide] string, size_t
79	*
80	* This is virtually impossible to determine exactly for
81	* all strings without knowing specifics of the display
82	* device, due to various factors affecting text output.
83	* If you have the option to query the terminal for
84	* position change caused by outputting the string,
85	* it is preferrable to determine width that way.
86	*
87	*
88	* Overview of string metrics:@n
89	*
90	* Metric Abbrev. Type Meaning
91	* ------ ------ ------ -------------------------------------------------
92	* size n size_t number of BYTES in a string (excluding the
93	* NULL-terminator)
94	*
95	* length l size_t number of CHARACTERS in a string (excluding the
96	* null terminator)
97	*
98	* width w size_t number of display cells on a monospace display
99	* taken by a string
100	*
101	*
102	* Function naming prefixes:@n
103	*
104	* chr_ operate on characters
105	* ascii_ operate on ASCII characters
106	* str_ operate on strings
107	* wstr_ operate on wide strings
108	*
109	* [w]str_[n\|l\|w] operate on a prefix limited by size, length
110	* or width
111	*
112	*
113	* A specific character inside a [wide] string can be referred to by:@n
114	*
115	* pointer (char , char32_t )
116	* byte offset (size_t)
117	* character index (size_t)
118	*
119	*/
120
121	#include <str.h>
122
123	#include <align.h>
124	#include <assert.h>
125	#include <ctype.h>
126	#include <errno.h>
127	#include <limits.h>
128	#include <macros.h>
129	#include <mem.h>
130	#include <stdbool.h>
131	#include <stddef.h>
132	#include <stdint.h>
133	#include <stdlib.h>
134	#include <uchar.h>
135
136	#if __STDC_HOSTED__
137	#include <fibril.h>
138	#endif
139
140	static void _set_ilseq()
141	{
142	#ifdef errno
143	errno = EILSEQ;
144	#endif
145	}
146
147	/** Byte mask consisting of lowest @n bits (out of 8) */
148	#define LO_MASK_8(n) ((uint8_t) ((1 << (n)) - 1))
149
150	/** Byte mask consisting of lowest @n bits (out of 32) */
151	#define LO_MASK_32(n) ((uint32_t) ((1 << (n)) - 1))
152
153	/** Byte mask consisting of highest @n bits (out of 8) */
154	#define HI_MASK_8(n) (~LO_MASK_8(8 - (n)))
155
156	/** Number of data bits in a UTF-8 continuation byte */
157	#define CONT_BITS 6
158
159	#define UTF8_MASK_INITIAL2 0b00011111
160	#define UTF8_MASK_INITIAL3 0b00001111
161	#define UTF8_MASK_INITIAL4 0b00000111
162	#define UTF8_MASK_CONT 0b00111111
163
164	#define CHAR_INVALID ((char32_t) UINT_MAX)
165
166	static inline bool _is_ascii(uint8_t b)
167	{
168	return b < 0x80;
169	}
170
171	static inline bool _is_continuation(uint8_t b)
172	{
173	return (b & 0xC0) == 0x80;
174	}
175
176	static inline bool _is_2_byte(uint8_t c)
177	{
178	return (c & 0xE0) == 0xC0;
179	}
180
181	static inline bool _is_3_byte(uint8_t c)
182	{
183	return (c & 0xF0) == 0xE0;
184	}
185
186	static inline bool _is_4_byte(uint8_t c)
187	{
188	return (c & 0xF8) == 0xF0;
189	}
190
191	static inline int _char_continuation_bytes(char32_t c)
192	{
193	if ((c & ~LO_MASK_32(7)) == 0)
194	return 0;
195
196	if ((c & ~LO_MASK_32(11)) == 0)
197	return 1;
198
199	if ((c & ~LO_MASK_32(16)) == 0)
200	return 2;
201
202	if ((c & ~LO_MASK_32(21)) == 0)
203	return 3;
204
205	/* Codes longer than 21 bits are not supported */
206	return -1;
207	}
208
209	static inline int _continuation_bytes(uint8_t b)
210	{
211	/* 0xxxxxxx */
212	if (_is_ascii(b))
213	return 0;
214
215	/* 110xxxxx 10xxxxxx */
216	if (_is_2_byte(b))
217	return 1;
218
219	/* 1110xxxx 10xxxxxx 10xxxxxx */
220	if (_is_3_byte(b))
221	return 2;
222
223	/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
224	if (_is_4_byte(b))
225	return 3;
226
227	return -1;
228	}
229
230	static bool _is_non_shortest(const mbstate_t *mb, uint8_t b)
231	{
232	return (mb->state == 0b1111110000000000 && !(b & 0b00100000)) \|\|
233	(mb->state == 0b1111111111110000 && !(b & 0b00110000));
234	}
235
236	#define _likely(expr) __builtin_expect((expr), true)
237	#define _unlikely(expr) __builtin_expect((expr), false)
238
239	#define FAST_PATHS 1
240
241	static char32_t _str_decode(const char s, size_t offset, size_t size, mbstate_t *mb)
242	{
243	assert(s);
244	assert(offset);
245	assert(*offset <= size);
246	assert(size == STR_NO_LIMIT \|\| s + size >= s);
247	assert(mb);
248
249	if (*offset == size)
250	return 0;
251
252	if (_likely(!mb->state)) {
253	/* Clean slate, read initial byte. */
254	uint8_t b = s[(*offset)++];
255
256	/* Fast exit for the most common case. */
257	if (_likely(_is_ascii(b)))
258	return b;
259
260	/* unexpected continuation byte */
261	if (_unlikely(_is_continuation(b)))
262	return CHAR_INVALID;
263
264	/*
265	* The value stored into `continuation` is designed to have
266	* just enough leading ones that after shifting in one less than
267	* the expected number of continuation bytes, the most significant
268	* bit becomes zero. (The field is 16b wide.)
269	*/
270
271	if (_is_2_byte(b)) {
272	/* Reject non-shortest form. */
273	if (_unlikely(!(b & 0b00011110)))
274	return CHAR_INVALID;
275
276	#if FAST_PATHS
277	/* We can usually take this exit. */
278	if (_likely(offset < size && _is_continuation(s[offset])))
279	return (b & UTF8_MASK_INITIAL2) << 6 \|
280	(s[(*offset)++] & UTF8_MASK_CONT);
281	#endif
282
283	/* 2 byte continuation 110xxxxx */
284	mb->state = b ^ 0b0000000011000000;
285
286	} else if (_is_3_byte(b)) {
287	#if FAST_PATHS
288	/* We can usually take this exit. */
289	if (_likely(offset + 1 < size && _is_continuation(s[offset]) && _is_continuation(s[*offset + 1]))) {
290
291	char32_t ch = (b & UTF8_MASK_INITIAL3) << 12 \|
292	(s[(*offset)] & UTF8_MASK_CONT) << 6 \|
293	(s[(*offset) + 1] & UTF8_MASK_CONT);
294
295	*offset += 2;
296
297	/* Reject non-shortest form. */
298	if (_unlikely(!(ch & 0xFFFFF800)))
299	return CHAR_INVALID;
300
301	return ch;
302	}
303	#endif
304
305	/* 3 byte continuation 1110xxxx */
306	mb->state = b ^ 0b1111110011100000;
307
308	} else if (_is_4_byte(b)) {
309	#if FAST_PATHS
310	/* We can usually take this exit. */
311	if (_likely(offset + 2 < size && _is_continuation(s[offset]) &&
312	_is_continuation(s[offset + 1]) && _is_continuation(s[offset + 2]))) {
313
314	char32_t ch = (b & UTF8_MASK_INITIAL4) << 18 \|
315	(s[(*offset)] & UTF8_MASK_CONT) << 12 \|
316	(s[(*offset) + 1] & UTF8_MASK_CONT) << 6 \|
317	(s[(*offset) + 2] & UTF8_MASK_CONT);
318
319	*offset += 3;
320
321	/* Reject non-shortest form. */
322	if (_unlikely(!(ch & 0xFFFF0000)))
323	return CHAR_INVALID;
324
325	return ch;
326	}
327	#endif
328
329	/* 4 byte continuation 11110xxx */
330	mb->state = b ^ 0b1111111100000000;
331	} else {
332	return CHAR_INVALID;
333	}
334	}
335
336	/* Deal with the remaining edge and invalid cases. */
337	for (; offset < size; (offset)++) {
338	/* Read continuation bytes. */
339	uint8_t b = s[*offset];
340
341	if (!_is_continuation(b) \|\| _is_non_shortest(mb, b)) {
342	mb->state = 0;
343	return CHAR_INVALID;
344	}
345
346	/* Top bit becomes zero when shifting in the second to last byte. */
347	if (!(mb->state & 0x8000)) {
348	char32_t c = ((char32_t) mb->state) << 6 \| (b & UTF8_MASK_CONT);
349	mb->state = 0;
350	(*offset)++;
351	return c;
352	}
353
354	mb->state = mb->state << 6 \| (b & UTF8_MASK_CONT);
355	}
356
357	/* Incomplete character. */
358	assert(mb->state);
359	return 0;
360	}
361
362	/** Standard <uchar.h> function since C11. */
363	size_t mbrtoc32(char32_t c, const char s, size_t n, mbstate_t *mb)
364	{
365	#if __STDC_HOSTED__
366	static fibril_local mbstate_t global_state = { };
367
368	if (!mb)
369	mb = &global_state;
370	#endif
371
372	if (!s) {
373	/* Equivalent to mbrtoc32(NULL, "", 1, mb); */
374	c = NULL;
375	s = "";
376	n = 1;
377	}
378
379	size_t offset = 0;
380	char32_t ret = _str_decode(s, &offset, n, mb);
381	if (ret == CHAR_INVALID) {
382	assert(!mb->state);
383	_set_ilseq();
384	return UCHAR_ILSEQ;
385	}
386	if (mb->state) {
387	assert(ret == 0);
388	return UCHAR_INCOMPLETE;
389	}
390
391	if (c)
392	*c = ret;
393	return ret ? offset : 0;
394	}
395
396	/** Decode a single character from a string.
397	*
398	* Decode a single character from a string of size @a size. Decoding starts
399	* at @a offset and this offset is moved to the beginning of the next
400	* character. In case of decoding error, offset generally advances at least
401	* by one. However, offset is never moved beyond size.
402	*
403	* @param str String (not necessarily NULL-terminated).
404	* @param offset Byte offset in string where to start decoding.
405	* @param size Size of the string (in bytes).
406	*
407	* @return Value of decoded character, U_SPECIAL on decoding error or
408	* NULL if attempt to decode beyond @a size.
409	*
410	*/
411	char32_t str_decode(const char str, size_t offset, size_t size)
412	{
413	mbstate_t mb = { };
414	char32_t ch = _str_decode(str, offset, size, &mb);
415
416	if (ch == CHAR_INVALID)
417	return U_SPECIAL;
418
419	if (mb.state)
420	return U_SPECIAL;
421
422	return ch;
423	}
424
425	/** Decode a single character from a string to the left.
426	*
427	* Decode a single character from a string of size @a size. Decoding starts
428	* at @a offset and this offset is moved to the beginning of the previous
429	* character. In case of decoding error, offset generally decreases at least
430	* by one. However, offset is never moved before 0.
431	*
432	* @param str String (not necessarily NULL-terminated).
433	* @param offset Byte offset in string where to start decoding.
434	* @param size Size of the string (in bytes).
435	*
436	* @return Value of decoded character, U_SPECIAL on decoding error or
437	* NULL if attempt to decode beyond @a start of str.
438	*
439	*/
440	char32_t str_decode_reverse(const char str, size_t offset, size_t size)
441	{
442	if (*offset == 0)
443	return 0;
444
445	int cbytes = 0;
446	/* Continue while continuation bytes found */
447	while (*offset > 0 && cbytes < 4) {
448	uint8_t b = (uint8_t) str[--(*offset)];
449
450	if (_is_continuation(b)) {
451	cbytes++;
452	continue;
453	}
454
455	/* Reject non-shortest form encoding. */
456	if (cbytes != _continuation_bytes(b))
457	return U_SPECIAL;
458
459	/* Start byte */
460	size_t start_offset = *offset;
461	return str_decode(str, &start_offset, size);
462	}
463
464	/* Too many continuation bytes */
465	return U_SPECIAL;
466	}
467
468	/** Encode a single character to string representation.
469	*
470	* Encode a single character to string representation (i.e. UTF-8) and store
471	* it into a buffer at @a offset. Encoding starts at @a offset and this offset
472	* is moved to the position where the next character can be written to.
473	*
474	* @param ch Input character.
475	* @param str Output buffer.
476	* @param offset Byte offset where to start writing.
477	* @param size Size of the output buffer (in bytes).
478	*
479	* @return EOK if the character was encoded successfully, EOVERFLOW if there
480	* was not enough space in the output buffer or EINVAL if the character
481	* code was invalid.
482	*/
483	errno_t chr_encode(char32_t ch, char str, size_t offset, size_t size)
484	{
485	// TODO: merge with c32rtomb()
486
487	if (*offset >= size)
488	return EOVERFLOW;
489
490	/* Fast exit for the most common case. */
491	if (ch < 0x80) {
492	str[(*offset)++] = (char) ch;
493	return EOK;
494	}
495
496	/* Codes longer than 21 bits are not supported */
497	if (!chr_check(ch))
498	return EINVAL;
499
500	/* Determine how many continuation bytes are needed */
501
502	unsigned int cbytes = _char_continuation_bytes(ch);
503	unsigned int b0_bits = 6 - cbytes; /* Data bits in first byte */
504
505	/* Check for available space in buffer */
506	if (*offset + cbytes >= size)
507	return EOVERFLOW;
508
509	/* Encode continuation bytes */
510	unsigned int i;
511	for (i = cbytes; i > 0; i--) {
512	str[*offset + i] = 0x80 \| (ch & LO_MASK_32(CONT_BITS));
513	ch >>= CONT_BITS;
514	}
515
516	/* Encode first byte */
517	str[*offset] = (ch & LO_MASK_32(b0_bits)) \| HI_MASK_8(8 - b0_bits - 1);
518
519	/* Advance offset */
520	*offset += cbytes + 1;
521
522	return EOK;
523	}
524
525	/* Convert in place any bytes that don't form a valid character into U_SPECIAL. */
526	static void _sanitize_string(char *str, size_t n)
527	{
528	uint8_t b = (uint8_t ) str;
529
530	for (; *b && n > 0; b++, n--) {
531	int cont = _continuation_bytes(b[0]);
532	if (__builtin_expect(cont, 0) == 0)
533	continue;
534
535	if (cont < 0 \|\| n <= (size_t) cont) {
536	b[0] = U_SPECIAL;
537	continue;
538	}
539
540	/* Check continuation bytes. */
541	for (int i = 1; i <= cont; i++) {
542	if (!_is_continuation(b[i])) {
543	b[0] = U_SPECIAL;
544	continue;
545	}
546	}
547
548	/*
549	* Check for non-shortest form encoding.
550	* See https://www.unicode.org/versions/corrigendum1.html
551	*/
552
553	switch (cont) {
554	case 1:
555	/* 0b110!!!!x 0b10xxxxxx */
556	if (!(b[0] & 0b00011110))
557	b[0] = U_SPECIAL;
558
559	continue;
560	case 2:
561	/* 0b1110!!!! 0b10!xxxxx 0b10xxxxxx */
562	if (!(b[0] & 0b00001111) && !(b[1] & 0b00100000))
563	b[0] = U_SPECIAL;
564
565	continue;
566	case 3:
567	/* 0b11110!!! 0b10!!xxxx 0b10xxxxxx 0b10xxxxxx */
568	if (!(b[0] & 0b00000111) && !(b[1] & 0b00110000))
569	b[0] = U_SPECIAL;
570
571	continue;
572	}
573	}
574	}
575
576	static size_t _str_size(const char *str)
577	{
578	size_t size = 0;
579
580	while (*str++ != 0)
581	size++;
582
583	return size;
584	}
585
586	/** Get size of string.
587	*
588	* Get the number of bytes which are used by the string @a str (excluding the
589	* NULL-terminator).
590	*
591	* @param str String to consider.
592	*
593	* @return Number of bytes used by the string
594	*
595	*/
596	size_t str_size(const char *str)
597	{
598	return _str_size(str);
599	}
600
601	/** Get size of wide string.
602	*
603	* Get the number of bytes which are used by the wide string @a str (excluding the
604	* NULL-terminator).
605	*
606	* @param str Wide string to consider.
607	*
608	* @return Number of bytes used by the wide string
609	*
610	*/
611	size_t wstr_size(const char32_t *str)
612	{
613	return (wstr_length(str) * sizeof(char32_t));
614	}
615
616	/** Get size of string with length limit.
617	*
618	* Get the number of bytes which are used by up to @a max_len first
619	* characters in the string @a str. If @a max_len is greater than
620	* the length of @a str, the entire string is measured (excluding the
621	* NULL-terminator).
622	*
623	* @param str String to consider.
624	* @param max_len Maximum number of characters to measure.
625	*
626	* @return Number of bytes used by the characters.
627	*
628	*/
629	size_t str_lsize(const char *str, size_t max_len)
630	{
631	size_t len = 0;
632	size_t offset = 0;
633
634	while (len < max_len) {
635	if (str_decode(str, &offset, STR_NO_LIMIT) == 0)
636	break;
637
638	len++;
639	}
640
641	return offset;
642	}
643
644	static size_t _str_nsize(const char *str, size_t max_size)
645	{
646	size_t size = 0;
647
648	while ((*str++ != 0) && (size < max_size))
649	size++;
650
651	return size;
652	}
653
654	/** Get size of string with size limit.
655	*
656	* Get the number of bytes which are used by the string @a str
657	* (excluding the NULL-terminator), but no more than @max_size bytes.
658	*
659	* @param str String to consider.
660	* @param max_size Maximum number of bytes to measure.
661	*
662	* @return Number of bytes used by the string
663	*
664	*/
665	size_t str_nsize(const char *str, size_t max_size)
666	{
667	return _str_nsize(str, max_size);
668	}
669
670	/** Get size of wide string with size limit.
671	*
672	* Get the number of bytes which are used by the wide string @a str
673	* (excluding the NULL-terminator), but no more than @max_size bytes.
674	*
675	* @param str Wide string to consider.
676	* @param max_size Maximum number of bytes to measure.
677	*
678	* @return Number of bytes used by the wide string
679	*
680	*/
681	size_t wstr_nsize(const char32_t *str, size_t max_size)
682	{
683	return (wstr_nlength(str, max_size) * sizeof(char32_t));
684	}
685
686	/** Get size of wide string with length limit.
687	*
688	* Get the number of bytes which are used by up to @a max_len first
689	* wide characters in the wide string @a str. If @a max_len is greater than
690	* the length of @a str, the entire wide string is measured (excluding the
691	* NULL-terminator).
692	*
693	* @param str Wide string to consider.
694	* @param max_len Maximum number of wide characters to measure.
695	*
696	* @return Number of bytes used by the wide characters.
697	*
698	*/
699	size_t wstr_lsize(const char32_t *str, size_t max_len)
700	{
701	return (wstr_nlength(str, max_len * sizeof(char32_t)) * sizeof(char32_t));
702	}
703
704	/** Get number of characters in a string.
705	*
706	* @param str NULL-terminated string.
707	*
708	* @return Number of characters in string.
709	*
710	*/
711	size_t str_length(const char *str)
712	{
713	size_t len = 0;
714	size_t offset = 0;
715
716	while (str_decode(str, &offset, STR_NO_LIMIT) != 0)
717	len++;
718
719	return len;
720	}
721
722	/** Get number of characters in a wide string.
723	*
724	* @param str NULL-terminated wide string.
725	*
726	* @return Number of characters in @a str.
727	*
728	*/
729	size_t wstr_length(const char32_t *wstr)
730	{
731	size_t len = 0;
732
733	while (*wstr++ != 0)
734	len++;
735
736	return len;
737	}
738
739	/** Get number of characters in a string with size limit.
740	*
741	* @param str NULL-terminated string.
742	* @param size Maximum number of bytes to consider.
743	*
744	* @return Number of characters in string.
745	*
746	*/
747	size_t str_nlength(const char *str, size_t size)
748	{
749	size_t len = 0;
750	size_t offset = 0;
751
752	while (str_decode(str, &offset, size) != 0)
753	len++;
754
755	return len;
756	}
757
758	/** Get number of characters in a string with size limit.
759	*
760	* @param str NULL-terminated string.
761	* @param size Maximum number of bytes to consider.
762	*
763	* @return Number of characters in string.
764	*
765	*/
766	size_t wstr_nlength(const char32_t *str, size_t size)
767	{
768	size_t len = 0;
769	size_t limit = ALIGN_DOWN(size, sizeof(char32_t));
770	size_t offset = 0;
771
772	while ((offset < limit) && (*str++ != 0)) {
773	len++;
774	offset += sizeof(char32_t);
775	}
776
777	return len;
778	}
779
780	/** Get character display width on a character cell display.
781	*
782	* @param ch Character
783	* @return Width of character in cells.
784	*/
785	size_t chr_width(char32_t ch)
786	{
787	return 1;
788	}
789
790	/** Get string display width on a character cell display.
791	*
792	* @param str String
793	* @return Width of string in cells.
794	*/
795	size_t str_width(const char *str)
796	{
797	size_t width = 0;
798	size_t offset = 0;
799	char32_t ch;
800
801	while ((ch = str_decode(str, &offset, STR_NO_LIMIT)) != 0)
802	width += chr_width(ch);
803
804	return width;
805	}
806
807	/** Check whether character is plain ASCII.
808	*
809	* @return True if character is plain ASCII.
810	*
811	*/
812	bool ascii_check(char32_t ch)
813	{
814	if (ch <= 127)
815	return true;
816
817	return false;
818	}
819
820	/** Check whether character is valid
821	*
822	* @return True if character is a valid Unicode code point.
823	*
824	*/
825	bool chr_check(char32_t ch)
826	{
827	if (ch <= 1114111)
828	return true;
829
830	return false;
831	}
832
833	/** Compare two NULL terminated strings.
834	*
835	* Do a char-by-char comparison of two NULL-terminated strings.
836	* The strings are considered equal iff their length is equal
837	* and both strings consist of the same sequence of characters.
838	*
839	* A string S1 is less than another string S2 if it has a character with
840	* lower value at the first character position where the strings differ.
841	* If the strings differ in length, the shorter one is treated as if
842	* padded by characters with a value of zero.
843	*
844	* @param s1 First string to compare.
845	* @param s2 Second string to compare.
846	*
847	* @return 0 if the strings are equal, -1 if the first is less than the second,
848	* 1 if the second is less than the first.
849	*
850	*/
851	int str_cmp(const char s1, const char s2)
852	{
853	/*
854	* UTF-8 has the nice property that lexicographic ordering on bytes is
855	* the same as the lexicographic ordering of the character sequences.
856	*/
857	while (s1 == s2 && *s1 != 0) {
858	s1++;
859	s2++;
860	}
861
862	if (s1 == s2)
863	return 0;
864
865	return (s1 < s2) ? -1 : 1;
866	}
867
868	/** Compare two NULL terminated strings with length limit.
869	*
870	* Do a char-by-char comparison of two NULL-terminated strings.
871	* The strings are considered equal iff
872	* min(str_length(s1), max_len) == min(str_length(s2), max_len)
873	* and both strings consist of the same sequence of characters,
874	* up to max_len characters.
875	*
876	* A string S1 is less than another string S2 if it has a character with
877	* lower value at the first character position where the strings differ.
878	* If the strings differ in length, the shorter one is treated as if
879	* padded by characters with a value of zero. Only the first max_len
880	* characters are considered.
881	*
882	* @param s1 First string to compare.
883	* @param s2 Second string to compare.
884	* @param max_len Maximum number of characters to consider.
885	*
886	* @return 0 if the strings are equal, -1 if the first is less than the second,
887	* 1 if the second is less than the first.
888	*
889	*/
890	int str_lcmp(const char s1, const char s2, size_t max_len)
891	{
892	char32_t c1 = 0;
893	char32_t c2 = 0;
894
895	size_t off1 = 0;
896	size_t off2 = 0;
897
898	size_t len = 0;
899
900	while (true) {
901	if (len >= max_len)
902	break;
903
904	c1 = str_decode(s1, &off1, STR_NO_LIMIT);
905	c2 = str_decode(s2, &off2, STR_NO_LIMIT);
906
907	if (c1 < c2)
908	return -1;
909
910	if (c1 > c2)
911	return 1;
912
913	if (c1 == 0 \|\| c2 == 0)
914	break;
915
916	++len;
917	}
918
919	return 0;
920
921	}
922
923	/** Compare two NULL terminated strings in case-insensitive manner.
924	*
925	* Do a char-by-char comparison of two NULL-terminated strings.
926	* The strings are considered equal iff their length is equal
927	* and both strings consist of the same sequence of characters
928	* when converted to lower case.
929	*
930	* A string S1 is less than another string S2 if it has a character with
931	* lower value at the first character position where the strings differ.
932	* If the strings differ in length, the shorter one is treated as if
933	* padded by characters with a value of zero.
934	*
935	* @param s1 First string to compare.
936	* @param s2 Second string to compare.
937	*
938	* @return 0 if the strings are equal, -1 if the first is less than the second,
939	* 1 if the second is less than the first.
940	*
941	*/
942	int str_casecmp(const char s1, const char s2)
943	{
944	// FIXME: doesn't work for non-ASCII caseful characters
945
946	char32_t c1 = 0;
947	char32_t c2 = 0;
948
949	size_t off1 = 0;
950	size_t off2 = 0;
951
952	while (true) {
953	c1 = tolower(str_decode(s1, &off1, STR_NO_LIMIT));
954	c2 = tolower(str_decode(s2, &off2, STR_NO_LIMIT));
955
956	if (c1 < c2)
957	return -1;
958
959	if (c1 > c2)
960	return 1;
961
962	if (c1 == 0 \|\| c2 == 0)
963	break;
964	}
965
966	return 0;
967	}
968
969	/** Compare two NULL terminated strings with length limit in case-insensitive
970	* manner.
971	*
972	* Do a char-by-char comparison of two NULL-terminated strings.
973	* The strings are considered equal iff
974	* min(str_length(s1), max_len) == min(str_length(s2), max_len)
975	* and both strings consist of the same sequence of characters,
976	* up to max_len characters.
977	*
978	* A string S1 is less than another string S2 if it has a character with
979	* lower value at the first character position where the strings differ.
980	* If the strings differ in length, the shorter one is treated as if
981	* padded by characters with a value of zero. Only the first max_len
982	* characters are considered.
983	*
984	* @param s1 First string to compare.
985	* @param s2 Second string to compare.
986	* @param max_len Maximum number of characters to consider.
987	*
988	* @return 0 if the strings are equal, -1 if the first is less than the second,
989	* 1 if the second is less than the first.
990	*
991	*/
992	int str_lcasecmp(const char s1, const char s2, size_t max_len)
993	{
994	// FIXME: doesn't work for non-ASCII caseful characters
995
996	char32_t c1 = 0;
997	char32_t c2 = 0;
998
999	size_t off1 = 0;
1000	size_t off2 = 0;
1001
1002	size_t len = 0;
1003
1004	while (true) {
1005	if (len >= max_len)
1006	break;
1007
1008	c1 = tolower(str_decode(s1, &off1, STR_NO_LIMIT));
1009	c2 = tolower(str_decode(s2, &off2, STR_NO_LIMIT));
1010
1011	if (c1 < c2)
1012	return -1;
1013
1014	if (c1 > c2)
1015	return 1;
1016
1017	if (c1 == 0 \|\| c2 == 0)
1018	break;
1019
1020	++len;
1021	}
1022
1023	return 0;
1024
1025	}
1026
1027	static bool _test_prefix(const char s, const char p)
1028	{
1029	while (s == p && *s != 0) {
1030	s++;
1031	p++;
1032	}
1033
1034	return *p == 0;
1035	}
1036
1037	/** Test whether p is a prefix of s.
1038	*
1039	* Do a char-by-char comparison of two NULL-terminated strings
1040	* and determine if p is a prefix of s.
1041	*
1042	* @param s The string in which to look
1043	* @param p The string to check if it is a prefix of s
1044	*
1045	* @return true iff p is prefix of s else false
1046	*
1047	*/
1048	bool str_test_prefix(const char s, const char p)
1049	{
1050	return _test_prefix(s, p);
1051	}
1052
1053	/** Get a string suffix.
1054	*
1055	* Return a string suffix defined by the prefix length.
1056	*
1057	* @param s The string to get the suffix from.
1058	* @param prefix_length Number of prefix characters to ignore.
1059	*
1060	* @return String suffix.
1061	*
1062	*/
1063	const char str_suffix(const char s, size_t prefix_length)
1064	{
1065	size_t off = 0;
1066	size_t i = 0;
1067
1068	while (true) {
1069	str_decode(s, &off, STR_NO_LIMIT);
1070	i++;
1071
1072	if (i >= prefix_length)
1073	break;
1074	}
1075
1076	return s + off;
1077	}
1078
1079	/** Copy string as a sequence of bytes. */
1080	static void _str_cpy(char dest, const char src)
1081	{
1082	while (*src)
1083	(dest++) = (src++);
1084
1085	*dest = 0;
1086	}
1087
1088	/** Copy string as a sequence of bytes. */
1089	static void _str_cpyn(char dest, size_t size, const char src)
1090	{
1091	assert(dest && src && size);
1092
1093	if (!dest \|\| !src \|\| !size)
1094	return;
1095
1096	if (size == STR_NO_LIMIT)
1097	return _str_cpy(dest, src);
1098
1099	char *dest_top = dest + size - 1;
1100	assert(size == 1 \|\| dest < dest_top);
1101
1102	while (*src && dest < dest_top)
1103	(dest++) = (src++);
1104
1105	*dest = 0;
1106	}
1107
1108	/** Copy string.
1109	*
1110	* Copy source string @a src to destination buffer @a dest.
1111	* No more than @a size bytes are written. If the size of the output buffer
1112	* is at least one byte, the output string will always be well-formed, i.e.
1113	* null-terminated and containing only complete characters.
1114	*
1115	* @param dest Destination buffer.
1116	* @param count Size of the destination buffer (must be > 0).
1117	* @param src Source string.
1118	*
1119	*/
1120	void str_cpy(char dest, size_t size, const char src)
1121	{
1122	/* There must be space for a null terminator in the buffer. */
1123	assert(size > 0);
1124	assert(src != NULL);
1125	assert(dest != NULL);
1126	assert(size == STR_NO_LIMIT \|\| dest + size > dest);
1127
1128	/* Copy data. */
1129	_str_cpyn(dest, size, src);
1130
1131	/* In-place translate invalid bytes to U_SPECIAL. */
1132	_sanitize_string(dest, size);
1133	}
1134
1135	/** Copy size-limited substring.
1136	*
1137	* Copy prefix of string @a src of max. size @a size to destination buffer
1138	* @a dest. No more than @a size bytes are written. The output string will
1139	* always be well-formed, i.e. null-terminated and containing only complete
1140	* characters.
1141	*
1142	* No more than @a n bytes are read from the input string, so it does not
1143	* have to be null-terminated.
1144	*
1145	* @param dest Destination buffer.
1146	* @param count Size of the destination buffer (must be > 0).
1147	* @param src Source string.
1148	* @param n Maximum number of bytes to read from @a src.
1149	*
1150	*/
1151	void str_ncpy(char dest, size_t size, const char src, size_t n)
1152	{
1153	/* There must be space for a null terminator in the buffer. */
1154	assert(size > 0);
1155	assert(src != NULL);
1156
1157	/* Copy data. */
1158	_str_cpyn(dest, min(size, n + 1), src);
1159
1160	/* In-place translate invalid bytes to U_SPECIAL. */
1161	_sanitize_string(dest, size);
1162	}
1163
1164	/** Append one string to another.
1165	*
1166	* Append source string @a src to string in destination buffer @a dest.
1167	* Size of the destination buffer is @a dest. If the size of the output buffer
1168	* is at least one byte, the output string will always be well-formed, i.e.
1169	* null-terminated and containing only complete characters.
1170	*
1171	* @param dest Destination buffer.
1172	* @param count Size of the destination buffer.
1173	* @param src Source string.
1174	*/
1175	void str_append(char dest, size_t size, const char src)
1176	{
1177	assert(src != NULL);
1178	assert(dest != NULL);
1179	assert(size > 0);
1180	assert(size == STR_NO_LIMIT \|\| dest + size > dest);
1181
1182	size_t dstr_size = _str_nsize(dest, size);
1183	if (dstr_size < size) {
1184	_str_cpyn(dest + dstr_size, size - dstr_size, src);
1185	_sanitize_string(dest + dstr_size, size - dstr_size);
1186	}
1187	}
1188
1189	/** Convert space-padded ASCII to string.
1190	*
1191	* Common legacy text encoding in hardware is 7-bit ASCII fitted into
1192	* a fixed-width byte buffer (bit 7 always zero), right-padded with spaces
1193	* (ASCII 0x20). Convert space-padded ascii to string representation.
1194	*
1195	* If the text does not fit into the destination buffer, the function converts
1196	* as many characters as possible and returns EOVERFLOW.
1197	*
1198	* If the text contains non-ASCII bytes (with bit 7 set), the whole string is
1199	* converted anyway and invalid characters are replaced with question marks
1200	* (U_SPECIAL) and the function returns EIO.
1201	*
1202	* Regardless of return value upon return @a dest will always be well-formed.
1203	*
1204	* @param dest Destination buffer
1205	* @param size Size of destination buffer
1206	* @param src Space-padded ASCII.
1207	* @param n Size of the source buffer in bytes.
1208	*
1209	* @return EOK on success, EOVERFLOW if the text does not fit
1210	* destination buffer, EIO if the text contains
1211	* non-ASCII bytes.
1212	*/
1213	errno_t spascii_to_str(char dest, size_t size, const uint8_t src, size_t n)
1214	{
1215	size_t len = 0;
1216
1217	/* Determine the length of the source string. */
1218	for (size_t i = 0; i < n; i++) {
1219	if (src[i] == 0)
1220	break;
1221
1222	if (src[i] != ' ')
1223	len = i + 1;
1224	}
1225
1226	errno_t result = EOK;
1227	size_t out_len = min(len, size - 1);
1228
1229	/* Copy characters */
1230	for (size_t i = 0; i < out_len; i++) {
1231	dest[i] = src[i];
1232
1233	if (dest[i] < 0) {
1234	dest[i] = U_SPECIAL;
1235	result = EIO;
1236	}
1237	}
1238
1239	dest[out_len] = 0;
1240
1241	if (out_len < len)
1242	return EOVERFLOW;
1243
1244	return result;
1245	}
1246
1247	/** Convert wide string to string.
1248	*
1249	* Convert wide string @a src to string. The output is written to the buffer
1250	* specified by @a dest and @a size. @a size must be non-zero and the string
1251	* written will always be well-formed.
1252	*
1253	* @param dest Destination buffer.
1254	* @param size Size of the destination buffer.
1255	* @param src Source wide string.
1256	*/
1257	void wstr_to_str(char dest, size_t size, const char32_t src)
1258	{
1259	char32_t ch;
1260	size_t src_idx;
1261	size_t dest_off;
1262
1263	/* There must be space for a null terminator in the buffer. */
1264	assert(size > 0);
1265
1266	src_idx = 0;
1267	dest_off = 0;
1268
1269	while ((ch = src[src_idx++]) != 0) {
1270	if (chr_encode(ch, dest, &dest_off, size - 1) != EOK)
1271	break;
1272	}
1273
1274	dest[dest_off] = '\0';
1275	}
1276
1277	/** Convert UTF16 string to string.
1278	*
1279	* Convert utf16 string @a src to string. The output is written to the buffer
1280	* specified by @a dest and @a size. @a size must be non-zero and the string
1281	* written will always be well-formed. Surrogate pairs also supported.
1282	*
1283	* @param dest Destination buffer.
1284	* @param size Size of the destination buffer.
1285	* @param src Source utf16 string.
1286	*
1287	* @return EOK, if success, an error code otherwise.
1288	*/
1289	errno_t utf16_to_str(char dest, size_t size, const uint16_t src)
1290	{
1291	size_t idx = 0, dest_off = 0;
1292	char32_t ch;
1293	errno_t rc = EOK;
1294
1295	/* There must be space for a null terminator in the buffer. */
1296	assert(size > 0);
1297
1298	while (src[idx]) {
1299	if ((src[idx] & 0xfc00) == 0xd800) {
1300	if (src[idx + 1] && (src[idx + 1] & 0xfc00) == 0xdc00) {
1301	ch = 0x10000;
1302	ch += (src[idx] & 0x03FF) << 10;
1303	ch += (src[idx + 1] & 0x03FF);
1304	idx += 2;
1305	} else
1306	break;
1307	} else {
1308	ch = src[idx];
1309	idx++;
1310	}
1311	rc = chr_encode(ch, dest, &dest_off, size - 1);
1312	if (rc != EOK)
1313	break;
1314	}
1315	dest[dest_off] = '\0';
1316	return rc;
1317	}
1318
1319	/** Convert string to UTF16 string.
1320	*
1321	* Convert string @a src to utf16 string. The output is written to the buffer
1322	* specified by @a dest and @a dlen. @a dlen must be non-zero and the string
1323	* written will always be well-formed. Surrogate pairs also supported.
1324	*
1325	* @param dest Destination buffer.
1326	* @param dlen Number of utf16 characters that fit in the destination buffer.
1327	* @param src Source string.
1328	*
1329	* @return EOK, if success, an error code otherwise.
1330	*/
1331	errno_t str_to_utf16(uint16_t dest, size_t dlen, const char src)
1332	{
1333	errno_t rc = EOK;
1334	size_t offset = 0;
1335	size_t idx = 0;
1336	char32_t c;
1337
1338	assert(dlen > 0);
1339
1340	while ((c = str_decode(src, &offset, STR_NO_LIMIT)) != 0) {
1341	if (c > 0x10000) {
1342	if (idx + 2 >= dlen - 1) {
1343	rc = EOVERFLOW;
1344	break;
1345	}
1346	c = (c - 0x10000);
1347	dest[idx] = 0xD800 \| (c >> 10);
1348	dest[idx + 1] = 0xDC00 \| (c & 0x3FF);
1349	idx++;
1350	} else {
1351	dest[idx] = c;
1352	}
1353
1354	idx++;
1355	if (idx >= dlen - 1) {
1356	rc = EOVERFLOW;
1357	break;
1358	}
1359	}
1360
1361	dest[idx] = '\0';
1362	return rc;
1363	}
1364
1365	/** Get size of UTF-16 string.
1366	*
1367	* Get the number of words which are used by the UTF-16 string @a ustr
1368	* (excluding the NULL-terminator).
1369	*
1370	* @param ustr UTF-16 string to consider.
1371	*
1372	* @return Number of words used by the UTF-16 string
1373	*
1374	*/
1375	size_t utf16_wsize(const uint16_t *ustr)
1376	{
1377	size_t wsize = 0;
1378
1379	while (*ustr++ != 0)
1380	wsize++;
1381
1382	return wsize;
1383	}
1384
1385	/** Convert wide string to new string.
1386	*
1387	* Convert wide string @a src to string. Space for the new string is allocated
1388	* on the heap.
1389	*
1390	* @param src Source wide string.
1391	* @return New string.
1392	*/
1393	char wstr_to_astr(const char32_t src)
1394	{
1395	char dbuf[STR_BOUNDS(1)];
1396	char *str;
1397	char32_t ch;
1398
1399	size_t src_idx;
1400	size_t dest_off;
1401	size_t dest_size;
1402
1403	/* Compute size of encoded string. */
1404
1405	src_idx = 0;
1406	dest_size = 0;
1407
1408	while ((ch = src[src_idx++]) != 0) {
1409	dest_off = 0;
1410	if (chr_encode(ch, dbuf, &dest_off, STR_BOUNDS(1)) != EOK)
1411	break;
1412	dest_size += dest_off;
1413	}
1414
1415	str = malloc(dest_size + 1);
1416	if (str == NULL)
1417	return NULL;
1418
1419	/* Encode string. */
1420
1421	src_idx = 0;
1422	dest_off = 0;
1423
1424	while ((ch = src[src_idx++]) != 0) {
1425	if (chr_encode(ch, str, &dest_off, dest_size) != EOK)
1426	break;
1427	}
1428
1429	str[dest_size] = '\0';
1430	return str;
1431	}
1432
1433	/** Convert string to wide string.
1434	*
1435	* Convert string @a src to wide string. The output is written to the
1436	* buffer specified by @a dest and @a dlen. @a dlen must be non-zero
1437	* and the wide string written will always be null-terminated.
1438	*
1439	* @param dest Destination buffer.
1440	* @param dlen Length of destination buffer (number of wchars).
1441	* @param src Source string.
1442	*/
1443	void str_to_wstr(char32_t dest, size_t dlen, const char src)
1444	{
1445	size_t offset;
1446	size_t di;
1447	char32_t c;
1448
1449	assert(dlen > 0);
1450
1451	offset = 0;
1452	di = 0;
1453
1454	do {
1455	if (di >= dlen - 1)
1456	break;
1457
1458	c = str_decode(src, &offset, STR_NO_LIMIT);
1459	dest[di++] = c;
1460	} while (c != '\0');
1461
1462	dest[dlen - 1] = '\0';
1463	}
1464
1465	/** Convert string to wide string.
1466	*
1467	* Convert string @a src to wide string. A new wide NULL-terminated
1468	* string will be allocated on the heap.
1469	*
1470	* @param src Source string.
1471	*/
1472	char32_t str_to_awstr(const char str)
1473	{
1474	size_t len = str_length(str);
1475
1476	char32_t *wstr = calloc(len + 1, sizeof(char32_t));
1477	if (wstr == NULL)
1478	return NULL;
1479
1480	str_to_wstr(wstr, len + 1, str);
1481	return wstr;
1482	}
1483
1484	static char _strchr(const char str, char c)
1485	{
1486	while (str != 0 && str != c)
1487	str++;
1488
1489	return (str == c) ? (char ) str : NULL;
1490	}
1491
1492	/** Find first occurence of character in string.
1493	*
1494	* @param str String to search.
1495	* @param ch Character to look for.
1496	*
1497	* @return Pointer to character in @a str or NULL if not found.
1498	*/
1499	char str_chr(const char str, char32_t ch)
1500	{
1501	/* Fast path for an ASCII character. */
1502	if (ascii_check(ch))
1503	return _strchr(str, ch);
1504
1505	/* Convert character to UTF-8. */
1506	char utf8[STR_BOUNDS(1) + 1];
1507	size_t offset = 0;
1508
1509	if (chr_encode(ch, utf8, &offset, sizeof(utf8)) != EOK \|\| offset == 0)
1510	return NULL;
1511
1512	utf8[offset] = '\0';
1513
1514	/* Find the first byte, then check if all of them are correct. */
1515	while (*str != 0) {
1516	str = _strchr(str, utf8[0]);
1517	if (!str)
1518	return NULL;
1519
1520	if (_test_prefix(str, utf8))
1521	return (char *) str;
1522
1523	str++;
1524	}
1525
1526	return NULL;
1527	}
1528
1529	/** Find first occurence of substring in string.
1530	*
1531	* @param hs Haystack (string)
1532	* @param n Needle (substring to look for)
1533	*
1534	* @return Pointer to character in @a hs or @c NULL if not found.
1535	*/
1536	char str_str(const char hs, const char *n)
1537	{
1538	size_t hsize = _str_size(hs);
1539	size_t nsize = _str_size(n);
1540
1541	while (hsize >= nsize) {
1542	if (_test_prefix(hs, n))
1543	return (char *) hs;
1544
1545	hs++;
1546	hsize--;
1547	}
1548
1549	return NULL;
1550	}
1551
1552	static void _str_rtrim(char *str, char c)
1553	{
1554	char *last = str;
1555
1556	while (*str) {
1557	if (*str != c)
1558	last = str;
1559
1560	str++;
1561	}
1562
1563	/* Truncate string. */
1564	last[1] = 0;
1565	}
1566
1567	/** Removes specified trailing characters from a string.
1568	*
1569	* @param str String to remove from.
1570	* @param ch Character to remove.
1571	*/
1572	void str_rtrim(char *str, char32_t ch)
1573	{
1574	/* Fast path for the ASCII case. */
1575	if (ascii_check(ch)) {
1576	_str_rtrim(str, ch);
1577	return;
1578	}
1579
1580	size_t off = 0;
1581	size_t pos = 0;
1582	char32_t c;
1583	bool update_last_chunk = true;
1584	char *last_chunk = NULL;
1585
1586	while ((c = str_decode(str, &off, STR_NO_LIMIT))) {
1587	if (c != ch) {
1588	update_last_chunk = true;
1589	last_chunk = NULL;
1590	} else if (update_last_chunk) {
1591	update_last_chunk = false;
1592	last_chunk = (str + pos);
1593	}
1594	pos = off;
1595	}
1596
1597	if (last_chunk)
1598	*last_chunk = '\0';
1599	}
1600
1601	static void _str_ltrim(char *str, char c)
1602	{
1603	char *p = str;
1604
1605	while (*p == c)
1606	p++;
1607
1608	if (str != p)
1609	_str_cpy(str, p);
1610	}
1611
1612	/** Removes specified leading characters from a string.
1613	*
1614	* @param str String to remove from.
1615	* @param ch Character to remove.
1616	*/
1617	void str_ltrim(char *str, char32_t ch)
1618	{
1619	/* Fast path for the ASCII case. */
1620	if (ascii_check(ch)) {
1621	_str_ltrim(str, ch);
1622	return;
1623	}
1624
1625	char32_t acc;
1626	size_t off = 0;
1627	size_t pos = 0;
1628	size_t str_sz = str_size(str);
1629
1630	while ((acc = str_decode(str, &off, STR_NO_LIMIT)) != 0) {
1631	if (acc != ch)
1632	break;
1633	else
1634	pos = off;
1635	}
1636
1637	if (pos > 0) {
1638	memmove(str, &str[pos], str_sz - pos);
1639	pos = str_sz - pos;
1640	str[pos] = '\0';
1641	}
1642	}
1643
1644	static char _str_rchr(const char str, char c)
1645	{
1646	const char *last = NULL;
1647
1648	while (*str) {
1649	if (*str == c)
1650	last = str;
1651
1652	str++;
1653	}
1654
1655	return (char *) last;
1656	}
1657
1658	/** Find last occurence of character in string.
1659	*
1660	* @param str String to search.
1661	* @param ch Character to look for.
1662	*
1663	* @return Pointer to character in @a str or NULL if not found.
1664	*/
1665	char str_rchr(const char str, char32_t ch)
1666	{
1667	if (ascii_check(ch))
1668	return _str_rchr(str, ch);
1669
1670	char32_t acc;
1671	size_t off = 0;
1672	size_t last = 0;
1673	const char *res = NULL;
1674
1675	while ((acc = str_decode(str, &off, STR_NO_LIMIT)) != 0) {
1676	if (acc == ch)
1677	res = (str + last);
1678	last = off;
1679	}
1680
1681	return (char *) res;
1682	}
1683
1684	/** Insert a wide character into a wide string.
1685	*
1686	* Insert a wide character into a wide string at position
1687	* @a pos. The characters after the position are shifted.
1688	*
1689	* @param str String to insert to.
1690	* @param ch Character to insert to.
1691	* @param pos Character index where to insert.
1692	* @param max_pos Characters in the buffer.
1693	*
1694	* @return True if the insertion was sucessful, false if the position
1695	* is out of bounds.
1696	*
1697	*/
1698	bool wstr_linsert(char32_t *str, char32_t ch, size_t pos, size_t max_pos)
1699	{
1700	size_t len = wstr_length(str);
1701
1702	if ((pos > len) \|\| (pos + 1 > max_pos))
1703	return false;
1704
1705	size_t i;
1706	for (i = len; i + 1 > pos; i--)
1707	str[i + 1] = str[i];
1708
1709	str[pos] = ch;
1710
1711	return true;
1712	}
1713
1714	/** Remove a wide character from a wide string.
1715	*
1716	* Remove a wide character from a wide string at position
1717	* @a pos. The characters after the position are shifted.
1718	*
1719	* @param str String to remove from.
1720	* @param pos Character index to remove.
1721	*
1722	* @return True if the removal was sucessful, false if the position
1723	* is out of bounds.
1724	*
1725	*/
1726	bool wstr_remove(char32_t *str, size_t pos)
1727	{
1728	size_t len = wstr_length(str);
1729
1730	if (pos >= len)
1731	return false;
1732
1733	size_t i;
1734	for (i = pos + 1; i <= len; i++)
1735	str[i - 1] = str[i];
1736
1737	return true;
1738	}
1739
1740	/** Duplicate string.
1741	*
1742	* Allocate a new string and copy characters from the source
1743	* string into it. The duplicate string is allocated via sleeping
1744	* malloc(), thus this function can sleep in no memory conditions.
1745	*
1746	* The allocation cannot fail and the return value is always
1747	* a valid pointer. The duplicate string is always a well-formed
1748	* null-terminated UTF-8 string, but it can differ from the source
1749	* string on the byte level.
1750	*
1751	* @param src Source string.
1752	*
1753	* @return Duplicate string.
1754	*
1755	*/
1756	char str_dup(const char src)
1757	{
1758	size_t size = _str_size(src) + 1;
1759	char *dest = malloc(size);
1760	if (!dest)
1761	return NULL;
1762
1763	memcpy(dest, src, size);
1764	_sanitize_string(dest, size);
1765	return dest;
1766	}
1767
1768	/** Duplicate string with size limit.
1769	*
1770	* Allocate a new string and copy up to @max_size bytes from the source
1771	* string into it. The duplicate string is allocated via sleeping
1772	* malloc(), thus this function can sleep in no memory conditions.
1773	* No more than @max_size + 1 bytes is allocated, but if the size
1774	* occupied by the source string is smaller than @max_size + 1,
1775	* less is allocated.
1776	*
1777	* The allocation cannot fail and the return value is always
1778	* a valid pointer. The duplicate string is always a well-formed
1779	* null-terminated UTF-8 string, but it can differ from the source
1780	* string on the byte level.
1781	*
1782	* @param src Source string.
1783	* @param n Maximum number of bytes to duplicate.
1784	*
1785	* @return Duplicate string.
1786	*
1787	*/
1788	char str_ndup(const char src, size_t n)
1789	{
1790	size_t size = _str_nsize(src, n);
1791
1792	char *dest = malloc(size + 1);
1793	if (!dest)
1794	return NULL;
1795
1796	memcpy(dest, src, size);
1797	_sanitize_string(dest, size);
1798	dest[size] = 0;
1799	return dest;
1800	}
1801
1802	/** Split string by delimiters.
1803	*
1804	* @param s String to be tokenized. May not be NULL.
1805	* @param delim String with the delimiters.
1806	* @param next Variable which will receive the pointer to the
1807	* continuation of the string following the first
1808	* occurrence of any of the delimiter characters.
1809	* May be NULL.
1810	* @return Pointer to the prefix of @a s before the first
1811	* delimiter character. NULL if no such prefix
1812	* exists.
1813	*/
1814	char str_tok(char s, const char delim, char *next)
1815	{
1816	char start, end;
1817
1818	if (!s)
1819	return NULL;
1820
1821	size_t len = str_size(s);
1822	size_t cur;
1823	size_t tmp;
1824	char32_t ch;
1825
1826	/* Skip over leading delimiters. */
1827	tmp = 0;
1828	cur = 0;
1829	while ((ch = str_decode(s, &tmp, len)) && str_chr(delim, ch))
1830	cur = tmp;
1831	start = &s[cur];
1832
1833	/* Skip over token characters. */
1834	tmp = cur;
1835	while ((ch = str_decode(s, &tmp, len)) && !str_chr(delim, ch))
1836	cur = tmp;
1837	end = &s[cur];
1838	if (next)
1839	*next = (ch ? &s[tmp] : &s[cur]);
1840
1841	if (start == end)
1842	return NULL; /* No more tokens. */
1843
1844	/* Overwrite delimiter with NULL terminator. */
1845	*end = '\0';
1846	return start;
1847	}
1848
1849	void order_suffix(const uint64_t val, uint64_t rv, char suffix)
1850	{
1851	if (val > UINT64_C(10000000000000000000)) {
1852	*rv = val / UINT64_C(1000000000000000000);
1853	*suffix = 'Z';
1854	} else if (val > UINT64_C(1000000000000000000)) {
1855	*rv = val / UINT64_C(1000000000000000);
1856	*suffix = 'E';
1857	} else if (val > UINT64_C(1000000000000000)) {
1858	*rv = val / UINT64_C(1000000000000);
1859	*suffix = 'T';
1860	} else if (val > UINT64_C(1000000000000)) {
1861	*rv = val / UINT64_C(1000000000);
1862	*suffix = 'G';
1863	} else if (val > UINT64_C(1000000000)) {
1864	*rv = val / UINT64_C(1000000);
1865	*suffix = 'M';
1866	} else if (val > UINT64_C(1000000)) {
1867	*rv = val / UINT64_C(1000);
1868	*suffix = 'k';
1869	} else {
1870	*rv = val;
1871	*suffix = ' ';
1872	}
1873	}
1874
1875	void bin_order_suffix(const uint64_t val, uint64_t rv, const char *suffix,
1876	bool fixed)
1877	{
1878	if (val > UINT64_C(1152921504606846976)) {
1879	*rv = val / UINT64_C(1125899906842624);
1880	*suffix = "EiB";
1881	} else if (val > UINT64_C(1125899906842624)) {
1882	*rv = val / UINT64_C(1099511627776);
1883	*suffix = "TiB";
1884	} else if (val > UINT64_C(1099511627776)) {
1885	*rv = val / UINT64_C(1073741824);
1886	*suffix = "GiB";
1887	} else if (val > UINT64_C(1073741824)) {
1888	*rv = val / UINT64_C(1048576);
1889	*suffix = "MiB";
1890	} else if (val > UINT64_C(1048576)) {
1891	*rv = val / UINT64_C(1024);
1892	*suffix = "KiB";
1893	} else {
1894	*rv = val;
1895	if (fixed)
1896	*suffix = "B ";
1897	else
1898	*suffix = "B";
1899	}
1900	}
1901
1902	/** @}
1903	*/

Note: See TracBrowser for help on using the repository browser.

Download in other formats: