Context Navigation

str.c@ 9bf95d4

Visit:

Last change on this file since 9bf95d4 was 1db4e2ae, checked in by Jiří Zárevúcky <zarevucky.jiri@…>, 2 months ago

Have str_sanitize also remove C0 and C1 control codes

and use it to sanitize KIO output

Property mode set to 100644

File size: 46.2 KB

Line
1	/*
2	* Copyright (c) 2001-2004 Jakub Jermar
3	* Copyright (c) 2005 Martin Decky
4	* Copyright (c) 2008 Jiri Svoboda
5	* Copyright (c) 2011 Martin Sucha
6	* Copyright (c) 2011 Oleg Romanenko
7	* Copyright (c) 2025 Jiří Zárevúcky
8	* All rights reserved.
9	*
10	* Redistribution and use in source and binary forms, with or without
11	* modification, are permitted provided that the following conditions
12	* are met:
13	*
14	* - Redistributions of source code must retain the above copyright
15	* notice, this list of conditions and the following disclaimer.
16	* - Redistributions in binary form must reproduce the above copyright
17	* notice, this list of conditions and the following disclaimer in the
18	* documentation and/or other materials provided with the distribution.
19	* - The name of the author may not be used to endorse or promote products
20	* derived from this software without specific prior written permission.
21	*
22	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
23	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
24	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
25	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
26	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
27	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
31	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32	*/
33
34	/** @addtogroup libc
35	* @{
36	*/
37
38	/**
39	* @file
40	* @brief String functions.
41	*
42	* Strings and characters use the Universal Character Set (UCS). The standard
43	* strings, called just strings are encoded in UTF-8. Wide strings (encoded
44	* in UTF-32) are supported to a limited degree. A single character is
45	* represented as char32_t.@n
46	*
47	* Overview of the terminology:@n
48	*
49	* Term Meaning
50	* -------------------- ----------------------------------------------------
51	* byte 8 bits stored in uint8_t (unsigned 8 bit integer)
52	*
53	* character UTF-32 encoded Unicode character, stored in char32_t
54	* (unsigned 32 bit integer), code points 0 .. 1114111
55	* are valid
56	*
57	* Note that Unicode characters do not match
58	* one-to-one with displayed characters or glyphs on
59	* screen. For that level of precision, look up
60	* Grapheme Clusters.
61	*
62	* ASCII character 7 bit encoded ASCII character, stored in char
63	* (usually signed 8 bit integer), code points 0 .. 127
64	* are valid
65	*
66	* string UTF-8 encoded NULL-terminated Unicode string, char *
67	*
68	* wide string UTF-32 encoded NULL-terminated Unicode string,
69	* char32_t *
70	*
71	* [wide] string size number of BYTES in a [wide] string (excluding
72	* the NULL-terminator), size_t
73	*
74	* [wide] string length number of CHARACTERS in a [wide] string (excluding
75	* the NULL-terminator), size_t
76	*
77	* [wide] string width number of display cells on a monospace display taken
78	* by a [wide] string, size_t
79	*
80	* This is virtually impossible to determine exactly for
81	* all strings without knowing specifics of the display
82	* device, due to various factors affecting text output.
83	* If you have the option to query the terminal for
84	* position change caused by outputting the string,
85	* it is preferrable to determine width that way.
86	*
87	*
88	* Overview of string metrics:@n
89	*
90	* Metric Abbrev. Type Meaning
91	* ------ ------ ------ -------------------------------------------------
92	* size n size_t number of BYTES in a string (excluding the
93	* NULL-terminator)
94	*
95	* length l size_t number of CHARACTERS in a string (excluding the
96	* null terminator)
97	*
98	* width w size_t number of display cells on a monospace display
99	* taken by a string
100	*
101	*
102	* Function naming prefixes:@n
103	*
104	* chr_ operate on characters
105	* ascii_ operate on ASCII characters
106	* str_ operate on strings
107	* wstr_ operate on wide strings
108	*
109	* [w]str_[n\|l\|w] operate on a prefix limited by size, length
110	* or width
111	*
112	*
113	* A specific character inside a [wide] string can be referred to by:@n
114	*
115	* pointer (char , char32_t )
116	* byte offset (size_t)
117	* character index (size_t)
118	*
119	*/
120
121	#include <str.h>
122
123	#include <align.h>
124	#include <assert.h>
125	#include <ctype.h>
126	#include <errno.h>
127	#include <limits.h>
128	#include <macros.h>
129	#include <mem.h>
130	#include <stdbool.h>
131	#include <stddef.h>
132	#include <stdint.h>
133	#include <stdlib.h>
134	#include <uchar.h>
135
136	#if __STDC_HOSTED__
137	#include <fibril.h>
138	#endif
139
140	static void _set_ilseq()
141	{
142	#ifdef errno
143	errno = EILSEQ;
144	#endif
145	}
146
147	/** Byte mask consisting of lowest @n bits (out of 8) */
148	#define LO_MASK_8(n) ((uint8_t) ((1 << (n)) - 1))
149
150	/** Byte mask consisting of lowest @n bits (out of 32) */
151	#define LO_MASK_32(n) ((uint32_t) ((1 << (n)) - 1))
152
153	/** Byte mask consisting of highest @n bits (out of 8) */
154	#define HI_MASK_8(n) (~LO_MASK_8(8 - (n)))
155
156	/** Number of data bits in a UTF-8 continuation byte */
157	#define CONT_BITS 6
158
159	#define UTF8_MASK_INITIAL2 0b00011111
160	#define UTF8_MASK_INITIAL3 0b00001111
161	#define UTF8_MASK_INITIAL4 0b00000111
162	#define UTF8_MASK_CONT 0b00111111
163
164	#define CHAR_INVALID ((char32_t) UINT_MAX)
165
166	static inline bool _is_ascii(uint8_t b)
167	{
168	return b < 0x80;
169	}
170
171	static inline bool _is_continuation(uint8_t b)
172	{
173	return (b & 0xC0) == 0x80;
174	}
175
176	static inline bool _is_2_byte(uint8_t c)
177	{
178	return (c & 0xE0) == 0xC0;
179	}
180
181	static inline bool _is_3_byte(uint8_t c)
182	{
183	return (c & 0xF0) == 0xE0;
184	}
185
186	static inline bool _is_4_byte(uint8_t c)
187	{
188	return (c & 0xF8) == 0xF0;
189	}
190
191	static inline int _char_continuation_bytes(char32_t c)
192	{
193	if ((c & ~LO_MASK_32(7)) == 0)
194	return 0;
195
196	if ((c & ~LO_MASK_32(11)) == 0)
197	return 1;
198
199	if ((c & ~LO_MASK_32(16)) == 0)
200	return 2;
201
202	if ((c & ~LO_MASK_32(21)) == 0)
203	return 3;
204
205	/* Codes longer than 21 bits are not supported */
206	return -1;
207	}
208
209	static inline int _continuation_bytes(uint8_t b)
210	{
211	/* 0xxxxxxx */
212	if (_is_ascii(b))
213	return 0;
214
215	/* 110xxxxx 10xxxxxx */
216	if (_is_2_byte(b))
217	return 1;
218
219	/* 1110xxxx 10xxxxxx 10xxxxxx */
220	if (_is_3_byte(b))
221	return 2;
222
223	/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
224	if (_is_4_byte(b))
225	return 3;
226
227	return -1;
228	}
229
230	static bool _is_non_shortest(const mbstate_t *mb, uint8_t b)
231	{
232	return (mb->state == 0b1111110000000000 && !(b & 0b00100000)) \|\|
233	(mb->state == 0b1111111111110000 && !(b & 0b00110000));
234	}
235
236	static bool _is_surrogate(const mbstate_t *mb, uint8_t b)
237	{
238	return (mb->state == 0b1111110000001101 && b >= 0xa0);
239	}
240
241	#define _likely(expr) __builtin_expect((expr), true)
242	#define _unlikely(expr) __builtin_expect((expr), false)
243
244	#define FAST_PATHS 1
245
246	static char32_t _str_decode(const char s, size_t offset, size_t size, mbstate_t *mb)
247	{
248	assert(s);
249	assert(offset);
250	assert(*offset <= size);
251	assert(size == STR_NO_LIMIT \|\| s + size >= s);
252	assert(mb);
253
254	if (*offset == size)
255	return 0;
256
257	if (_likely(!mb->state)) {
258	/* Clean slate, read initial byte. */
259	uint8_t b = s[(*offset)++];
260
261	/* Fast exit for the most common case. */
262	if (_likely(_is_ascii(b)))
263	return b;
264
265	/* unexpected continuation byte */
266	if (_unlikely(_is_continuation(b)))
267	return CHAR_INVALID;
268
269	/*
270	* The value stored into `continuation` is designed to have
271	* just enough leading ones that after shifting in one less than
272	* the expected number of continuation bytes, the most significant
273	* bit becomes zero. (The field is 16b wide.)
274	*/
275
276	if (_is_2_byte(b)) {
277	/* Reject non-shortest form. */
278	if (_unlikely(!(b & 0b00011110)))
279	return CHAR_INVALID;
280
281	#if FAST_PATHS
282	/* We can usually take this exit. */
283	if (_likely(offset < size && _is_continuation(s[offset])))
284	return (b & UTF8_MASK_INITIAL2) << 6 \|
285	(s[(*offset)++] & UTF8_MASK_CONT);
286	#endif
287
288	/* 2 byte continuation 110xxxxx */
289	mb->state = b ^ 0b0000000011000000;
290
291	} else if (_is_3_byte(b)) {
292	#if FAST_PATHS
293	/* We can usually take this exit. */
294	if (_likely(offset + 1 < size && _is_continuation(s[offset]) && _is_continuation(s[*offset + 1]))) {
295
296	char32_t ch = (b & UTF8_MASK_INITIAL3) << 12 \|
297	(s[(*offset)] & UTF8_MASK_CONT) << 6 \|
298	(s[(*offset) + 1] & UTF8_MASK_CONT);
299
300	*offset += 2;
301
302	/* Reject non-shortest form. */
303	if (_unlikely(!(ch & 0xFFFFF800)))
304	return CHAR_INVALID;
305
306	/* Reject surrogates */
307	if (_unlikely(ch >= 0xD800 && ch < 0xE000))
308	return CHAR_INVALID;
309
310	return ch;
311	}
312	#endif
313
314	/* 3 byte continuation 1110xxxx */
315	mb->state = b ^ 0b1111110011100000;
316
317	} else if (_is_4_byte(b)) {
318	#if FAST_PATHS
319	/* We can usually take this exit. */
320	if (_likely(offset + 2 < size && _is_continuation(s[offset]) &&
321	_is_continuation(s[offset + 1]) && _is_continuation(s[offset + 2]))) {
322
323	char32_t ch = (b & UTF8_MASK_INITIAL4) << 18 \|
324	(s[(*offset)] & UTF8_MASK_CONT) << 12 \|
325	(s[(*offset) + 1] & UTF8_MASK_CONT) << 6 \|
326	(s[(*offset) + 2] & UTF8_MASK_CONT);
327
328	*offset += 3;
329
330	/* Reject non-shortest form. */
331	if (_unlikely(!(ch & 0xFFFF0000)))
332	return CHAR_INVALID;
333
334	/* Reject out-of-range characters. */
335	if (_unlikely(ch >= 0x110000))
336	return CHAR_INVALID;
337
338	return ch;
339	}
340	#endif
341
342	/* 4 byte continuation 11110xxx */
343	mb->state = b ^ 0b1111111100000000;
344	} else {
345	return CHAR_INVALID;
346	}
347	}
348
349	/* Deal with the remaining edge and invalid cases. */
350	for (; offset < size; (offset)++) {
351	/* Read continuation bytes. */
352	uint8_t b = s[*offset];
353
354	if (!_is_continuation(b) \|\| _is_non_shortest(mb, b) \|\| _is_surrogate(mb, b)) {
355	mb->state = 0;
356	return CHAR_INVALID;
357	}
358
359	/* Top bit becomes zero when shifting in the second to last byte. */
360	if (!(mb->state & 0x8000)) {
361	char32_t c = ((char32_t) mb->state) << 6 \| (b & UTF8_MASK_CONT);
362	mb->state = 0;
363	(*offset)++;
364	return c;
365	}
366
367	mb->state = mb->state << 6 \| (b & UTF8_MASK_CONT);
368	}
369
370	/* Incomplete character. */
371	assert(mb->state);
372	return 0;
373	}
374
375	/** Standard <uchar.h> function since C11. */
376	size_t mbrtoc32(char32_t c, const char s, size_t n, mbstate_t *mb)
377	{
378	#if __STDC_HOSTED__
379	static fibril_local mbstate_t global_state = { };
380
381	if (!mb)
382	mb = &global_state;
383	#endif
384
385	if (!s) {
386	/* Equivalent to mbrtoc32(NULL, "", 1, mb); */
387	c = NULL;
388	s = "";
389	n = 1;
390	}
391
392	size_t offset = 0;
393	char32_t ret = _str_decode(s, &offset, n, mb);
394	if (ret == CHAR_INVALID) {
395	assert(!mb->state);
396	_set_ilseq();
397	return UCHAR_ILSEQ;
398	}
399	if (mb->state) {
400	assert(ret == 0);
401	return UCHAR_INCOMPLETE;
402	}
403
404	if (c)
405	*c = ret;
406	return ret ? offset : 0;
407	}
408
409	/** Decode a single character from a string.
410	*
411	* Decode a single character from a string of size @a size. Decoding starts
412	* at @a offset and this offset is moved to the beginning of the next
413	* character. In case of decoding error, offset generally advances at least
414	* by one. However, offset is never moved beyond size.
415	*
416	* @param str String (not necessarily NULL-terminated).
417	* @param offset Byte offset in string where to start decoding.
418	* @param size Size of the string (in bytes).
419	*
420	* @return Value of decoded character, U_SPECIAL on decoding error or
421	* NULL if attempt to decode beyond @a size.
422	*
423	*/
424	char32_t str_decode(const char str, size_t offset, size_t size)
425	{
426	mbstate_t mb = { };
427	char32_t ch = _str_decode(str, offset, size, &mb);
428
429	if (ch == CHAR_INVALID \|\| mb.state)
430	return U_SPECIAL;
431
432	return ch;
433	}
434
435	char32_t str_decode_r(const char str, size_t offset, size_t size,
436	char32_t replacement, mbstate_t *mb)
437	{
438	char32_t ch = _str_decode(str, offset, size, mb);
439	return (ch == CHAR_INVALID) ? replacement : ch;
440	}
441
442	/** Decode a single character from a string to the left.
443	*
444	* Decode a single character from a string of size @a size. Decoding starts
445	* at @a offset and this offset is moved to the beginning of the previous
446	* character. In case of decoding error, offset generally decreases at least
447	* by one. However, offset is never moved before 0.
448	*
449	* @param str String (not necessarily NULL-terminated).
450	* @param offset Byte offset in string where to start decoding.
451	* @param size Size of the string (in bytes).
452	*
453	* @return Value of decoded character, U_SPECIAL on decoding error or
454	* NULL if attempt to decode beyond @a start of str.
455	*
456	*/
457	char32_t str_decode_reverse(const char str, size_t offset, size_t size)
458	{
459	if (*offset == 0)
460	return 0;
461
462	int cbytes = 0;
463	/* Continue while continuation bytes found */
464	while (*offset > 0 && cbytes < 4) {
465	uint8_t b = (uint8_t) str[--(*offset)];
466
467	if (_is_continuation(b)) {
468	cbytes++;
469	continue;
470	}
471
472	/* Reject non-shortest form encoding. */
473	if (cbytes != _continuation_bytes(b))
474	return U_SPECIAL;
475
476	/* Start byte */
477	size_t start_offset = *offset;
478	return str_decode(str, &start_offset, size);
479	}
480
481	/* Too many continuation bytes */
482	return U_SPECIAL;
483	}
484
485	/** Encode a single character to string representation.
486	*
487	* Encode a single character to string representation (i.e. UTF-8) and store
488	* it into a buffer at @a offset. Encoding starts at @a offset and this offset
489	* is moved to the position where the next character can be written to.
490	*
491	* @param ch Input character.
492	* @param str Output buffer.
493	* @param offset Byte offset where to start writing.
494	* @param size Size of the output buffer (in bytes).
495	*
496	* @return EOK if the character was encoded successfully, EOVERFLOW if there
497	* was not enough space in the output buffer or EINVAL if the character
498	* code was invalid.
499	*/
500	errno_t chr_encode(char32_t ch, char str, size_t offset, size_t size)
501	{
502	// TODO: merge with c32rtomb()
503
504	if (*offset >= size)
505	return EOVERFLOW;
506
507	/* Fast exit for the most common case. */
508	if (ch < 0x80) {
509	str[(*offset)++] = (char) ch;
510	return EOK;
511	}
512
513	/* Codes longer than 21 bits are not supported */
514	if (!chr_check(ch))
515	return EINVAL;
516
517	/* Determine how many continuation bytes are needed */
518
519	unsigned int cbytes = _char_continuation_bytes(ch);
520	unsigned int b0_bits = 6 - cbytes; /* Data bits in first byte */
521
522	/* Check for available space in buffer */
523	if (*offset + cbytes >= size)
524	return EOVERFLOW;
525
526	/* Encode continuation bytes */
527	unsigned int i;
528	for (i = cbytes; i > 0; i--) {
529	str[*offset + i] = 0x80 \| (ch & LO_MASK_32(CONT_BITS));
530	ch >>= CONT_BITS;
531	}
532
533	/* Encode first byte */
534	str[*offset] = (ch & LO_MASK_32(b0_bits)) \| HI_MASK_8(8 - b0_bits - 1);
535
536	/* Advance offset */
537	*offset += cbytes + 1;
538
539	return EOK;
540	}
541
542	/* Convert in place any bytes that don't form a valid character into replacement. */
543	static size_t _str_sanitize(char *str, size_t n, uint8_t replacement)
544	{
545	uint8_t b = (uint8_t ) str;
546	size_t count = 0;
547
548	for (; n > 0 && b[0]; b++, n--) {
549	if (b[0] < ' ') {
550	/* C0 control codes */
551	b[0] = replacement;
552	count++;
553	continue;
554	}
555
556	int cont = _continuation_bytes(b[0]);
557	if (__builtin_expect(cont, 0) == 0)
558	continue;
559
560	if (cont < 0 \|\| n <= (size_t) cont) {
561	b[0] = replacement;
562	count++;
563	continue;
564	}
565
566	/* Check continuation bytes. */
567	bool valid = true;
568	for (int i = 1; i <= cont; i++) {
569	if (!_is_continuation(b[i])) {
570	valid = false;
571	break;
572	}
573	}
574
575	if (!valid) {
576	b[0] = replacement;
577	count++;
578	continue;
579	}
580
581	/*
582	* Check for non-shortest form encoding.
583	* See https://www.unicode.org/versions/corrigendum1.html
584	*/
585
586	/* 0b110!!!!x 0b10xxxxxx */
587	if (cont == 1 && !(b[0] & 0b00011110)) {
588	b[0] = replacement;
589	count++;
590	continue;
591	}
592
593	bool c1_control = (b[0] == 0b11000010 && b[1] < 0b10100000);
594	if (cont == 1 && c1_control) {
595	b[0] = replacement;
596	count++;
597	continue;
598	}
599
600	/* 0b1110!!!! 0b10!xxxxx 0b10xxxxxx */
601	if (cont == 2 && !(b[0] & 0b00001111) && !(b[1] & 0b00100000)) {
602	b[0] = replacement;
603	count++;
604	continue;
605	}
606
607	/* 0b11110!!! 0b10!!xxxx 0b10xxxxxx 0b10xxxxxx */
608	if (cont == 3 && !(b[0] & 0b00000111) && !(b[1] & 0b00110000)) {
609	b[0] = replacement;
610	count++;
611	continue;
612	}
613
614	/* Check for surrogate character encoding. */
615	if (cont == 2 && b[0] == 0xED && b[1] >= 0xA0) {
616	b[0] = replacement;
617	count++;
618	continue;
619	}
620
621	/* Check for out-of-range code points. */
622	if (cont == 3 && (b[0] > 0xF4 \|\| (b[0] == 0xF4 && b[1] >= 0x90))) {
623	b[0] = replacement;
624	count++;
625	continue;
626	}
627
628	b += cont;
629	n -= cont;
630	}
631
632	return count;
633	}
634
635	/** Replaces any byte that's not part of a complete valid UTF-8 character
636	* encoding with a replacement byte.
637	* Also replaces C0 and C1 control codes.
638	*/
639	size_t str_sanitize(char *str, size_t n, uint8_t replacement)
640	{
641	return _str_sanitize(str, n, replacement);
642	}
643
644	static size_t _str_size(const char *str)
645	{
646	size_t size = 0;
647
648	while (*str++ != 0)
649	size++;
650
651	return size;
652	}
653
654	/** Get size of string.
655	*
656	* Get the number of bytes which are used by the string @a str (excluding the
657	* NULL-terminator).
658	*
659	* @param str String to consider.
660	*
661	* @return Number of bytes used by the string
662	*
663	*/
664	size_t str_size(const char *str)
665	{
666	return _str_size(str);
667	}
668
669	/** Get size of wide string.
670	*
671	* Get the number of bytes which are used by the wide string @a str (excluding the
672	* NULL-terminator).
673	*
674	* @param str Wide string to consider.
675	*
676	* @return Number of bytes used by the wide string
677	*
678	*/
679	size_t wstr_size(const char32_t *str)
680	{
681	return (wstr_length(str) * sizeof(char32_t));
682	}
683
684	/** Get size of string with length limit.
685	*
686	* Get the number of bytes which are used by up to @a max_len first
687	* characters in the string @a str. If @a max_len is greater than
688	* the length of @a str, the entire string is measured (excluding the
689	* NULL-terminator).
690	*
691	* @param str String to consider.
692	* @param max_len Maximum number of characters to measure.
693	*
694	* @return Number of bytes used by the characters.
695	*
696	*/
697	size_t str_lsize(const char *str, size_t max_len)
698	{
699	size_t len = 0;
700	size_t offset = 0;
701
702	while (len < max_len) {
703	if (str_decode(str, &offset, STR_NO_LIMIT) == 0)
704	break;
705
706	len++;
707	}
708
709	return offset;
710	}
711
712	static size_t _str_nsize(const char *str, size_t max_size)
713	{
714	size_t size = 0;
715
716	while ((*str++ != 0) && (size < max_size))
717	size++;
718
719	return size;
720	}
721
722	/** Get size of string with size limit.
723	*
724	* Get the number of bytes which are used by the string @a str
725	* (excluding the NULL-terminator), but no more than @max_size bytes.
726	*
727	* @param str String to consider.
728	* @param max_size Maximum number of bytes to measure.
729	*
730	* @return Number of bytes used by the string
731	*
732	*/
733	size_t str_nsize(const char *str, size_t max_size)
734	{
735	return _str_nsize(str, max_size);
736	}
737
738	/** Get size of wide string with size limit.
739	*
740	* Get the number of bytes which are used by the wide string @a str
741	* (excluding the NULL-terminator), but no more than @max_size bytes.
742	*
743	* @param str Wide string to consider.
744	* @param max_size Maximum number of bytes to measure.
745	*
746	* @return Number of bytes used by the wide string
747	*
748	*/
749	size_t wstr_nsize(const char32_t *str, size_t max_size)
750	{
751	return (wstr_nlength(str, max_size) * sizeof(char32_t));
752	}
753
754	/** Get size of wide string with length limit.
755	*
756	* Get the number of bytes which are used by up to @a max_len first
757	* wide characters in the wide string @a str. If @a max_len is greater than
758	* the length of @a str, the entire wide string is measured (excluding the
759	* NULL-terminator).
760	*
761	* @param str Wide string to consider.
762	* @param max_len Maximum number of wide characters to measure.
763	*
764	* @return Number of bytes used by the wide characters.
765	*
766	*/
767	size_t wstr_lsize(const char32_t *str, size_t max_len)
768	{
769	return (wstr_nlength(str, max_len * sizeof(char32_t)) * sizeof(char32_t));
770	}
771
772	/** Get number of characters in a string.
773	*
774	* @param str NULL-terminated string.
775	*
776	* @return Number of characters in string.
777	*
778	*/
779	size_t str_length(const char *str)
780	{
781	size_t len = 0;
782	size_t offset = 0;
783
784	while (str_decode(str, &offset, STR_NO_LIMIT) != 0)
785	len++;
786
787	return len;
788	}
789
790	/** Get number of characters in a wide string.
791	*
792	* @param str NULL-terminated wide string.
793	*
794	* @return Number of characters in @a str.
795	*
796	*/
797	size_t wstr_length(const char32_t *wstr)
798	{
799	size_t len = 0;
800
801	while (*wstr++ != 0)
802	len++;
803
804	return len;
805	}
806
807	/** Get number of characters in a string with size limit.
808	*
809	* @param str NULL-terminated string.
810	* @param size Maximum number of bytes to consider.
811	*
812	* @return Number of characters in string.
813	*
814	*/
815	size_t str_nlength(const char *str, size_t size)
816	{
817	size_t len = 0;
818	size_t offset = 0;
819
820	while (str_decode(str, &offset, size) != 0)
821	len++;
822
823	return len;
824	}
825
826	/** Get number of characters in a string with size limit.
827	*
828	* @param str NULL-terminated string.
829	* @param size Maximum number of bytes to consider.
830	*
831	* @return Number of characters in string.
832	*
833	*/
834	size_t wstr_nlength(const char32_t *str, size_t size)
835	{
836	size_t len = 0;
837	size_t limit = ALIGN_DOWN(size, sizeof(char32_t));
838	size_t offset = 0;
839
840	while ((offset < limit) && (*str++ != 0)) {
841	len++;
842	offset += sizeof(char32_t);
843	}
844
845	return len;
846	}
847
848	/** Get character display width on a character cell display.
849	*
850	* @param ch Character
851	* @return Width of character in cells.
852	*/
853	size_t chr_width(char32_t ch)
854	{
855	return 1;
856	}
857
858	/** Get string display width on a character cell display.
859	*
860	* @param str String
861	* @return Width of string in cells.
862	*/
863	size_t str_width(const char *str)
864	{
865	size_t width = 0;
866	size_t offset = 0;
867	char32_t ch;
868
869	while ((ch = str_decode(str, &offset, STR_NO_LIMIT)) != 0)
870	width += chr_width(ch);
871
872	return width;
873	}
874
875	/** Check whether character is plain ASCII.
876	*
877	* @return True if character is plain ASCII.
878	*
879	*/
880	bool ascii_check(char32_t ch)
881	{
882	if (ch <= 127)
883	return true;
884
885	return false;
886	}
887
888	/** Check whether character is valid
889	*
890	* @return True if character is a valid Unicode code point.
891	*
892	*/
893	bool chr_check(char32_t ch)
894	{
895	if (ch <= 1114111)
896	return true;
897
898	return false;
899	}
900
901	/** Compare two NULL terminated strings.
902	*
903	* Do a char-by-char comparison of two NULL-terminated strings.
904	* The strings are considered equal iff their length is equal
905	* and both strings consist of the same sequence of characters.
906	*
907	* A string S1 is less than another string S2 if it has a character with
908	* lower value at the first character position where the strings differ.
909	* If the strings differ in length, the shorter one is treated as if
910	* padded by characters with a value of zero.
911	*
912	* @param s1 First string to compare.
913	* @param s2 Second string to compare.
914	*
915	* @return 0 if the strings are equal, -1 if the first is less than the second,
916	* 1 if the second is less than the first.
917	*
918	*/
919	int str_cmp(const char s1, const char s2)
920	{
921	/*
922	* UTF-8 has the nice property that lexicographic ordering on bytes is
923	* the same as the lexicographic ordering of the character sequences.
924	*/
925	while (s1 == s2 && *s1 != 0) {
926	s1++;
927	s2++;
928	}
929
930	if (s1 == s2)
931	return 0;
932
933	return (s1 < s2) ? -1 : 1;
934	}
935
936	/** Compare two NULL terminated strings with length limit.
937	*
938	* Do a char-by-char comparison of two NULL-terminated strings.
939	* The strings are considered equal iff
940	* min(str_length(s1), max_len) == min(str_length(s2), max_len)
941	* and both strings consist of the same sequence of characters,
942	* up to max_len characters.
943	*
944	* A string S1 is less than another string S2 if it has a character with
945	* lower value at the first character position where the strings differ.
946	* If the strings differ in length, the shorter one is treated as if
947	* padded by characters with a value of zero. Only the first max_len
948	* characters are considered.
949	*
950	* @param s1 First string to compare.
951	* @param s2 Second string to compare.
952	* @param max_len Maximum number of characters to consider.
953	*
954	* @return 0 if the strings are equal, -1 if the first is less than the second,
955	* 1 if the second is less than the first.
956	*
957	*/
958	int str_lcmp(const char s1, const char s2, size_t max_len)
959	{
960	char32_t c1 = 0;
961	char32_t c2 = 0;
962
963	size_t off1 = 0;
964	size_t off2 = 0;
965
966	size_t len = 0;
967
968	while (true) {
969	if (len >= max_len)
970	break;
971
972	c1 = str_decode(s1, &off1, STR_NO_LIMIT);
973	c2 = str_decode(s2, &off2, STR_NO_LIMIT);
974
975	if (c1 < c2)
976	return -1;
977
978	if (c1 > c2)
979	return 1;
980
981	if (c1 == 0 \|\| c2 == 0)
982	break;
983
984	++len;
985	}
986
987	return 0;
988
989	}
990
991	/** Compare two NULL terminated strings in case-insensitive manner.
992	*
993	* Do a char-by-char comparison of two NULL-terminated strings.
994	* The strings are considered equal iff their length is equal
995	* and both strings consist of the same sequence of characters
996	* when converted to lower case.
997	*
998	* A string S1 is less than another string S2 if it has a character with
999	* lower value at the first character position where the strings differ.
1000	* If the strings differ in length, the shorter one is treated as if
1001	* padded by characters with a value of zero.
1002	*
1003	* @param s1 First string to compare.
1004	* @param s2 Second string to compare.
1005	*
1006	* @return 0 if the strings are equal, -1 if the first is less than the second,
1007	* 1 if the second is less than the first.
1008	*
1009	*/
1010	int str_casecmp(const char s1, const char s2)
1011	{
1012	// FIXME: doesn't work for non-ASCII caseful characters
1013
1014	char32_t c1 = 0;
1015	char32_t c2 = 0;
1016
1017	size_t off1 = 0;
1018	size_t off2 = 0;
1019
1020	while (true) {
1021	c1 = tolower(str_decode(s1, &off1, STR_NO_LIMIT));
1022	c2 = tolower(str_decode(s2, &off2, STR_NO_LIMIT));
1023
1024	if (c1 < c2)
1025	return -1;
1026
1027	if (c1 > c2)
1028	return 1;
1029
1030	if (c1 == 0 \|\| c2 == 0)
1031	break;
1032	}
1033
1034	return 0;
1035	}
1036
1037	/** Compare two NULL terminated strings with length limit in case-insensitive
1038	* manner.
1039	*
1040	* Do a char-by-char comparison of two NULL-terminated strings.
1041	* The strings are considered equal iff
1042	* min(str_length(s1), max_len) == min(str_length(s2), max_len)
1043	* and both strings consist of the same sequence of characters,
1044	* up to max_len characters.
1045	*
1046	* A string S1 is less than another string S2 if it has a character with
1047	* lower value at the first character position where the strings differ.
1048	* If the strings differ in length, the shorter one is treated as if
1049	* padded by characters with a value of zero. Only the first max_len
1050	* characters are considered.
1051	*
1052	* @param s1 First string to compare.
1053	* @param s2 Second string to compare.
1054	* @param max_len Maximum number of characters to consider.
1055	*
1056	* @return 0 if the strings are equal, -1 if the first is less than the second,
1057	* 1 if the second is less than the first.
1058	*
1059	*/
1060	int str_lcasecmp(const char s1, const char s2, size_t max_len)
1061	{
1062	// FIXME: doesn't work for non-ASCII caseful characters
1063
1064	char32_t c1 = 0;
1065	char32_t c2 = 0;
1066
1067	size_t off1 = 0;
1068	size_t off2 = 0;
1069
1070	size_t len = 0;
1071
1072	while (true) {
1073	if (len >= max_len)
1074	break;
1075
1076	c1 = tolower(str_decode(s1, &off1, STR_NO_LIMIT));
1077	c2 = tolower(str_decode(s2, &off2, STR_NO_LIMIT));
1078
1079	if (c1 < c2)
1080	return -1;
1081
1082	if (c1 > c2)
1083	return 1;
1084
1085	if (c1 == 0 \|\| c2 == 0)
1086	break;
1087
1088	++len;
1089	}
1090
1091	return 0;
1092
1093	}
1094
1095	static bool _test_prefix(const char s, const char p)
1096	{
1097	while (s == p && *s != 0) {
1098	s++;
1099	p++;
1100	}
1101
1102	return *p == 0;
1103	}
1104
1105	/** Test whether p is a prefix of s.
1106	*
1107	* Do a char-by-char comparison of two NULL-terminated strings
1108	* and determine if p is a prefix of s.
1109	*
1110	* @param s The string in which to look
1111	* @param p The string to check if it is a prefix of s
1112	*
1113	* @return true iff p is prefix of s else false
1114	*
1115	*/
1116	bool str_test_prefix(const char s, const char p)
1117	{
1118	return _test_prefix(s, p);
1119	}
1120
1121	/** Get a string suffix.
1122	*
1123	* Return a string suffix defined by the prefix length.
1124	*
1125	* @param s The string to get the suffix from.
1126	* @param prefix_length Number of prefix characters to ignore.
1127	*
1128	* @return String suffix.
1129	*
1130	*/
1131	const char str_suffix(const char s, size_t prefix_length)
1132	{
1133	size_t off = 0;
1134	size_t i = 0;
1135
1136	while (true) {
1137	str_decode(s, &off, STR_NO_LIMIT);
1138	i++;
1139
1140	if (i >= prefix_length)
1141	break;
1142	}
1143
1144	return s + off;
1145	}
1146
1147	/** Copy string as a sequence of bytes. */
1148	static void _str_cpy(char dest, const char src)
1149	{
1150	while (*src)
1151	(dest++) = (src++);
1152
1153	*dest = 0;
1154	}
1155
1156	/** Copy string as a sequence of bytes. */
1157	static void _str_cpyn(char dest, size_t size, const char src)
1158	{
1159	assert(dest && src && size);
1160
1161	if (!dest \|\| !src \|\| !size)
1162	return;
1163
1164	if (size == STR_NO_LIMIT)
1165	return _str_cpy(dest, src);
1166
1167	char *dest_top = dest + size - 1;
1168	assert(size == 1 \|\| dest < dest_top);
1169
1170	while (*src && dest < dest_top)
1171	(dest++) = (src++);
1172
1173	*dest = 0;
1174	}
1175
1176	/** Copy string.
1177	*
1178	* Copy source string @a src to destination buffer @a dest.
1179	* No more than @a size bytes are written. If the size of the output buffer
1180	* is at least one byte, the output string will always be well-formed, i.e.
1181	* null-terminated and containing only complete characters.
1182	*
1183	* @param dest Destination buffer.
1184	* @param count Size of the destination buffer (must be > 0).
1185	* @param src Source string.
1186	*
1187	*/
1188	void str_cpy(char dest, size_t size, const char src)
1189	{
1190	/* There must be space for a null terminator in the buffer. */
1191	assert(size > 0);
1192	assert(src != NULL);
1193	assert(dest != NULL);
1194	assert(size == STR_NO_LIMIT \|\| dest + size > dest);
1195
1196	/* Copy data. */
1197	_str_cpyn(dest, size, src);
1198
1199	/* In-place translate invalid bytes to U_SPECIAL. */
1200	_str_sanitize(dest, size, U_SPECIAL);
1201	}
1202
1203	/** Copy size-limited substring.
1204	*
1205	* Copy prefix of string @a src of max. size @a size to destination buffer
1206	* @a dest. No more than @a size bytes are written. The output string will
1207	* always be well-formed, i.e. null-terminated and containing only complete
1208	* characters.
1209	*
1210	* No more than @a n bytes are read from the input string, so it does not
1211	* have to be null-terminated.
1212	*
1213	* @param dest Destination buffer.
1214	* @param count Size of the destination buffer (must be > 0).
1215	* @param src Source string.
1216	* @param n Maximum number of bytes to read from @a src.
1217	*
1218	*/
1219	void str_ncpy(char dest, size_t size, const char src, size_t n)
1220	{
1221	/* There must be space for a null terminator in the buffer. */
1222	assert(size > 0);
1223	assert(src != NULL);
1224
1225	/* Copy data. */
1226	_str_cpyn(dest, min(size, n + 1), src);
1227
1228	/* In-place translate invalid bytes to U_SPECIAL. */
1229	_str_sanitize(dest, size, U_SPECIAL);
1230	}
1231
1232	/** Append one string to another.
1233	*
1234	* Append source string @a src to string in destination buffer @a dest.
1235	* Size of the destination buffer is @a dest. If the size of the output buffer
1236	* is at least one byte, the output string will always be well-formed, i.e.
1237	* null-terminated and containing only complete characters.
1238	*
1239	* @param dest Destination buffer.
1240	* @param count Size of the destination buffer.
1241	* @param src Source string.
1242	*/
1243	void str_append(char dest, size_t size, const char src)
1244	{
1245	assert(src != NULL);
1246	assert(dest != NULL);
1247	assert(size > 0);
1248	assert(size == STR_NO_LIMIT \|\| dest + size > dest);
1249
1250	size_t dstr_size = _str_nsize(dest, size);
1251	if (dstr_size < size) {
1252	_str_cpyn(dest + dstr_size, size - dstr_size, src);
1253	_str_sanitize(dest + dstr_size, size - dstr_size, U_SPECIAL);
1254	}
1255	}
1256
1257	/** Convert space-padded ASCII to string.
1258	*
1259	* Common legacy text encoding in hardware is 7-bit ASCII fitted into
1260	* a fixed-width byte buffer (bit 7 always zero), right-padded with spaces
1261	* (ASCII 0x20). Convert space-padded ascii to string representation.
1262	*
1263	* If the text does not fit into the destination buffer, the function converts
1264	* as many characters as possible and returns EOVERFLOW.
1265	*
1266	* If the text contains non-ASCII bytes (with bit 7 set), the whole string is
1267	* converted anyway and invalid characters are replaced with question marks
1268	* (U_SPECIAL) and the function returns EIO.
1269	*
1270	* Regardless of return value upon return @a dest will always be well-formed.
1271	*
1272	* @param dest Destination buffer
1273	* @param size Size of destination buffer
1274	* @param src Space-padded ASCII.
1275	* @param n Size of the source buffer in bytes.
1276	*
1277	* @return EOK on success, EOVERFLOW if the text does not fit
1278	* destination buffer, EIO if the text contains
1279	* non-ASCII bytes.
1280	*/
1281	errno_t spascii_to_str(char dest, size_t size, const uint8_t src, size_t n)
1282	{
1283	size_t len = 0;
1284
1285	/* Determine the length of the source string. */
1286	for (size_t i = 0; i < n; i++) {
1287	if (src[i] == 0)
1288	break;
1289
1290	if (src[i] != ' ')
1291	len = i + 1;
1292	}
1293
1294	errno_t result = EOK;
1295	size_t out_len = min(len, size - 1);
1296
1297	/* Copy characters */
1298	for (size_t i = 0; i < out_len; i++) {
1299	dest[i] = src[i];
1300
1301	if (dest[i] < 0) {
1302	dest[i] = U_SPECIAL;
1303	result = EIO;
1304	}
1305	}
1306
1307	dest[out_len] = 0;
1308
1309	if (out_len < len)
1310	return EOVERFLOW;
1311
1312	return result;
1313	}
1314
1315	/** Convert wide string to string.
1316	*
1317	* Convert wide string @a src to string. The output is written to the buffer
1318	* specified by @a dest and @a size. @a size must be non-zero and the string
1319	* written will always be well-formed.
1320	*
1321	* @param dest Destination buffer.
1322	* @param size Size of the destination buffer.
1323	* @param src Source wide string.
1324	*/
1325	void wstr_to_str(char dest, size_t size, const char32_t src)
1326	{
1327	char32_t ch;
1328	size_t src_idx;
1329	size_t dest_off;
1330
1331	/* There must be space for a null terminator in the buffer. */
1332	assert(size > 0);
1333
1334	src_idx = 0;
1335	dest_off = 0;
1336
1337	while ((ch = src[src_idx++]) != 0) {
1338	if (chr_encode(ch, dest, &dest_off, size - 1) != EOK)
1339	break;
1340	}
1341
1342	dest[dest_off] = '\0';
1343	}
1344
1345	/** Convert UTF16 string to string.
1346	*
1347	* Convert utf16 string @a src to string. The output is written to the buffer
1348	* specified by @a dest and @a size. @a size must be non-zero and the string
1349	* written will always be well-formed. Surrogate pairs also supported.
1350	*
1351	* @param dest Destination buffer.
1352	* @param size Size of the destination buffer.
1353	* @param src Source utf16 string.
1354	*
1355	* @return EOK, if success, an error code otherwise.
1356	*/
1357	errno_t utf16_to_str(char dest, size_t size, const uint16_t src)
1358	{
1359	size_t idx = 0, dest_off = 0;
1360	char32_t ch;
1361	errno_t rc = EOK;
1362
1363	/* There must be space for a null terminator in the buffer. */
1364	assert(size > 0);
1365
1366	while (src[idx]) {
1367	if ((src[idx] & 0xfc00) == 0xd800) {
1368	if (src[idx + 1] && (src[idx + 1] & 0xfc00) == 0xdc00) {
1369	ch = 0x10000;
1370	ch += (src[idx] & 0x03FF) << 10;
1371	ch += (src[idx + 1] & 0x03FF);
1372	idx += 2;
1373	} else
1374	break;
1375	} else {
1376	ch = src[idx];
1377	idx++;
1378	}
1379	rc = chr_encode(ch, dest, &dest_off, size - 1);
1380	if (rc != EOK)
1381	break;
1382	}
1383	dest[dest_off] = '\0';
1384	return rc;
1385	}
1386
1387	/** Convert string to UTF16 string.
1388	*
1389	* Convert string @a src to utf16 string. The output is written to the buffer
1390	* specified by @a dest and @a dlen. @a dlen must be non-zero and the string
1391	* written will always be well-formed. Surrogate pairs also supported.
1392	*
1393	* @param dest Destination buffer.
1394	* @param dlen Number of utf16 characters that fit in the destination buffer.
1395	* @param src Source string.
1396	*
1397	* @return EOK, if success, an error code otherwise.
1398	*/
1399	errno_t str_to_utf16(uint16_t dest, size_t dlen, const char src)
1400	{
1401	errno_t rc = EOK;
1402	size_t offset = 0;
1403	size_t idx = 0;
1404	char32_t c;
1405
1406	assert(dlen > 0);
1407
1408	while ((c = str_decode(src, &offset, STR_NO_LIMIT)) != 0) {
1409	if (c > 0x10000) {
1410	if (idx + 2 >= dlen - 1) {
1411	rc = EOVERFLOW;
1412	break;
1413	}
1414	c = (c - 0x10000);
1415	dest[idx] = 0xD800 \| (c >> 10);
1416	dest[idx + 1] = 0xDC00 \| (c & 0x3FF);
1417	idx++;
1418	} else {
1419	dest[idx] = c;
1420	}
1421
1422	idx++;
1423	if (idx >= dlen - 1) {
1424	rc = EOVERFLOW;
1425	break;
1426	}
1427	}
1428
1429	dest[idx] = '\0';
1430	return rc;
1431	}
1432
1433	/** Get size of UTF-16 string.
1434	*
1435	* Get the number of words which are used by the UTF-16 string @a ustr
1436	* (excluding the NULL-terminator).
1437	*
1438	* @param ustr UTF-16 string to consider.
1439	*
1440	* @return Number of words used by the UTF-16 string
1441	*
1442	*/
1443	size_t utf16_wsize(const uint16_t *ustr)
1444	{
1445	size_t wsize = 0;
1446
1447	while (*ustr++ != 0)
1448	wsize++;
1449
1450	return wsize;
1451	}
1452
1453	/** Convert wide string to new string.
1454	*
1455	* Convert wide string @a src to string. Space for the new string is allocated
1456	* on the heap.
1457	*
1458	* @param src Source wide string.
1459	* @return New string.
1460	*/
1461	char wstr_to_astr(const char32_t src)
1462	{
1463	char dbuf[STR_BOUNDS(1)];
1464	char *str;
1465	char32_t ch;
1466
1467	size_t src_idx;
1468	size_t dest_off;
1469	size_t dest_size;
1470
1471	/* Compute size of encoded string. */
1472
1473	src_idx = 0;
1474	dest_size = 0;
1475
1476	while ((ch = src[src_idx++]) != 0) {
1477	dest_off = 0;
1478	if (chr_encode(ch, dbuf, &dest_off, STR_BOUNDS(1)) != EOK)
1479	break;
1480	dest_size += dest_off;
1481	}
1482
1483	str = malloc(dest_size + 1);
1484	if (str == NULL)
1485	return NULL;
1486
1487	/* Encode string. */
1488
1489	src_idx = 0;
1490	dest_off = 0;
1491
1492	while ((ch = src[src_idx++]) != 0) {
1493	if (chr_encode(ch, str, &dest_off, dest_size) != EOK)
1494	break;
1495	}
1496
1497	str[dest_size] = '\0';
1498	return str;
1499	}
1500
1501	/** Convert string to wide string.
1502	*
1503	* Convert string @a src to wide string. The output is written to the
1504	* buffer specified by @a dest and @a dlen. @a dlen must be non-zero
1505	* and the wide string written will always be null-terminated.
1506	*
1507	* @param dest Destination buffer.
1508	* @param dlen Length of destination buffer (number of wchars).
1509	* @param src Source string.
1510	*/
1511	void str_to_wstr(char32_t dest, size_t dlen, const char src)
1512	{
1513	size_t offset;
1514	size_t di;
1515	char32_t c;
1516
1517	assert(dlen > 0);
1518
1519	offset = 0;
1520	di = 0;
1521
1522	do {
1523	if (di >= dlen - 1)
1524	break;
1525
1526	c = str_decode(src, &offset, STR_NO_LIMIT);
1527	dest[di++] = c;
1528	} while (c != '\0');
1529
1530	dest[dlen - 1] = '\0';
1531	}
1532
1533	/** Convert string to wide string.
1534	*
1535	* Convert string @a src to wide string. A new wide NULL-terminated
1536	* string will be allocated on the heap.
1537	*
1538	* @param src Source string.
1539	*/
1540	char32_t str_to_awstr(const char str)
1541	{
1542	size_t len = str_length(str);
1543
1544	char32_t *wstr = calloc(len + 1, sizeof(char32_t));
1545	if (wstr == NULL)
1546	return NULL;
1547
1548	str_to_wstr(wstr, len + 1, str);
1549	return wstr;
1550	}
1551
1552	static char _strchr(const char str, char c)
1553	{
1554	while (str != 0 && str != c)
1555	str++;
1556
1557	return (str == c) ? (char ) str : NULL;
1558	}
1559
1560	/** Find first occurence of character in string.
1561	*
1562	* @param str String to search.
1563	* @param ch Character to look for.
1564	*
1565	* @return Pointer to character in @a str or NULL if not found.
1566	*/
1567	char str_chr(const char str, char32_t ch)
1568	{
1569	/* Fast path for an ASCII character. */
1570	if (ascii_check(ch))
1571	return _strchr(str, ch);
1572
1573	/* Convert character to UTF-8. */
1574	char utf8[STR_BOUNDS(1) + 1];
1575	size_t offset = 0;
1576
1577	if (chr_encode(ch, utf8, &offset, sizeof(utf8)) != EOK \|\| offset == 0)
1578	return NULL;
1579
1580	utf8[offset] = '\0';
1581
1582	/* Find the first byte, then check if all of them are correct. */
1583	while (*str != 0) {
1584	str = _strchr(str, utf8[0]);
1585	if (!str)
1586	return NULL;
1587
1588	if (_test_prefix(str, utf8))
1589	return (char *) str;
1590
1591	str++;
1592	}
1593
1594	return NULL;
1595	}
1596
1597	/** Find first occurence of substring in string.
1598	*
1599	* @param hs Haystack (string)
1600	* @param n Needle (substring to look for)
1601	*
1602	* @return Pointer to character in @a hs or @c NULL if not found.
1603	*/
1604	char str_str(const char hs, const char *n)
1605	{
1606	size_t hsize = _str_size(hs);
1607	size_t nsize = _str_size(n);
1608
1609	while (hsize >= nsize) {
1610	if (_test_prefix(hs, n))
1611	return (char *) hs;
1612
1613	hs++;
1614	hsize--;
1615	}
1616
1617	return NULL;
1618	}
1619
1620	static void _str_rtrim(char *str, char c)
1621	{
1622	char *last = str;
1623
1624	while (*str) {
1625	if (*str != c)
1626	last = str;
1627
1628	str++;
1629	}
1630
1631	/* Truncate string. */
1632	last[1] = 0;
1633	}
1634
1635	/** Removes specified trailing characters from a string.
1636	*
1637	* @param str String to remove from.
1638	* @param ch Character to remove.
1639	*/
1640	void str_rtrim(char *str, char32_t ch)
1641	{
1642	/* Fast path for the ASCII case. */
1643	if (ascii_check(ch)) {
1644	_str_rtrim(str, ch);
1645	return;
1646	}
1647
1648	size_t off = 0;
1649	size_t pos = 0;
1650	char32_t c;
1651	bool update_last_chunk = true;
1652	char *last_chunk = NULL;
1653
1654	while ((c = str_decode(str, &off, STR_NO_LIMIT))) {
1655	if (c != ch) {
1656	update_last_chunk = true;
1657	last_chunk = NULL;
1658	} else if (update_last_chunk) {
1659	update_last_chunk = false;
1660	last_chunk = (str + pos);
1661	}
1662	pos = off;
1663	}
1664
1665	if (last_chunk)
1666	*last_chunk = '\0';
1667	}
1668
1669	static void _str_ltrim(char *str, char c)
1670	{
1671	char *p = str;
1672
1673	while (*p == c)
1674	p++;
1675
1676	if (str != p)
1677	_str_cpy(str, p);
1678	}
1679
1680	/** Removes specified leading characters from a string.
1681	*
1682	* @param str String to remove from.
1683	* @param ch Character to remove.
1684	*/
1685	void str_ltrim(char *str, char32_t ch)
1686	{
1687	/* Fast path for the ASCII case. */
1688	if (ascii_check(ch)) {
1689	_str_ltrim(str, ch);
1690	return;
1691	}
1692
1693	char32_t acc;
1694	size_t off = 0;
1695	size_t pos = 0;
1696	size_t str_sz = str_size(str);
1697
1698	while ((acc = str_decode(str, &off, STR_NO_LIMIT)) != 0) {
1699	if (acc != ch)
1700	break;
1701	else
1702	pos = off;
1703	}
1704
1705	if (pos > 0) {
1706	memmove(str, &str[pos], str_sz - pos);
1707	pos = str_sz - pos;
1708	str[pos] = '\0';
1709	}
1710	}
1711
1712	static char _str_rchr(const char str, char c)
1713	{
1714	const char *last = NULL;
1715
1716	while (*str) {
1717	if (*str == c)
1718	last = str;
1719
1720	str++;
1721	}
1722
1723	return (char *) last;
1724	}
1725
1726	/** Find last occurence of character in string.
1727	*
1728	* @param str String to search.
1729	* @param ch Character to look for.
1730	*
1731	* @return Pointer to character in @a str or NULL if not found.
1732	*/
1733	char str_rchr(const char str, char32_t ch)
1734	{
1735	if (ascii_check(ch))
1736	return _str_rchr(str, ch);
1737
1738	char32_t acc;
1739	size_t off = 0;
1740	size_t last = 0;
1741	const char *res = NULL;
1742
1743	while ((acc = str_decode(str, &off, STR_NO_LIMIT)) != 0) {
1744	if (acc == ch)
1745	res = (str + last);
1746	last = off;
1747	}
1748
1749	return (char *) res;
1750	}
1751
1752	/** Insert a wide character into a wide string.
1753	*
1754	* Insert a wide character into a wide string at position
1755	* @a pos. The characters after the position are shifted.
1756	*
1757	* @param str String to insert to.
1758	* @param ch Character to insert to.
1759	* @param pos Character index where to insert.
1760	* @param max_pos Characters in the buffer.
1761	*
1762	* @return True if the insertion was sucessful, false if the position
1763	* is out of bounds.
1764	*
1765	*/
1766	bool wstr_linsert(char32_t *str, char32_t ch, size_t pos, size_t max_pos)
1767	{
1768	size_t len = wstr_length(str);
1769
1770	if ((pos > len) \|\| (pos + 1 > max_pos))
1771	return false;
1772
1773	size_t i;
1774	for (i = len; i + 1 > pos; i--)
1775	str[i + 1] = str[i];
1776
1777	str[pos] = ch;
1778
1779	return true;
1780	}
1781
1782	/** Remove a wide character from a wide string.
1783	*
1784	* Remove a wide character from a wide string at position
1785	* @a pos. The characters after the position are shifted.
1786	*
1787	* @param str String to remove from.
1788	* @param pos Character index to remove.
1789	*
1790	* @return True if the removal was sucessful, false if the position
1791	* is out of bounds.
1792	*
1793	*/
1794	bool wstr_remove(char32_t *str, size_t pos)
1795	{
1796	size_t len = wstr_length(str);
1797
1798	if (pos >= len)
1799	return false;
1800
1801	size_t i;
1802	for (i = pos + 1; i <= len; i++)
1803	str[i - 1] = str[i];
1804
1805	return true;
1806	}
1807
1808	/** Duplicate string.
1809	*
1810	* Allocate a new string and copy characters from the source
1811	* string into it. The duplicate string is allocated via sleeping
1812	* malloc(), thus this function can sleep in no memory conditions.
1813	*
1814	* The allocation cannot fail and the return value is always
1815	* a valid pointer. The duplicate string is always a well-formed
1816	* null-terminated UTF-8 string, but it can differ from the source
1817	* string on the byte level.
1818	*
1819	* @param src Source string.
1820	*
1821	* @return Duplicate string.
1822	*
1823	*/
1824	char str_dup(const char src)
1825	{
1826	size_t size = _str_size(src) + 1;
1827	char *dest = malloc(size);
1828	if (!dest)
1829	return NULL;
1830
1831	memcpy(dest, src, size);
1832	_str_sanitize(dest, size, U_SPECIAL);
1833	return dest;
1834	}
1835
1836	/** Duplicate string with size limit.
1837	*
1838	* Allocate a new string and copy up to @max_size bytes from the source
1839	* string into it. The duplicate string is allocated via sleeping
1840	* malloc(), thus this function can sleep in no memory conditions.
1841	* No more than @max_size + 1 bytes is allocated, but if the size
1842	* occupied by the source string is smaller than @max_size + 1,
1843	* less is allocated.
1844	*
1845	* The allocation cannot fail and the return value is always
1846	* a valid pointer. The duplicate string is always a well-formed
1847	* null-terminated UTF-8 string, but it can differ from the source
1848	* string on the byte level.
1849	*
1850	* @param src Source string.
1851	* @param n Maximum number of bytes to duplicate.
1852	*
1853	* @return Duplicate string.
1854	*
1855	*/
1856	char str_ndup(const char src, size_t n)
1857	{
1858	size_t size = _str_nsize(src, n);
1859
1860	char *dest = malloc(size + 1);
1861	if (!dest)
1862	return NULL;
1863
1864	memcpy(dest, src, size);
1865	_str_sanitize(dest, size, U_SPECIAL);
1866	dest[size] = 0;
1867	return dest;
1868	}
1869
1870	/** Split string by delimiters.
1871	*
1872	* @param s String to be tokenized. May not be NULL.
1873	* @param delim String with the delimiters.
1874	* @param next Variable which will receive the pointer to the
1875	* continuation of the string following the first
1876	* occurrence of any of the delimiter characters.
1877	* May be NULL.
1878	* @return Pointer to the prefix of @a s before the first
1879	* delimiter character. NULL if no such prefix
1880	* exists.
1881	*/
1882	char str_tok(char s, const char delim, char *next)
1883	{
1884	char start, end;
1885
1886	if (!s)
1887	return NULL;
1888
1889	size_t len = str_size(s);
1890	size_t cur;
1891	size_t tmp;
1892	char32_t ch;
1893
1894	/* Skip over leading delimiters. */
1895	tmp = 0;
1896	cur = 0;
1897	while ((ch = str_decode(s, &tmp, len)) && str_chr(delim, ch))
1898	cur = tmp;
1899	start = &s[cur];
1900
1901	/* Skip over token characters. */
1902	tmp = cur;
1903	while ((ch = str_decode(s, &tmp, len)) && !str_chr(delim, ch))
1904	cur = tmp;
1905	end = &s[cur];
1906	if (next)
1907	*next = (ch ? &s[tmp] : &s[cur]);
1908
1909	if (start == end)
1910	return NULL; /* No more tokens. */
1911
1912	/* Overwrite delimiter with NULL terminator. */
1913	*end = '\0';
1914	return start;
1915	}
1916
1917	void order_suffix(const uint64_t val, uint64_t rv, char suffix)
1918	{
1919	if (val > UINT64_C(10000000000000000000)) {
1920	*rv = val / UINT64_C(1000000000000000000);
1921	*suffix = 'Z';
1922	} else if (val > UINT64_C(1000000000000000000)) {
1923	*rv = val / UINT64_C(1000000000000000);
1924	*suffix = 'E';
1925	} else if (val > UINT64_C(1000000000000000)) {
1926	*rv = val / UINT64_C(1000000000000);
1927	*suffix = 'T';
1928	} else if (val > UINT64_C(1000000000000)) {
1929	*rv = val / UINT64_C(1000000000);
1930	*suffix = 'G';
1931	} else if (val > UINT64_C(1000000000)) {
1932	*rv = val / UINT64_C(1000000);
1933	*suffix = 'M';
1934	} else if (val > UINT64_C(1000000)) {
1935	*rv = val / UINT64_C(1000);
1936	*suffix = 'k';
1937	} else {
1938	*rv = val;
1939	*suffix = ' ';
1940	}
1941	}
1942
1943	void bin_order_suffix(const uint64_t val, uint64_t rv, const char *suffix,
1944	bool fixed)
1945	{
1946	if (val > UINT64_C(1152921504606846976)) {
1947	*rv = val / UINT64_C(1125899906842624);
1948	*suffix = "EiB";
1949	} else if (val > UINT64_C(1125899906842624)) {
1950	*rv = val / UINT64_C(1099511627776);
1951	*suffix = "TiB";
1952	} else if (val > UINT64_C(1099511627776)) {
1953	*rv = val / UINT64_C(1073741824);
1954	*suffix = "GiB";
1955	} else if (val > UINT64_C(1073741824)) {
1956	*rv = val / UINT64_C(1048576);
1957	*suffix = "MiB";
1958	} else if (val > UINT64_C(1048576)) {
1959	*rv = val / UINT64_C(1024);
1960	*suffix = "KiB";
1961	} else {
1962	*rv = val;
1963	if (fixed)
1964	*suffix = "B ";
1965	else
1966	*suffix = "B";
1967	}
1968	}
1969
1970	/** @}
1971	*/

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: mainline/common/str.c@ 9bf95d4

Download in other formats: