Context Navigation

source: mainline/common/str.c@ e2b417f

Visit:

Last change on this file since e2b417f was fdfb24e, checked in by Jiří Zárevúcky <zarevucky.jiri@…>, 20 months ago
Deduplicate string related functions
Property mode set to `100644`
File size: 37.2 KB

Line
1	/*
2	* Copyright (c) 2001-2004 Jakub Jermar
3	* Copyright (c) 2005 Martin Decky
4	* Copyright (c) 2008 Jiri Svoboda
5	* Copyright (c) 2011 Martin Sucha
6	* Copyright (c) 2011 Oleg Romanenko
7	* All rights reserved.
8	*
9	* Redistribution and use in source and binary forms, with or without
10	* modification, are permitted provided that the following conditions
11	* are met:
12	*
13	* - Redistributions of source code must retain the above copyright
14	* notice, this list of conditions and the following disclaimer.
15	* - Redistributions in binary form must reproduce the above copyright
16	* notice, this list of conditions and the following disclaimer in the
17	* documentation and/or other materials provided with the distribution.
18	* - The name of the author may not be used to endorse or promote products
19	* derived from this software without specific prior written permission.
20	*
21	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
22	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
23	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
24	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
25	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
26	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
30	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31	*/
32
33	/** @addtogroup libc
34	* @{
35	*/
36
37	/**
38	* @file
39	* @brief String functions.
40	*
41	* Strings and characters use the Universal Character Set (UCS). The standard
42	* strings, called just strings are encoded in UTF-8. Wide strings (encoded
43	* in UTF-32) are supported to a limited degree. A single character is
44	* represented as char32_t.@n
45	*
46	* Overview of the terminology:@n
47	*
48	* Term Meaning
49	* -------------------- ----------------------------------------------------
50	* byte 8 bits stored in uint8_t (unsigned 8 bit integer)
51	*
52	* character UTF-32 encoded Unicode character, stored in char32_t
53	* (unsigned 32 bit integer), code points 0 .. 1114111
54	* are valid
55	*
56	* ASCII character 7 bit encoded ASCII character, stored in char
57	* (usually signed 8 bit integer), code points 0 .. 127
58	* are valid
59	*
60	* string UTF-8 encoded NULL-terminated Unicode string, char *
61	*
62	* wide string UTF-32 encoded NULL-terminated Unicode string,
63	* char32_t *
64	*
65	* [wide] string size number of BYTES in a [wide] string (excluding
66	* the NULL-terminator), size_t
67	*
68	* [wide] string length number of CHARACTERS in a [wide] string (excluding
69	* the NULL-terminator), size_t
70	*
71	* [wide] string width number of display cells on a monospace display taken
72	* by a [wide] string, size_t
73	*
74	*
75	* Overview of string metrics:@n
76	*
77	* Metric Abbrev. Type Meaning
78	* ------ ------ ------ -------------------------------------------------
79	* size n size_t number of BYTES in a string (excluding the
80	* NULL-terminator)
81	*
82	* length l size_t number of CHARACTERS in a string (excluding the
83	* null terminator)
84	*
85	* width w size_t number of display cells on a monospace display
86	* taken by a string
87	*
88	*
89	* Function naming prefixes:@n
90	*
91	* chr_ operate on characters
92	* ascii_ operate on ASCII characters
93	* str_ operate on strings
94	* wstr_ operate on wide strings
95	*
96	* [w]str_[n\|l\|w] operate on a prefix limited by size, length
97	* or width
98	*
99	*
100	* A specific character inside a [wide] string can be referred to by:@n
101	*
102	* pointer (char , char32_t )
103	* byte offset (size_t)
104	* character index (size_t)
105	*
106	*/
107
108	#include <str.h>
109
110	#include <assert.h>
111	#include <ctype.h>
112	#include <errno.h>
113	#include <stdbool.h>
114	#include <stddef.h>
115	#include <stdint.h>
116	#include <stdlib.h>
117
118	#include <align.h>
119	#include <mem.h>
120
121	/** Byte mask consisting of lowest @n bits (out of 8) */
122	#define LO_MASK_8(n) ((uint8_t) ((1 << (n)) - 1))
123
124	/** Byte mask consisting of lowest @n bits (out of 32) */
125	#define LO_MASK_32(n) ((uint32_t) ((1 << (n)) - 1))
126
127	/** Byte mask consisting of highest @n bits (out of 8) */
128	#define HI_MASK_8(n) (~LO_MASK_8(8 - (n)))
129
130	/** Number of data bits in a UTF-8 continuation byte */
131	#define CONT_BITS 6
132
133	/** Decode a single character from a string.
134	*
135	* Decode a single character from a string of size @a size. Decoding starts
136	* at @a offset and this offset is moved to the beginning of the next
137	* character. In case of decoding error, offset generally advances at least
138	* by one. However, offset is never moved beyond size.
139	*
140	* @param str String (not necessarily NULL-terminated).
141	* @param offset Byte offset in string where to start decoding.
142	* @param size Size of the string (in bytes).
143	*
144	* @return Value of decoded character, U_SPECIAL on decoding error or
145	* NULL if attempt to decode beyond @a size.
146	*
147	*/
148	char32_t str_decode(const char str, size_t offset, size_t size)
149	{
150	if (*offset + 1 > size)
151	return 0;
152
153	/* First byte read from string */
154	uint8_t b0 = (uint8_t) str[(*offset)++];
155
156	/* Determine code length */
157
158	unsigned int b0_bits; /* Data bits in first byte */
159	unsigned int cbytes; /* Number of continuation bytes */
160
161	if ((b0 & 0x80) == 0) {
162	/* 0xxxxxxx (Plain ASCII) */
163	b0_bits = 7;
164	cbytes = 0;
165	} else if ((b0 & 0xe0) == 0xc0) {
166	/* 110xxxxx 10xxxxxx */
167	b0_bits = 5;
168	cbytes = 1;
169	} else if ((b0 & 0xf0) == 0xe0) {
170	/* 1110xxxx 10xxxxxx 10xxxxxx */
171	b0_bits = 4;
172	cbytes = 2;
173	} else if ((b0 & 0xf8) == 0xf0) {
174	/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
175	b0_bits = 3;
176	cbytes = 3;
177	} else {
178	/* 10xxxxxx -- unexpected continuation byte */
179	return U_SPECIAL;
180	}
181
182	if (*offset + cbytes > size)
183	return U_SPECIAL;
184
185	char32_t ch = b0 & LO_MASK_8(b0_bits);
186
187	/* Decode continuation bytes */
188	while (cbytes > 0) {
189	uint8_t b = (uint8_t) str[(*offset)++];
190
191	/* Must be 10xxxxxx */
192	if ((b & 0xc0) != 0x80)
193	return U_SPECIAL;
194
195	/* Shift data bits to ch */
196	ch = (ch << CONT_BITS) \| (char32_t) (b & LO_MASK_8(CONT_BITS));
197	cbytes--;
198	}
199
200	return ch;
201	}
202
203	/** Decode a single character from a string to the left.
204	*
205	* Decode a single character from a string of size @a size. Decoding starts
206	* at @a offset and this offset is moved to the beginning of the previous
207	* character. In case of decoding error, offset generally decreases at least
208	* by one. However, offset is never moved before 0.
209	*
210	* @param str String (not necessarily NULL-terminated).
211	* @param offset Byte offset in string where to start decoding.
212	* @param size Size of the string (in bytes).
213	*
214	* @return Value of decoded character, U_SPECIAL on decoding error or
215	* NULL if attempt to decode beyond @a start of str.
216	*
217	*/
218	char32_t str_decode_reverse(const char str, size_t offset, size_t size)
219	{
220	if (*offset == 0)
221	return 0;
222
223	size_t processed = 0;
224	/* Continue while continuation bytes found */
225	while (*offset > 0 && processed < 4) {
226	uint8_t b = (uint8_t) str[--(*offset)];
227
228	if (processed == 0 && (b & 0x80) == 0) {
229	/* 0xxxxxxx (Plain ASCII) */
230	return b & 0x7f;
231	} else if ((b & 0xe0) == 0xc0 \|\| (b & 0xf0) == 0xe0 \|\|
232	(b & 0xf8) == 0xf0) {
233	/* Start byte */
234	size_t start_offset = *offset;
235	return str_decode(str, &start_offset, size);
236	} else if ((b & 0xc0) != 0x80) {
237	/* Not a continuation byte */
238	return U_SPECIAL;
239	}
240	processed++;
241	}
242	/* Too many continuation bytes */
243	return U_SPECIAL;
244	}
245
246	/** Encode a single character to string representation.
247	*
248	* Encode a single character to string representation (i.e. UTF-8) and store
249	* it into a buffer at @a offset. Encoding starts at @a offset and this offset
250	* is moved to the position where the next character can be written to.
251	*
252	* @param ch Input character.
253	* @param str Output buffer.
254	* @param offset Byte offset where to start writing.
255	* @param size Size of the output buffer (in bytes).
256	*
257	* @return EOK if the character was encoded successfully, EOVERFLOW if there
258	* was not enough space in the output buffer or EINVAL if the character
259	* code was invalid.
260	*/
261	errno_t chr_encode(const char32_t ch, char str, size_t offset, size_t size)
262	{
263	if (*offset >= size)
264	return EOVERFLOW;
265
266	if (!chr_check(ch))
267	return EINVAL;
268
269	/*
270	* Unsigned version of ch (bit operations should only be done
271	* on unsigned types).
272	*/
273	uint32_t cc = (uint32_t) ch;
274
275	/* Determine how many continuation bytes are needed */
276
277	unsigned int b0_bits; /* Data bits in first byte */
278	unsigned int cbytes; /* Number of continuation bytes */
279
280	if ((cc & ~LO_MASK_32(7)) == 0) {
281	b0_bits = 7;
282	cbytes = 0;
283	} else if ((cc & ~LO_MASK_32(11)) == 0) {
284	b0_bits = 5;
285	cbytes = 1;
286	} else if ((cc & ~LO_MASK_32(16)) == 0) {
287	b0_bits = 4;
288	cbytes = 2;
289	} else if ((cc & ~LO_MASK_32(21)) == 0) {
290	b0_bits = 3;
291	cbytes = 3;
292	} else {
293	/* Codes longer than 21 bits are not supported */
294	return EINVAL;
295	}
296
297	/* Check for available space in buffer */
298	if (*offset + cbytes >= size)
299	return EOVERFLOW;
300
301	/* Encode continuation bytes */
302	unsigned int i;
303	for (i = cbytes; i > 0; i--) {
304	str[*offset + i] = 0x80 \| (cc & LO_MASK_32(CONT_BITS));
305	cc = cc >> CONT_BITS;
306	}
307
308	/* Encode first byte */
309	str[*offset] = (cc & LO_MASK_32(b0_bits)) \| HI_MASK_8(8 - b0_bits - 1);
310
311	/* Advance offset */
312	*offset += cbytes + 1;
313
314	return EOK;
315	}
316
317	/** Get size of string.
318	*
319	* Get the number of bytes which are used by the string @a str (excluding the
320	* NULL-terminator).
321	*
322	* @param str String to consider.
323	*
324	* @return Number of bytes used by the string
325	*
326	*/
327	size_t str_size(const char *str)
328	{
329	size_t size = 0;
330
331	while (*str++ != 0)
332	size++;
333
334	return size;
335	}
336
337	/** Get size of wide string.
338	*
339	* Get the number of bytes which are used by the wide string @a str (excluding the
340	* NULL-terminator).
341	*
342	* @param str Wide string to consider.
343	*
344	* @return Number of bytes used by the wide string
345	*
346	*/
347	size_t wstr_size(const char32_t *str)
348	{
349	return (wstr_length(str) * sizeof(char32_t));
350	}
351
352	/** Get size of string with length limit.
353	*
354	* Get the number of bytes which are used by up to @a max_len first
355	* characters in the string @a str. If @a max_len is greater than
356	* the length of @a str, the entire string is measured (excluding the
357	* NULL-terminator).
358	*
359	* @param str String to consider.
360	* @param max_len Maximum number of characters to measure.
361	*
362	* @return Number of bytes used by the characters.
363	*
364	*/
365	size_t str_lsize(const char *str, size_t max_len)
366	{
367	size_t len = 0;
368	size_t offset = 0;
369
370	while (len < max_len) {
371	if (str_decode(str, &offset, STR_NO_LIMIT) == 0)
372	break;
373
374	len++;
375	}
376
377	return offset;
378	}
379
380	/** Get size of string with size limit.
381	*
382	* Get the number of bytes which are used by the string @a str
383	* (excluding the NULL-terminator), but no more than @max_size bytes.
384	*
385	* @param str String to consider.
386	* @param max_size Maximum number of bytes to measure.
387	*
388	* @return Number of bytes used by the string
389	*
390	*/
391	size_t str_nsize(const char *str, size_t max_size)
392	{
393	size_t size = 0;
394
395	while ((*str++ != 0) && (size < max_size))
396	size++;
397
398	return size;
399	}
400
401	/** Get size of wide string with size limit.
402	*
403	* Get the number of bytes which are used by the wide string @a str
404	* (excluding the NULL-terminator), but no more than @max_size bytes.
405	*
406	* @param str Wide string to consider.
407	* @param max_size Maximum number of bytes to measure.
408	*
409	* @return Number of bytes used by the wide string
410	*
411	*/
412	size_t wstr_nsize(const char32_t *str, size_t max_size)
413	{
414	return (wstr_nlength(str, max_size) * sizeof(char32_t));
415	}
416
417	/** Get size of wide string with length limit.
418	*
419	* Get the number of bytes which are used by up to @a max_len first
420	* wide characters in the wide string @a str. If @a max_len is greater than
421	* the length of @a str, the entire wide string is measured (excluding the
422	* NULL-terminator).
423	*
424	* @param str Wide string to consider.
425	* @param max_len Maximum number of wide characters to measure.
426	*
427	* @return Number of bytes used by the wide characters.
428	*
429	*/
430	size_t wstr_lsize(const char32_t *str, size_t max_len)
431	{
432	return (wstr_nlength(str, max_len * sizeof(char32_t)) * sizeof(char32_t));
433	}
434
435	/** Get number of characters in a string.
436	*
437	* @param str NULL-terminated string.
438	*
439	* @return Number of characters in string.
440	*
441	*/
442	size_t str_length(const char *str)
443	{
444	size_t len = 0;
445	size_t offset = 0;
446
447	while (str_decode(str, &offset, STR_NO_LIMIT) != 0)
448	len++;
449
450	return len;
451	}
452
453	/** Get number of characters in a wide string.
454	*
455	* @param str NULL-terminated wide string.
456	*
457	* @return Number of characters in @a str.
458	*
459	*/
460	size_t wstr_length(const char32_t *wstr)
461	{
462	size_t len = 0;
463
464	while (*wstr++ != 0)
465	len++;
466
467	return len;
468	}
469
470	/** Get number of characters in a string with size limit.
471	*
472	* @param str NULL-terminated string.
473	* @param size Maximum number of bytes to consider.
474	*
475	* @return Number of characters in string.
476	*
477	*/
478	size_t str_nlength(const char *str, size_t size)
479	{
480	size_t len = 0;
481	size_t offset = 0;
482
483	while (str_decode(str, &offset, size) != 0)
484	len++;
485
486	return len;
487	}
488
489	/** Get number of characters in a string with size limit.
490	*
491	* @param str NULL-terminated string.
492	* @param size Maximum number of bytes to consider.
493	*
494	* @return Number of characters in string.
495	*
496	*/
497	size_t wstr_nlength(const char32_t *str, size_t size)
498	{
499	size_t len = 0;
500	size_t limit = ALIGN_DOWN(size, sizeof(char32_t));
501	size_t offset = 0;
502
503	while ((offset < limit) && (*str++ != 0)) {
504	len++;
505	offset += sizeof(char32_t);
506	}
507
508	return len;
509	}
510
511	/** Get character display width on a character cell display.
512	*
513	* @param ch Character
514	* @return Width of character in cells.
515	*/
516	size_t chr_width(char32_t ch)
517	{
518	return 1;
519	}
520
521	/** Get string display width on a character cell display.
522	*
523	* @param str String
524	* @return Width of string in cells.
525	*/
526	size_t str_width(const char *str)
527	{
528	size_t width = 0;
529	size_t offset = 0;
530	char32_t ch;
531
532	while ((ch = str_decode(str, &offset, STR_NO_LIMIT)) != 0)
533	width += chr_width(ch);
534
535	return width;
536	}
537
538	/** Check whether character is plain ASCII.
539	*
540	* @return True if character is plain ASCII.
541	*
542	*/
543	bool ascii_check(char32_t ch)
544	{
545	if (ch <= 127)
546	return true;
547
548	return false;
549	}
550
551	/** Check whether character is valid
552	*
553	* @return True if character is a valid Unicode code point.
554	*
555	*/
556	bool chr_check(char32_t ch)
557	{
558	if (ch <= 1114111)
559	return true;
560
561	return false;
562	}
563
564	/** Compare two NULL terminated strings.
565	*
566	* Do a char-by-char comparison of two NULL-terminated strings.
567	* The strings are considered equal iff their length is equal
568	* and both strings consist of the same sequence of characters.
569	*
570	* A string S1 is less than another string S2 if it has a character with
571	* lower value at the first character position where the strings differ.
572	* If the strings differ in length, the shorter one is treated as if
573	* padded by characters with a value of zero.
574	*
575	* @param s1 First string to compare.
576	* @param s2 Second string to compare.
577	*
578	* @return 0 if the strings are equal, -1 if the first is less than the second,
579	* 1 if the second is less than the first.
580	*
581	*/
582	int str_cmp(const char s1, const char s2)
583	{
584	char32_t c1 = 0;
585	char32_t c2 = 0;
586
587	size_t off1 = 0;
588	size_t off2 = 0;
589
590	while (true) {
591	c1 = str_decode(s1, &off1, STR_NO_LIMIT);
592	c2 = str_decode(s2, &off2, STR_NO_LIMIT);
593
594	if (c1 < c2)
595	return -1;
596
597	if (c1 > c2)
598	return 1;
599
600	if (c1 == 0 \|\| c2 == 0)
601	break;
602	}
603
604	return 0;
605	}
606
607	/** Compare two NULL terminated strings with length limit.
608	*
609	* Do a char-by-char comparison of two NULL-terminated strings.
610	* The strings are considered equal iff
611	* min(str_length(s1), max_len) == min(str_length(s2), max_len)
612	* and both strings consist of the same sequence of characters,
613	* up to max_len characters.
614	*
615	* A string S1 is less than another string S2 if it has a character with
616	* lower value at the first character position where the strings differ.
617	* If the strings differ in length, the shorter one is treated as if
618	* padded by characters with a value of zero. Only the first max_len
619	* characters are considered.
620	*
621	* @param s1 First string to compare.
622	* @param s2 Second string to compare.
623	* @param max_len Maximum number of characters to consider.
624	*
625	* @return 0 if the strings are equal, -1 if the first is less than the second,
626	* 1 if the second is less than the first.
627	*
628	*/
629	int str_lcmp(const char s1, const char s2, size_t max_len)
630	{
631	char32_t c1 = 0;
632	char32_t c2 = 0;
633
634	size_t off1 = 0;
635	size_t off2 = 0;
636
637	size_t len = 0;
638
639	while (true) {
640	if (len >= max_len)
641	break;
642
643	c1 = str_decode(s1, &off1, STR_NO_LIMIT);
644	c2 = str_decode(s2, &off2, STR_NO_LIMIT);
645
646	if (c1 < c2)
647	return -1;
648
649	if (c1 > c2)
650	return 1;
651
652	if (c1 == 0 \|\| c2 == 0)
653	break;
654
655	++len;
656	}
657
658	return 0;
659
660	}
661
662	/** Compare two NULL terminated strings in case-insensitive manner.
663	*
664	* Do a char-by-char comparison of two NULL-terminated strings.
665	* The strings are considered equal iff their length is equal
666	* and both strings consist of the same sequence of characters
667	* when converted to lower case.
668	*
669	* A string S1 is less than another string S2 if it has a character with
670	* lower value at the first character position where the strings differ.
671	* If the strings differ in length, the shorter one is treated as if
672	* padded by characters with a value of zero.
673	*
674	* @param s1 First string to compare.
675	* @param s2 Second string to compare.
676	*
677	* @return 0 if the strings are equal, -1 if the first is less than the second,
678	* 1 if the second is less than the first.
679	*
680	*/
681	int str_casecmp(const char s1, const char s2)
682	{
683	char32_t c1 = 0;
684	char32_t c2 = 0;
685
686	size_t off1 = 0;
687	size_t off2 = 0;
688
689	while (true) {
690	c1 = tolower(str_decode(s1, &off1, STR_NO_LIMIT));
691	c2 = tolower(str_decode(s2, &off2, STR_NO_LIMIT));
692
693	if (c1 < c2)
694	return -1;
695
696	if (c1 > c2)
697	return 1;
698
699	if (c1 == 0 \|\| c2 == 0)
700	break;
701	}
702
703	return 0;
704	}
705
706	/** Compare two NULL terminated strings with length limit in case-insensitive
707	* manner.
708	*
709	* Do a char-by-char comparison of two NULL-terminated strings.
710	* The strings are considered equal iff
711	* min(str_length(s1), max_len) == min(str_length(s2), max_len)
712	* and both strings consist of the same sequence of characters,
713	* up to max_len characters.
714	*
715	* A string S1 is less than another string S2 if it has a character with
716	* lower value at the first character position where the strings differ.
717	* If the strings differ in length, the shorter one is treated as if
718	* padded by characters with a value of zero. Only the first max_len
719	* characters are considered.
720	*
721	* @param s1 First string to compare.
722	* @param s2 Second string to compare.
723	* @param max_len Maximum number of characters to consider.
724	*
725	* @return 0 if the strings are equal, -1 if the first is less than the second,
726	* 1 if the second is less than the first.
727	*
728	*/
729	int str_lcasecmp(const char s1, const char s2, size_t max_len)
730	{
731	char32_t c1 = 0;
732	char32_t c2 = 0;
733
734	size_t off1 = 0;
735	size_t off2 = 0;
736
737	size_t len = 0;
738
739	while (true) {
740	if (len >= max_len)
741	break;
742
743	c1 = tolower(str_decode(s1, &off1, STR_NO_LIMIT));
744	c2 = tolower(str_decode(s2, &off2, STR_NO_LIMIT));
745
746	if (c1 < c2)
747	return -1;
748
749	if (c1 > c2)
750	return 1;
751
752	if (c1 == 0 \|\| c2 == 0)
753	break;
754
755	++len;
756	}
757
758	return 0;
759
760	}
761
762	/** Test whether p is a prefix of s.
763	*
764	* Do a char-by-char comparison of two NULL-terminated strings
765	* and determine if p is a prefix of s.
766	*
767	* @param s The string in which to look
768	* @param p The string to check if it is a prefix of s
769	*
770	* @return true iff p is prefix of s else false
771	*
772	*/
773	bool str_test_prefix(const char s, const char p)
774	{
775	char32_t c1 = 0;
776	char32_t c2 = 0;
777
778	size_t off1 = 0;
779	size_t off2 = 0;
780
781	while (true) {
782	c1 = str_decode(s, &off1, STR_NO_LIMIT);
783	c2 = str_decode(p, &off2, STR_NO_LIMIT);
784
785	if (c2 == 0)
786	return true;
787
788	if (c1 != c2)
789	return false;
790
791	if (c1 == 0)
792	break;
793	}
794
795	return false;
796	}
797
798	/** Get a string suffix.
799	*
800	* Return a string suffix defined by the prefix length.
801	*
802	* @param s The string to get the suffix from.
803	* @param prefix_length Number of prefix characters to ignore.
804	*
805	* @return String suffix.
806	*
807	*/
808	const char str_suffix(const char s, size_t prefix_length)
809	{
810	size_t off = 0;
811	size_t i = 0;
812
813	while (true) {
814	str_decode(s, &off, STR_NO_LIMIT);
815	i++;
816
817	if (i >= prefix_length)
818	break;
819	}
820
821	return s + off;
822	}
823
824	/** Copy string.
825	*
826	* Copy source string @a src to destination buffer @a dest.
827	* No more than @a size bytes are written. If the size of the output buffer
828	* is at least one byte, the output string will always be well-formed, i.e.
829	* null-terminated and containing only complete characters.
830	*
831	* @param dest Destination buffer.
832	* @param count Size of the destination buffer (must be > 0).
833	* @param src Source string.
834	*
835	*/
836	void str_cpy(char dest, size_t size, const char src)
837	{
838	/* There must be space for a null terminator in the buffer. */
839	assert(size > 0);
840	assert(src != NULL);
841
842	size_t src_off = 0;
843	size_t dest_off = 0;
844
845	char32_t ch;
846	while ((ch = str_decode(src, &src_off, STR_NO_LIMIT)) != 0) {
847	if (chr_encode(ch, dest, &dest_off, size - 1) != EOK)
848	break;
849	}
850
851	dest[dest_off] = '\0';
852	}
853
854	/** Copy size-limited substring.
855	*
856	* Copy prefix of string @a src of max. size @a size to destination buffer
857	* @a dest. No more than @a size bytes are written. The output string will
858	* always be well-formed, i.e. null-terminated and containing only complete
859	* characters.
860	*
861	* No more than @a n bytes are read from the input string, so it does not
862	* have to be null-terminated.
863	*
864	* @param dest Destination buffer.
865	* @param count Size of the destination buffer (must be > 0).
866	* @param src Source string.
867	* @param n Maximum number of bytes to read from @a src.
868	*
869	*/
870	void str_ncpy(char dest, size_t size, const char src, size_t n)
871	{
872	/* There must be space for a null terminator in the buffer. */
873	assert(size > 0);
874
875	size_t src_off = 0;
876	size_t dest_off = 0;
877
878	char32_t ch;
879	while ((ch = str_decode(src, &src_off, n)) != 0) {
880	if (chr_encode(ch, dest, &dest_off, size - 1) != EOK)
881	break;
882	}
883
884	dest[dest_off] = '\0';
885	}
886
887	/** Append one string to another.
888	*
889	* Append source string @a src to string in destination buffer @a dest.
890	* Size of the destination buffer is @a dest. If the size of the output buffer
891	* is at least one byte, the output string will always be well-formed, i.e.
892	* null-terminated and containing only complete characters.
893	*
894	* @param dest Destination buffer.
895	* @param count Size of the destination buffer.
896	* @param src Source string.
897	*/
898	void str_append(char dest, size_t size, const char src)
899	{
900	size_t dstr_size;
901
902	dstr_size = str_size(dest);
903	if (dstr_size >= size)
904	return;
905
906	str_cpy(dest + dstr_size, size - dstr_size, src);
907	}
908
909	/** Convert space-padded ASCII to string.
910	*
911	* Common legacy text encoding in hardware is 7-bit ASCII fitted into
912	* a fixed-width byte buffer (bit 7 always zero), right-padded with spaces
913	* (ASCII 0x20). Convert space-padded ascii to string representation.
914	*
915	* If the text does not fit into the destination buffer, the function converts
916	* as many characters as possible and returns EOVERFLOW.
917	*
918	* If the text contains non-ASCII bytes (with bit 7 set), the whole string is
919	* converted anyway and invalid characters are replaced with question marks
920	* (U_SPECIAL) and the function returns EIO.
921	*
922	* Regardless of return value upon return @a dest will always be well-formed.
923	*
924	* @param dest Destination buffer
925	* @param size Size of destination buffer
926	* @param src Space-padded ASCII.
927	* @param n Size of the source buffer in bytes.
928	*
929	* @return EOK on success, EOVERFLOW if the text does not fit
930	* destination buffer, EIO if the text contains
931	* non-ASCII bytes.
932	*/
933	errno_t spascii_to_str(char dest, size_t size, const uint8_t src, size_t n)
934	{
935	size_t sidx;
936	size_t didx;
937	size_t dlast;
938	uint8_t byte;
939	errno_t rc;
940	errno_t result;
941
942	/* There must be space for a null terminator in the buffer. */
943	assert(size > 0);
944	result = EOK;
945
946	didx = 0;
947	dlast = 0;
948	for (sidx = 0; sidx < n; ++sidx) {
949	byte = src[sidx];
950	if (!ascii_check(byte)) {
951	byte = U_SPECIAL;
952	result = EIO;
953	}
954
955	rc = chr_encode(byte, dest, &didx, size - 1);
956	if (rc != EOK) {
957	assert(rc == EOVERFLOW);
958	dest[didx] = '\0';
959	return rc;
960	}
961
962	/* Remember dest index after last non-empty character */
963	if (byte != 0x20)
964	dlast = didx;
965	}
966
967	/* Terminate string after last non-empty character */
968	dest[dlast] = '\0';
969	return result;
970	}
971
972	/** Convert wide string to string.
973	*
974	* Convert wide string @a src to string. The output is written to the buffer
975	* specified by @a dest and @a size. @a size must be non-zero and the string
976	* written will always be well-formed.
977	*
978	* @param dest Destination buffer.
979	* @param size Size of the destination buffer.
980	* @param src Source wide string.
981	*/
982	void wstr_to_str(char dest, size_t size, const char32_t src)
983	{
984	char32_t ch;
985	size_t src_idx;
986	size_t dest_off;
987
988	/* There must be space for a null terminator in the buffer. */
989	assert(size > 0);
990
991	src_idx = 0;
992	dest_off = 0;
993
994	while ((ch = src[src_idx++]) != 0) {
995	if (chr_encode(ch, dest, &dest_off, size - 1) != EOK)
996	break;
997	}
998
999	dest[dest_off] = '\0';
1000	}
1001
1002	/** Convert UTF16 string to string.
1003	*
1004	* Convert utf16 string @a src to string. The output is written to the buffer
1005	* specified by @a dest and @a size. @a size must be non-zero and the string
1006	* written will always be well-formed. Surrogate pairs also supported.
1007	*
1008	* @param dest Destination buffer.
1009	* @param size Size of the destination buffer.
1010	* @param src Source utf16 string.
1011	*
1012	* @return EOK, if success, an error code otherwise.
1013	*/
1014	errno_t utf16_to_str(char dest, size_t size, const uint16_t src)
1015	{
1016	size_t idx = 0, dest_off = 0;
1017	char32_t ch;
1018	errno_t rc = EOK;
1019
1020	/* There must be space for a null terminator in the buffer. */
1021	assert(size > 0);
1022
1023	while (src[idx]) {
1024	if ((src[idx] & 0xfc00) == 0xd800) {
1025	if (src[idx + 1] && (src[idx + 1] & 0xfc00) == 0xdc00) {
1026	ch = 0x10000;
1027	ch += (src[idx] & 0x03FF) << 10;
1028	ch += (src[idx + 1] & 0x03FF);
1029	idx += 2;
1030	} else
1031	break;
1032	} else {
1033	ch = src[idx];
1034	idx++;
1035	}
1036	rc = chr_encode(ch, dest, &dest_off, size - 1);
1037	if (rc != EOK)
1038	break;
1039	}
1040	dest[dest_off] = '\0';
1041	return rc;
1042	}
1043
1044	/** Convert string to UTF16 string.
1045	*
1046	* Convert string @a src to utf16 string. The output is written to the buffer
1047	* specified by @a dest and @a dlen. @a dlen must be non-zero and the string
1048	* written will always be well-formed. Surrogate pairs also supported.
1049	*
1050	* @param dest Destination buffer.
1051	* @param dlen Number of utf16 characters that fit in the destination buffer.
1052	* @param src Source string.
1053	*
1054	* @return EOK, if success, an error code otherwise.
1055	*/
1056	errno_t str_to_utf16(uint16_t dest, size_t dlen, const char src)
1057	{
1058	errno_t rc = EOK;
1059	size_t offset = 0;
1060	size_t idx = 0;
1061	char32_t c;
1062
1063	assert(dlen > 0);
1064
1065	while ((c = str_decode(src, &offset, STR_NO_LIMIT)) != 0) {
1066	if (c > 0x10000) {
1067	if (idx + 2 >= dlen - 1) {
1068	rc = EOVERFLOW;
1069	break;
1070	}
1071	c = (c - 0x10000);
1072	dest[idx] = 0xD800 \| (c >> 10);
1073	dest[idx + 1] = 0xDC00 \| (c & 0x3FF);
1074	idx++;
1075	} else {
1076	dest[idx] = c;
1077	}
1078
1079	idx++;
1080	if (idx >= dlen - 1) {
1081	rc = EOVERFLOW;
1082	break;
1083	}
1084	}
1085
1086	dest[idx] = '\0';
1087	return rc;
1088	}
1089
1090	/** Get size of UTF-16 string.
1091	*
1092	* Get the number of words which are used by the UTF-16 string @a ustr
1093	* (excluding the NULL-terminator).
1094	*
1095	* @param ustr UTF-16 string to consider.
1096	*
1097	* @return Number of words used by the UTF-16 string
1098	*
1099	*/
1100	size_t utf16_wsize(const uint16_t *ustr)
1101	{
1102	size_t wsize = 0;
1103
1104	while (*ustr++ != 0)
1105	wsize++;
1106
1107	return wsize;
1108	}
1109
1110	/** Convert wide string to new string.
1111	*
1112	* Convert wide string @a src to string. Space for the new string is allocated
1113	* on the heap.
1114	*
1115	* @param src Source wide string.
1116	* @return New string.
1117	*/
1118	char wstr_to_astr(const char32_t src)
1119	{
1120	char dbuf[STR_BOUNDS(1)];
1121	char *str;
1122	char32_t ch;
1123
1124	size_t src_idx;
1125	size_t dest_off;
1126	size_t dest_size;
1127
1128	/* Compute size of encoded string. */
1129
1130	src_idx = 0;
1131	dest_size = 0;
1132
1133	while ((ch = src[src_idx++]) != 0) {
1134	dest_off = 0;
1135	if (chr_encode(ch, dbuf, &dest_off, STR_BOUNDS(1)) != EOK)
1136	break;
1137	dest_size += dest_off;
1138	}
1139
1140	str = malloc(dest_size + 1);
1141	if (str == NULL)
1142	return NULL;
1143
1144	/* Encode string. */
1145
1146	src_idx = 0;
1147	dest_off = 0;
1148
1149	while ((ch = src[src_idx++]) != 0) {
1150	if (chr_encode(ch, str, &dest_off, dest_size) != EOK)
1151	break;
1152	}
1153
1154	str[dest_size] = '\0';
1155	return str;
1156	}
1157
1158	/** Convert string to wide string.
1159	*
1160	* Convert string @a src to wide string. The output is written to the
1161	* buffer specified by @a dest and @a dlen. @a dlen must be non-zero
1162	* and the wide string written will always be null-terminated.
1163	*
1164	* @param dest Destination buffer.
1165	* @param dlen Length of destination buffer (number of wchars).
1166	* @param src Source string.
1167	*/
1168	void str_to_wstr(char32_t dest, size_t dlen, const char src)
1169	{
1170	size_t offset;
1171	size_t di;
1172	char32_t c;
1173
1174	assert(dlen > 0);
1175
1176	offset = 0;
1177	di = 0;
1178
1179	do {
1180	if (di >= dlen - 1)
1181	break;
1182
1183	c = str_decode(src, &offset, STR_NO_LIMIT);
1184	dest[di++] = c;
1185	} while (c != '\0');
1186
1187	dest[dlen - 1] = '\0';
1188	}
1189
1190	/** Convert string to wide string.
1191	*
1192	* Convert string @a src to wide string. A new wide NULL-terminated
1193	* string will be allocated on the heap.
1194	*
1195	* @param src Source string.
1196	*/
1197	char32_t str_to_awstr(const char str)
1198	{
1199	size_t len = str_length(str);
1200
1201	char32_t *wstr = calloc(len + 1, sizeof(char32_t));
1202	if (wstr == NULL)
1203	return NULL;
1204
1205	str_to_wstr(wstr, len + 1, str);
1206	return wstr;
1207	}
1208
1209	/** Find first occurence of character in string.
1210	*
1211	* @param str String to search.
1212	* @param ch Character to look for.
1213	*
1214	* @return Pointer to character in @a str or NULL if not found.
1215	*/
1216	char str_chr(const char str, char32_t ch)
1217	{
1218	char32_t acc;
1219	size_t off = 0;
1220	size_t last = 0;
1221
1222	while ((acc = str_decode(str, &off, STR_NO_LIMIT)) != 0) {
1223	if (acc == ch)
1224	return (char *) (str + last);
1225	last = off;
1226	}
1227
1228	return NULL;
1229	}
1230
1231	/** Find first occurence of substring in string.
1232	*
1233	* @param hs Haystack (string)
1234	* @param n Needle (substring to look for)
1235	*
1236	* @return Pointer to character in @a hs or @c NULL if not found.
1237	*/
1238	char str_str(const char hs, const char *n)
1239	{
1240	size_t off = 0;
1241
1242	if (str_lcmp(hs, n, str_length(n)) == 0)
1243	return (char *)hs;
1244
1245	while (str_decode(hs, &off, STR_NO_LIMIT) != 0) {
1246	if (str_lcmp(hs + off, n, str_length(n)) == 0)
1247	return (char *)(hs + off);
1248	}
1249
1250	return NULL;
1251	}
1252
1253	/** Removes specified trailing characters from a string.
1254	*
1255	* @param str String to remove from.
1256	* @param ch Character to remove.
1257	*/
1258	void str_rtrim(char *str, char32_t ch)
1259	{
1260	size_t off = 0;
1261	size_t pos = 0;
1262	char32_t c;
1263	bool update_last_chunk = true;
1264	char *last_chunk = NULL;
1265
1266	while ((c = str_decode(str, &off, STR_NO_LIMIT))) {
1267	if (c != ch) {
1268	update_last_chunk = true;
1269	last_chunk = NULL;
1270	} else if (update_last_chunk) {
1271	update_last_chunk = false;
1272	last_chunk = (str + pos);
1273	}
1274	pos = off;
1275	}
1276
1277	if (last_chunk)
1278	*last_chunk = '\0';
1279	}
1280
1281	/** Removes specified leading characters from a string.
1282	*
1283	* @param str String to remove from.
1284	* @param ch Character to remove.
1285	*/
1286	void str_ltrim(char *str, char32_t ch)
1287	{
1288	char32_t acc;
1289	size_t off = 0;
1290	size_t pos = 0;
1291	size_t str_sz = str_size(str);
1292
1293	while ((acc = str_decode(str, &off, STR_NO_LIMIT)) != 0) {
1294	if (acc != ch)
1295	break;
1296	else
1297	pos = off;
1298	}
1299
1300	if (pos > 0) {
1301	memmove(str, &str[pos], str_sz - pos);
1302	pos = str_sz - pos;
1303	str[pos] = '\0';
1304	}
1305	}
1306
1307	/** Find last occurence of character in string.
1308	*
1309	* @param str String to search.
1310	* @param ch Character to look for.
1311	*
1312	* @return Pointer to character in @a str or NULL if not found.
1313	*/
1314	char str_rchr(const char str, char32_t ch)
1315	{
1316	char32_t acc;
1317	size_t off = 0;
1318	size_t last = 0;
1319	const char *res = NULL;
1320
1321	while ((acc = str_decode(str, &off, STR_NO_LIMIT)) != 0) {
1322	if (acc == ch)
1323	res = (str + last);
1324	last = off;
1325	}
1326
1327	return (char *) res;
1328	}
1329
1330	/** Insert a wide character into a wide string.
1331	*
1332	* Insert a wide character into a wide string at position
1333	* @a pos. The characters after the position are shifted.
1334	*
1335	* @param str String to insert to.
1336	* @param ch Character to insert to.
1337	* @param pos Character index where to insert.
1338	* @param max_pos Characters in the buffer.
1339	*
1340	* @return True if the insertion was sucessful, false if the position
1341	* is out of bounds.
1342	*
1343	*/
1344	bool wstr_linsert(char32_t *str, char32_t ch, size_t pos, size_t max_pos)
1345	{
1346	size_t len = wstr_length(str);
1347
1348	if ((pos > len) \|\| (pos + 1 > max_pos))
1349	return false;
1350
1351	size_t i;
1352	for (i = len; i + 1 > pos; i--)
1353	str[i + 1] = str[i];
1354
1355	str[pos] = ch;
1356
1357	return true;
1358	}
1359
1360	/** Remove a wide character from a wide string.
1361	*
1362	* Remove a wide character from a wide string at position
1363	* @a pos. The characters after the position are shifted.
1364	*
1365	* @param str String to remove from.
1366	* @param pos Character index to remove.
1367	*
1368	* @return True if the removal was sucessful, false if the position
1369	* is out of bounds.
1370	*
1371	*/
1372	bool wstr_remove(char32_t *str, size_t pos)
1373	{
1374	size_t len = wstr_length(str);
1375
1376	if (pos >= len)
1377	return false;
1378
1379	size_t i;
1380	for (i = pos + 1; i <= len; i++)
1381	str[i - 1] = str[i];
1382
1383	return true;
1384	}
1385
1386	/** Duplicate string.
1387	*
1388	* Allocate a new string and copy characters from the source
1389	* string into it. The duplicate string is allocated via sleeping
1390	* malloc(), thus this function can sleep in no memory conditions.
1391	*
1392	* The allocation cannot fail and the return value is always
1393	* a valid pointer. The duplicate string is always a well-formed
1394	* null-terminated UTF-8 string, but it can differ from the source
1395	* string on the byte level.
1396	*
1397	* @param src Source string.
1398	*
1399	* @return Duplicate string.
1400	*
1401	*/
1402	char str_dup(const char src)
1403	{
1404	size_t size = str_size(src) + 1;
1405	char *dest = malloc(size);
1406	if (!dest)
1407	return NULL;
1408
1409	str_cpy(dest, size, src);
1410	return dest;
1411	}
1412
1413	/** Duplicate string with size limit.
1414	*
1415	* Allocate a new string and copy up to @max_size bytes from the source
1416	* string into it. The duplicate string is allocated via sleeping
1417	* malloc(), thus this function can sleep in no memory conditions.
1418	* No more than @max_size + 1 bytes is allocated, but if the size
1419	* occupied by the source string is smaller than @max_size + 1,
1420	* less is allocated.
1421	*
1422	* The allocation cannot fail and the return value is always
1423	* a valid pointer. The duplicate string is always a well-formed
1424	* null-terminated UTF-8 string, but it can differ from the source
1425	* string on the byte level.
1426	*
1427	* @param src Source string.
1428	* @param n Maximum number of bytes to duplicate.
1429	*
1430	* @return Duplicate string.
1431	*
1432	*/
1433	char str_ndup(const char src, size_t n)
1434	{
1435	size_t size = str_size(src);
1436	if (size > n)
1437	size = n;
1438
1439	char *dest = malloc(size + 1);
1440	if (!dest)
1441	return NULL;
1442
1443	str_ncpy(dest, size + 1, src, size);
1444	return dest;
1445	}
1446
1447	/** Split string by delimiters.
1448	*
1449	* @param s String to be tokenized. May not be NULL.
1450	* @param delim String with the delimiters.
1451	* @param next Variable which will receive the pointer to the
1452	* continuation of the string following the first
1453	* occurrence of any of the delimiter characters.
1454	* May be NULL.
1455	* @return Pointer to the prefix of @a s before the first
1456	* delimiter character. NULL if no such prefix
1457	* exists.
1458	*/
1459	char str_tok(char s, const char delim, char *next)
1460	{
1461	char start, end;
1462
1463	if (!s)
1464	return NULL;
1465
1466	size_t len = str_size(s);
1467	size_t cur;
1468	size_t tmp;
1469	char32_t ch;
1470
1471	/* Skip over leading delimiters. */
1472	tmp = 0;
1473	cur = 0;
1474	while ((ch = str_decode(s, &tmp, len)) && str_chr(delim, ch))
1475	cur = tmp;
1476	start = &s[cur];
1477
1478	/* Skip over token characters. */
1479	tmp = cur;
1480	while ((ch = str_decode(s, &tmp, len)) && !str_chr(delim, ch))
1481	cur = tmp;
1482	end = &s[cur];
1483	if (next)
1484	*next = (ch ? &s[tmp] : &s[cur]);
1485
1486	if (start == end)
1487	return NULL; /* No more tokens. */
1488
1489	/* Overwrite delimiter with NULL terminator. */
1490	*end = '\0';
1491	return start;
1492	}
1493
1494	void order_suffix(const uint64_t val, uint64_t rv, char suffix)
1495	{
1496	if (val > UINT64_C(10000000000000000000)) {
1497	*rv = val / UINT64_C(1000000000000000000);
1498	*suffix = 'Z';
1499	} else if (val > UINT64_C(1000000000000000000)) {
1500	*rv = val / UINT64_C(1000000000000000);
1501	*suffix = 'E';
1502	} else if (val > UINT64_C(1000000000000000)) {
1503	*rv = val / UINT64_C(1000000000000);
1504	*suffix = 'T';
1505	} else if (val > UINT64_C(1000000000000)) {
1506	*rv = val / UINT64_C(1000000000);
1507	*suffix = 'G';
1508	} else if (val > UINT64_C(1000000000)) {
1509	*rv = val / UINT64_C(1000000);
1510	*suffix = 'M';
1511	} else if (val > UINT64_C(1000000)) {
1512	*rv = val / UINT64_C(1000);
1513	*suffix = 'k';
1514	} else {
1515	*rv = val;
1516	*suffix = ' ';
1517	}
1518	}
1519
1520	void bin_order_suffix(const uint64_t val, uint64_t rv, const char *suffix,
1521	bool fixed)
1522	{
1523	if (val > UINT64_C(1152921504606846976)) {
1524	*rv = val / UINT64_C(1125899906842624);
1525	*suffix = "EiB";
1526	} else if (val > UINT64_C(1125899906842624)) {
1527	*rv = val / UINT64_C(1099511627776);
1528	*suffix = "TiB";
1529	} else if (val > UINT64_C(1099511627776)) {
1530	*rv = val / UINT64_C(1073741824);
1531	*suffix = "GiB";
1532	} else if (val > UINT64_C(1073741824)) {
1533	*rv = val / UINT64_C(1048576);
1534	*suffix = "MiB";
1535	} else if (val > UINT64_C(1048576)) {
1536	*rv = val / UINT64_C(1024);
1537	*suffix = "KiB";
1538	} else {
1539	*rv = val;
1540	if (fixed)
1541	*suffix = "B ";
1542	else
1543	*suffix = "B";
1544	}
1545	}
1546
1547	/** @}
1548	*/

Note: See TracBrowser for help on using the repository browser.

Download in other formats: