Context Navigation

str.c@ 39916d6

Visit:

Last change on this file since 39916d6 was d7f7a4a, checked in by Jiří Zárevúcky <zarevucky.jiri@…>, 3 years ago

Replace some license headers with SPDX identifier

Headers are replaced using tools/transorm-copyright.sh only
when it can be matched verbatim with the license header used
throughout most of the codebase.

Property mode set to 100644

File size: 10.2 KB

Line
1	/*
2	* SPDX-FileCopyrightText: 2001-2004 Jakub Jermar
3	* SPDX-FileCopyrightText: 2005 Martin Decky
4	* SPDX-FileCopyrightText: 2008 Jiri Svoboda
5	* SPDX-FileCopyrightText: 2011 Martin Sucha
6	* SPDX-FileCopyrightText: 2011 Oleg Romanenko
7	*
8	* SPDX-License-Identifier: BSD-3-Clause
9	*/
10
11	/**
12	* @file
13	* @brief String functions.
14	*
15	* Strings and characters use the Universal Character Set (UCS). The standard
16	* strings, called just strings are encoded in UTF-8. Wide strings (encoded
17	* in UTF-32) are supported to a limited degree. A single character is
18	* represented as char32_t.@n
19	*
20	* Overview of the terminology:@n
21	*
22	* Term Meaning
23	* -------------------- ----------------------------------------------------
24	* byte 8 bits stored in uint8_t (unsigned 8 bit integer)
25	*
26	* character UTF-32 encoded Unicode character, stored in char32_t
27	* (unsigned 32 bit integer), code points 0 .. 1114111
28	* are valid
29	*
30	* ASCII character 7 bit encoded ASCII character, stored in char
31	* (usually signed 8 bit integer), code points 0 .. 127
32	* are valid
33	*
34	* string UTF-8 encoded NULL-terminated Unicode string, char *
35	*
36	* wide string UTF-32 encoded NULL-terminated Unicode string,
37	* char32_t *
38	*
39	* [wide] string size number of BYTES in a [wide] string (excluding
40	* the NULL-terminator), size_t
41	*
42	* [wide] string length number of CHARACTERS in a [wide] string (excluding
43	* the NULL-terminator), size_t
44	*
45	* [wide] string width number of display cells on a monospace display taken
46	* by a [wide] string, size_t
47	*
48	*
49	* Overview of string metrics:@n
50	*
51	* Metric Abbrev. Type Meaning
52	* ------ ------ ------ -------------------------------------------------
53	* size n size_t number of BYTES in a string (excluding the
54	* NULL-terminator)
55	*
56	* length l size_t number of CHARACTERS in a string (excluding the
57	* null terminator)
58	*
59	* width w size_t number of display cells on a monospace display
60	* taken by a string
61	*
62	*
63	* Function naming prefixes:@n
64	*
65	* chr_ operate on characters
66	* ascii_ operate on ASCII characters
67	* str_ operate on strings
68	* wstr_ operate on wide strings
69	*
70	* [w]str_[n\|l\|w] operate on a prefix limited by size, length
71	* or width
72	*
73	*
74	* A specific character inside a [wide] string can be referred to by:@n
75	*
76	* pointer (char , char32_t )
77	* byte offset (size_t)
78	* character index (size_t)
79	*
80	*/
81
82	#include <str.h>
83
84	#include <errno.h>
85	#include <stdbool.h>
86	#include <stddef.h>
87	#include <stdint.h>
88
89	/** Byte mask consisting of lowest @n bits (out of 8) */
90	#define LO_MASK_8(n) ((uint8_t) ((1 << (n)) - 1))
91
92	/** Byte mask consisting of lowest @n bits (out of 32) */
93	#define LO_MASK_32(n) ((uint32_t) ((1 << (n)) - 1))
94
95	/** Byte mask consisting of highest @n bits (out of 8) */
96	#define HI_MASK_8(n) (~LO_MASK_8(8 - (n)))
97
98	/** Number of data bits in a UTF-8 continuation byte */
99	#define CONT_BITS 6
100
101	/** Decode a single character from a string.
102	*
103	* Decode a single character from a string of size @a size. Decoding starts
104	* at @a offset and this offset is moved to the beginning of the next
105	* character. In case of decoding error, offset generally advances at least
106	* by one. However, offset is never moved beyond size.
107	*
108	* @param str String (not necessarily NULL-terminated).
109	* @param offset Byte offset in string where to start decoding.
110	* @param size Size of the string (in bytes).
111	*
112	* @return Value of decoded character, U_SPECIAL on decoding error or
113	* NULL if attempt to decode beyond @a size.
114	*
115	*/
116	char32_t str_decode(const char str, size_t offset, size_t size)
117	{
118	if (*offset + 1 > size)
119	return 0;
120
121	/* First byte read from string */
122	uint8_t b0 = (uint8_t) str[(*offset)++];
123
124	/* Determine code length */
125
126	unsigned int b0_bits; /* Data bits in first byte */
127	unsigned int cbytes; /* Number of continuation bytes */
128
129	if ((b0 & 0x80) == 0) {
130	/* 0xxxxxxx (Plain ASCII) */
131	b0_bits = 7;
132	cbytes = 0;
133	} else if ((b0 & 0xe0) == 0xc0) {
134	/* 110xxxxx 10xxxxxx */
135	b0_bits = 5;
136	cbytes = 1;
137	} else if ((b0 & 0xf0) == 0xe0) {
138	/* 1110xxxx 10xxxxxx 10xxxxxx */
139	b0_bits = 4;
140	cbytes = 2;
141	} else if ((b0 & 0xf8) == 0xf0) {
142	/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
143	b0_bits = 3;
144	cbytes = 3;
145	} else {
146	/* 10xxxxxx -- unexpected continuation byte */
147	return U_SPECIAL;
148	}
149
150	if (*offset + cbytes > size)
151	return U_SPECIAL;
152
153	char32_t ch = b0 & LO_MASK_8(b0_bits);
154
155	/* Decode continuation bytes */
156	while (cbytes > 0) {
157	uint8_t b = (uint8_t) str[(*offset)++];
158
159	/* Must be 10xxxxxx */
160	if ((b & 0xc0) != 0x80)
161	return U_SPECIAL;
162
163	/* Shift data bits to ch */
164	ch = (ch << CONT_BITS) \| (char32_t) (b & LO_MASK_8(CONT_BITS));
165	cbytes--;
166	}
167
168	return ch;
169	}
170
171	/** Encode a single character to string representation.
172	*
173	* Encode a single character to string representation (i.e. UTF-8) and store
174	* it into a buffer at @a offset. Encoding starts at @a offset and this offset
175	* is moved to the position where the next character can be written to.
176	*
177	* @param ch Input character.
178	* @param str Output buffer.
179	* @param offset Byte offset where to start writing.
180	* @param size Size of the output buffer (in bytes).
181	*
182	* @return EOK if the character was encoded successfully, EOVERFLOW if there
183	* was not enough space in the output buffer or EINVAL if the character
184	* code was invalid.
185	*/
186	errno_t chr_encode(const char32_t ch, char str, size_t offset, size_t size)
187	{
188	if (*offset >= size)
189	return EOVERFLOW;
190
191	if (!chr_check(ch))
192	return EINVAL;
193
194	/*
195	* Unsigned version of ch (bit operations should only be done
196	* on unsigned types).
197	*/
198	uint32_t cc = (uint32_t) ch;
199
200	/* Determine how many continuation bytes are needed */
201
202	unsigned int b0_bits; /* Data bits in first byte */
203	unsigned int cbytes; /* Number of continuation bytes */
204
205	if ((cc & ~LO_MASK_32(7)) == 0) {
206	b0_bits = 7;
207	cbytes = 0;
208	} else if ((cc & ~LO_MASK_32(11)) == 0) {
209	b0_bits = 5;
210	cbytes = 1;
211	} else if ((cc & ~LO_MASK_32(16)) == 0) {
212	b0_bits = 4;
213	cbytes = 2;
214	} else if ((cc & ~LO_MASK_32(21)) == 0) {
215	b0_bits = 3;
216	cbytes = 3;
217	} else {
218	/* Codes longer than 21 bits are not supported */
219	return EINVAL;
220	}
221
222	/* Check for available space in buffer */
223	if (*offset + cbytes >= size)
224	return EOVERFLOW;
225
226	/* Encode continuation bytes */
227	unsigned int i;
228	for (i = cbytes; i > 0; i--) {
229	str[*offset + i] = 0x80 \| (cc & LO_MASK_32(CONT_BITS));
230	cc = cc >> CONT_BITS;
231	}
232
233	/* Encode first byte */
234	str[*offset] = (cc & LO_MASK_32(b0_bits)) \| HI_MASK_8(8 - b0_bits - 1);
235
236	/* Advance offset */
237	*offset += cbytes + 1;
238
239	return EOK;
240	}
241
242	/** Get size of string.
243	*
244	* Get the number of bytes which are used by the string @a str (excluding the
245	* NULL-terminator).
246	*
247	* @param str String to consider.
248	*
249	* @return Number of bytes used by the string
250	*
251	*/
252	size_t str_size(const char *str)
253	{
254	size_t size = 0;
255
256	while (*str++ != 0)
257	size++;
258
259	return size;
260	}
261
262	/** Get size of string with length limit.
263	*
264	* Get the number of bytes which are used by up to @a max_len first
265	* characters in the string @a str. If @a max_len is greater than
266	* the length of @a str, the entire string is measured (excluding the
267	* NULL-terminator).
268	*
269	* @param str String to consider.
270	* @param max_len Maximum number of characters to measure.
271	*
272	* @return Number of bytes used by the characters.
273	*
274	*/
275	size_t str_lsize(const char *str, size_t max_len)
276	{
277	size_t len = 0;
278	size_t offset = 0;
279
280	while (len < max_len) {
281	if (str_decode(str, &offset, STR_NO_LIMIT) == 0)
282	break;
283
284	len++;
285	}
286
287	return offset;
288	}
289
290	/** Get number of characters in a string.
291	*
292	* @param str NULL-terminated string.
293	*
294	* @return Number of characters in string.
295	*
296	*/
297	size_t str_length(const char *str)
298	{
299	size_t len = 0;
300	size_t offset = 0;
301
302	while (str_decode(str, &offset, STR_NO_LIMIT) != 0)
303	len++;
304
305	return len;
306	}
307
308	/** Check whether character is plain ASCII.
309	*
310	* @return True if character is plain ASCII.
311	*
312	*/
313	bool ascii_check(char32_t ch)
314	{
315	if (ch <= 127)
316	return true;
317
318	return false;
319	}
320
321	/** Check whether character is valid
322	*
323	* @return True if character is a valid Unicode code point.
324	*
325	*/
326	bool chr_check(char32_t ch)
327	{
328	if (ch <= 1114111)
329	return true;
330
331	return false;
332	}
333
334	/** Compare two NULL terminated strings.
335	*
336	* Do a char-by-char comparison of two NULL-terminated strings.
337	* The strings are considered equal iff their length is equal
338	* and both strings consist of the same sequence of characters.
339	*
340	* A string S1 is less than another string S2 if it has a character with
341	* lower value at the first character position where the strings differ.
342	* If the strings differ in length, the shorter one is treated as if
343	* padded by characters with a value of zero.
344	*
345	* @param s1 First string to compare.
346	* @param s2 Second string to compare.
347	*
348	* @return 0 if the strings are equal, -1 if the first is less than the second,
349	* 1 if the second is less than the first.
350	*
351	*/
352	int str_cmp(const char s1, const char s2)
353	{
354	char32_t c1 = 0;
355	char32_t c2 = 0;
356
357	size_t off1 = 0;
358	size_t off2 = 0;
359
360	while (true) {
361	c1 = str_decode(s1, &off1, STR_NO_LIMIT);
362	c2 = str_decode(s2, &off2, STR_NO_LIMIT);
363
364	if (c1 < c2)
365	return -1;
366
367	if (c1 > c2)
368	return 1;
369
370	if (c1 == 0 \|\| c2 == 0)
371	break;
372	}
373
374	return 0;
375	}
376
377	/** Copy string.
378	*
379	* Copy source string @a src to destination buffer @a dest.
380	* No more than @a size bytes are written. If the size of the output buffer
381	* is at least one byte, the output string will always be well-formed, i.e.
382	* null-terminated and containing only complete characters.
383	*
384	* @param dest Destination buffer.
385	* @param count Size of the destination buffer (must be > 0).
386	* @param src Source string.
387	*
388	*/
389	void str_cpy(char dest, size_t size, const char src)
390	{
391	size_t src_off = 0;
392	size_t dest_off = 0;
393
394	char32_t ch;
395	while ((ch = str_decode(src, &src_off, STR_NO_LIMIT)) != 0) {
396	if (chr_encode(ch, dest, &dest_off, size - 1) != EOK)
397	break;
398	}
399
400	dest[dest_off] = '\0';
401	}
402
403	/** @}
404	*/

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: mainline/boot/generic/src/str.c@ 39916d6

Download in other formats: