Changes in common/str.c [1db4e2ae:fdfb24e] in mainline
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
common/str.c
r1db4e2ae rfdfb24e 5 5 * Copyright (c) 2011 Martin Sucha 6 6 * Copyright (c) 2011 Oleg Romanenko 7 * Copyright (c) 2025 Jiří Zárevúcky8 7 * All rights reserved. 9 8 * … … 55 54 * are valid 56 55 * 57 * Note that Unicode characters do not match58 * one-to-one with displayed characters or glyphs on59 * screen. For that level of precision, look up60 * Grapheme Clusters.61 *62 56 * ASCII character 7 bit encoded ASCII character, stored in char 63 57 * (usually signed 8 bit integer), code points 0 .. 127 … … 77 71 * [wide] string width number of display cells on a monospace display taken 78 72 * by a [wide] string, size_t 79 *80 * This is virtually impossible to determine exactly for81 * all strings without knowing specifics of the display82 * device, due to various factors affecting text output.83 * If you have the option to query the terminal for84 * position change caused by outputting the string,85 * it is preferrable to determine width that way.86 73 * 87 74 * … … 121 108 #include <str.h> 122 109 123 #include <align.h>124 110 #include <assert.h> 125 111 #include <ctype.h> 126 112 #include <errno.h> 127 #include <limits.h>128 #include <macros.h>129 #include <mem.h>130 113 #include <stdbool.h> 131 114 #include <stddef.h> 132 115 #include <stdint.h> 133 116 #include <stdlib.h> 134 #include <uchar.h> 135 136 #if __STDC_HOSTED__ 137 #include <fibril.h> 138 #endif 139 140 static void _set_ilseq() 141 { 142 #ifdef errno 143 errno = EILSEQ; 144 #endif 145 } 117 118 #include <align.h> 119 #include <mem.h> 146 120 147 121 /** Byte mask consisting of lowest @n bits (out of 8) */ … … 156 130 /** Number of data bits in a UTF-8 continuation byte */ 157 131 #define CONT_BITS 6 158 159 #define UTF8_MASK_INITIAL2 0b00011111160 #define UTF8_MASK_INITIAL3 0b00001111161 #define UTF8_MASK_INITIAL4 0b00000111162 #define UTF8_MASK_CONT 0b00111111163 164 #define CHAR_INVALID ((char32_t) UINT_MAX)165 166 static inline bool _is_ascii(uint8_t b)167 {168 return b < 0x80;169 }170 171 static inline bool _is_continuation(uint8_t b)172 {173 return (b & 0xC0) == 0x80;174 }175 176 static inline bool _is_2_byte(uint8_t c)177 {178 return (c & 0xE0) == 0xC0;179 }180 181 static inline bool _is_3_byte(uint8_t c)182 {183 return (c & 0xF0) == 0xE0;184 }185 186 static inline bool _is_4_byte(uint8_t c)187 {188 return (c & 0xF8) == 0xF0;189 }190 191 static inline int _char_continuation_bytes(char32_t c)192 {193 if ((c & ~LO_MASK_32(7)) == 0)194 return 0;195 196 if ((c & ~LO_MASK_32(11)) == 0)197 return 1;198 199 if ((c & ~LO_MASK_32(16)) == 0)200 return 2;201 202 if ((c & ~LO_MASK_32(21)) == 0)203 return 3;204 205 /* Codes longer than 21 bits are not supported */206 return -1;207 }208 209 static inline int _continuation_bytes(uint8_t b)210 {211 /* 0xxxxxxx */212 if (_is_ascii(b))213 return 0;214 215 /* 110xxxxx 10xxxxxx */216 if (_is_2_byte(b))217 return 1;218 219 /* 1110xxxx 10xxxxxx 10xxxxxx */220 if (_is_3_byte(b))221 return 2;222 223 /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */224 if (_is_4_byte(b))225 return 3;226 227 return -1;228 }229 230 static bool _is_non_shortest(const mbstate_t *mb, uint8_t b)231 {232 return (mb->state == 0b1111110000000000 && !(b & 0b00100000)) ||233 (mb->state == 0b1111111111110000 && !(b & 0b00110000));234 }235 236 static bool _is_surrogate(const mbstate_t *mb, uint8_t b)237 {238 return (mb->state == 0b1111110000001101 && b >= 0xa0);239 }240 241 #define _likely(expr) __builtin_expect((expr), true)242 #define _unlikely(expr) __builtin_expect((expr), false)243 244 #define FAST_PATHS 1245 246 static char32_t _str_decode(const char *s, size_t *offset, size_t size, mbstate_t *mb)247 {248 assert(s);249 assert(offset);250 assert(*offset <= size);251 assert(size == STR_NO_LIMIT || s + size >= s);252 assert(mb);253 254 if (*offset == size)255 return 0;256 257 if (_likely(!mb->state)) {258 /* Clean slate, read initial byte. */259 uint8_t b = s[(*offset)++];260 261 /* Fast exit for the most common case. */262 if (_likely(_is_ascii(b)))263 return b;264 265 /* unexpected continuation byte */266 if (_unlikely(_is_continuation(b)))267 return CHAR_INVALID;268 269 /*270 * The value stored into `continuation` is designed to have271 * just enough leading ones that after shifting in one less than272 * the expected number of continuation bytes, the most significant273 * bit becomes zero. (The field is 16b wide.)274 */275 276 if (_is_2_byte(b)) {277 /* Reject non-shortest form. */278 if (_unlikely(!(b & 0b00011110)))279 return CHAR_INVALID;280 281 #if FAST_PATHS282 /* We can usually take this exit. */283 if (_likely(*offset < size && _is_continuation(s[*offset])))284 return (b & UTF8_MASK_INITIAL2) << 6 |285 (s[(*offset)++] & UTF8_MASK_CONT);286 #endif287 288 /* 2 byte continuation 110xxxxx */289 mb->state = b ^ 0b0000000011000000;290 291 } else if (_is_3_byte(b)) {292 #if FAST_PATHS293 /* We can usually take this exit. */294 if (_likely(*offset + 1 < size && _is_continuation(s[*offset]) && _is_continuation(s[*offset + 1]))) {295 296 char32_t ch = (b & UTF8_MASK_INITIAL3) << 12 |297 (s[(*offset)] & UTF8_MASK_CONT) << 6 |298 (s[(*offset) + 1] & UTF8_MASK_CONT);299 300 *offset += 2;301 302 /* Reject non-shortest form. */303 if (_unlikely(!(ch & 0xFFFFF800)))304 return CHAR_INVALID;305 306 /* Reject surrogates */307 if (_unlikely(ch >= 0xD800 && ch < 0xE000))308 return CHAR_INVALID;309 310 return ch;311 }312 #endif313 314 /* 3 byte continuation 1110xxxx */315 mb->state = b ^ 0b1111110011100000;316 317 } else if (_is_4_byte(b)) {318 #if FAST_PATHS319 /* We can usually take this exit. */320 if (_likely(*offset + 2 < size && _is_continuation(s[*offset]) &&321 _is_continuation(s[*offset + 1]) && _is_continuation(s[*offset + 2]))) {322 323 char32_t ch = (b & UTF8_MASK_INITIAL4) << 18 |324 (s[(*offset)] & UTF8_MASK_CONT) << 12 |325 (s[(*offset) + 1] & UTF8_MASK_CONT) << 6 |326 (s[(*offset) + 2] & UTF8_MASK_CONT);327 328 *offset += 3;329 330 /* Reject non-shortest form. */331 if (_unlikely(!(ch & 0xFFFF0000)))332 return CHAR_INVALID;333 334 /* Reject out-of-range characters. */335 if (_unlikely(ch >= 0x110000))336 return CHAR_INVALID;337 338 return ch;339 }340 #endif341 342 /* 4 byte continuation 11110xxx */343 mb->state = b ^ 0b1111111100000000;344 } else {345 return CHAR_INVALID;346 }347 }348 349 /* Deal with the remaining edge and invalid cases. */350 for (; *offset < size; (*offset)++) {351 /* Read continuation bytes. */352 uint8_t b = s[*offset];353 354 if (!_is_continuation(b) || _is_non_shortest(mb, b) || _is_surrogate(mb, b)) {355 mb->state = 0;356 return CHAR_INVALID;357 }358 359 /* Top bit becomes zero when shifting in the second to last byte. */360 if (!(mb->state & 0x8000)) {361 char32_t c = ((char32_t) mb->state) << 6 | (b & UTF8_MASK_CONT);362 mb->state = 0;363 (*offset)++;364 return c;365 }366 367 mb->state = mb->state << 6 | (b & UTF8_MASK_CONT);368 }369 370 /* Incomplete character. */371 assert(mb->state);372 return 0;373 }374 375 /** Standard <uchar.h> function since C11. */376 size_t mbrtoc32(char32_t *c, const char *s, size_t n, mbstate_t *mb)377 {378 #if __STDC_HOSTED__379 static fibril_local mbstate_t global_state = { };380 381 if (!mb)382 mb = &global_state;383 #endif384 385 if (!s) {386 /* Equivalent to mbrtoc32(NULL, "", 1, mb); */387 c = NULL;388 s = "";389 n = 1;390 }391 392 size_t offset = 0;393 char32_t ret = _str_decode(s, &offset, n, mb);394 if (ret == CHAR_INVALID) {395 assert(!mb->state);396 _set_ilseq();397 return UCHAR_ILSEQ;398 }399 if (mb->state) {400 assert(ret == 0);401 return UCHAR_INCOMPLETE;402 }403 404 if (c)405 *c = ret;406 return ret ? offset : 0;407 }408 132 409 133 /** Decode a single character from a string. … … 424 148 char32_t str_decode(const char *str, size_t *offset, size_t size) 425 149 { 426 mbstate_t mb = { }; 427 char32_t ch = _str_decode(str, offset, size, &mb); 428 429 if (ch == CHAR_INVALID || mb.state) 150 if (*offset + 1 > size) 151 return 0; 152 153 /* First byte read from string */ 154 uint8_t b0 = (uint8_t) str[(*offset)++]; 155 156 /* Determine code length */ 157 158 unsigned int b0_bits; /* Data bits in first byte */ 159 unsigned int cbytes; /* Number of continuation bytes */ 160 161 if ((b0 & 0x80) == 0) { 162 /* 0xxxxxxx (Plain ASCII) */ 163 b0_bits = 7; 164 cbytes = 0; 165 } else if ((b0 & 0xe0) == 0xc0) { 166 /* 110xxxxx 10xxxxxx */ 167 b0_bits = 5; 168 cbytes = 1; 169 } else if ((b0 & 0xf0) == 0xe0) { 170 /* 1110xxxx 10xxxxxx 10xxxxxx */ 171 b0_bits = 4; 172 cbytes = 2; 173 } else if ((b0 & 0xf8) == 0xf0) { 174 /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ 175 b0_bits = 3; 176 cbytes = 3; 177 } else { 178 /* 10xxxxxx -- unexpected continuation byte */ 430 179 return U_SPECIAL; 180 } 181 182 if (*offset + cbytes > size) 183 return U_SPECIAL; 184 185 char32_t ch = b0 & LO_MASK_8(b0_bits); 186 187 /* Decode continuation bytes */ 188 while (cbytes > 0) { 189 uint8_t b = (uint8_t) str[(*offset)++]; 190 191 /* Must be 10xxxxxx */ 192 if ((b & 0xc0) != 0x80) 193 return U_SPECIAL; 194 195 /* Shift data bits to ch */ 196 ch = (ch << CONT_BITS) | (char32_t) (b & LO_MASK_8(CONT_BITS)); 197 cbytes--; 198 } 431 199 432 200 return ch; 433 }434 435 char32_t str_decode_r(const char *str, size_t *offset, size_t size,436 char32_t replacement, mbstate_t *mb)437 {438 char32_t ch = _str_decode(str, offset, size, mb);439 return (ch == CHAR_INVALID) ? replacement : ch;440 201 } 441 202 … … 460 221 return 0; 461 222 462 int cbytes= 0;223 size_t processed = 0; 463 224 /* Continue while continuation bytes found */ 464 while (*offset > 0 && cbytes< 4) {225 while (*offset > 0 && processed < 4) { 465 226 uint8_t b = (uint8_t) str[--(*offset)]; 466 227 467 if (_is_continuation(b)) { 468 cbytes++; 469 continue; 228 if (processed == 0 && (b & 0x80) == 0) { 229 /* 0xxxxxxx (Plain ASCII) */ 230 return b & 0x7f; 231 } else if ((b & 0xe0) == 0xc0 || (b & 0xf0) == 0xe0 || 232 (b & 0xf8) == 0xf0) { 233 /* Start byte */ 234 size_t start_offset = *offset; 235 return str_decode(str, &start_offset, size); 236 } else if ((b & 0xc0) != 0x80) { 237 /* Not a continuation byte */ 238 return U_SPECIAL; 470 239 } 471 472 /* Reject non-shortest form encoding. */ 473 if (cbytes != _continuation_bytes(b)) 474 return U_SPECIAL; 475 476 /* Start byte */ 477 size_t start_offset = *offset; 478 return str_decode(str, &start_offset, size); 479 } 480 240 processed++; 241 } 481 242 /* Too many continuation bytes */ 482 243 return U_SPECIAL; … … 498 259 * code was invalid. 499 260 */ 500 errno_t chr_encode(char32_t ch, char *str, size_t *offset, size_t size) 501 { 502 // TODO: merge with c32rtomb() 503 261 errno_t chr_encode(const char32_t ch, char *str, size_t *offset, size_t size) 262 { 504 263 if (*offset >= size) 505 264 return EOVERFLOW; 506 265 507 /* Fast exit for the most common case. */508 if (ch < 0x80) {509 str[(*offset)++] = (char) ch;510 return EOK;511 }512 513 /* Codes longer than 21 bits are not supported */514 266 if (!chr_check(ch)) 515 267 return EINVAL; 516 268 269 /* 270 * Unsigned version of ch (bit operations should only be done 271 * on unsigned types). 272 */ 273 uint32_t cc = (uint32_t) ch; 274 517 275 /* Determine how many continuation bytes are needed */ 518 276 519 unsigned int cbytes = _char_continuation_bytes(ch); 520 unsigned int b0_bits = 6 - cbytes; /* Data bits in first byte */ 277 unsigned int b0_bits; /* Data bits in first byte */ 278 unsigned int cbytes; /* Number of continuation bytes */ 279 280 if ((cc & ~LO_MASK_32(7)) == 0) { 281 b0_bits = 7; 282 cbytes = 0; 283 } else if ((cc & ~LO_MASK_32(11)) == 0) { 284 b0_bits = 5; 285 cbytes = 1; 286 } else if ((cc & ~LO_MASK_32(16)) == 0) { 287 b0_bits = 4; 288 cbytes = 2; 289 } else if ((cc & ~LO_MASK_32(21)) == 0) { 290 b0_bits = 3; 291 cbytes = 3; 292 } else { 293 /* Codes longer than 21 bits are not supported */ 294 return EINVAL; 295 } 521 296 522 297 /* Check for available space in buffer */ … … 527 302 unsigned int i; 528 303 for (i = cbytes; i > 0; i--) { 529 str[*offset + i] = 0x80 | (c h& LO_MASK_32(CONT_BITS));530 c h >>=CONT_BITS;304 str[*offset + i] = 0x80 | (cc & LO_MASK_32(CONT_BITS)); 305 cc = cc >> CONT_BITS; 531 306 } 532 307 533 308 /* Encode first byte */ 534 str[*offset] = (c h& LO_MASK_32(b0_bits)) | HI_MASK_8(8 - b0_bits - 1);309 str[*offset] = (cc & LO_MASK_32(b0_bits)) | HI_MASK_8(8 - b0_bits - 1); 535 310 536 311 /* Advance offset */ … … 540 315 } 541 316 542 /* Convert in place any bytes that don't form a valid character into replacement. */ 543 static size_t _str_sanitize(char *str, size_t n, uint8_t replacement) 544 { 545 uint8_t *b = (uint8_t *) str; 546 size_t count = 0; 547 548 for (; n > 0 && b[0]; b++, n--) { 549 if (b[0] < ' ') { 550 /* C0 control codes */ 551 b[0] = replacement; 552 count++; 553 continue; 554 } 555 556 int cont = _continuation_bytes(b[0]); 557 if (__builtin_expect(cont, 0) == 0) 558 continue; 559 560 if (cont < 0 || n <= (size_t) cont) { 561 b[0] = replacement; 562 count++; 563 continue; 564 } 565 566 /* Check continuation bytes. */ 567 bool valid = true; 568 for (int i = 1; i <= cont; i++) { 569 if (!_is_continuation(b[i])) { 570 valid = false; 571 break; 572 } 573 } 574 575 if (!valid) { 576 b[0] = replacement; 577 count++; 578 continue; 579 } 580 581 /* 582 * Check for non-shortest form encoding. 583 * See https://www.unicode.org/versions/corrigendum1.html 584 */ 585 586 /* 0b110!!!!x 0b10xxxxxx */ 587 if (cont == 1 && !(b[0] & 0b00011110)) { 588 b[0] = replacement; 589 count++; 590 continue; 591 } 592 593 bool c1_control = (b[0] == 0b11000010 && b[1] < 0b10100000); 594 if (cont == 1 && c1_control) { 595 b[0] = replacement; 596 count++; 597 continue; 598 } 599 600 /* 0b1110!!!! 0b10!xxxxx 0b10xxxxxx */ 601 if (cont == 2 && !(b[0] & 0b00001111) && !(b[1] & 0b00100000)) { 602 b[0] = replacement; 603 count++; 604 continue; 605 } 606 607 /* 0b11110!!! 0b10!!xxxx 0b10xxxxxx 0b10xxxxxx */ 608 if (cont == 3 && !(b[0] & 0b00000111) && !(b[1] & 0b00110000)) { 609 b[0] = replacement; 610 count++; 611 continue; 612 } 613 614 /* Check for surrogate character encoding. */ 615 if (cont == 2 && b[0] == 0xED && b[1] >= 0xA0) { 616 b[0] = replacement; 617 count++; 618 continue; 619 } 620 621 /* Check for out-of-range code points. */ 622 if (cont == 3 && (b[0] > 0xF4 || (b[0] == 0xF4 && b[1] >= 0x90))) { 623 b[0] = replacement; 624 count++; 625 continue; 626 } 627 628 b += cont; 629 n -= cont; 630 } 631 632 return count; 633 } 634 635 /** Replaces any byte that's not part of a complete valid UTF-8 character 636 * encoding with a replacement byte. 637 * Also replaces C0 and C1 control codes. 638 */ 639 size_t str_sanitize(char *str, size_t n, uint8_t replacement) 640 { 641 return _str_sanitize(str, n, replacement); 642 } 643 644 static size_t _str_size(const char *str) 317 /** Get size of string. 318 * 319 * Get the number of bytes which are used by the string @a str (excluding the 320 * NULL-terminator). 321 * 322 * @param str String to consider. 323 * 324 * @return Number of bytes used by the string 325 * 326 */ 327 size_t str_size(const char *str) 645 328 { 646 329 size_t size = 0; … … 650 333 651 334 return size; 652 }653 654 /** Get size of string.655 *656 * Get the number of bytes which are used by the string @a str (excluding the657 * NULL-terminator).658 *659 * @param str String to consider.660 *661 * @return Number of bytes used by the string662 *663 */664 size_t str_size(const char *str)665 {666 return _str_size(str);667 335 } 668 336 … … 710 378 } 711 379 712 static size_t _str_nsize(const char *str, size_t max_size) 380 /** Get size of string with size limit. 381 * 382 * Get the number of bytes which are used by the string @a str 383 * (excluding the NULL-terminator), but no more than @max_size bytes. 384 * 385 * @param str String to consider. 386 * @param max_size Maximum number of bytes to measure. 387 * 388 * @return Number of bytes used by the string 389 * 390 */ 391 size_t str_nsize(const char *str, size_t max_size) 713 392 { 714 393 size_t size = 0; … … 718 397 719 398 return size; 720 }721 722 /** Get size of string with size limit.723 *724 * Get the number of bytes which are used by the string @a str725 * (excluding the NULL-terminator), but no more than @max_size bytes.726 *727 * @param str String to consider.728 * @param max_size Maximum number of bytes to measure.729 *730 * @return Number of bytes used by the string731 *732 */733 size_t str_nsize(const char *str, size_t max_size)734 {735 return _str_nsize(str, max_size);736 399 } 737 400 … … 919 582 int str_cmp(const char *s1, const char *s2) 920 583 { 921 /* 922 * UTF-8 has the nice property that lexicographic ordering on bytes is 923 * the same as the lexicographic ordering of the character sequences. 924 */ 925 while (*s1 == *s2 && *s1 != 0) { 926 s1++; 927 s2++; 928 } 929 930 if (*s1 == *s2) 931 return 0; 932 933 return (*s1 < *s2) ? -1 : 1; 584 char32_t c1 = 0; 585 char32_t c2 = 0; 586 587 size_t off1 = 0; 588 size_t off2 = 0; 589 590 while (true) { 591 c1 = str_decode(s1, &off1, STR_NO_LIMIT); 592 c2 = str_decode(s2, &off2, STR_NO_LIMIT); 593 594 if (c1 < c2) 595 return -1; 596 597 if (c1 > c2) 598 return 1; 599 600 if (c1 == 0 || c2 == 0) 601 break; 602 } 603 604 return 0; 934 605 } 935 606 … … 1010 681 int str_casecmp(const char *s1, const char *s2) 1011 682 { 1012 // FIXME: doesn't work for non-ASCII caseful characters1013 1014 683 char32_t c1 = 0; 1015 684 char32_t c2 = 0; … … 1060 729 int str_lcasecmp(const char *s1, const char *s2, size_t max_len) 1061 730 { 1062 // FIXME: doesn't work for non-ASCII caseful characters1063 1064 731 char32_t c1 = 0; 1065 732 char32_t c2 = 0; … … 1093 760 } 1094 761 1095 static bool _test_prefix(const char *s, const char *p)1096 {1097 while (*s == *p && *s != 0) {1098 s++;1099 p++;1100 }1101 1102 return *p == 0;1103 }1104 1105 762 /** Test whether p is a prefix of s. 1106 763 * … … 1116 773 bool str_test_prefix(const char *s, const char *p) 1117 774 { 1118 return _test_prefix(s, p); 775 char32_t c1 = 0; 776 char32_t c2 = 0; 777 778 size_t off1 = 0; 779 size_t off2 = 0; 780 781 while (true) { 782 c1 = str_decode(s, &off1, STR_NO_LIMIT); 783 c2 = str_decode(p, &off2, STR_NO_LIMIT); 784 785 if (c2 == 0) 786 return true; 787 788 if (c1 != c2) 789 return false; 790 791 if (c1 == 0) 792 break; 793 } 794 795 return false; 1119 796 } 1120 797 … … 1143 820 1144 821 return s + off; 1145 }1146 1147 /** Copy string as a sequence of bytes. */1148 static void _str_cpy(char *dest, const char *src)1149 {1150 while (*src)1151 *(dest++) = *(src++);1152 1153 *dest = 0;1154 }1155 1156 /** Copy string as a sequence of bytes. */1157 static void _str_cpyn(char *dest, size_t size, const char *src)1158 {1159 assert(dest && src && size);1160 1161 if (!dest || !src || !size)1162 return;1163 1164 if (size == STR_NO_LIMIT)1165 return _str_cpy(dest, src);1166 1167 char *dest_top = dest + size - 1;1168 assert(size == 1 || dest < dest_top);1169 1170 while (*src && dest < dest_top)1171 *(dest++) = *(src++);1172 1173 *dest = 0;1174 822 } 1175 823 … … 1191 839 assert(size > 0); 1192 840 assert(src != NULL); 1193 assert(dest != NULL); 1194 assert(size == STR_NO_LIMIT || dest + size > dest); 1195 1196 /* Copy data. */ 1197 _str_cpyn(dest, size, src); 1198 1199 /* In-place translate invalid bytes to U_SPECIAL. */ 1200 _str_sanitize(dest, size, U_SPECIAL); 841 842 size_t src_off = 0; 843 size_t dest_off = 0; 844 845 char32_t ch; 846 while ((ch = str_decode(src, &src_off, STR_NO_LIMIT)) != 0) { 847 if (chr_encode(ch, dest, &dest_off, size - 1) != EOK) 848 break; 849 } 850 851 dest[dest_off] = '\0'; 1201 852 } 1202 853 … … 1221 872 /* There must be space for a null terminator in the buffer. */ 1222 873 assert(size > 0); 1223 assert(src != NULL); 1224 1225 /* Copy data. */ 1226 _str_cpyn(dest, min(size, n + 1), src); 1227 1228 /* In-place translate invalid bytes to U_SPECIAL. */ 1229 _str_sanitize(dest, size, U_SPECIAL); 874 875 size_t src_off = 0; 876 size_t dest_off = 0; 877 878 char32_t ch; 879 while ((ch = str_decode(src, &src_off, n)) != 0) { 880 if (chr_encode(ch, dest, &dest_off, size - 1) != EOK) 881 break; 882 } 883 884 dest[dest_off] = '\0'; 1230 885 } 1231 886 … … 1243 898 void str_append(char *dest, size_t size, const char *src) 1244 899 { 1245 assert(src != NULL); 1246 assert(dest != NULL); 1247 assert(size > 0); 1248 assert(size == STR_NO_LIMIT || dest + size > dest); 1249 1250 size_t dstr_size = _str_nsize(dest, size); 1251 if (dstr_size < size) { 1252 _str_cpyn(dest + dstr_size, size - dstr_size, src); 1253 _str_sanitize(dest + dstr_size, size - dstr_size, U_SPECIAL); 1254 } 900 size_t dstr_size; 901 902 dstr_size = str_size(dest); 903 if (dstr_size >= size) 904 return; 905 906 str_cpy(dest + dstr_size, size - dstr_size, src); 1255 907 } 1256 908 … … 1281 933 errno_t spascii_to_str(char *dest, size_t size, const uint8_t *src, size_t n) 1282 934 { 1283 size_t len = 0; 1284 1285 /* Determine the length of the source string. */ 1286 for (size_t i = 0; i < n; i++) { 1287 if (src[i] == 0) 1288 break; 1289 1290 if (src[i] != ' ') 1291 len = i + 1; 1292 } 1293 1294 errno_t result = EOK; 1295 size_t out_len = min(len, size - 1); 1296 1297 /* Copy characters */ 1298 for (size_t i = 0; i < out_len; i++) { 1299 dest[i] = src[i]; 1300 1301 if (dest[i] < 0) { 1302 dest[i] = U_SPECIAL; 935 size_t sidx; 936 size_t didx; 937 size_t dlast; 938 uint8_t byte; 939 errno_t rc; 940 errno_t result; 941 942 /* There must be space for a null terminator in the buffer. */ 943 assert(size > 0); 944 result = EOK; 945 946 didx = 0; 947 dlast = 0; 948 for (sidx = 0; sidx < n; ++sidx) { 949 byte = src[sidx]; 950 if (!ascii_check(byte)) { 951 byte = U_SPECIAL; 1303 952 result = EIO; 1304 953 } 1305 } 1306 1307 dest[out_len] = 0; 1308 1309 if (out_len < len) 1310 return EOVERFLOW; 1311 954 955 rc = chr_encode(byte, dest, &didx, size - 1); 956 if (rc != EOK) { 957 assert(rc == EOVERFLOW); 958 dest[didx] = '\0'; 959 return rc; 960 } 961 962 /* Remember dest index after last non-empty character */ 963 if (byte != 0x20) 964 dlast = didx; 965 } 966 967 /* Terminate string after last non-empty character */ 968 dest[dlast] = '\0'; 1312 969 return result; 1313 970 } … … 1550 1207 } 1551 1208 1552 static char *_strchr(const char *str, char c)1553 {1554 while (*str != 0 && *str != c)1555 str++;1556 1557 return (*str == c) ? (char *) str : NULL;1558 }1559 1560 1209 /** Find first occurence of character in string. 1561 1210 * … … 1567 1216 char *str_chr(const char *str, char32_t ch) 1568 1217 { 1569 /* Fast path for an ASCII character. */ 1570 if (ascii_check(ch)) 1571 return _strchr(str, ch); 1572 1573 /* Convert character to UTF-8. */ 1574 char utf8[STR_BOUNDS(1) + 1]; 1575 size_t offset = 0; 1576 1577 if (chr_encode(ch, utf8, &offset, sizeof(utf8)) != EOK || offset == 0) 1578 return NULL; 1579 1580 utf8[offset] = '\0'; 1581 1582 /* Find the first byte, then check if all of them are correct. */ 1583 while (*str != 0) { 1584 str = _strchr(str, utf8[0]); 1585 if (!str) 1586 return NULL; 1587 1588 if (_test_prefix(str, utf8)) 1589 return (char *) str; 1590 1591 str++; 1218 char32_t acc; 1219 size_t off = 0; 1220 size_t last = 0; 1221 1222 while ((acc = str_decode(str, &off, STR_NO_LIMIT)) != 0) { 1223 if (acc == ch) 1224 return (char *) (str + last); 1225 last = off; 1592 1226 } 1593 1227 … … 1604 1238 char *str_str(const char *hs, const char *n) 1605 1239 { 1606 size_t hsize = _str_size(hs); 1607 size_t nsize = _str_size(n); 1608 1609 while (hsize >= nsize) { 1610 if (_test_prefix(hs, n)) 1611 return (char *) hs; 1612 1613 hs++; 1614 hsize--; 1240 size_t off = 0; 1241 1242 if (str_lcmp(hs, n, str_length(n)) == 0) 1243 return (char *)hs; 1244 1245 while (str_decode(hs, &off, STR_NO_LIMIT) != 0) { 1246 if (str_lcmp(hs + off, n, str_length(n)) == 0) 1247 return (char *)(hs + off); 1615 1248 } 1616 1249 1617 1250 return NULL; 1618 }1619 1620 static void _str_rtrim(char *str, char c)1621 {1622 char *last = str;1623 1624 while (*str) {1625 if (*str != c)1626 last = str;1627 1628 str++;1629 }1630 1631 /* Truncate string. */1632 last[1] = 0;1633 1251 } 1634 1252 … … 1640 1258 void str_rtrim(char *str, char32_t ch) 1641 1259 { 1642 /* Fast path for the ASCII case. */1643 if (ascii_check(ch)) {1644 _str_rtrim(str, ch);1645 return;1646 }1647 1648 1260 size_t off = 0; 1649 1261 size_t pos = 0; … … 1667 1279 } 1668 1280 1669 static void _str_ltrim(char *str, char c)1670 {1671 char *p = str;1672 1673 while (*p == c)1674 p++;1675 1676 if (str != p)1677 _str_cpy(str, p);1678 }1679 1680 1281 /** Removes specified leading characters from a string. 1681 1282 * … … 1685 1286 void str_ltrim(char *str, char32_t ch) 1686 1287 { 1687 /* Fast path for the ASCII case. */1688 if (ascii_check(ch)) {1689 _str_ltrim(str, ch);1690 return;1691 }1692 1693 1288 char32_t acc; 1694 1289 size_t off = 0; … … 1710 1305 } 1711 1306 1712 static char *_str_rchr(const char *str, char c)1713 {1714 const char *last = NULL;1715 1716 while (*str) {1717 if (*str == c)1718 last = str;1719 1720 str++;1721 }1722 1723 return (char *) last;1724 }1725 1726 1307 /** Find last occurence of character in string. 1727 1308 * … … 1733 1314 char *str_rchr(const char *str, char32_t ch) 1734 1315 { 1735 if (ascii_check(ch))1736 return _str_rchr(str, ch);1737 1738 1316 char32_t acc; 1739 1317 size_t off = 0; … … 1824 1402 char *str_dup(const char *src) 1825 1403 { 1826 size_t size = _str_size(src) + 1;1404 size_t size = str_size(src) + 1; 1827 1405 char *dest = malloc(size); 1828 1406 if (!dest) 1829 1407 return NULL; 1830 1408 1831 memcpy(dest, src, size); 1832 _str_sanitize(dest, size, U_SPECIAL); 1409 str_cpy(dest, size, src); 1833 1410 return dest; 1834 1411 } … … 1856 1433 char *str_ndup(const char *src, size_t n) 1857 1434 { 1858 size_t size = _str_nsize(src, n); 1435 size_t size = str_size(src); 1436 if (size > n) 1437 size = n; 1859 1438 1860 1439 char *dest = malloc(size + 1); … … 1862 1441 return NULL; 1863 1442 1864 memcpy(dest, src, size); 1865 _str_sanitize(dest, size, U_SPECIAL); 1866 dest[size] = 0; 1443 str_ncpy(dest, size + 1, src, size); 1867 1444 return dest; 1868 1445 }
Note:
See TracChangeset
for help on using the changeset viewer.