Changes in common/str.c [fdfb24e:1db4e2ae] in mainline
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
common/str.c
rfdfb24e r1db4e2ae 5 5 * Copyright (c) 2011 Martin Sucha 6 6 * Copyright (c) 2011 Oleg Romanenko 7 * Copyright (c) 2025 Jiří Zárevúcky 7 8 * All rights reserved. 8 9 * … … 54 55 * are valid 55 56 * 57 * Note that Unicode characters do not match 58 * one-to-one with displayed characters or glyphs on 59 * screen. For that level of precision, look up 60 * Grapheme Clusters. 61 * 56 62 * ASCII character 7 bit encoded ASCII character, stored in char 57 63 * (usually signed 8 bit integer), code points 0 .. 127 … … 71 77 * [wide] string width number of display cells on a monospace display taken 72 78 * by a [wide] string, size_t 79 * 80 * This is virtually impossible to determine exactly for 81 * all strings without knowing specifics of the display 82 * device, due to various factors affecting text output. 83 * If you have the option to query the terminal for 84 * position change caused by outputting the string, 85 * it is preferrable to determine width that way. 73 86 * 74 87 * … … 108 121 #include <str.h> 109 122 123 #include <align.h> 110 124 #include <assert.h> 111 125 #include <ctype.h> 112 126 #include <errno.h> 127 #include <limits.h> 128 #include <macros.h> 129 #include <mem.h> 113 130 #include <stdbool.h> 114 131 #include <stddef.h> 115 132 #include <stdint.h> 116 133 #include <stdlib.h> 117 118 #include <align.h> 119 #include <mem.h> 134 #include <uchar.h> 135 136 #if __STDC_HOSTED__ 137 #include <fibril.h> 138 #endif 139 140 static void _set_ilseq() 141 { 142 #ifdef errno 143 errno = EILSEQ; 144 #endif 145 } 120 146 121 147 /** Byte mask consisting of lowest @n bits (out of 8) */ … … 130 156 /** Number of data bits in a UTF-8 continuation byte */ 131 157 #define CONT_BITS 6 158 159 #define UTF8_MASK_INITIAL2 0b00011111 160 #define UTF8_MASK_INITIAL3 0b00001111 161 #define UTF8_MASK_INITIAL4 0b00000111 162 #define UTF8_MASK_CONT 0b00111111 163 164 #define CHAR_INVALID ((char32_t) UINT_MAX) 165 166 static inline bool _is_ascii(uint8_t b) 167 { 168 return b < 0x80; 169 } 170 171 static inline bool _is_continuation(uint8_t b) 172 { 173 return (b & 0xC0) == 0x80; 174 } 175 176 static inline bool _is_2_byte(uint8_t c) 177 { 178 return (c & 0xE0) == 0xC0; 179 } 180 181 static inline bool _is_3_byte(uint8_t c) 182 { 183 return (c & 0xF0) == 0xE0; 184 } 185 186 static inline bool _is_4_byte(uint8_t c) 187 { 188 return (c & 0xF8) == 0xF0; 189 } 190 191 static inline int _char_continuation_bytes(char32_t c) 192 { 193 if ((c & ~LO_MASK_32(7)) == 0) 194 return 0; 195 196 if ((c & ~LO_MASK_32(11)) == 0) 197 return 1; 198 199 if ((c & ~LO_MASK_32(16)) == 0) 200 return 2; 201 202 if ((c & ~LO_MASK_32(21)) == 0) 203 return 3; 204 205 /* Codes longer than 21 bits are not supported */ 206 return -1; 207 } 208 209 static inline int _continuation_bytes(uint8_t b) 210 { 211 /* 0xxxxxxx */ 212 if (_is_ascii(b)) 213 return 0; 214 215 /* 110xxxxx 10xxxxxx */ 216 if (_is_2_byte(b)) 217 return 1; 218 219 /* 1110xxxx 10xxxxxx 10xxxxxx */ 220 if (_is_3_byte(b)) 221 return 2; 222 223 /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ 224 if (_is_4_byte(b)) 225 return 3; 226 227 return -1; 228 } 229 230 static bool _is_non_shortest(const mbstate_t *mb, uint8_t b) 231 { 232 return (mb->state == 0b1111110000000000 && !(b & 0b00100000)) || 233 (mb->state == 0b1111111111110000 && !(b & 0b00110000)); 234 } 235 236 static bool _is_surrogate(const mbstate_t *mb, uint8_t b) 237 { 238 return (mb->state == 0b1111110000001101 && b >= 0xa0); 239 } 240 241 #define _likely(expr) __builtin_expect((expr), true) 242 #define _unlikely(expr) __builtin_expect((expr), false) 243 244 #define FAST_PATHS 1 245 246 static char32_t _str_decode(const char *s, size_t *offset, size_t size, mbstate_t *mb) 247 { 248 assert(s); 249 assert(offset); 250 assert(*offset <= size); 251 assert(size == STR_NO_LIMIT || s + size >= s); 252 assert(mb); 253 254 if (*offset == size) 255 return 0; 256 257 if (_likely(!mb->state)) { 258 /* Clean slate, read initial byte. */ 259 uint8_t b = s[(*offset)++]; 260 261 /* Fast exit for the most common case. */ 262 if (_likely(_is_ascii(b))) 263 return b; 264 265 /* unexpected continuation byte */ 266 if (_unlikely(_is_continuation(b))) 267 return CHAR_INVALID; 268 269 /* 270 * The value stored into `continuation` is designed to have 271 * just enough leading ones that after shifting in one less than 272 * the expected number of continuation bytes, the most significant 273 * bit becomes zero. (The field is 16b wide.) 274 */ 275 276 if (_is_2_byte(b)) { 277 /* Reject non-shortest form. */ 278 if (_unlikely(!(b & 0b00011110))) 279 return CHAR_INVALID; 280 281 #if FAST_PATHS 282 /* We can usually take this exit. */ 283 if (_likely(*offset < size && _is_continuation(s[*offset]))) 284 return (b & UTF8_MASK_INITIAL2) << 6 | 285 (s[(*offset)++] & UTF8_MASK_CONT); 286 #endif 287 288 /* 2 byte continuation 110xxxxx */ 289 mb->state = b ^ 0b0000000011000000; 290 291 } else if (_is_3_byte(b)) { 292 #if FAST_PATHS 293 /* We can usually take this exit. */ 294 if (_likely(*offset + 1 < size && _is_continuation(s[*offset]) && _is_continuation(s[*offset + 1]))) { 295 296 char32_t ch = (b & UTF8_MASK_INITIAL3) << 12 | 297 (s[(*offset)] & UTF8_MASK_CONT) << 6 | 298 (s[(*offset) + 1] & UTF8_MASK_CONT); 299 300 *offset += 2; 301 302 /* Reject non-shortest form. */ 303 if (_unlikely(!(ch & 0xFFFFF800))) 304 return CHAR_INVALID; 305 306 /* Reject surrogates */ 307 if (_unlikely(ch >= 0xD800 && ch < 0xE000)) 308 return CHAR_INVALID; 309 310 return ch; 311 } 312 #endif 313 314 /* 3 byte continuation 1110xxxx */ 315 mb->state = b ^ 0b1111110011100000; 316 317 } else if (_is_4_byte(b)) { 318 #if FAST_PATHS 319 /* We can usually take this exit. */ 320 if (_likely(*offset + 2 < size && _is_continuation(s[*offset]) && 321 _is_continuation(s[*offset + 1]) && _is_continuation(s[*offset + 2]))) { 322 323 char32_t ch = (b & UTF8_MASK_INITIAL4) << 18 | 324 (s[(*offset)] & UTF8_MASK_CONT) << 12 | 325 (s[(*offset) + 1] & UTF8_MASK_CONT) << 6 | 326 (s[(*offset) + 2] & UTF8_MASK_CONT); 327 328 *offset += 3; 329 330 /* Reject non-shortest form. */ 331 if (_unlikely(!(ch & 0xFFFF0000))) 332 return CHAR_INVALID; 333 334 /* Reject out-of-range characters. */ 335 if (_unlikely(ch >= 0x110000)) 336 return CHAR_INVALID; 337 338 return ch; 339 } 340 #endif 341 342 /* 4 byte continuation 11110xxx */ 343 mb->state = b ^ 0b1111111100000000; 344 } else { 345 return CHAR_INVALID; 346 } 347 } 348 349 /* Deal with the remaining edge and invalid cases. */ 350 for (; *offset < size; (*offset)++) { 351 /* Read continuation bytes. */ 352 uint8_t b = s[*offset]; 353 354 if (!_is_continuation(b) || _is_non_shortest(mb, b) || _is_surrogate(mb, b)) { 355 mb->state = 0; 356 return CHAR_INVALID; 357 } 358 359 /* Top bit becomes zero when shifting in the second to last byte. */ 360 if (!(mb->state & 0x8000)) { 361 char32_t c = ((char32_t) mb->state) << 6 | (b & UTF8_MASK_CONT); 362 mb->state = 0; 363 (*offset)++; 364 return c; 365 } 366 367 mb->state = mb->state << 6 | (b & UTF8_MASK_CONT); 368 } 369 370 /* Incomplete character. */ 371 assert(mb->state); 372 return 0; 373 } 374 375 /** Standard <uchar.h> function since C11. */ 376 size_t mbrtoc32(char32_t *c, const char *s, size_t n, mbstate_t *mb) 377 { 378 #if __STDC_HOSTED__ 379 static fibril_local mbstate_t global_state = { }; 380 381 if (!mb) 382 mb = &global_state; 383 #endif 384 385 if (!s) { 386 /* Equivalent to mbrtoc32(NULL, "", 1, mb); */ 387 c = NULL; 388 s = ""; 389 n = 1; 390 } 391 392 size_t offset = 0; 393 char32_t ret = _str_decode(s, &offset, n, mb); 394 if (ret == CHAR_INVALID) { 395 assert(!mb->state); 396 _set_ilseq(); 397 return UCHAR_ILSEQ; 398 } 399 if (mb->state) { 400 assert(ret == 0); 401 return UCHAR_INCOMPLETE; 402 } 403 404 if (c) 405 *c = ret; 406 return ret ? offset : 0; 407 } 132 408 133 409 /** Decode a single character from a string. … … 148 424 char32_t str_decode(const char *str, size_t *offset, size_t size) 149 425 { 150 if (*offset + 1 > size) 151 return 0; 152 153 /* First byte read from string */ 154 uint8_t b0 = (uint8_t) str[(*offset)++]; 155 156 /* Determine code length */ 157 158 unsigned int b0_bits; /* Data bits in first byte */ 159 unsigned int cbytes; /* Number of continuation bytes */ 160 161 if ((b0 & 0x80) == 0) { 162 /* 0xxxxxxx (Plain ASCII) */ 163 b0_bits = 7; 164 cbytes = 0; 165 } else if ((b0 & 0xe0) == 0xc0) { 166 /* 110xxxxx 10xxxxxx */ 167 b0_bits = 5; 168 cbytes = 1; 169 } else if ((b0 & 0xf0) == 0xe0) { 170 /* 1110xxxx 10xxxxxx 10xxxxxx */ 171 b0_bits = 4; 172 cbytes = 2; 173 } else if ((b0 & 0xf8) == 0xf0) { 174 /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ 175 b0_bits = 3; 176 cbytes = 3; 177 } else { 178 /* 10xxxxxx -- unexpected continuation byte */ 426 mbstate_t mb = { }; 427 char32_t ch = _str_decode(str, offset, size, &mb); 428 429 if (ch == CHAR_INVALID || mb.state) 179 430 return U_SPECIAL; 180 }181 182 if (*offset + cbytes > size)183 return U_SPECIAL;184 185 char32_t ch = b0 & LO_MASK_8(b0_bits);186 187 /* Decode continuation bytes */188 while (cbytes > 0) {189 uint8_t b = (uint8_t) str[(*offset)++];190 191 /* Must be 10xxxxxx */192 if ((b & 0xc0) != 0x80)193 return U_SPECIAL;194 195 /* Shift data bits to ch */196 ch = (ch << CONT_BITS) | (char32_t) (b & LO_MASK_8(CONT_BITS));197 cbytes--;198 }199 431 200 432 return ch; 433 } 434 435 char32_t str_decode_r(const char *str, size_t *offset, size_t size, 436 char32_t replacement, mbstate_t *mb) 437 { 438 char32_t ch = _str_decode(str, offset, size, mb); 439 return (ch == CHAR_INVALID) ? replacement : ch; 201 440 } 202 441 … … 221 460 return 0; 222 461 223 size_t processed= 0;462 int cbytes = 0; 224 463 /* Continue while continuation bytes found */ 225 while (*offset > 0 && processed< 4) {464 while (*offset > 0 && cbytes < 4) { 226 465 uint8_t b = (uint8_t) str[--(*offset)]; 227 466 228 if (processed == 0 && (b & 0x80) == 0) { 229 /* 0xxxxxxx (Plain ASCII) */ 230 return b & 0x7f; 231 } else if ((b & 0xe0) == 0xc0 || (b & 0xf0) == 0xe0 || 232 (b & 0xf8) == 0xf0) { 233 /* Start byte */ 234 size_t start_offset = *offset; 235 return str_decode(str, &start_offset, size); 236 } else if ((b & 0xc0) != 0x80) { 237 /* Not a continuation byte */ 467 if (_is_continuation(b)) { 468 cbytes++; 469 continue; 470 } 471 472 /* Reject non-shortest form encoding. */ 473 if (cbytes != _continuation_bytes(b)) 238 474 return U_SPECIAL; 239 } 240 processed++; 241 } 475 476 /* Start byte */ 477 size_t start_offset = *offset; 478 return str_decode(str, &start_offset, size); 479 } 480 242 481 /* Too many continuation bytes */ 243 482 return U_SPECIAL; … … 259 498 * code was invalid. 260 499 */ 261 errno_t chr_encode(const char32_t ch, char *str, size_t *offset, size_t size) 262 { 500 errno_t chr_encode(char32_t ch, char *str, size_t *offset, size_t size) 501 { 502 // TODO: merge with c32rtomb() 503 263 504 if (*offset >= size) 264 505 return EOVERFLOW; 265 506 507 /* Fast exit for the most common case. */ 508 if (ch < 0x80) { 509 str[(*offset)++] = (char) ch; 510 return EOK; 511 } 512 513 /* Codes longer than 21 bits are not supported */ 266 514 if (!chr_check(ch)) 267 515 return EINVAL; 268 516 269 /*270 * Unsigned version of ch (bit operations should only be done271 * on unsigned types).272 */273 uint32_t cc = (uint32_t) ch;274 275 517 /* Determine how many continuation bytes are needed */ 276 518 277 unsigned int b0_bits; /* Data bits in first byte */ 278 unsigned int cbytes; /* Number of continuation bytes */ 279 280 if ((cc & ~LO_MASK_32(7)) == 0) { 281 b0_bits = 7; 282 cbytes = 0; 283 } else if ((cc & ~LO_MASK_32(11)) == 0) { 284 b0_bits = 5; 285 cbytes = 1; 286 } else if ((cc & ~LO_MASK_32(16)) == 0) { 287 b0_bits = 4; 288 cbytes = 2; 289 } else if ((cc & ~LO_MASK_32(21)) == 0) { 290 b0_bits = 3; 291 cbytes = 3; 292 } else { 293 /* Codes longer than 21 bits are not supported */ 294 return EINVAL; 295 } 519 unsigned int cbytes = _char_continuation_bytes(ch); 520 unsigned int b0_bits = 6 - cbytes; /* Data bits in first byte */ 296 521 297 522 /* Check for available space in buffer */ … … 302 527 unsigned int i; 303 528 for (i = cbytes; i > 0; i--) { 304 str[*offset + i] = 0x80 | (c c& LO_MASK_32(CONT_BITS));305 c c = cc >>CONT_BITS;529 str[*offset + i] = 0x80 | (ch & LO_MASK_32(CONT_BITS)); 530 ch >>= CONT_BITS; 306 531 } 307 532 308 533 /* Encode first byte */ 309 str[*offset] = (c c& LO_MASK_32(b0_bits)) | HI_MASK_8(8 - b0_bits - 1);534 str[*offset] = (ch & LO_MASK_32(b0_bits)) | HI_MASK_8(8 - b0_bits - 1); 310 535 311 536 /* Advance offset */ … … 315 540 } 316 541 542 /* Convert in place any bytes that don't form a valid character into replacement. */ 543 static size_t _str_sanitize(char *str, size_t n, uint8_t replacement) 544 { 545 uint8_t *b = (uint8_t *) str; 546 size_t count = 0; 547 548 for (; n > 0 && b[0]; b++, n--) { 549 if (b[0] < ' ') { 550 /* C0 control codes */ 551 b[0] = replacement; 552 count++; 553 continue; 554 } 555 556 int cont = _continuation_bytes(b[0]); 557 if (__builtin_expect(cont, 0) == 0) 558 continue; 559 560 if (cont < 0 || n <= (size_t) cont) { 561 b[0] = replacement; 562 count++; 563 continue; 564 } 565 566 /* Check continuation bytes. */ 567 bool valid = true; 568 for (int i = 1; i <= cont; i++) { 569 if (!_is_continuation(b[i])) { 570 valid = false; 571 break; 572 } 573 } 574 575 if (!valid) { 576 b[0] = replacement; 577 count++; 578 continue; 579 } 580 581 /* 582 * Check for non-shortest form encoding. 583 * See https://www.unicode.org/versions/corrigendum1.html 584 */ 585 586 /* 0b110!!!!x 0b10xxxxxx */ 587 if (cont == 1 && !(b[0] & 0b00011110)) { 588 b[0] = replacement; 589 count++; 590 continue; 591 } 592 593 bool c1_control = (b[0] == 0b11000010 && b[1] < 0b10100000); 594 if (cont == 1 && c1_control) { 595 b[0] = replacement; 596 count++; 597 continue; 598 } 599 600 /* 0b1110!!!! 0b10!xxxxx 0b10xxxxxx */ 601 if (cont == 2 && !(b[0] & 0b00001111) && !(b[1] & 0b00100000)) { 602 b[0] = replacement; 603 count++; 604 continue; 605 } 606 607 /* 0b11110!!! 0b10!!xxxx 0b10xxxxxx 0b10xxxxxx */ 608 if (cont == 3 && !(b[0] & 0b00000111) && !(b[1] & 0b00110000)) { 609 b[0] = replacement; 610 count++; 611 continue; 612 } 613 614 /* Check for surrogate character encoding. */ 615 if (cont == 2 && b[0] == 0xED && b[1] >= 0xA0) { 616 b[0] = replacement; 617 count++; 618 continue; 619 } 620 621 /* Check for out-of-range code points. */ 622 if (cont == 3 && (b[0] > 0xF4 || (b[0] == 0xF4 && b[1] >= 0x90))) { 623 b[0] = replacement; 624 count++; 625 continue; 626 } 627 628 b += cont; 629 n -= cont; 630 } 631 632 return count; 633 } 634 635 /** Replaces any byte that's not part of a complete valid UTF-8 character 636 * encoding with a replacement byte. 637 * Also replaces C0 and C1 control codes. 638 */ 639 size_t str_sanitize(char *str, size_t n, uint8_t replacement) 640 { 641 return _str_sanitize(str, n, replacement); 642 } 643 644 static size_t _str_size(const char *str) 645 { 646 size_t size = 0; 647 648 while (*str++ != 0) 649 size++; 650 651 return size; 652 } 653 317 654 /** Get size of string. 318 655 * … … 327 664 size_t str_size(const char *str) 328 665 { 329 size_t size = 0; 330 331 while (*str++ != 0) 332 size++; 333 334 return size; 666 return _str_size(str); 335 667 } 336 668 … … 378 710 } 379 711 712 static size_t _str_nsize(const char *str, size_t max_size) 713 { 714 size_t size = 0; 715 716 while ((*str++ != 0) && (size < max_size)) 717 size++; 718 719 return size; 720 } 721 380 722 /** Get size of string with size limit. 381 723 * … … 391 733 size_t str_nsize(const char *str, size_t max_size) 392 734 { 393 size_t size = 0; 394 395 while ((*str++ != 0) && (size < max_size)) 396 size++; 397 398 return size; 735 return _str_nsize(str, max_size); 399 736 } 400 737 … … 582 919 int str_cmp(const char *s1, const char *s2) 583 920 { 584 char32_t c1 = 0; 585 char32_t c2 = 0; 586 587 size_t off1 = 0; 588 size_t off2 = 0; 589 590 while (true) { 591 c1 = str_decode(s1, &off1, STR_NO_LIMIT); 592 c2 = str_decode(s2, &off2, STR_NO_LIMIT); 593 594 if (c1 < c2) 595 return -1; 596 597 if (c1 > c2) 598 return 1; 599 600 if (c1 == 0 || c2 == 0) 601 break; 602 } 603 604 return 0; 921 /* 922 * UTF-8 has the nice property that lexicographic ordering on bytes is 923 * the same as the lexicographic ordering of the character sequences. 924 */ 925 while (*s1 == *s2 && *s1 != 0) { 926 s1++; 927 s2++; 928 } 929 930 if (*s1 == *s2) 931 return 0; 932 933 return (*s1 < *s2) ? -1 : 1; 605 934 } 606 935 … … 681 1010 int str_casecmp(const char *s1, const char *s2) 682 1011 { 1012 // FIXME: doesn't work for non-ASCII caseful characters 1013 683 1014 char32_t c1 = 0; 684 1015 char32_t c2 = 0; … … 729 1060 int str_lcasecmp(const char *s1, const char *s2, size_t max_len) 730 1061 { 1062 // FIXME: doesn't work for non-ASCII caseful characters 1063 731 1064 char32_t c1 = 0; 732 1065 char32_t c2 = 0; … … 760 1093 } 761 1094 1095 static bool _test_prefix(const char *s, const char *p) 1096 { 1097 while (*s == *p && *s != 0) { 1098 s++; 1099 p++; 1100 } 1101 1102 return *p == 0; 1103 } 1104 762 1105 /** Test whether p is a prefix of s. 763 1106 * … … 773 1116 bool str_test_prefix(const char *s, const char *p) 774 1117 { 775 char32_t c1 = 0; 776 char32_t c2 = 0; 777 778 size_t off1 = 0; 779 size_t off2 = 0; 780 781 while (true) { 782 c1 = str_decode(s, &off1, STR_NO_LIMIT); 783 c2 = str_decode(p, &off2, STR_NO_LIMIT); 784 785 if (c2 == 0) 786 return true; 787 788 if (c1 != c2) 789 return false; 790 791 if (c1 == 0) 792 break; 793 } 794 795 return false; 1118 return _test_prefix(s, p); 796 1119 } 797 1120 … … 820 1143 821 1144 return s + off; 1145 } 1146 1147 /** Copy string as a sequence of bytes. */ 1148 static void _str_cpy(char *dest, const char *src) 1149 { 1150 while (*src) 1151 *(dest++) = *(src++); 1152 1153 *dest = 0; 1154 } 1155 1156 /** Copy string as a sequence of bytes. */ 1157 static void _str_cpyn(char *dest, size_t size, const char *src) 1158 { 1159 assert(dest && src && size); 1160 1161 if (!dest || !src || !size) 1162 return; 1163 1164 if (size == STR_NO_LIMIT) 1165 return _str_cpy(dest, src); 1166 1167 char *dest_top = dest + size - 1; 1168 assert(size == 1 || dest < dest_top); 1169 1170 while (*src && dest < dest_top) 1171 *(dest++) = *(src++); 1172 1173 *dest = 0; 822 1174 } 823 1175 … … 839 1191 assert(size > 0); 840 1192 assert(src != NULL); 841 842 size_t src_off = 0; 843 size_t dest_off = 0; 844 845 char32_t ch; 846 while ((ch = str_decode(src, &src_off, STR_NO_LIMIT)) != 0) { 847 if (chr_encode(ch, dest, &dest_off, size - 1) != EOK) 848 break; 849 } 850 851 dest[dest_off] = '\0'; 1193 assert(dest != NULL); 1194 assert(size == STR_NO_LIMIT || dest + size > dest); 1195 1196 /* Copy data. */ 1197 _str_cpyn(dest, size, src); 1198 1199 /* In-place translate invalid bytes to U_SPECIAL. */ 1200 _str_sanitize(dest, size, U_SPECIAL); 852 1201 } 853 1202 … … 872 1221 /* There must be space for a null terminator in the buffer. */ 873 1222 assert(size > 0); 874 875 size_t src_off = 0; 876 size_t dest_off = 0; 877 878 char32_t ch; 879 while ((ch = str_decode(src, &src_off, n)) != 0) { 880 if (chr_encode(ch, dest, &dest_off, size - 1) != EOK) 881 break; 882 } 883 884 dest[dest_off] = '\0'; 1223 assert(src != NULL); 1224 1225 /* Copy data. */ 1226 _str_cpyn(dest, min(size, n + 1), src); 1227 1228 /* In-place translate invalid bytes to U_SPECIAL. */ 1229 _str_sanitize(dest, size, U_SPECIAL); 885 1230 } 886 1231 … … 898 1243 void str_append(char *dest, size_t size, const char *src) 899 1244 { 900 size_t dstr_size; 901 902 dstr_size = str_size(dest); 903 if (dstr_size >= size) 904 return; 905 906 str_cpy(dest + dstr_size, size - dstr_size, src); 1245 assert(src != NULL); 1246 assert(dest != NULL); 1247 assert(size > 0); 1248 assert(size == STR_NO_LIMIT || dest + size > dest); 1249 1250 size_t dstr_size = _str_nsize(dest, size); 1251 if (dstr_size < size) { 1252 _str_cpyn(dest + dstr_size, size - dstr_size, src); 1253 _str_sanitize(dest + dstr_size, size - dstr_size, U_SPECIAL); 1254 } 907 1255 } 908 1256 … … 933 1281 errno_t spascii_to_str(char *dest, size_t size, const uint8_t *src, size_t n) 934 1282 { 935 size_t sidx; 936 size_t didx; 937 size_t dlast; 938 uint8_t byte; 939 errno_t rc; 940 errno_t result; 941 942 /* There must be space for a null terminator in the buffer. */ 943 assert(size > 0); 944 result = EOK; 945 946 didx = 0; 947 dlast = 0; 948 for (sidx = 0; sidx < n; ++sidx) { 949 byte = src[sidx]; 950 if (!ascii_check(byte)) { 951 byte = U_SPECIAL; 1283 size_t len = 0; 1284 1285 /* Determine the length of the source string. */ 1286 for (size_t i = 0; i < n; i++) { 1287 if (src[i] == 0) 1288 break; 1289 1290 if (src[i] != ' ') 1291 len = i + 1; 1292 } 1293 1294 errno_t result = EOK; 1295 size_t out_len = min(len, size - 1); 1296 1297 /* Copy characters */ 1298 for (size_t i = 0; i < out_len; i++) { 1299 dest[i] = src[i]; 1300 1301 if (dest[i] < 0) { 1302 dest[i] = U_SPECIAL; 952 1303 result = EIO; 953 1304 } 954 955 rc = chr_encode(byte, dest, &didx, size - 1); 956 if (rc != EOK) { 957 assert(rc == EOVERFLOW); 958 dest[didx] = '\0'; 959 return rc; 960 } 961 962 /* Remember dest index after last non-empty character */ 963 if (byte != 0x20) 964 dlast = didx; 965 } 966 967 /* Terminate string after last non-empty character */ 968 dest[dlast] = '\0'; 1305 } 1306 1307 dest[out_len] = 0; 1308 1309 if (out_len < len) 1310 return EOVERFLOW; 1311 969 1312 return result; 970 1313 } … … 1207 1550 } 1208 1551 1552 static char *_strchr(const char *str, char c) 1553 { 1554 while (*str != 0 && *str != c) 1555 str++; 1556 1557 return (*str == c) ? (char *) str : NULL; 1558 } 1559 1209 1560 /** Find first occurence of character in string. 1210 1561 * … … 1216 1567 char *str_chr(const char *str, char32_t ch) 1217 1568 { 1218 char32_t acc; 1219 size_t off = 0; 1220 size_t last = 0; 1221 1222 while ((acc = str_decode(str, &off, STR_NO_LIMIT)) != 0) { 1223 if (acc == ch) 1224 return (char *) (str + last); 1225 last = off; 1569 /* Fast path for an ASCII character. */ 1570 if (ascii_check(ch)) 1571 return _strchr(str, ch); 1572 1573 /* Convert character to UTF-8. */ 1574 char utf8[STR_BOUNDS(1) + 1]; 1575 size_t offset = 0; 1576 1577 if (chr_encode(ch, utf8, &offset, sizeof(utf8)) != EOK || offset == 0) 1578 return NULL; 1579 1580 utf8[offset] = '\0'; 1581 1582 /* Find the first byte, then check if all of them are correct. */ 1583 while (*str != 0) { 1584 str = _strchr(str, utf8[0]); 1585 if (!str) 1586 return NULL; 1587 1588 if (_test_prefix(str, utf8)) 1589 return (char *) str; 1590 1591 str++; 1226 1592 } 1227 1593 … … 1238 1604 char *str_str(const char *hs, const char *n) 1239 1605 { 1240 size_t off = 0; 1241 1242 if (str_lcmp(hs, n, str_length(n)) == 0) 1243 return (char *)hs; 1244 1245 while (str_decode(hs, &off, STR_NO_LIMIT) != 0) { 1246 if (str_lcmp(hs + off, n, str_length(n)) == 0) 1247 return (char *)(hs + off); 1606 size_t hsize = _str_size(hs); 1607 size_t nsize = _str_size(n); 1608 1609 while (hsize >= nsize) { 1610 if (_test_prefix(hs, n)) 1611 return (char *) hs; 1612 1613 hs++; 1614 hsize--; 1248 1615 } 1249 1616 1250 1617 return NULL; 1618 } 1619 1620 static void _str_rtrim(char *str, char c) 1621 { 1622 char *last = str; 1623 1624 while (*str) { 1625 if (*str != c) 1626 last = str; 1627 1628 str++; 1629 } 1630 1631 /* Truncate string. */ 1632 last[1] = 0; 1251 1633 } 1252 1634 … … 1258 1640 void str_rtrim(char *str, char32_t ch) 1259 1641 { 1642 /* Fast path for the ASCII case. */ 1643 if (ascii_check(ch)) { 1644 _str_rtrim(str, ch); 1645 return; 1646 } 1647 1260 1648 size_t off = 0; 1261 1649 size_t pos = 0; … … 1279 1667 } 1280 1668 1669 static void _str_ltrim(char *str, char c) 1670 { 1671 char *p = str; 1672 1673 while (*p == c) 1674 p++; 1675 1676 if (str != p) 1677 _str_cpy(str, p); 1678 } 1679 1281 1680 /** Removes specified leading characters from a string. 1282 1681 * … … 1286 1685 void str_ltrim(char *str, char32_t ch) 1287 1686 { 1687 /* Fast path for the ASCII case. */ 1688 if (ascii_check(ch)) { 1689 _str_ltrim(str, ch); 1690 return; 1691 } 1692 1288 1693 char32_t acc; 1289 1694 size_t off = 0; … … 1305 1710 } 1306 1711 1712 static char *_str_rchr(const char *str, char c) 1713 { 1714 const char *last = NULL; 1715 1716 while (*str) { 1717 if (*str == c) 1718 last = str; 1719 1720 str++; 1721 } 1722 1723 return (char *) last; 1724 } 1725 1307 1726 /** Find last occurence of character in string. 1308 1727 * … … 1314 1733 char *str_rchr(const char *str, char32_t ch) 1315 1734 { 1735 if (ascii_check(ch)) 1736 return _str_rchr(str, ch); 1737 1316 1738 char32_t acc; 1317 1739 size_t off = 0; … … 1402 1824 char *str_dup(const char *src) 1403 1825 { 1404 size_t size = str_size(src) + 1;1826 size_t size = _str_size(src) + 1; 1405 1827 char *dest = malloc(size); 1406 1828 if (!dest) 1407 1829 return NULL; 1408 1830 1409 str_cpy(dest, size, src); 1831 memcpy(dest, src, size); 1832 _str_sanitize(dest, size, U_SPECIAL); 1410 1833 return dest; 1411 1834 } … … 1433 1856 char *str_ndup(const char *src, size_t n) 1434 1857 { 1435 size_t size = str_size(src); 1436 if (size > n) 1437 size = n; 1858 size_t size = _str_nsize(src, n); 1438 1859 1439 1860 char *dest = malloc(size + 1); … … 1441 1862 return NULL; 1442 1863 1443 str_ncpy(dest, size + 1, src, size); 1864 memcpy(dest, src, size); 1865 _str_sanitize(dest, size, U_SPECIAL); 1866 dest[size] = 0; 1444 1867 return dest; 1445 1868 }
Note:
See TracChangeset
for help on using the changeset viewer.