[a18a8b9] | 1 | /*
|
---|
| 2 | * Copyright (c) 2015 Michal Koutny
|
---|
| 3 | * All rights reserved.
|
---|
| 4 | *
|
---|
| 5 | * Redistribution and use in source and binary forms, with or without
|
---|
| 6 | * modification, are permitted provided that the following conditions
|
---|
| 7 | * are met:
|
---|
| 8 | *
|
---|
| 9 | * - Redistributions of source code must retain the above copyright
|
---|
| 10 | * notice, this list of conditions and the following disclaimer.
|
---|
| 11 | * - Redistributions in binary form must reproduce the above copyright
|
---|
| 12 | * notice, this list of conditions and the following disclaimer in the
|
---|
| 13 | * documentation and/or other materials provided with the distribution.
|
---|
| 14 | * - The name of the author may not be used to endorse or promote products
|
---|
| 15 | * derived from this software without specific prior written permission.
|
---|
| 16 | *
|
---|
| 17 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
|
---|
| 18 | * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
---|
| 19 | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
---|
| 20 | * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
|
---|
| 21 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
|
---|
| 22 | * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
---|
| 23 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
---|
| 24 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
---|
| 25 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
|
---|
| 26 | * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
---|
| 27 | */
|
---|
| 28 |
|
---|
[0600976] | 29 | #include "pcut/asserts.h"
|
---|
[b31323f] | 30 | #include <assert.h>
|
---|
| 31 | #include <stdint.h>
|
---|
[a18a8b9] | 32 | #include <stdio.h>
|
---|
| 33 | #include <str.h>
|
---|
| 34 | #include <pcut/pcut.h>
|
---|
| 35 |
|
---|
| 36 | #define BUFFER_SIZE 256
|
---|
| 37 |
|
---|
| 38 | #define SET_BUFFER(str) snprintf(buffer, BUFFER_SIZE, "%s", str)
|
---|
| 39 | #define EQ(expected, value) PCUT_ASSERT_STR_EQUALS(expected, value)
|
---|
| 40 |
|
---|
[3f932a7e] | 41 | PCUT_INIT;
|
---|
[a18a8b9] | 42 |
|
---|
| 43 | PCUT_TEST_SUITE(str);
|
---|
| 44 |
|
---|
| 45 | static char buffer[BUFFER_SIZE];
|
---|
| 46 |
|
---|
[3bacee1] | 47 | PCUT_TEST_BEFORE
|
---|
| 48 | {
|
---|
[a18a8b9] | 49 | memset(buffer, 0, BUFFER_SIZE);
|
---|
| 50 | }
|
---|
| 51 |
|
---|
[b31323f] | 52 | /* Helper to display string contents for debugging */
|
---|
| 53 | static void print_string_hex(char *out, const char *s, size_t len)
|
---|
| 54 | {
|
---|
| 55 | *out++ = '"';
|
---|
| 56 | for (size_t i = 0; i < len && s[i]; i++) {
|
---|
| 57 | if (s[i] >= 32 && s[i] <= 126)
|
---|
| 58 | *out++ = s[i];
|
---|
| 59 | else
|
---|
| 60 | out += snprintf(out, 5, "\\x%02x", (uint8_t) s[i]);
|
---|
| 61 | }
|
---|
| 62 | *out++ = '"';
|
---|
| 63 | *out++ = 0;
|
---|
| 64 | }
|
---|
| 65 |
|
---|
[3bacee1] | 66 | PCUT_TEST(rtrim)
|
---|
| 67 | {
|
---|
[a18a8b9] | 68 | SET_BUFFER("foobar");
|
---|
| 69 | str_rtrim(buffer, ' ');
|
---|
| 70 | EQ("foobar", buffer);
|
---|
| 71 |
|
---|
| 72 | SET_BUFFER(" foobar ");
|
---|
| 73 | str_rtrim(buffer, ' ');
|
---|
| 74 | EQ(" foobar", buffer);
|
---|
| 75 |
|
---|
| 76 | SET_BUFFER(" ššš ");
|
---|
| 77 | str_rtrim(buffer, ' ');
|
---|
| 78 | EQ(" ššš", buffer);
|
---|
| 79 |
|
---|
| 80 | SET_BUFFER("ššAAAšš");
|
---|
| 81 | str_rtrim(buffer, L'š');
|
---|
| 82 | EQ("ššAAA", buffer);
|
---|
| 83 | }
|
---|
| 84 |
|
---|
[3bacee1] | 85 | PCUT_TEST(ltrim)
|
---|
| 86 | {
|
---|
[a18a8b9] | 87 | SET_BUFFER("foobar");
|
---|
| 88 | str_ltrim(buffer, ' ');
|
---|
| 89 | EQ("foobar", buffer);
|
---|
| 90 |
|
---|
| 91 | SET_BUFFER(" foobar ");
|
---|
| 92 | str_ltrim(buffer, ' ');
|
---|
| 93 | EQ("foobar ", buffer);
|
---|
| 94 |
|
---|
| 95 | SET_BUFFER(" ššš ");
|
---|
| 96 | str_ltrim(buffer, ' ');
|
---|
| 97 | EQ("ššš ", buffer);
|
---|
| 98 |
|
---|
| 99 | SET_BUFFER("ššAAAšš");
|
---|
| 100 | str_ltrim(buffer, L'š');
|
---|
| 101 | EQ("AAAšš", buffer);
|
---|
| 102 | }
|
---|
| 103 |
|
---|
[da680b4b] | 104 | PCUT_TEST(str_str_found)
|
---|
| 105 | {
|
---|
| 106 | const char *hs = "abracadabra";
|
---|
| 107 | const char *n = "raca";
|
---|
| 108 | char *p;
|
---|
| 109 |
|
---|
| 110 | p = str_str(hs, n);
|
---|
| 111 | PCUT_ASSERT_TRUE((const char *)p == hs + 2);
|
---|
| 112 | }
|
---|
| 113 |
|
---|
| 114 | PCUT_TEST(str_str_not_found)
|
---|
| 115 | {
|
---|
| 116 | const char *hs = "abracadabra";
|
---|
| 117 | const char *n = "racab";
|
---|
| 118 | char *p;
|
---|
| 119 |
|
---|
| 120 | p = str_str(hs, n);
|
---|
| 121 | PCUT_ASSERT_TRUE(p == NULL);
|
---|
| 122 | }
|
---|
| 123 |
|
---|
| 124 | PCUT_TEST(str_str_empty_n)
|
---|
| 125 | {
|
---|
| 126 | const char *hs = "abracadabra";
|
---|
| 127 | const char *n = "";
|
---|
| 128 | char *p;
|
---|
| 129 |
|
---|
| 130 | p = str_str(hs, n);
|
---|
| 131 | PCUT_ASSERT_TRUE((const char *)p == hs);
|
---|
| 132 | }
|
---|
[a18a8b9] | 133 |
|
---|
[0600976] | 134 | PCUT_TEST(str_non_shortest)
|
---|
| 135 | {
|
---|
| 136 | /* Overlong zero. */
|
---|
[b31323f] | 137 | const char overlong1[] = "\xC0\x80";
|
---|
| 138 | const char overlong2[] = "\xE0\x80\x80";
|
---|
| 139 | const char overlong3[] = "\xF0\x80\x80\x80";
|
---|
[0600976] | 140 |
|
---|
[b31323f] | 141 | const char overlong4[] = "\xC1\xBF";
|
---|
| 142 | const char overlong5[] = "\xE0\x9F\xBF";
|
---|
| 143 | const char overlong6[] = "\xF0\x8F\xBF\xBF";
|
---|
[0600976] | 144 |
|
---|
| 145 | size_t offset = 0;
|
---|
| 146 | PCUT_ASSERT_INT_EQUALS(U_SPECIAL, str_decode(overlong1, &offset, sizeof(overlong1)));
|
---|
| 147 | offset = 0;
|
---|
| 148 | PCUT_ASSERT_INT_EQUALS(U_SPECIAL, str_decode(overlong2, &offset, sizeof(overlong2)));
|
---|
| 149 | offset = 0;
|
---|
| 150 | PCUT_ASSERT_INT_EQUALS(U_SPECIAL, str_decode(overlong3, &offset, sizeof(overlong3)));
|
---|
| 151 | offset = 0;
|
---|
| 152 | PCUT_ASSERT_INT_EQUALS(U_SPECIAL, str_decode(overlong4, &offset, sizeof(overlong4)));
|
---|
| 153 | offset = 0;
|
---|
| 154 | PCUT_ASSERT_INT_EQUALS(U_SPECIAL, str_decode(overlong5, &offset, sizeof(overlong5)));
|
---|
| 155 | offset = 0;
|
---|
| 156 | PCUT_ASSERT_INT_EQUALS(U_SPECIAL, str_decode(overlong6, &offset, sizeof(overlong6)));
|
---|
[b31323f] | 157 | }
|
---|
| 158 |
|
---|
| 159 | struct sanitize_test {
|
---|
| 160 | const char *input;
|
---|
| 161 | const char *output;
|
---|
| 162 | };
|
---|
| 163 |
|
---|
| 164 | static const struct sanitize_test sanitize_tests[] = {
|
---|
| 165 | // Empty string
|
---|
| 166 | { "", "" },
|
---|
| 167 | // ASCII only
|
---|
| 168 | { "Hello, world!", "Hello, world!" },
|
---|
| 169 | // Valid multi-byte sequences
|
---|
| 170 | { "Aπ你🐱", "Aπ你🐱" },
|
---|
| 171 | // U+D7FF is last valid before surrogates
|
---|
| 172 | { "A\xED\x9F\xBFZ", "A\xED\x9F\xBFZ" },
|
---|
| 173 | // 0x10FFFF is the highest legal code point
|
---|
| 174 | { "A\xF4\x8F\xBF\xBFZ", "A\xF4\x8F\xBF\xBFZ" },
|
---|
| 175 |
|
---|
| 176 | // Missing continuation byte
|
---|
| 177 | { "A\xC2Z", "A?Z" },
|
---|
| 178 | // Truncated multi-byte at buffer end
|
---|
| 179 | { "A\xE2\x82", "A??" },
|
---|
| 180 | // Continuation byte without leading byte (0x80-0xBF are never valid first bytes)
|
---|
| 181 | { "A\x80Y\xBFZ", "A?Y?Z" },
|
---|
| 182 |
|
---|
| 183 | // 'A' (U+0041) normally encoded as 0x41
|
---|
| 184 | // Overlong 2-byte encoding: 0xC1 0x81
|
---|
| 185 | { "\xC1\x81X", "??X" },
|
---|
| 186 |
|
---|
| 187 | // ¢ (U+00A2) normally encoded as 0xC2 0xA2
|
---|
| 188 | // Overlong 3-byte encoding: 0xE0 0x82 0xA2
|
---|
| 189 | { "\xE0\x82\xA2X", "???X" },
|
---|
| 190 |
|
---|
| 191 | // ¢ (U+00A2) normally encoded as 0xC2 0xA2
|
---|
| 192 | // Overlong 4-byte encoding: 0xF0 0x80 0x82 0xA2
|
---|
| 193 | { "\xF0\x80\x82\xA2X", "????X" },
|
---|
| 194 |
|
---|
| 195 | // € (U+20AC) normally encoded as 0xE2 0x82 0xAC
|
---|
| 196 | // Overlong 4-byte encoding: 0xF0 0x82 0x82 0xAC
|
---|
| 197 | { "\xF0\x82\x82\xACX", "????X" },
|
---|
| 198 |
|
---|
| 199 | // Using 0xC0 0x80 as overlong encoding for NUL (which should be just 0x00)
|
---|
| 200 | { "\xC0\x80X", "??X" },
|
---|
| 201 |
|
---|
| 202 | // 0xED 0xA0 0x80 encodes a surrogate half (U+D800), not allowed in UTF-8
|
---|
| 203 | { "A\xED\xA0\x80Z", "A???Z" },
|
---|
| 204 |
|
---|
| 205 | // 0x110000 is not a legal code point
|
---|
| 206 | { "A\xF4\x90\x80\x80Z", "A????Z" },
|
---|
| 207 |
|
---|
| 208 | // Mix of valid and invalid sequences
|
---|
| 209 | { "A\xC2\xA9\xE2\x28\xA1\xF0\x9F\x98\x81\x80Z", "A©?(?😁?Z" },
|
---|
| 210 | };
|
---|
| 211 |
|
---|
| 212 | static size_t count_diff(const char *a, const char *b, size_t n)
|
---|
| 213 | {
|
---|
| 214 | size_t count = 0;
|
---|
| 215 |
|
---|
| 216 | for (size_t i = 0; i < n; i++) {
|
---|
| 217 | if (a[i] != b[i])
|
---|
| 218 | count++;
|
---|
| 219 | }
|
---|
| 220 |
|
---|
| 221 | return count;
|
---|
| 222 | }
|
---|
| 223 |
|
---|
| 224 | PCUT_TEST(str_sanitize)
|
---|
| 225 | {
|
---|
| 226 | char replacement = '?';
|
---|
| 227 | char buffer2[255];
|
---|
| 228 |
|
---|
| 229 | for (size_t i = 0; i < sizeof(sanitize_tests) / sizeof(sanitize_tests[0]); i++) {
|
---|
| 230 | const char *in = sanitize_tests[i].input;
|
---|
| 231 | const char *out = sanitize_tests[i].output;
|
---|
| 232 | size_t n = str_size(in) + 1;
|
---|
| 233 | assert(str_size(out) + 1 == n);
|
---|
| 234 |
|
---|
| 235 | memcpy(buffer, in, n);
|
---|
| 236 | size_t replaced = str_sanitize(buffer, n, replacement);
|
---|
| 237 | if (memcmp(buffer, out, n) != 0) {
|
---|
| 238 | print_string_hex(buffer2, buffer, n);
|
---|
| 239 | print_string_hex(buffer, out, n);
|
---|
| 240 | PCUT_ASSERTION_FAILED("Expected %s, got %s", buffer, buffer2);
|
---|
| 241 | }
|
---|
| 242 |
|
---|
| 243 | size_t expect_replaced = count_diff(buffer, in, n);
|
---|
| 244 | PCUT_ASSERT_INT_EQUALS(expect_replaced, replaced);
|
---|
| 245 | }
|
---|
| 246 |
|
---|
| 247 | // Test with n smaller than string length - truncated valid encoding for €
|
---|
| 248 | const char *in = "ABC€";
|
---|
| 249 | const char *out = "ABC??\xAC";
|
---|
| 250 | size_t n = str_size(in) + 1;
|
---|
| 251 | memcpy(buffer, in, n);
|
---|
| 252 | size_t replaced = str_sanitize(buffer, 5, replacement);
|
---|
| 253 | if (memcmp(buffer, out, n) != 0) {
|
---|
| 254 | print_string_hex(buffer2, buffer, n);
|
---|
| 255 | print_string_hex(buffer, out, n);
|
---|
| 256 | PCUT_ASSERTION_FAILED("Expected %s, got %s", buffer, buffer2);
|
---|
| 257 | }
|
---|
[0600976] | 258 |
|
---|
[b31323f] | 259 | PCUT_ASSERT_INT_EQUALS(2, replaced);
|
---|
[0600976] | 260 | }
|
---|
| 261 |
|
---|
[a18a8b9] | 262 | PCUT_EXPORT(str);
|
---|