1 | /*
|
---|
2 | * Copyright (c) 2015 Michal Koutny
|
---|
3 | * All rights reserved.
|
---|
4 | *
|
---|
5 | * Redistribution and use in source and binary forms, with or without
|
---|
6 | * modification, are permitted provided that the following conditions
|
---|
7 | * are met:
|
---|
8 | *
|
---|
9 | * - Redistributions of source code must retain the above copyright
|
---|
10 | * notice, this list of conditions and the following disclaimer.
|
---|
11 | * - Redistributions in binary form must reproduce the above copyright
|
---|
12 | * notice, this list of conditions and the following disclaimer in the
|
---|
13 | * documentation and/or other materials provided with the distribution.
|
---|
14 | * - The name of the author may not be used to endorse or promote products
|
---|
15 | * derived from this software without specific prior written permission.
|
---|
16 | *
|
---|
17 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
|
---|
18 | * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
---|
19 | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
---|
20 | * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
|
---|
21 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
|
---|
22 | * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
---|
23 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
---|
24 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
---|
25 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
|
---|
26 | * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
---|
27 | */
|
---|
28 |
|
---|
29 | #include "pcut/asserts.h"
|
---|
30 | #include <assert.h>
|
---|
31 | #include <stdint.h>
|
---|
32 | #include <stdio.h>
|
---|
33 | #include <str.h>
|
---|
34 | #include <pcut/pcut.h>
|
---|
35 |
|
---|
36 | #define BUFFER_SIZE 256
|
---|
37 |
|
---|
38 | #define SET_BUFFER(str) snprintf(buffer, BUFFER_SIZE, "%s", str)
|
---|
39 | #define EQ(expected, value) PCUT_ASSERT_STR_EQUALS(expected, value)
|
---|
40 |
|
---|
41 | PCUT_INIT;
|
---|
42 |
|
---|
43 | PCUT_TEST_SUITE(str);
|
---|
44 |
|
---|
45 | static char buffer[BUFFER_SIZE];
|
---|
46 |
|
---|
47 | PCUT_TEST_BEFORE
|
---|
48 | {
|
---|
49 | memset(buffer, 0, BUFFER_SIZE);
|
---|
50 | }
|
---|
51 |
|
---|
52 | /* Helper to display string contents for debugging */
|
---|
53 | static void print_string_hex(char *out, const char *s, size_t len)
|
---|
54 | {
|
---|
55 | *out++ = '"';
|
---|
56 | for (size_t i = 0; i < len && s[i]; i++) {
|
---|
57 | if (s[i] >= 32 && s[i] <= 126)
|
---|
58 | *out++ = s[i];
|
---|
59 | else
|
---|
60 | out += snprintf(out, 5, "\\x%02x", (uint8_t) s[i]);
|
---|
61 | }
|
---|
62 | *out++ = '"';
|
---|
63 | *out++ = 0;
|
---|
64 | }
|
---|
65 |
|
---|
66 | PCUT_TEST(rtrim)
|
---|
67 | {
|
---|
68 | SET_BUFFER("foobar");
|
---|
69 | str_rtrim(buffer, ' ');
|
---|
70 | EQ("foobar", buffer);
|
---|
71 |
|
---|
72 | SET_BUFFER(" foobar ");
|
---|
73 | str_rtrim(buffer, ' ');
|
---|
74 | EQ(" foobar", buffer);
|
---|
75 |
|
---|
76 | SET_BUFFER(" ššš ");
|
---|
77 | str_rtrim(buffer, ' ');
|
---|
78 | EQ(" ššš", buffer);
|
---|
79 |
|
---|
80 | SET_BUFFER("ššAAAšš");
|
---|
81 | str_rtrim(buffer, L'š');
|
---|
82 | EQ("ššAAA", buffer);
|
---|
83 | }
|
---|
84 |
|
---|
85 | PCUT_TEST(ltrim)
|
---|
86 | {
|
---|
87 | SET_BUFFER("foobar");
|
---|
88 | str_ltrim(buffer, ' ');
|
---|
89 | EQ("foobar", buffer);
|
---|
90 |
|
---|
91 | SET_BUFFER(" foobar ");
|
---|
92 | str_ltrim(buffer, ' ');
|
---|
93 | EQ("foobar ", buffer);
|
---|
94 |
|
---|
95 | SET_BUFFER(" ššš ");
|
---|
96 | str_ltrim(buffer, ' ');
|
---|
97 | EQ("ššš ", buffer);
|
---|
98 |
|
---|
99 | SET_BUFFER("ššAAAšš");
|
---|
100 | str_ltrim(buffer, L'š');
|
---|
101 | EQ("AAAšš", buffer);
|
---|
102 | }
|
---|
103 |
|
---|
104 | PCUT_TEST(str_str_found)
|
---|
105 | {
|
---|
106 | const char *hs = "abracadabra";
|
---|
107 | const char *n = "raca";
|
---|
108 | char *p;
|
---|
109 |
|
---|
110 | p = str_str(hs, n);
|
---|
111 | PCUT_ASSERT_TRUE((const char *)p == hs + 2);
|
---|
112 | }
|
---|
113 |
|
---|
114 | PCUT_TEST(str_str_not_found)
|
---|
115 | {
|
---|
116 | const char *hs = "abracadabra";
|
---|
117 | const char *n = "racab";
|
---|
118 | char *p;
|
---|
119 |
|
---|
120 | p = str_str(hs, n);
|
---|
121 | PCUT_ASSERT_TRUE(p == NULL);
|
---|
122 | }
|
---|
123 |
|
---|
124 | PCUT_TEST(str_str_empty_n)
|
---|
125 | {
|
---|
126 | const char *hs = "abracadabra";
|
---|
127 | const char *n = "";
|
---|
128 | char *p;
|
---|
129 |
|
---|
130 | p = str_str(hs, n);
|
---|
131 | PCUT_ASSERT_TRUE((const char *)p == hs);
|
---|
132 | }
|
---|
133 |
|
---|
134 | PCUT_TEST(str_non_shortest)
|
---|
135 | {
|
---|
136 | /* Overlong zero. */
|
---|
137 | const char overlong1[] = "\xC0\x80";
|
---|
138 | const char overlong2[] = "\xE0\x80\x80";
|
---|
139 | const char overlong3[] = "\xF0\x80\x80\x80";
|
---|
140 |
|
---|
141 | const char overlong4[] = "\xC1\xBF";
|
---|
142 | const char overlong5[] = "\xE0\x9F\xBF";
|
---|
143 | const char overlong6[] = "\xF0\x8F\xBF\xBF";
|
---|
144 |
|
---|
145 | size_t offset = 0;
|
---|
146 | PCUT_ASSERT_INT_EQUALS(U_SPECIAL, str_decode(overlong1, &offset, sizeof(overlong1)));
|
---|
147 | offset = 0;
|
---|
148 | PCUT_ASSERT_INT_EQUALS(U_SPECIAL, str_decode(overlong2, &offset, sizeof(overlong2)));
|
---|
149 | offset = 0;
|
---|
150 | PCUT_ASSERT_INT_EQUALS(U_SPECIAL, str_decode(overlong3, &offset, sizeof(overlong3)));
|
---|
151 | offset = 0;
|
---|
152 | PCUT_ASSERT_INT_EQUALS(U_SPECIAL, str_decode(overlong4, &offset, sizeof(overlong4)));
|
---|
153 | offset = 0;
|
---|
154 | PCUT_ASSERT_INT_EQUALS(U_SPECIAL, str_decode(overlong5, &offset, sizeof(overlong5)));
|
---|
155 | offset = 0;
|
---|
156 | PCUT_ASSERT_INT_EQUALS(U_SPECIAL, str_decode(overlong6, &offset, sizeof(overlong6)));
|
---|
157 | }
|
---|
158 |
|
---|
159 | struct sanitize_test {
|
---|
160 | const char *input;
|
---|
161 | const char *output;
|
---|
162 | };
|
---|
163 |
|
---|
164 | static const struct sanitize_test sanitize_tests[] = {
|
---|
165 | // Empty string
|
---|
166 | { "", "" },
|
---|
167 | // ASCII only
|
---|
168 | { "Hello, world!", "Hello, world!" },
|
---|
169 | // Valid multi-byte sequences
|
---|
170 | { "Aπ你🐱", "Aπ你🐱" },
|
---|
171 | // U+D7FF is last valid before surrogates
|
---|
172 | { "A\xED\x9F\xBFZ", "A\xED\x9F\xBFZ" },
|
---|
173 | // 0x10FFFF is the highest legal code point
|
---|
174 | { "A\xF4\x8F\xBF\xBFZ", "A\xF4\x8F\xBF\xBFZ" },
|
---|
175 |
|
---|
176 | // Missing continuation byte
|
---|
177 | { "A\xC2Z", "A?Z" },
|
---|
178 | // Truncated multi-byte at buffer end
|
---|
179 | { "A\xE2\x82", "A??" },
|
---|
180 | // Continuation byte without leading byte (0x80-0xBF are never valid first bytes)
|
---|
181 | { "A\x80Y\xBFZ", "A?Y?Z" },
|
---|
182 |
|
---|
183 | // 'A' (U+0041) normally encoded as 0x41
|
---|
184 | // Overlong 2-byte encoding: 0xC1 0x81
|
---|
185 | { "\xC1\x81X", "??X" },
|
---|
186 |
|
---|
187 | // ¢ (U+00A2) normally encoded as 0xC2 0xA2
|
---|
188 | // Overlong 3-byte encoding: 0xE0 0x82 0xA2
|
---|
189 | { "\xE0\x82\xA2X", "???X" },
|
---|
190 |
|
---|
191 | // ¢ (U+00A2) normally encoded as 0xC2 0xA2
|
---|
192 | // Overlong 4-byte encoding: 0xF0 0x80 0x82 0xA2
|
---|
193 | { "\xF0\x80\x82\xA2X", "????X" },
|
---|
194 |
|
---|
195 | // € (U+20AC) normally encoded as 0xE2 0x82 0xAC
|
---|
196 | // Overlong 4-byte encoding: 0xF0 0x82 0x82 0xAC
|
---|
197 | { "\xF0\x82\x82\xACX", "????X" },
|
---|
198 |
|
---|
199 | // Using 0xC0 0x80 as overlong encoding for NUL (which should be just 0x00)
|
---|
200 | { "\xC0\x80X", "??X" },
|
---|
201 |
|
---|
202 | // 0xED 0xA0 0x80 encodes a surrogate half (U+D800), not allowed in UTF-8
|
---|
203 | { "A\xED\xA0\x80Z", "A???Z" },
|
---|
204 |
|
---|
205 | // 0x110000 is not a legal code point
|
---|
206 | { "A\xF4\x90\x80\x80Z", "A????Z" },
|
---|
207 |
|
---|
208 | // Mix of valid and invalid sequences
|
---|
209 | { "A\xC2\xA9\xE2\x28\xA1\xF0\x9F\x98\x81\x80Z", "A©?(?😁?Z" },
|
---|
210 | };
|
---|
211 |
|
---|
212 | static size_t count_diff(const char *a, const char *b, size_t n)
|
---|
213 | {
|
---|
214 | size_t count = 0;
|
---|
215 |
|
---|
216 | for (size_t i = 0; i < n; i++) {
|
---|
217 | if (a[i] != b[i])
|
---|
218 | count++;
|
---|
219 | }
|
---|
220 |
|
---|
221 | return count;
|
---|
222 | }
|
---|
223 |
|
---|
224 | PCUT_TEST(str_sanitize)
|
---|
225 | {
|
---|
226 | char replacement = '?';
|
---|
227 | char buffer2[255];
|
---|
228 |
|
---|
229 | for (size_t i = 0; i < sizeof(sanitize_tests) / sizeof(sanitize_tests[0]); i++) {
|
---|
230 | const char *in = sanitize_tests[i].input;
|
---|
231 | const char *out = sanitize_tests[i].output;
|
---|
232 | size_t n = str_size(in) + 1;
|
---|
233 | assert(str_size(out) + 1 == n);
|
---|
234 |
|
---|
235 | memcpy(buffer, in, n);
|
---|
236 | size_t replaced = str_sanitize(buffer, n, replacement);
|
---|
237 | if (memcmp(buffer, out, n) != 0) {
|
---|
238 | print_string_hex(buffer2, buffer, n);
|
---|
239 | print_string_hex(buffer, out, n);
|
---|
240 | PCUT_ASSERTION_FAILED("Expected %s, got %s", buffer, buffer2);
|
---|
241 | }
|
---|
242 |
|
---|
243 | size_t expect_replaced = count_diff(buffer, in, n);
|
---|
244 | PCUT_ASSERT_INT_EQUALS(expect_replaced, replaced);
|
---|
245 | }
|
---|
246 |
|
---|
247 | // Test with n smaller than string length - truncated valid encoding for €
|
---|
248 | const char *in = "ABC€";
|
---|
249 | const char *out = "ABC??\xAC";
|
---|
250 | size_t n = str_size(in) + 1;
|
---|
251 | memcpy(buffer, in, n);
|
---|
252 | size_t replaced = str_sanitize(buffer, 5, replacement);
|
---|
253 | if (memcmp(buffer, out, n) != 0) {
|
---|
254 | print_string_hex(buffer2, buffer, n);
|
---|
255 | print_string_hex(buffer, out, n);
|
---|
256 | PCUT_ASSERTION_FAILED("Expected %s, got %s", buffer, buffer2);
|
---|
257 | }
|
---|
258 |
|
---|
259 | PCUT_ASSERT_INT_EQUALS(2, replaced);
|
---|
260 | }
|
---|
261 |
|
---|
262 | PCUT_EXPORT(str);
|
---|