source: mainline/uspace/app/bdsh/tok.c@ 0a9b918

Last change on this file since 0a9b918 was 87eba56, checked in by Manuele Conti <manuele.conti@…>, 4 years ago

Fix tokenize command

  • Property mode set to 100644
File size: 8.0 KB
Line 
1/*
2 * Copyright (c) 2011 Martin Sucha
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * - Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * - Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * - The name of the author may not be used to endorse or promote products
15 * derived from this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29#include <str.h>
30#include <assert.h>
31#include <stdlib.h>
32#include <stddef.h>
33#include <errno.h>
34
35#include "tok.h"
36
37/* Forward declarations of static functions */
38static char32_t tok_get_char(tokenizer_t *);
39static char32_t tok_look_char(tokenizer_t *);
40static errno_t tok_push_char(tokenizer_t *, char32_t);
41static errno_t tok_push_token(tokenizer_t *);
42static bool tok_pending_chars(tokenizer_t *);
43static errno_t tok_finish_string(tokenizer_t *);
44static void tok_start_token(tokenizer_t *, token_type_t);
45
46/** Initialize the token parser
47 *
48 * @param tok the tokenizer structure to initialize
49 * @param input the input string to tokenize
50 * @param out_tokens array of strings where to store the result
51 * @param max_tokens number of elements of the out_tokens array
52 */
53errno_t tok_init(tokenizer_t *tok, char *input, token_t *out_tokens,
54 size_t max_tokens)
55{
56 tok->in = input;
57 tok->in_offset = 0;
58 tok->last_in_offset = 0;
59 tok->in_char_offset = 0;
60 tok->last_in_char_offset = 0;
61
62 tok->outtok = out_tokens;
63 tok->outtok_offset = 0;
64 tok->outtok_size = max_tokens;
65
66 /* Prepare a buffer where all the token strings will be stored */
67 size_t len = str_size(input) + max_tokens + 1;
68 char *tmp = malloc(len);
69
70 if (tmp == NULL) {
71 return ENOMEM;
72 }
73
74 tok->outbuf = tmp;
75 tok->outbuf_offset = 0;
76 tok->outbuf_size = len;
77 tok->outbuf_last_start = 0;
78
79 return EOK;
80}
81
82/** Finalize the token parser */
83void tok_fini(tokenizer_t *tok)
84{
85 if (tok->outbuf != NULL) {
86 free(tok->outbuf);
87 }
88}
89
90/** Tokenize the input string into the tokens */
91errno_t tok_tokenize(tokenizer_t *tok, size_t *tokens_length)
92{
93 errno_t rc;
94 char32_t next_char;
95
96 /* Read the input line char by char and append tokens */
97 while ((next_char = tok_look_char(tok)) != 0) {
98 if (next_char == ' ') {
99 /*
100 * Push the token if there is any.
101 * There may not be any pending char for a token in case
102 * there are several spaces in the input.
103 */
104 if (tok_pending_chars(tok)) {
105 rc = tok_push_token(tok);
106 if (rc != EOK) {
107 return rc;
108 }
109 }
110 tok_start_token(tok, TOKTYPE_SPACE);
111 /* Eat all the spaces */
112 while (tok_look_char(tok) == ' ') {
113 tok_push_char(tok, tok_get_char(tok));
114 }
115 tok_push_token(tok);
116
117 } else if (next_char == '|') {
118 /*
119 * Pipes are tokens that are delimiters and should be
120 * output as a separate token
121 */
122 if (tok_pending_chars(tok)) {
123 rc = tok_push_token(tok);
124 if (rc != EOK) {
125 return rc;
126 }
127 }
128
129 tok_start_token(tok, TOKTYPE_PIPE);
130
131 rc = tok_push_char(tok, tok_get_char(tok));
132 if (rc != EOK) {
133 return rc;
134 }
135
136 rc = tok_push_token(tok);
137 if (rc != EOK) {
138 return rc;
139 }
140 } else if (next_char == '<') {
141 if (tok_pending_chars(tok)) {
142 rc = tok_push_token(tok);
143 if (rc != EOK) {
144 return rc;
145 }
146 }
147
148 tok_start_token(tok, TOKTYPE_RDIN);
149
150 rc = tok_push_char(tok, tok_get_char(tok));
151 if (rc != EOK) {
152 return rc;
153 }
154
155 rc = tok_push_token(tok);
156 if (rc != EOK) {
157 return rc;
158 }
159 } else if (next_char == '>') {
160 if (tok_pending_chars(tok)) {
161 rc = tok_push_token(tok);
162 if (rc != EOK) {
163 return rc;
164 }
165 }
166
167 tok_start_token(tok, TOKTYPE_RDOU);
168
169 rc = tok_push_char(tok, tok_get_char(tok));
170 if (rc != EOK) {
171 return rc;
172 }
173
174 rc = tok_push_token(tok);
175 if (rc != EOK) {
176 return rc;
177 }
178 } else if (next_char == '\'') {
179 /*
180 * A string starts with a quote (') and ends again with a quote.
181 * A literal quote is written as ''
182 */
183 tok_start_token(tok, TOKTYPE_TEXT);
184 /* Eat the quote */
185 tok_get_char(tok);
186 rc = tok_finish_string(tok);
187 if (rc != EOK) {
188 return rc;
189 }
190 } else {
191 if (!tok_pending_chars(tok)) {
192 tok_start_token(tok, TOKTYPE_TEXT);
193 }
194 /*
195 * If we are handling any other character, just append it to
196 * the current token.
197 */
198 rc = tok_push_char(tok, tok_get_char(tok));
199 if (rc != EOK) {
200 return rc;
201 }
202 }
203 }
204
205 /* Push the last token */
206 if (tok_pending_chars(tok)) {
207 rc = tok_push_token(tok);
208 if (rc != EOK) {
209 return rc;
210 }
211 }
212
213 *tokens_length = tok->outtok_offset;
214
215 return EOK;
216}
217
218/** Finish tokenizing an opened string */
219errno_t tok_finish_string(tokenizer_t *tok)
220{
221 errno_t rc;
222 char32_t next_char;
223
224 while ((next_char = tok_look_char(tok)) != 0) {
225 if (next_char == '\'') {
226 /* Eat the quote */
227 tok_get_char(tok);
228 if (tok_look_char(tok) == '\'') {
229 /* Encode a single literal quote */
230 rc = tok_push_char(tok, '\'');
231 if (rc != EOK) {
232 return rc;
233 }
234
235 /* Swallow the additional one in the input */
236 tok_get_char(tok);
237 } else {
238 /* The string end */
239 return tok_push_token(tok);
240 }
241 } else {
242 rc = tok_push_char(tok, tok_get_char(tok));
243 if (rc != EOK) {
244 return rc;
245 }
246 }
247 }
248
249 /* If we are here, the string run to the end without being closed */
250 return EINVAL;
251}
252
253/** Get a char from input, advancing the input position */
254char32_t tok_get_char(tokenizer_t *tok)
255{
256 tok->in_char_offset++;
257 return str_decode(tok->in, &tok->in_offset, STR_NO_LIMIT);
258}
259
260/** Get a char from input, while staying on the same input position */
261char32_t tok_look_char(tokenizer_t *tok)
262{
263 size_t old_offset = tok->in_offset;
264 size_t old_char_offset = tok->in_char_offset;
265 char32_t ret = tok_get_char(tok);
266 tok->in_offset = old_offset;
267 tok->in_char_offset = old_char_offset;
268 return ret;
269}
270
271/** Append a char to the end of the current token */
272errno_t tok_push_char(tokenizer_t *tok, char32_t ch)
273{
274 return chr_encode(ch, tok->outbuf, &tok->outbuf_offset, tok->outbuf_size);
275}
276
277void tok_start_token(tokenizer_t *tok, token_type_t type)
278{
279 tok->current_type = type;
280}
281
282/** Push the current token to the output array */
283errno_t tok_push_token(tokenizer_t *tok)
284{
285 if (tok->outtok_offset >= tok->outtok_size) {
286 return EOVERFLOW;
287 }
288
289 if (tok->outbuf_offset >= tok->outbuf_size) {
290 return EOVERFLOW;
291 }
292
293 tok->outbuf[tok->outbuf_offset++] = 0;
294 token_t *tokinfo = &tok->outtok[tok->outtok_offset++];
295 tokinfo->type = tok->current_type;
296 tokinfo->text = tok->outbuf + tok->outbuf_last_start;
297 tokinfo->byte_start = tok->last_in_offset;
298 tokinfo->byte_length = tok->in_offset - tok->last_in_offset;
299 tokinfo->char_start = tok->last_in_char_offset;
300 tokinfo->char_length = tok->in_char_offset - tok->last_in_char_offset;
301 tok->outbuf_last_start = tok->outbuf_offset;
302
303 /* We have consumed the first char of the next token already */
304 tok->last_in_offset = tok->in_offset;
305 tok->last_in_char_offset = tok->in_char_offset;
306
307 return EOK;
308}
309
310/** Return true, if the current token is not empty */
311bool tok_pending_chars(tokenizer_t *tok)
312{
313 assert(tok->outbuf_offset >= tok->outbuf_last_start);
314 return (tok->outbuf_offset != tok->outbuf_last_start);
315}
Note: See TracBrowser for help on using the repository browser.