source: mainline/uspace/app/bdsh/tok.c@ b9e3af6b

Last change on this file since b9e3af6b was 87eba56, checked in by Manuele Conti <manuele.conti@…>, 4 years ago

Fix tokenize command

  • Property mode set to 100644
File size: 8.0 KB
RevLine 
[36ab7c7]1/*
2 * Copyright (c) 2011 Martin Sucha
[6939edb]3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
[36ab7c7]6 * modification, are permitted provided that the following conditions
7 * are met:
[6939edb]8 *
[36ab7c7]9 * - Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * - Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * - The name of the author may not be used to endorse or promote products
15 * derived from this software without specific prior written permission.
[6939edb]16 *
[36ab7c7]17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
[6939edb]27 */
28
29#include <str.h>
30#include <assert.h>
31#include <stdlib.h>
[582a0b8]32#include <stddef.h>
[6939edb]33#include <errno.h>
34
35#include "tok.h"
36
37/* Forward declarations of static functions */
[28a5ebd]38static char32_t tok_get_char(tokenizer_t *);
39static char32_t tok_look_char(tokenizer_t *);
40static errno_t tok_push_char(tokenizer_t *, char32_t);
[b7fd2a0]41static errno_t tok_push_token(tokenizer_t *);
[6939edb]42static bool tok_pending_chars(tokenizer_t *);
[b7fd2a0]43static errno_t tok_finish_string(tokenizer_t *);
[0662451]44static void tok_start_token(tokenizer_t *, token_type_t);
[6939edb]45
46/** Initialize the token parser
[1b20da0]47 *
[6939edb]48 * @param tok the tokenizer structure to initialize
49 * @param input the input string to tokenize
50 * @param out_tokens array of strings where to store the result
51 * @param max_tokens number of elements of the out_tokens array
52 */
[b7fd2a0]53errno_t tok_init(tokenizer_t *tok, char *input, token_t *out_tokens,
[6939edb]54 size_t max_tokens)
[1b20da0]55{
[6939edb]56 tok->in = input;
57 tok->in_offset = 0;
[7dcb7981]58 tok->last_in_offset = 0;
59 tok->in_char_offset = 0;
60 tok->last_in_char_offset = 0;
[a35b458]61
[6939edb]62 tok->outtok = out_tokens;
63 tok->outtok_offset = 0;
[0662451]64 tok->outtok_size = max_tokens;
[a35b458]65
[6939edb]66 /* Prepare a buffer where all the token strings will be stored */
67 size_t len = str_size(input) + max_tokens + 1;
68 char *tmp = malloc(len);
[a35b458]69
[6939edb]70 if (tmp == NULL) {
71 return ENOMEM;
72 }
[a35b458]73
[6939edb]74 tok->outbuf = tmp;
75 tok->outbuf_offset = 0;
76 tok->outbuf_size = len;
77 tok->outbuf_last_start = 0;
[a35b458]78
[6939edb]79 return EOK;
80}
81
82/** Finalize the token parser */
83void tok_fini(tokenizer_t *tok)
84{
85 if (tok->outbuf != NULL) {
86 free(tok->outbuf);
87 }
88}
89
90/** Tokenize the input string into the tokens */
[b7fd2a0]91errno_t tok_tokenize(tokenizer_t *tok, size_t *tokens_length)
[6939edb]92{
[b7fd2a0]93 errno_t rc;
[28a5ebd]94 char32_t next_char;
[a35b458]95
[6939edb]96 /* Read the input line char by char and append tokens */
[5992e0e]97 while ((next_char = tok_look_char(tok)) != 0) {
98 if (next_char == ' ') {
[7c3fb9b]99 /*
100 * Push the token if there is any.
[6939edb]101 * There may not be any pending char for a token in case
102 * there are several spaces in the input.
103 */
104 if (tok_pending_chars(tok)) {
105 rc = tok_push_token(tok);
106 if (rc != EOK) {
107 return rc;
108 }
109 }
[0662451]110 tok_start_token(tok, TOKTYPE_SPACE);
[5992e0e]111 /* Eat all the spaces */
[0662451]112 while (tok_look_char(tok) == ' ') {
113 tok_push_char(tok, tok_get_char(tok));
114 }
115 tok_push_token(tok);
[a35b458]116
[1433ecda]117 } else if (next_char == '|') {
[7c3fb9b]118 /*
119 * Pipes are tokens that are delimiters and should be
[5992e0e]120 * output as a separate token
[6939edb]121 */
122 if (tok_pending_chars(tok)) {
123 rc = tok_push_token(tok);
124 if (rc != EOK) {
125 return rc;
126 }
127 }
[a35b458]128
[0662451]129 tok_start_token(tok, TOKTYPE_PIPE);
[a35b458]130
[5992e0e]131 rc = tok_push_char(tok, tok_get_char(tok));
[6939edb]132 if (rc != EOK) {
133 return rc;
134 }
[a35b458]135
[6939edb]136 rc = tok_push_token(tok);
137 if (rc != EOK) {
138 return rc;
139 }
[21b0013]140 } else if (next_char == '<') {
141 if (tok_pending_chars(tok)) {
142 rc = tok_push_token(tok);
143 if (rc != EOK) {
144 return rc;
145 }
146 }
147
148 tok_start_token(tok, TOKTYPE_RDIN);
149
150 rc = tok_push_char(tok, tok_get_char(tok));
151 if (rc != EOK) {
152 return rc;
153 }
[87eba56]154
[21b0013]155 rc = tok_push_token(tok);
156 if (rc != EOK) {
157 return rc;
158 }
159 } else if (next_char == '>') {
160 if (tok_pending_chars(tok)) {
161 rc = tok_push_token(tok);
162 if (rc != EOK) {
163 return rc;
164 }
165 }
166
167 tok_start_token(tok, TOKTYPE_RDOU);
168
169 rc = tok_push_char(tok, tok_get_char(tok));
170 if (rc != EOK) {
171 return rc;
172 }
[87eba56]173
[21b0013]174 rc = tok_push_token(tok);
175 if (rc != EOK) {
176 return rc;
177 }
178 } else if (next_char == '\'') {
[7c3fb9b]179 /*
180 * A string starts with a quote (') and ends again with a quote.
[6939edb]181 * A literal quote is written as ''
182 */
[0662451]183 tok_start_token(tok, TOKTYPE_TEXT);
[5992e0e]184 /* Eat the quote */
185 tok_get_char(tok);
[6939edb]186 rc = tok_finish_string(tok);
187 if (rc != EOK) {
188 return rc;
189 }
[1433ecda]190 } else {
[0662451]191 if (!tok_pending_chars(tok)) {
192 tok_start_token(tok, TOKTYPE_TEXT);
193 }
[7c3fb9b]194 /*
195 * If we are handling any other character, just append it to
[6939edb]196 * the current token.
197 */
[5992e0e]198 rc = tok_push_char(tok, tok_get_char(tok));
[6939edb]199 if (rc != EOK) {
200 return rc;
201 }
202 }
203 }
[a35b458]204
[6939edb]205 /* Push the last token */
206 if (tok_pending_chars(tok)) {
207 rc = tok_push_token(tok);
208 if (rc != EOK) {
209 return rc;
210 }
211 }
[a35b458]212
[0662451]213 *tokens_length = tok->outtok_offset;
[a35b458]214
[6939edb]215 return EOK;
216}
217
218/** Finish tokenizing an opened string */
[b7fd2a0]219errno_t tok_finish_string(tokenizer_t *tok)
[6939edb]220{
[b7fd2a0]221 errno_t rc;
[28a5ebd]222 char32_t next_char;
[a35b458]223
[5992e0e]224 while ((next_char = tok_look_char(tok)) != 0) {
225 if (next_char == '\'') {
226 /* Eat the quote */
227 tok_get_char(tok);
[6939edb]228 if (tok_look_char(tok) == '\'') {
229 /* Encode a single literal quote */
230 rc = tok_push_char(tok, '\'');
231 if (rc != EOK) {
232 return rc;
233 }
[a35b458]234
[6939edb]235 /* Swallow the additional one in the input */
236 tok_get_char(tok);
[1433ecda]237 } else {
[6939edb]238 /* The string end */
239 return tok_push_token(tok);
240 }
[1433ecda]241 } else {
[5992e0e]242 rc = tok_push_char(tok, tok_get_char(tok));
[6939edb]243 if (rc != EOK) {
244 return rc;
245 }
246 }
247 }
[a35b458]248
[6939edb]249 /* If we are here, the string run to the end without being closed */
250 return EINVAL;
251}
252
253/** Get a char from input, advancing the input position */
[28a5ebd]254char32_t tok_get_char(tokenizer_t *tok)
[6939edb]255{
[7dcb7981]256 tok->in_char_offset++;
[6939edb]257 return str_decode(tok->in, &tok->in_offset, STR_NO_LIMIT);
258}
259
260/** Get a char from input, while staying on the same input position */
[28a5ebd]261char32_t tok_look_char(tokenizer_t *tok)
[6939edb]262{
[f41682c]263 size_t old_offset = tok->in_offset;
264 size_t old_char_offset = tok->in_char_offset;
[28a5ebd]265 char32_t ret = tok_get_char(tok);
[6939edb]266 tok->in_offset = old_offset;
[7dcb7981]267 tok->in_char_offset = old_char_offset;
[6939edb]268 return ret;
269}
270
271/** Append a char to the end of the current token */
[28a5ebd]272errno_t tok_push_char(tokenizer_t *tok, char32_t ch)
[6939edb]273{
274 return chr_encode(ch, tok->outbuf, &tok->outbuf_offset, tok->outbuf_size);
275}
276
[0662451]277void tok_start_token(tokenizer_t *tok, token_type_t type)
278{
279 tok->current_type = type;
280}
281
[6939edb]282/** Push the current token to the output array */
[b7fd2a0]283errno_t tok_push_token(tokenizer_t *tok)
[6939edb]284{
285 if (tok->outtok_offset >= tok->outtok_size) {
286 return EOVERFLOW;
287 }
[a35b458]288
[6939edb]289 if (tok->outbuf_offset >= tok->outbuf_size) {
290 return EOVERFLOW;
291 }
[a35b458]292
[6939edb]293 tok->outbuf[tok->outbuf_offset++] = 0;
[0662451]294 token_t *tokinfo = &tok->outtok[tok->outtok_offset++];
295 tokinfo->type = tok->current_type;
296 tokinfo->text = tok->outbuf + tok->outbuf_last_start;
297 tokinfo->byte_start = tok->last_in_offset;
[5992e0e]298 tokinfo->byte_length = tok->in_offset - tok->last_in_offset;
[0662451]299 tokinfo->char_start = tok->last_in_char_offset;
[5992e0e]300 tokinfo->char_length = tok->in_char_offset - tok->last_in_char_offset;
[6939edb]301 tok->outbuf_last_start = tok->outbuf_offset;
[a35b458]302
[7dcb7981]303 /* We have consumed the first char of the next token already */
[5992e0e]304 tok->last_in_offset = tok->in_offset;
305 tok->last_in_char_offset = tok->in_char_offset;
[a35b458]306
[6939edb]307 return EOK;
308}
309
310/** Return true, if the current token is not empty */
311bool tok_pending_chars(tokenizer_t *tok)
312{
313 assert(tok->outbuf_offset >= tok->outbuf_last_start);
314 return (tok->outbuf_offset != tok->outbuf_last_start);
[7dcb7981]315}
Note: See TracBrowser for help on using the repository browser.