source: mainline/uspace/app/bdsh/tok.c@ 28ee877e

lfn serial ticket/834-toolchain-update topic/msim-upgrade topic/simplify-dev-export
Last change on this file since 28ee877e was 6939edb, checked in by Martin Sucha <sucha14@…>, 14 years ago

Add more powerful tokenizer to bdsh

  • Property mode set to 100644
File size: 6.4 KB
Line 
1/* Copyright (c) 2011, Martin Sucha <sucha14@st.fmph.uniba.sk>
2 * All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are met:
6 *
7 * Redistributions of source code must retain the above copyright notice, this
8 * list of conditions and the following disclaimer.
9 *
10 * Redistributions in binary form must reproduce the above copyright notice,
11 * this list of conditions and the following disclaimer in the documentation
12 * and/or other materials provided with the distribution.
13 *
14 * Neither the name of the original program's authors nor the names of its
15 * contributors may be used to endorse or promote products derived from this
16 * software without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
22 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28 * POSSIBILITY OF SUCH DAMAGE.
29 */
30
31#include <str.h>
32#include <assert.h>
33#include <malloc.h>
34#include <stdlib.h>
35#include <errno.h>
36
37#include "tok.h"
38
39/* Forward declarations of static functions */
40static wchar_t tok_get_char(tokenizer_t *);
41static wchar_t tok_look_char(tokenizer_t *);
42static int tok_push_char(tokenizer_t *, wchar_t);
43static int tok_push_token(tokenizer_t *);
44static bool tok_pending_chars(tokenizer_t *);
45static int tok_finish_string(tokenizer_t *);
46
47/** Initialize the token parser
48 *
49 * @param tok the tokenizer structure to initialize
50 * @param input the input string to tokenize
51 * @param out_tokens array of strings where to store the result
52 * @param max_tokens number of elements of the out_tokens array
53 */
54int tok_init(tokenizer_t *tok, char *input, char **out_tokens,
55 size_t max_tokens)
56{
57 tok->in = input;
58 tok->in_offset = 0;
59
60 tok->outtok = out_tokens;
61 tok->outtok_offset = 0;
62 /* Leave one slot for a null terminator */
63 assert(max_tokens > 0);
64 tok->outtok_size = max_tokens - 1;
65
66 /* Prepare a buffer where all the token strings will be stored */
67 size_t len = str_size(input) + max_tokens + 1;
68 char *tmp = malloc(len);
69
70 if (tmp == NULL) {
71 return ENOMEM;
72 }
73
74 tok->outbuf = tmp;
75 tok->outbuf_offset = 0;
76 tok->outbuf_size = len;
77 tok->outbuf_last_start = 0;
78
79 return EOK;
80}
81
82/** Finalize the token parser */
83void tok_fini(tokenizer_t *tok)
84{
85 if (tok->outbuf != NULL) {
86 free(tok->outbuf);
87 }
88}
89
90/** Tokenize the input string into the tokens */
91int tok_tokenize(tokenizer_t *tok)
92{
93 int rc;
94 wchar_t cur_char;
95
96 /* Read the input line char by char and append tokens */
97 while ((cur_char = tok_get_char(tok)) != 0) {
98 if (cur_char == ' ') {
99 /* Spaces delimit tokens, but are not processed in any way
100 * Push the token if there is any.
101 * There may not be any pending char for a token in case
102 * there are several spaces in the input.
103 */
104 if (tok_pending_chars(tok)) {
105 rc = tok_push_token(tok);
106 if (rc != EOK) {
107 return rc;
108 }
109 }
110 }
111 else if (cur_char == '|') {
112 /* Pipes are tokens that are delimiters and should be output
113 * as a separate token
114 */
115 if (tok_pending_chars(tok)) {
116 rc = tok_push_token(tok);
117 if (rc != EOK) {
118 return rc;
119 }
120 }
121
122 rc = tok_push_char(tok, '|');
123 if (rc != EOK) {
124 return rc;
125 }
126
127 rc = tok_push_token(tok);
128 if (rc != EOK) {
129 return rc;
130 }
131 }
132 else if (cur_char == '\'') {
133 /* A string starts with a quote (') and ends again with a quote.
134 * A literal quote is written as ''
135 */
136 rc = tok_finish_string(tok);
137 if (rc != EOK) {
138 return rc;
139 }
140 }
141 else {
142 /* If we are handling any other character, just append it to
143 * the current token.
144 */
145 rc = tok_push_char(tok, cur_char);
146 if (rc != EOK) {
147 return rc;
148 }
149 }
150 }
151
152 /* Push the last token */
153 if (tok_pending_chars(tok)) {
154 rc = tok_push_token(tok);
155 if (rc != EOK) {
156 return rc;
157 }
158 }
159
160 /* We always have a space for the terminator, as we
161 * reserved it in tok_init */
162 tok->outtok[tok->outtok_offset] = 0;
163
164 return EOK;
165}
166
167/** Finish tokenizing an opened string */
168int tok_finish_string(tokenizer_t *tok)
169{
170 int rc;
171 wchar_t cur_char;
172
173 while ((cur_char = tok_get_char(tok)) != 0) {
174 if (cur_char == '\'') {
175 if (tok_look_char(tok) == '\'') {
176 /* Encode a single literal quote */
177 rc = tok_push_char(tok, '\'');
178 if (rc != EOK) {
179 return rc;
180 }
181
182 /* Swallow the additional one in the input */
183 tok_get_char(tok);
184 }
185 else {
186 /* The string end */
187 return tok_push_token(tok);
188 }
189 }
190 else {
191 rc = tok_push_char(tok, cur_char);
192 if (rc != EOK) {
193 return rc;
194 }
195 }
196 }
197
198 /* If we are here, the string run to the end without being closed */
199 return EINVAL;
200}
201
202/** Get a char from input, advancing the input position */
203wchar_t tok_get_char(tokenizer_t *tok)
204{
205 return str_decode(tok->in, &tok->in_offset, STR_NO_LIMIT);
206}
207
208/** Get a char from input, while staying on the same input position */
209wchar_t tok_look_char(tokenizer_t *tok)
210{
211 size_t old_offset = tok->in_offset;
212 wchar_t ret = tok_get_char(tok);
213 tok->in_offset = old_offset;
214 return ret;
215}
216
217/** Append a char to the end of the current token */
218int tok_push_char(tokenizer_t *tok, wchar_t ch)
219{
220 return chr_encode(ch, tok->outbuf, &tok->outbuf_offset, tok->outbuf_size);
221}
222
223/** Push the current token to the output array */
224int tok_push_token(tokenizer_t *tok)
225{
226 if (tok->outtok_offset >= tok->outtok_size) {
227 return EOVERFLOW;
228 }
229
230 if (tok->outbuf_offset >= tok->outbuf_size) {
231 return EOVERFLOW;
232 }
233
234 tok->outbuf[tok->outbuf_offset++] = 0;
235 tok->outtok[tok->outtok_offset++] = tok->outbuf + tok->outbuf_last_start;
236 tok->outbuf_last_start = tok->outbuf_offset;
237
238 return EOK;
239}
240
241/** Return true, if the current token is not empty */
242bool tok_pending_chars(tokenizer_t *tok)
243{
244 assert(tok->outbuf_offset >= tok->outbuf_last_start);
245 return (tok->outbuf_offset != tok->outbuf_last_start);
246}
Note: See TracBrowser for help on using the repository browser.