source: mainline/uspace/app/bdsh/tok.c

Last change on this file was 28a5ebd, checked in by Martin Decky <martin@…>, 5 years ago

Use char32_t instead of wchat_t to represent UTF-32 strings

The intention of the native HelenOS string API has been always to
support Unicode in the UTF-8 and UTF-32 encodings as the sole character
representations and ignore the obsolete mess of older single-byte and
multibyte character encodings. Before C11, the wchar_t type has been
slightly misused for the purpose of the UTF-32 strings. The newer
char32_t type is obviously a much more suitable option. The standard
defines char32_t as uint_least32_t, thus we can take the liberty to fix
it to uint32_t.

To maintain compatilibity with the C Standard, the putwchar(wchar_t)
functions has been replaced by our custom putuchar(char32_t) functions
where appropriate.

  • Property mode set to 100644
File size: 7.3 KB
RevLine 
[36ab7c7]1/*
2 * Copyright (c) 2011 Martin Sucha
[6939edb]3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
[36ab7c7]6 * modification, are permitted provided that the following conditions
7 * are met:
[6939edb]8 *
[36ab7c7]9 * - Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * - Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * - The name of the author may not be used to endorse or promote products
15 * derived from this software without specific prior written permission.
[6939edb]16 *
[36ab7c7]17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
[6939edb]27 */
28
29#include <str.h>
30#include <assert.h>
31#include <stdlib.h>
[582a0b8]32#include <stddef.h>
[6939edb]33#include <errno.h>
34
35#include "tok.h"
36
37/* Forward declarations of static functions */
[28a5ebd]38static char32_t tok_get_char(tokenizer_t *);
39static char32_t tok_look_char(tokenizer_t *);
40static errno_t tok_push_char(tokenizer_t *, char32_t);
[b7fd2a0]41static errno_t tok_push_token(tokenizer_t *);
[6939edb]42static bool tok_pending_chars(tokenizer_t *);
[b7fd2a0]43static errno_t tok_finish_string(tokenizer_t *);
[0662451]44static void tok_start_token(tokenizer_t *, token_type_t);
[6939edb]45
46/** Initialize the token parser
[1b20da0]47 *
[6939edb]48 * @param tok the tokenizer structure to initialize
49 * @param input the input string to tokenize
50 * @param out_tokens array of strings where to store the result
51 * @param max_tokens number of elements of the out_tokens array
52 */
[b7fd2a0]53errno_t tok_init(tokenizer_t *tok, char *input, token_t *out_tokens,
[6939edb]54 size_t max_tokens)
[1b20da0]55{
[6939edb]56 tok->in = input;
57 tok->in_offset = 0;
[7dcb7981]58 tok->last_in_offset = 0;
59 tok->in_char_offset = 0;
60 tok->last_in_char_offset = 0;
[a35b458]61
[6939edb]62 tok->outtok = out_tokens;
63 tok->outtok_offset = 0;
[0662451]64 tok->outtok_size = max_tokens;
[a35b458]65
[6939edb]66 /* Prepare a buffer where all the token strings will be stored */
67 size_t len = str_size(input) + max_tokens + 1;
68 char *tmp = malloc(len);
[a35b458]69
[6939edb]70 if (tmp == NULL) {
71 return ENOMEM;
72 }
[a35b458]73
[6939edb]74 tok->outbuf = tmp;
75 tok->outbuf_offset = 0;
76 tok->outbuf_size = len;
77 tok->outbuf_last_start = 0;
[a35b458]78
[6939edb]79 return EOK;
80}
81
82/** Finalize the token parser */
83void tok_fini(tokenizer_t *tok)
84{
85 if (tok->outbuf != NULL) {
86 free(tok->outbuf);
87 }
88}
89
90/** Tokenize the input string into the tokens */
[b7fd2a0]91errno_t tok_tokenize(tokenizer_t *tok, size_t *tokens_length)
[6939edb]92{
[b7fd2a0]93 errno_t rc;
[28a5ebd]94 char32_t next_char;
[a35b458]95
[6939edb]96 /* Read the input line char by char and append tokens */
[5992e0e]97 while ((next_char = tok_look_char(tok)) != 0) {
98 if (next_char == ' ') {
[7c3fb9b]99 /*
100 * Push the token if there is any.
[6939edb]101 * There may not be any pending char for a token in case
102 * there are several spaces in the input.
103 */
104 if (tok_pending_chars(tok)) {
105 rc = tok_push_token(tok);
106 if (rc != EOK) {
107 return rc;
108 }
109 }
[0662451]110 tok_start_token(tok, TOKTYPE_SPACE);
[5992e0e]111 /* Eat all the spaces */
[0662451]112 while (tok_look_char(tok) == ' ') {
113 tok_push_char(tok, tok_get_char(tok));
114 }
115 tok_push_token(tok);
[a35b458]116
[1433ecda]117 } else if (next_char == '|') {
[7c3fb9b]118 /*
119 * Pipes are tokens that are delimiters and should be
[5992e0e]120 * output as a separate token
[6939edb]121 */
122 if (tok_pending_chars(tok)) {
123 rc = tok_push_token(tok);
124 if (rc != EOK) {
125 return rc;
126 }
127 }
[a35b458]128
[0662451]129 tok_start_token(tok, TOKTYPE_PIPE);
[a35b458]130
[5992e0e]131 rc = tok_push_char(tok, tok_get_char(tok));
[6939edb]132 if (rc != EOK) {
133 return rc;
134 }
[a35b458]135
[6939edb]136 rc = tok_push_token(tok);
137 if (rc != EOK) {
138 return rc;
139 }
[1433ecda]140 } else if (next_char == '\'') {
[7c3fb9b]141 /*
142 * A string starts with a quote (') and ends again with a quote.
[6939edb]143 * A literal quote is written as ''
144 */
[0662451]145 tok_start_token(tok, TOKTYPE_TEXT);
[5992e0e]146 /* Eat the quote */
147 tok_get_char(tok);
[6939edb]148 rc = tok_finish_string(tok);
149 if (rc != EOK) {
150 return rc;
151 }
[1433ecda]152 } else {
[0662451]153 if (!tok_pending_chars(tok)) {
154 tok_start_token(tok, TOKTYPE_TEXT);
155 }
[7c3fb9b]156 /*
157 * If we are handling any other character, just append it to
[6939edb]158 * the current token.
159 */
[5992e0e]160 rc = tok_push_char(tok, tok_get_char(tok));
[6939edb]161 if (rc != EOK) {
162 return rc;
163 }
164 }
165 }
[a35b458]166
[6939edb]167 /* Push the last token */
168 if (tok_pending_chars(tok)) {
169 rc = tok_push_token(tok);
170 if (rc != EOK) {
171 return rc;
172 }
173 }
[a35b458]174
[0662451]175 *tokens_length = tok->outtok_offset;
[a35b458]176
[6939edb]177 return EOK;
178}
179
180/** Finish tokenizing an opened string */
[b7fd2a0]181errno_t tok_finish_string(tokenizer_t *tok)
[6939edb]182{
[b7fd2a0]183 errno_t rc;
[28a5ebd]184 char32_t next_char;
[a35b458]185
[5992e0e]186 while ((next_char = tok_look_char(tok)) != 0) {
187 if (next_char == '\'') {
188 /* Eat the quote */
189 tok_get_char(tok);
[6939edb]190 if (tok_look_char(tok) == '\'') {
191 /* Encode a single literal quote */
192 rc = tok_push_char(tok, '\'');
193 if (rc != EOK) {
194 return rc;
195 }
[a35b458]196
[6939edb]197 /* Swallow the additional one in the input */
198 tok_get_char(tok);
[1433ecda]199 } else {
[6939edb]200 /* The string end */
201 return tok_push_token(tok);
202 }
[1433ecda]203 } else {
[5992e0e]204 rc = tok_push_char(tok, tok_get_char(tok));
[6939edb]205 if (rc != EOK) {
206 return rc;
207 }
208 }
209 }
[a35b458]210
[6939edb]211 /* If we are here, the string run to the end without being closed */
212 return EINVAL;
213}
214
215/** Get a char from input, advancing the input position */
[28a5ebd]216char32_t tok_get_char(tokenizer_t *tok)
[6939edb]217{
[7dcb7981]218 tok->in_char_offset++;
[6939edb]219 return str_decode(tok->in, &tok->in_offset, STR_NO_LIMIT);
220}
221
222/** Get a char from input, while staying on the same input position */
[28a5ebd]223char32_t tok_look_char(tokenizer_t *tok)
[6939edb]224{
[f41682c]225 size_t old_offset = tok->in_offset;
226 size_t old_char_offset = tok->in_char_offset;
[28a5ebd]227 char32_t ret = tok_get_char(tok);
[6939edb]228 tok->in_offset = old_offset;
[7dcb7981]229 tok->in_char_offset = old_char_offset;
[6939edb]230 return ret;
231}
232
233/** Append a char to the end of the current token */
[28a5ebd]234errno_t tok_push_char(tokenizer_t *tok, char32_t ch)
[6939edb]235{
236 return chr_encode(ch, tok->outbuf, &tok->outbuf_offset, tok->outbuf_size);
237}
238
[0662451]239void tok_start_token(tokenizer_t *tok, token_type_t type)
240{
241 tok->current_type = type;
242}
243
[6939edb]244/** Push the current token to the output array */
[b7fd2a0]245errno_t tok_push_token(tokenizer_t *tok)
[6939edb]246{
247 if (tok->outtok_offset >= tok->outtok_size) {
248 return EOVERFLOW;
249 }
[a35b458]250
[6939edb]251 if (tok->outbuf_offset >= tok->outbuf_size) {
252 return EOVERFLOW;
253 }
[a35b458]254
[6939edb]255 tok->outbuf[tok->outbuf_offset++] = 0;
[0662451]256 token_t *tokinfo = &tok->outtok[tok->outtok_offset++];
257 tokinfo->type = tok->current_type;
258 tokinfo->text = tok->outbuf + tok->outbuf_last_start;
259 tokinfo->byte_start = tok->last_in_offset;
[5992e0e]260 tokinfo->byte_length = tok->in_offset - tok->last_in_offset;
[0662451]261 tokinfo->char_start = tok->last_in_char_offset;
[5992e0e]262 tokinfo->char_length = tok->in_char_offset - tok->last_in_char_offset;
[6939edb]263 tok->outbuf_last_start = tok->outbuf_offset;
[a35b458]264
[7dcb7981]265 /* We have consumed the first char of the next token already */
[5992e0e]266 tok->last_in_offset = tok->in_offset;
267 tok->last_in_char_offset = tok->in_char_offset;
[a35b458]268
[6939edb]269 return EOK;
270}
271
272/** Return true, if the current token is not empty */
273bool tok_pending_chars(tokenizer_t *tok)
274{
275 assert(tok->outbuf_offset >= tok->outbuf_last_start);
276 return (tok->outbuf_offset != tok->outbuf_last_start);
[7dcb7981]277}
Note: See TracBrowser for help on using the repository browser.