Context Navigation

source: mainline/uspace/app/bdsh/tok.c@ 0a9b918

Visit:

Last change on this file since 0a9b918 was 87eba56, checked in by Manuele Conti <manuele.conti@…>, 4 years ago
Fix tokenize command
Property mode set to `100644`
File size: 8.0 KB

Line
1	/*
2	* Copyright (c) 2011 Martin Sucha
3	* All rights reserved.
4	*
5	* Redistribution and use in source and binary forms, with or without
6	* modification, are permitted provided that the following conditions
7	* are met:
8	*
9	* - Redistributions of source code must retain the above copyright
10	* notice, this list of conditions and the following disclaimer.
11	* - Redistributions in binary form must reproduce the above copyright
12	* notice, this list of conditions and the following disclaimer in the
13	* documentation and/or other materials provided with the distribution.
14	* - The name of the author may not be used to endorse or promote products
15	* derived from this software without specific prior written permission.
16	*
17	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27	*/
28
29	#include <str.h>
30	#include <assert.h>
31	#include <stdlib.h>
32	#include <stddef.h>
33	#include <errno.h>
34
35	#include "tok.h"
36
37	/* Forward declarations of static functions */
38	static char32_t tok_get_char(tokenizer_t *);
39	static char32_t tok_look_char(tokenizer_t *);
40	static errno_t tok_push_char(tokenizer_t *, char32_t);
41	static errno_t tok_push_token(tokenizer_t *);
42	static bool tok_pending_chars(tokenizer_t *);
43	static errno_t tok_finish_string(tokenizer_t *);
44	static void tok_start_token(tokenizer_t *, token_type_t);
45
46	/** Initialize the token parser
47	*
48	* @param tok the tokenizer structure to initialize
49	* @param input the input string to tokenize
50	* @param out_tokens array of strings where to store the result
51	* @param max_tokens number of elements of the out_tokens array
52	*/
53	errno_t tok_init(tokenizer_t tok, char input, token_t *out_tokens,
54	size_t max_tokens)
55	{
56	tok->in = input;
57	tok->in_offset = 0;
58	tok->last_in_offset = 0;
59	tok->in_char_offset = 0;
60	tok->last_in_char_offset = 0;
61
62	tok->outtok = out_tokens;
63	tok->outtok_offset = 0;
64	tok->outtok_size = max_tokens;
65
66	/* Prepare a buffer where all the token strings will be stored */
67	size_t len = str_size(input) + max_tokens + 1;
68	char *tmp = malloc(len);
69
70	if (tmp == NULL) {
71	return ENOMEM;
72	}
73
74	tok->outbuf = tmp;
75	tok->outbuf_offset = 0;
76	tok->outbuf_size = len;
77	tok->outbuf_last_start = 0;
78
79	return EOK;
80	}
81
82	/** Finalize the token parser */
83	void tok_fini(tokenizer_t *tok)
84	{
85	if (tok->outbuf != NULL) {
86	free(tok->outbuf);
87	}
88	}
89
90	/** Tokenize the input string into the tokens */
91	errno_t tok_tokenize(tokenizer_t tok, size_t tokens_length)
92	{
93	errno_t rc;
94	char32_t next_char;
95
96	/* Read the input line char by char and append tokens */
97	while ((next_char = tok_look_char(tok)) != 0) {
98	if (next_char == ' ') {
99	/*
100	* Push the token if there is any.
101	* There may not be any pending char for a token in case
102	* there are several spaces in the input.
103	*/
104	if (tok_pending_chars(tok)) {
105	rc = tok_push_token(tok);
106	if (rc != EOK) {
107	return rc;
108	}
109	}
110	tok_start_token(tok, TOKTYPE_SPACE);
111	/* Eat all the spaces */
112	while (tok_look_char(tok) == ' ') {
113	tok_push_char(tok, tok_get_char(tok));
114	}
115	tok_push_token(tok);
116
117	} else if (next_char == '\|') {
118	/*
119	* Pipes are tokens that are delimiters and should be
120	* output as a separate token
121	*/
122	if (tok_pending_chars(tok)) {
123	rc = tok_push_token(tok);
124	if (rc != EOK) {
125	return rc;
126	}
127	}
128
129	tok_start_token(tok, TOKTYPE_PIPE);
130
131	rc = tok_push_char(tok, tok_get_char(tok));
132	if (rc != EOK) {
133	return rc;
134	}
135
136	rc = tok_push_token(tok);
137	if (rc != EOK) {
138	return rc;
139	}
140	} else if (next_char == '<') {
141	if (tok_pending_chars(tok)) {
142	rc = tok_push_token(tok);
143	if (rc != EOK) {
144	return rc;
145	}
146	}
147
148	tok_start_token(tok, TOKTYPE_RDIN);
149
150	rc = tok_push_char(tok, tok_get_char(tok));
151	if (rc != EOK) {
152	return rc;
153	}
154
155	rc = tok_push_token(tok);
156	if (rc != EOK) {
157	return rc;
158	}
159	} else if (next_char == '>') {
160	if (tok_pending_chars(tok)) {
161	rc = tok_push_token(tok);
162	if (rc != EOK) {
163	return rc;
164	}
165	}
166
167	tok_start_token(tok, TOKTYPE_RDOU);
168
169	rc = tok_push_char(tok, tok_get_char(tok));
170	if (rc != EOK) {
171	return rc;
172	}
173
174	rc = tok_push_token(tok);
175	if (rc != EOK) {
176	return rc;
177	}
178	} else if (next_char == '\'') {
179	/*
180	* A string starts with a quote (') and ends again with a quote.
181	* A literal quote is written as ''
182	*/
183	tok_start_token(tok, TOKTYPE_TEXT);
184	/* Eat the quote */
185	tok_get_char(tok);
186	rc = tok_finish_string(tok);
187	if (rc != EOK) {
188	return rc;
189	}
190	} else {
191	if (!tok_pending_chars(tok)) {
192	tok_start_token(tok, TOKTYPE_TEXT);
193	}
194	/*
195	* If we are handling any other character, just append it to
196	* the current token.
197	*/
198	rc = tok_push_char(tok, tok_get_char(tok));
199	if (rc != EOK) {
200	return rc;
201	}
202	}
203	}
204
205	/* Push the last token */
206	if (tok_pending_chars(tok)) {
207	rc = tok_push_token(tok);
208	if (rc != EOK) {
209	return rc;
210	}
211	}
212
213	*tokens_length = tok->outtok_offset;
214
215	return EOK;
216	}
217
218	/** Finish tokenizing an opened string */
219	errno_t tok_finish_string(tokenizer_t *tok)
220	{
221	errno_t rc;
222	char32_t next_char;
223
224	while ((next_char = tok_look_char(tok)) != 0) {
225	if (next_char == '\'') {
226	/* Eat the quote */
227	tok_get_char(tok);
228	if (tok_look_char(tok) == '\'') {
229	/* Encode a single literal quote */
230	rc = tok_push_char(tok, '\'');
231	if (rc != EOK) {
232	return rc;
233	}
234
235	/* Swallow the additional one in the input */
236	tok_get_char(tok);
237	} else {
238	/* The string end */
239	return tok_push_token(tok);
240	}
241	} else {
242	rc = tok_push_char(tok, tok_get_char(tok));
243	if (rc != EOK) {
244	return rc;
245	}
246	}
247	}
248
249	/* If we are here, the string run to the end without being closed */
250	return EINVAL;
251	}
252
253	/** Get a char from input, advancing the input position */
254	char32_t tok_get_char(tokenizer_t *tok)
255	{
256	tok->in_char_offset++;
257	return str_decode(tok->in, &tok->in_offset, STR_NO_LIMIT);
258	}
259
260	/** Get a char from input, while staying on the same input position */
261	char32_t tok_look_char(tokenizer_t *tok)
262	{
263	size_t old_offset = tok->in_offset;
264	size_t old_char_offset = tok->in_char_offset;
265	char32_t ret = tok_get_char(tok);
266	tok->in_offset = old_offset;
267	tok->in_char_offset = old_char_offset;
268	return ret;
269	}
270
271	/** Append a char to the end of the current token */
272	errno_t tok_push_char(tokenizer_t *tok, char32_t ch)
273	{
274	return chr_encode(ch, tok->outbuf, &tok->outbuf_offset, tok->outbuf_size);
275	}
276
277	void tok_start_token(tokenizer_t *tok, token_type_t type)
278	{
279	tok->current_type = type;
280	}
281
282	/** Push the current token to the output array */
283	errno_t tok_push_token(tokenizer_t *tok)
284	{
285	if (tok->outtok_offset >= tok->outtok_size) {
286	return EOVERFLOW;
287	}
288
289	if (tok->outbuf_offset >= tok->outbuf_size) {
290	return EOVERFLOW;
291	}
292
293	tok->outbuf[tok->outbuf_offset++] = 0;
294	token_t *tokinfo = &tok->outtok[tok->outtok_offset++];
295	tokinfo->type = tok->current_type;
296	tokinfo->text = tok->outbuf + tok->outbuf_last_start;
297	tokinfo->byte_start = tok->last_in_offset;
298	tokinfo->byte_length = tok->in_offset - tok->last_in_offset;
299	tokinfo->char_start = tok->last_in_char_offset;
300	tokinfo->char_length = tok->in_char_offset - tok->last_in_char_offset;
301	tok->outbuf_last_start = tok->outbuf_offset;
302
303	/* We have consumed the first char of the next token already */
304	tok->last_in_offset = tok->in_offset;
305	tok->last_in_char_offset = tok->in_char_offset;
306
307	return EOK;
308	}
309
310	/** Return true, if the current token is not empty */
311	bool tok_pending_chars(tokenizer_t *tok)
312	{
313	assert(tok->outbuf_offset >= tok->outbuf_last_start);
314	return (tok->outbuf_offset != tok->outbuf_last_start);
315	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: