Context Navigation

tok.c

Visit:

Last change on this file was 28a5ebd, checked in by Martin Decky <martin@…>, 5 years ago

Use char32_t instead of wchat_t to represent UTF-32 strings

The intention of the native HelenOS string API has been always to
support Unicode in the UTF-8 and UTF-32 encodings as the sole character
representations and ignore the obsolete mess of older single-byte and
multibyte character encodings. Before C11, the wchar_t type has been
slightly misused for the purpose of the UTF-32 strings. The newer
char32_t type is obviously a much more suitable option. The standard
defines char32_t as uint_least32_t, thus we can take the liberty to fix
it to uint32_t.

To maintain compatilibity with the C Standard, the putwchar(wchar_t)
functions has been replaced by our custom putuchar(char32_t) functions
where appropriate.

Property mode set to 100644

File size: 7.3 KB

Line
1	/*
2	* Copyright (c) 2011 Martin Sucha
3	* All rights reserved.
4	*
5	* Redistribution and use in source and binary forms, with or without
6	* modification, are permitted provided that the following conditions
7	* are met:
8	*
9	* - Redistributions of source code must retain the above copyright
10	* notice, this list of conditions and the following disclaimer.
11	* - Redistributions in binary form must reproduce the above copyright
12	* notice, this list of conditions and the following disclaimer in the
13	* documentation and/or other materials provided with the distribution.
14	* - The name of the author may not be used to endorse or promote products
15	* derived from this software without specific prior written permission.
16	*
17	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27	*/
28
29	#include <str.h>
30	#include <assert.h>
31	#include <stdlib.h>
32	#include <stddef.h>
33	#include <errno.h>
34
35	#include "tok.h"
36
37	/* Forward declarations of static functions */
38	static char32_t tok_get_char(tokenizer_t *);
39	static char32_t tok_look_char(tokenizer_t *);
40	static errno_t tok_push_char(tokenizer_t *, char32_t);
41	static errno_t tok_push_token(tokenizer_t *);
42	static bool tok_pending_chars(tokenizer_t *);
43	static errno_t tok_finish_string(tokenizer_t *);
44	static void tok_start_token(tokenizer_t *, token_type_t);
45
46	/** Initialize the token parser
47	*
48	* @param tok the tokenizer structure to initialize
49	* @param input the input string to tokenize
50	* @param out_tokens array of strings where to store the result
51	* @param max_tokens number of elements of the out_tokens array
52	*/
53	errno_t tok_init(tokenizer_t tok, char input, token_t *out_tokens,
54	size_t max_tokens)
55	{
56	tok->in = input;
57	tok->in_offset = 0;
58	tok->last_in_offset = 0;
59	tok->in_char_offset = 0;
60	tok->last_in_char_offset = 0;
61
62	tok->outtok = out_tokens;
63	tok->outtok_offset = 0;
64	tok->outtok_size = max_tokens;
65
66	/* Prepare a buffer where all the token strings will be stored */
67	size_t len = str_size(input) + max_tokens + 1;
68	char *tmp = malloc(len);
69
70	if (tmp == NULL) {
71	return ENOMEM;
72	}
73
74	tok->outbuf = tmp;
75	tok->outbuf_offset = 0;
76	tok->outbuf_size = len;
77	tok->outbuf_last_start = 0;
78
79	return EOK;
80	}
81
82	/** Finalize the token parser */
83	void tok_fini(tokenizer_t *tok)
84	{
85	if (tok->outbuf != NULL) {
86	free(tok->outbuf);
87	}
88	}
89
90	/** Tokenize the input string into the tokens */
91	errno_t tok_tokenize(tokenizer_t tok, size_t tokens_length)
92	{
93	errno_t rc;
94	char32_t next_char;
95
96	/* Read the input line char by char and append tokens */
97	while ((next_char = tok_look_char(tok)) != 0) {
98	if (next_char == ' ') {
99	/*
100	* Push the token if there is any.
101	* There may not be any pending char for a token in case
102	* there are several spaces in the input.
103	*/
104	if (tok_pending_chars(tok)) {
105	rc = tok_push_token(tok);
106	if (rc != EOK) {
107	return rc;
108	}
109	}
110	tok_start_token(tok, TOKTYPE_SPACE);
111	/* Eat all the spaces */
112	while (tok_look_char(tok) == ' ') {
113	tok_push_char(tok, tok_get_char(tok));
114	}
115	tok_push_token(tok);
116
117	} else if (next_char == '\|') {
118	/*
119	* Pipes are tokens that are delimiters and should be
120	* output as a separate token
121	*/
122	if (tok_pending_chars(tok)) {
123	rc = tok_push_token(tok);
124	if (rc != EOK) {
125	return rc;
126	}
127	}
128
129	tok_start_token(tok, TOKTYPE_PIPE);
130
131	rc = tok_push_char(tok, tok_get_char(tok));
132	if (rc != EOK) {
133	return rc;
134	}
135
136	rc = tok_push_token(tok);
137	if (rc != EOK) {
138	return rc;
139	}
140	} else if (next_char == '\'') {
141	/*
142	* A string starts with a quote (') and ends again with a quote.
143	* A literal quote is written as ''
144	*/
145	tok_start_token(tok, TOKTYPE_TEXT);
146	/* Eat the quote */
147	tok_get_char(tok);
148	rc = tok_finish_string(tok);
149	if (rc != EOK) {
150	return rc;
151	}
152	} else {
153	if (!tok_pending_chars(tok)) {
154	tok_start_token(tok, TOKTYPE_TEXT);
155	}
156	/*
157	* If we are handling any other character, just append it to
158	* the current token.
159	*/
160	rc = tok_push_char(tok, tok_get_char(tok));
161	if (rc != EOK) {
162	return rc;
163	}
164	}
165	}
166
167	/* Push the last token */
168	if (tok_pending_chars(tok)) {
169	rc = tok_push_token(tok);
170	if (rc != EOK) {
171	return rc;
172	}
173	}
174
175	*tokens_length = tok->outtok_offset;
176
177	return EOK;
178	}
179
180	/** Finish tokenizing an opened string */
181	errno_t tok_finish_string(tokenizer_t *tok)
182	{
183	errno_t rc;
184	char32_t next_char;
185
186	while ((next_char = tok_look_char(tok)) != 0) {
187	if (next_char == '\'') {
188	/* Eat the quote */
189	tok_get_char(tok);
190	if (tok_look_char(tok) == '\'') {
191	/* Encode a single literal quote */
192	rc = tok_push_char(tok, '\'');
193	if (rc != EOK) {
194	return rc;
195	}
196
197	/* Swallow the additional one in the input */
198	tok_get_char(tok);
199	} else {
200	/* The string end */
201	return tok_push_token(tok);
202	}
203	} else {
204	rc = tok_push_char(tok, tok_get_char(tok));
205	if (rc != EOK) {
206	return rc;
207	}
208	}
209	}
210
211	/* If we are here, the string run to the end without being closed */
212	return EINVAL;
213	}
214
215	/** Get a char from input, advancing the input position */
216	char32_t tok_get_char(tokenizer_t *tok)
217	{
218	tok->in_char_offset++;
219	return str_decode(tok->in, &tok->in_offset, STR_NO_LIMIT);
220	}
221
222	/** Get a char from input, while staying on the same input position */
223	char32_t tok_look_char(tokenizer_t *tok)
224	{
225	size_t old_offset = tok->in_offset;
226	size_t old_char_offset = tok->in_char_offset;
227	char32_t ret = tok_get_char(tok);
228	tok->in_offset = old_offset;
229	tok->in_char_offset = old_char_offset;
230	return ret;
231	}
232
233	/** Append a char to the end of the current token */
234	errno_t tok_push_char(tokenizer_t *tok, char32_t ch)
235	{
236	return chr_encode(ch, tok->outbuf, &tok->outbuf_offset, tok->outbuf_size);
237	}
238
239	void tok_start_token(tokenizer_t *tok, token_type_t type)
240	{
241	tok->current_type = type;
242	}
243
244	/** Push the current token to the output array */
245	errno_t tok_push_token(tokenizer_t *tok)
246	{
247	if (tok->outtok_offset >= tok->outtok_size) {
248	return EOVERFLOW;
249	}
250
251	if (tok->outbuf_offset >= tok->outbuf_size) {
252	return EOVERFLOW;
253	}
254
255	tok->outbuf[tok->outbuf_offset++] = 0;
256	token_t *tokinfo = &tok->outtok[tok->outtok_offset++];
257	tokinfo->type = tok->current_type;
258	tokinfo->text = tok->outbuf + tok->outbuf_last_start;
259	tokinfo->byte_start = tok->last_in_offset;
260	tokinfo->byte_length = tok->in_offset - tok->last_in_offset;
261	tokinfo->char_start = tok->last_in_char_offset;
262	tokinfo->char_length = tok->in_char_offset - tok->last_in_char_offset;
263	tok->outbuf_last_start = tok->outbuf_offset;
264
265	/* We have consumed the first char of the next token already */
266	tok->last_in_offset = tok->in_offset;
267	tok->last_in_char_offset = tok->in_char_offset;
268
269	return EOK;
270	}
271
272	/** Return true, if the current token is not empty */
273	bool tok_pending_chars(tokenizer_t *tok)
274	{
275	assert(tok->outbuf_offset >= tok->outbuf_last_start);
276	return (tok->outbuf_offset != tok->outbuf_last_start);
277	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: mainline/uspace/app/bdsh/tok.c

Download in other formats: