Context Navigation

tok.c

Visit:

Last change on this file was 28a5ebd, checked in by Martin Decky <martin@…>, 5 years ago

Use char32_t instead of wchat_t to represent UTF-32 strings

The intention of the native HelenOS string API has been always to
support Unicode in the UTF-8 and UTF-32 encodings as the sole character
representations and ignore the obsolete mess of older single-byte and
multibyte character encodings. Before C11, the wchar_t type has been
slightly misused for the purpose of the UTF-32 strings. The newer
char32_t type is obviously a much more suitable option. The standard
defines char32_t as uint_least32_t, thus we can take the liberty to fix
it to uint32_t.

To maintain compatilibity with the C Standard, the putwchar(wchar_t)
functions has been replaced by our custom putuchar(char32_t) functions
where appropriate.

Property mode set to 100644

File size: 7.3 KB

Rev	Line
[36ab7c7]	1	/*
	2	* Copyright (c) 2011 Martin Sucha
[6939edb]	3	* All rights reserved.
	4	*
	5	* Redistribution and use in source and binary forms, with or without
[36ab7c7]	6	* modification, are permitted provided that the following conditions
	7	* are met:
[6939edb]	8	*
[36ab7c7]	9	* - Redistributions of source code must retain the above copyright
	10	* notice, this list of conditions and the following disclaimer.
	11	* - Redistributions in binary form must reproduce the above copyright
	12	* notice, this list of conditions and the following disclaimer in the
	13	* documentation and/or other materials provided with the distribution.
	14	* - The name of the author may not be used to endorse or promote products
	15	* derived from this software without specific prior written permission.
[6939edb]	16	*
[36ab7c7]	17	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	18	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	19	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	20	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	21	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	22	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	23	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	24	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	25	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	26	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
[6939edb]	27	*/
	28
	29	#include <str.h>
	30	#include <assert.h>
	31	#include <stdlib.h>
[582a0b8]	32	#include <stddef.h>
[6939edb]	33	#include <errno.h>
	34
	35	#include "tok.h"
	36
	37	/* Forward declarations of static functions */
[28a5ebd]	38	static char32_t tok_get_char(tokenizer_t *);
	39	static char32_t tok_look_char(tokenizer_t *);
	40	static errno_t tok_push_char(tokenizer_t *, char32_t);
[b7fd2a0]	41	static errno_t tok_push_token(tokenizer_t *);
[6939edb]	42	static bool tok_pending_chars(tokenizer_t *);
[b7fd2a0]	43	static errno_t tok_finish_string(tokenizer_t *);
[0662451]	44	static void tok_start_token(tokenizer_t *, token_type_t);
[6939edb]	45
	46	/** Initialize the token parser
[1b20da0]	47	*
[6939edb]	48	* @param tok the tokenizer structure to initialize
	49	* @param input the input string to tokenize
	50	* @param out_tokens array of strings where to store the result
	51	* @param max_tokens number of elements of the out_tokens array
	52	*/
[b7fd2a0]	53	errno_t tok_init(tokenizer_t tok, char input, token_t *out_tokens,
[6939edb]	54	size_t max_tokens)
[1b20da0]	55	{
[6939edb]	56	tok->in = input;
	57	tok->in_offset = 0;
[7dcb7981]	58	tok->last_in_offset = 0;
	59	tok->in_char_offset = 0;
	60	tok->last_in_char_offset = 0;
[a35b458]	61
[6939edb]	62	tok->outtok = out_tokens;
	63	tok->outtok_offset = 0;
[0662451]	64	tok->outtok_size = max_tokens;
[a35b458]	65
[6939edb]	66	/* Prepare a buffer where all the token strings will be stored */
	67	size_t len = str_size(input) + max_tokens + 1;
	68	char *tmp = malloc(len);
[a35b458]	69
[6939edb]	70	if (tmp == NULL) {
	71	return ENOMEM;
	72	}
[a35b458]	73
[6939edb]	74	tok->outbuf = tmp;
	75	tok->outbuf_offset = 0;
	76	tok->outbuf_size = len;
	77	tok->outbuf_last_start = 0;
[a35b458]	78
[6939edb]	79	return EOK;
	80	}
	81
	82	/** Finalize the token parser */
	83	void tok_fini(tokenizer_t *tok)
	84	{
	85	if (tok->outbuf != NULL) {
	86	free(tok->outbuf);
	87	}
	88	}
	89
	90	/** Tokenize the input string into the tokens */
[b7fd2a0]	91	errno_t tok_tokenize(tokenizer_t tok, size_t tokens_length)
[6939edb]	92	{
[b7fd2a0]	93	errno_t rc;
[28a5ebd]	94	char32_t next_char;
[a35b458]	95
[6939edb]	96	/* Read the input line char by char and append tokens */
[5992e0e]	97	while ((next_char = tok_look_char(tok)) != 0) {
	98	if (next_char == ' ') {
[7c3fb9b]	99	/*
	100	* Push the token if there is any.
[6939edb]	101	* There may not be any pending char for a token in case
	102	* there are several spaces in the input.
	103	*/
	104	if (tok_pending_chars(tok)) {
	105	rc = tok_push_token(tok);
	106	if (rc != EOK) {
	107	return rc;
	108	}
	109	}
[0662451]	110	tok_start_token(tok, TOKTYPE_SPACE);
[5992e0e]	111	/* Eat all the spaces */
[0662451]	112	while (tok_look_char(tok) == ' ') {
	113	tok_push_char(tok, tok_get_char(tok));
	114	}
	115	tok_push_token(tok);
[a35b458]	116
[1433ecda]	117	} else if (next_char == '\|') {
[7c3fb9b]	118	/*
	119	* Pipes are tokens that are delimiters and should be
[5992e0e]	120	* output as a separate token
[6939edb]	121	*/
	122	if (tok_pending_chars(tok)) {
	123	rc = tok_push_token(tok);
	124	if (rc != EOK) {
	125	return rc;
	126	}
	127	}
[a35b458]	128
[0662451]	129	tok_start_token(tok, TOKTYPE_PIPE);
[a35b458]	130
[5992e0e]	131	rc = tok_push_char(tok, tok_get_char(tok));
[6939edb]	132	if (rc != EOK) {
	133	return rc;
	134	}
[a35b458]	135
[6939edb]	136	rc = tok_push_token(tok);
	137	if (rc != EOK) {
	138	return rc;
	139	}
[1433ecda]	140	} else if (next_char == '\'') {
[7c3fb9b]	141	/*
	142	* A string starts with a quote (') and ends again with a quote.
[6939edb]	143	* A literal quote is written as ''
	144	*/
[0662451]	145	tok_start_token(tok, TOKTYPE_TEXT);
[5992e0e]	146	/* Eat the quote */
	147	tok_get_char(tok);
[6939edb]	148	rc = tok_finish_string(tok);
	149	if (rc != EOK) {
	150	return rc;
	151	}
[1433ecda]	152	} else {
[0662451]	153	if (!tok_pending_chars(tok)) {
	154	tok_start_token(tok, TOKTYPE_TEXT);
	155	}
[7c3fb9b]	156	/*
	157	* If we are handling any other character, just append it to
[6939edb]	158	* the current token.
	159	*/
[5992e0e]	160	rc = tok_push_char(tok, tok_get_char(tok));
[6939edb]	161	if (rc != EOK) {
	162	return rc;
	163	}
	164	}
	165	}
[a35b458]	166
[6939edb]	167	/* Push the last token */
	168	if (tok_pending_chars(tok)) {
	169	rc = tok_push_token(tok);
	170	if (rc != EOK) {
	171	return rc;
	172	}
	173	}
[a35b458]	174
[0662451]	175	*tokens_length = tok->outtok_offset;
[a35b458]	176
[6939edb]	177	return EOK;
	178	}
	179
	180	/** Finish tokenizing an opened string */
[b7fd2a0]	181	errno_t tok_finish_string(tokenizer_t *tok)
[6939edb]	182	{
[b7fd2a0]	183	errno_t rc;
[28a5ebd]	184	char32_t next_char;
[a35b458]	185
[5992e0e]	186	while ((next_char = tok_look_char(tok)) != 0) {
	187	if (next_char == '\'') {
	188	/* Eat the quote */
	189	tok_get_char(tok);
[6939edb]	190	if (tok_look_char(tok) == '\'') {
	191	/* Encode a single literal quote */
	192	rc = tok_push_char(tok, '\'');
	193	if (rc != EOK) {
	194	return rc;
	195	}
[a35b458]	196
[6939edb]	197	/* Swallow the additional one in the input */
	198	tok_get_char(tok);
[1433ecda]	199	} else {
[6939edb]	200	/* The string end */
	201	return tok_push_token(tok);
	202	}
[1433ecda]	203	} else {
[5992e0e]	204	rc = tok_push_char(tok, tok_get_char(tok));
[6939edb]	205	if (rc != EOK) {
	206	return rc;
	207	}
	208	}
	209	}
[a35b458]	210
[6939edb]	211	/* If we are here, the string run to the end without being closed */
	212	return EINVAL;
	213	}
	214
	215	/** Get a char from input, advancing the input position */
[28a5ebd]	216	char32_t tok_get_char(tokenizer_t *tok)
[6939edb]	217	{
[7dcb7981]	218	tok->in_char_offset++;
[6939edb]	219	return str_decode(tok->in, &tok->in_offset, STR_NO_LIMIT);
	220	}
	221
	222	/** Get a char from input, while staying on the same input position */
[28a5ebd]	223	char32_t tok_look_char(tokenizer_t *tok)
[6939edb]	224	{
[f41682c]	225	size_t old_offset = tok->in_offset;
	226	size_t old_char_offset = tok->in_char_offset;
[28a5ebd]	227	char32_t ret = tok_get_char(tok);
[6939edb]	228	tok->in_offset = old_offset;
[7dcb7981]	229	tok->in_char_offset = old_char_offset;
[6939edb]	230	return ret;
	231	}
	232
	233	/** Append a char to the end of the current token */
[28a5ebd]	234	errno_t tok_push_char(tokenizer_t *tok, char32_t ch)
[6939edb]	235	{
	236	return chr_encode(ch, tok->outbuf, &tok->outbuf_offset, tok->outbuf_size);
	237	}
	238
[0662451]	239	void tok_start_token(tokenizer_t *tok, token_type_t type)
	240	{
	241	tok->current_type = type;
	242	}
	243
[6939edb]	244	/** Push the current token to the output array */
[b7fd2a0]	245	errno_t tok_push_token(tokenizer_t *tok)
[6939edb]	246	{
	247	if (tok->outtok_offset >= tok->outtok_size) {
	248	return EOVERFLOW;
	249	}
[a35b458]	250
[6939edb]	251	if (tok->outbuf_offset >= tok->outbuf_size) {
	252	return EOVERFLOW;
	253	}
[a35b458]	254
[6939edb]	255	tok->outbuf[tok->outbuf_offset++] = 0;
[0662451]	256	token_t *tokinfo = &tok->outtok[tok->outtok_offset++];
	257	tokinfo->type = tok->current_type;
	258	tokinfo->text = tok->outbuf + tok->outbuf_last_start;
	259	tokinfo->byte_start = tok->last_in_offset;
[5992e0e]	260	tokinfo->byte_length = tok->in_offset - tok->last_in_offset;
[0662451]	261	tokinfo->char_start = tok->last_in_char_offset;
[5992e0e]	262	tokinfo->char_length = tok->in_char_offset - tok->last_in_char_offset;
[6939edb]	263	tok->outbuf_last_start = tok->outbuf_offset;
[a35b458]	264
[7dcb7981]	265	/* We have consumed the first char of the next token already */
[5992e0e]	266	tok->last_in_offset = tok->in_offset;
	267	tok->last_in_char_offset = tok->in_char_offset;
[a35b458]	268
[6939edb]	269	return EOK;
	270	}
	271
	272	/** Return true, if the current token is not empty */
	273	bool tok_pending_chars(tokenizer_t *tok)
	274	{
	275	assert(tok->outbuf_offset >= tok->outbuf_last_start);
	276	return (tok->outbuf_offset != tok->outbuf_last_start);
[7dcb7981]	277	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: mainline/uspace/app/bdsh/tok.c

Download in other formats: