Context Navigation

source: mainline/uspace/app/bdsh/tok.c@ b9e3af6b

Visit:

Last change on this file since b9e3af6b was 87eba56, checked in by Manuele Conti <manuele.conti@…>, 4 years ago
Fix tokenize command
Property mode set to `100644`
File size: 8.0 KB

Rev	Line
[36ab7c7]	1	/*
	2	* Copyright (c) 2011 Martin Sucha
[6939edb]	3	* All rights reserved.
	4	*
	5	* Redistribution and use in source and binary forms, with or without
[36ab7c7]	6	* modification, are permitted provided that the following conditions
	7	* are met:
[6939edb]	8	*
[36ab7c7]	9	* - Redistributions of source code must retain the above copyright
	10	* notice, this list of conditions and the following disclaimer.
	11	* - Redistributions in binary form must reproduce the above copyright
	12	* notice, this list of conditions and the following disclaimer in the
	13	* documentation and/or other materials provided with the distribution.
	14	* - The name of the author may not be used to endorse or promote products
	15	* derived from this software without specific prior written permission.
[6939edb]	16	*
[36ab7c7]	17	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	18	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	19	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	20	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	21	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	22	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	23	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	24	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	25	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	26	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
[6939edb]	27	*/
	28
	29	#include <str.h>
	30	#include <assert.h>
	31	#include <stdlib.h>
[582a0b8]	32	#include <stddef.h>
[6939edb]	33	#include <errno.h>
	34
	35	#include "tok.h"
	36
	37	/* Forward declarations of static functions */
[28a5ebd]	38	static char32_t tok_get_char(tokenizer_t *);
	39	static char32_t tok_look_char(tokenizer_t *);
	40	static errno_t tok_push_char(tokenizer_t *, char32_t);
[b7fd2a0]	41	static errno_t tok_push_token(tokenizer_t *);
[6939edb]	42	static bool tok_pending_chars(tokenizer_t *);
[b7fd2a0]	43	static errno_t tok_finish_string(tokenizer_t *);
[0662451]	44	static void tok_start_token(tokenizer_t *, token_type_t);
[6939edb]	45
	46	/** Initialize the token parser
[1b20da0]	47	*
[6939edb]	48	* @param tok the tokenizer structure to initialize
	49	* @param input the input string to tokenize
	50	* @param out_tokens array of strings where to store the result
	51	* @param max_tokens number of elements of the out_tokens array
	52	*/
[b7fd2a0]	53	errno_t tok_init(tokenizer_t tok, char input, token_t *out_tokens,
[6939edb]	54	size_t max_tokens)
[1b20da0]	55	{
[6939edb]	56	tok->in = input;
	57	tok->in_offset = 0;
[7dcb7981]	58	tok->last_in_offset = 0;
	59	tok->in_char_offset = 0;
	60	tok->last_in_char_offset = 0;
[a35b458]	61
[6939edb]	62	tok->outtok = out_tokens;
	63	tok->outtok_offset = 0;
[0662451]	64	tok->outtok_size = max_tokens;
[a35b458]	65
[6939edb]	66	/* Prepare a buffer where all the token strings will be stored */
	67	size_t len = str_size(input) + max_tokens + 1;
	68	char *tmp = malloc(len);
[a35b458]	69
[6939edb]	70	if (tmp == NULL) {
	71	return ENOMEM;
	72	}
[a35b458]	73
[6939edb]	74	tok->outbuf = tmp;
	75	tok->outbuf_offset = 0;
	76	tok->outbuf_size = len;
	77	tok->outbuf_last_start = 0;
[a35b458]	78
[6939edb]	79	return EOK;
	80	}
	81
	82	/** Finalize the token parser */
	83	void tok_fini(tokenizer_t *tok)
	84	{
	85	if (tok->outbuf != NULL) {
	86	free(tok->outbuf);
	87	}
	88	}
	89
	90	/** Tokenize the input string into the tokens */
[b7fd2a0]	91	errno_t tok_tokenize(tokenizer_t tok, size_t tokens_length)
[6939edb]	92	{
[b7fd2a0]	93	errno_t rc;
[28a5ebd]	94	char32_t next_char;
[a35b458]	95
[6939edb]	96	/* Read the input line char by char and append tokens */
[5992e0e]	97	while ((next_char = tok_look_char(tok)) != 0) {
	98	if (next_char == ' ') {
[7c3fb9b]	99	/*
	100	* Push the token if there is any.
[6939edb]	101	* There may not be any pending char for a token in case
	102	* there are several spaces in the input.
	103	*/
	104	if (tok_pending_chars(tok)) {
	105	rc = tok_push_token(tok);
	106	if (rc != EOK) {
	107	return rc;
	108	}
	109	}
[0662451]	110	tok_start_token(tok, TOKTYPE_SPACE);
[5992e0e]	111	/* Eat all the spaces */
[0662451]	112	while (tok_look_char(tok) == ' ') {
	113	tok_push_char(tok, tok_get_char(tok));
	114	}
	115	tok_push_token(tok);
[a35b458]	116
[1433ecda]	117	} else if (next_char == '\|') {
[7c3fb9b]	118	/*
	119	* Pipes are tokens that are delimiters and should be
[5992e0e]	120	* output as a separate token
[6939edb]	121	*/
	122	if (tok_pending_chars(tok)) {
	123	rc = tok_push_token(tok);
	124	if (rc != EOK) {
	125	return rc;
	126	}
	127	}
[a35b458]	128
[0662451]	129	tok_start_token(tok, TOKTYPE_PIPE);
[a35b458]	130
[5992e0e]	131	rc = tok_push_char(tok, tok_get_char(tok));
[6939edb]	132	if (rc != EOK) {
	133	return rc;
	134	}
[a35b458]	135
[6939edb]	136	rc = tok_push_token(tok);
	137	if (rc != EOK) {
	138	return rc;
	139	}
[21b0013]	140	} else if (next_char == '<') {
	141	if (tok_pending_chars(tok)) {
	142	rc = tok_push_token(tok);
	143	if (rc != EOK) {
	144	return rc;
	145	}
	146	}
	147
	148	tok_start_token(tok, TOKTYPE_RDIN);
	149
	150	rc = tok_push_char(tok, tok_get_char(tok));
	151	if (rc != EOK) {
	152	return rc;
	153	}
[87eba56]	154
[21b0013]	155	rc = tok_push_token(tok);
	156	if (rc != EOK) {
	157	return rc;
	158	}
	159	} else if (next_char == '>') {
	160	if (tok_pending_chars(tok)) {
	161	rc = tok_push_token(tok);
	162	if (rc != EOK) {
	163	return rc;
	164	}
	165	}
	166
	167	tok_start_token(tok, TOKTYPE_RDOU);
	168
	169	rc = tok_push_char(tok, tok_get_char(tok));
	170	if (rc != EOK) {
	171	return rc;
	172	}
[87eba56]	173
[21b0013]	174	rc = tok_push_token(tok);
	175	if (rc != EOK) {
	176	return rc;
	177	}
	178	} else if (next_char == '\'') {
[7c3fb9b]	179	/*
	180	* A string starts with a quote (') and ends again with a quote.
[6939edb]	181	* A literal quote is written as ''
	182	*/
[0662451]	183	tok_start_token(tok, TOKTYPE_TEXT);
[5992e0e]	184	/* Eat the quote */
	185	tok_get_char(tok);
[6939edb]	186	rc = tok_finish_string(tok);
	187	if (rc != EOK) {
	188	return rc;
	189	}
[1433ecda]	190	} else {
[0662451]	191	if (!tok_pending_chars(tok)) {
	192	tok_start_token(tok, TOKTYPE_TEXT);
	193	}
[7c3fb9b]	194	/*
	195	* If we are handling any other character, just append it to
[6939edb]	196	* the current token.
	197	*/
[5992e0e]	198	rc = tok_push_char(tok, tok_get_char(tok));
[6939edb]	199	if (rc != EOK) {
	200	return rc;
	201	}
	202	}
	203	}
[a35b458]	204
[6939edb]	205	/* Push the last token */
	206	if (tok_pending_chars(tok)) {
	207	rc = tok_push_token(tok);
	208	if (rc != EOK) {
	209	return rc;
	210	}
	211	}
[a35b458]	212
[0662451]	213	*tokens_length = tok->outtok_offset;
[a35b458]	214
[6939edb]	215	return EOK;
	216	}
	217
	218	/** Finish tokenizing an opened string */
[b7fd2a0]	219	errno_t tok_finish_string(tokenizer_t *tok)
[6939edb]	220	{
[b7fd2a0]	221	errno_t rc;
[28a5ebd]	222	char32_t next_char;
[a35b458]	223
[5992e0e]	224	while ((next_char = tok_look_char(tok)) != 0) {
	225	if (next_char == '\'') {
	226	/* Eat the quote */
	227	tok_get_char(tok);
[6939edb]	228	if (tok_look_char(tok) == '\'') {
	229	/* Encode a single literal quote */
	230	rc = tok_push_char(tok, '\'');
	231	if (rc != EOK) {
	232	return rc;
	233	}
[a35b458]	234
[6939edb]	235	/* Swallow the additional one in the input */
	236	tok_get_char(tok);
[1433ecda]	237	} else {
[6939edb]	238	/* The string end */
	239	return tok_push_token(tok);
	240	}
[1433ecda]	241	} else {
[5992e0e]	242	rc = tok_push_char(tok, tok_get_char(tok));
[6939edb]	243	if (rc != EOK) {
	244	return rc;
	245	}
	246	}
	247	}
[a35b458]	248
[6939edb]	249	/* If we are here, the string run to the end without being closed */
	250	return EINVAL;
	251	}
	252
	253	/** Get a char from input, advancing the input position */
[28a5ebd]	254	char32_t tok_get_char(tokenizer_t *tok)
[6939edb]	255	{
[7dcb7981]	256	tok->in_char_offset++;
[6939edb]	257	return str_decode(tok->in, &tok->in_offset, STR_NO_LIMIT);
	258	}
	259
	260	/** Get a char from input, while staying on the same input position */
[28a5ebd]	261	char32_t tok_look_char(tokenizer_t *tok)
[6939edb]	262	{
[f41682c]	263	size_t old_offset = tok->in_offset;
	264	size_t old_char_offset = tok->in_char_offset;
[28a5ebd]	265	char32_t ret = tok_get_char(tok);
[6939edb]	266	tok->in_offset = old_offset;
[7dcb7981]	267	tok->in_char_offset = old_char_offset;
[6939edb]	268	return ret;
	269	}
	270
	271	/** Append a char to the end of the current token */
[28a5ebd]	272	errno_t tok_push_char(tokenizer_t *tok, char32_t ch)
[6939edb]	273	{
	274	return chr_encode(ch, tok->outbuf, &tok->outbuf_offset, tok->outbuf_size);
	275	}
	276
[0662451]	277	void tok_start_token(tokenizer_t *tok, token_type_t type)
	278	{
	279	tok->current_type = type;
	280	}
	281
[6939edb]	282	/** Push the current token to the output array */
[b7fd2a0]	283	errno_t tok_push_token(tokenizer_t *tok)
[6939edb]	284	{
	285	if (tok->outtok_offset >= tok->outtok_size) {
	286	return EOVERFLOW;
	287	}
[a35b458]	288
[6939edb]	289	if (tok->outbuf_offset >= tok->outbuf_size) {
	290	return EOVERFLOW;
	291	}
[a35b458]	292
[6939edb]	293	tok->outbuf[tok->outbuf_offset++] = 0;
[0662451]	294	token_t *tokinfo = &tok->outtok[tok->outtok_offset++];
	295	tokinfo->type = tok->current_type;
	296	tokinfo->text = tok->outbuf + tok->outbuf_last_start;
	297	tokinfo->byte_start = tok->last_in_offset;
[5992e0e]	298	tokinfo->byte_length = tok->in_offset - tok->last_in_offset;
[0662451]	299	tokinfo->char_start = tok->last_in_char_offset;
[5992e0e]	300	tokinfo->char_length = tok->in_char_offset - tok->last_in_char_offset;
[6939edb]	301	tok->outbuf_last_start = tok->outbuf_offset;
[a35b458]	302
[7dcb7981]	303	/* We have consumed the first char of the next token already */
[5992e0e]	304	tok->last_in_offset = tok->in_offset;
	305	tok->last_in_char_offset = tok->in_char_offset;
[a35b458]	306
[6939edb]	307	return EOK;
	308	}
	309
	310	/** Return true, if the current token is not empty */
	311	bool tok_pending_chars(tokenizer_t *tok)
	312	{
	313	assert(tok->outbuf_offset >= tok->outbuf_last_start);
	314	return (tok->outbuf_offset != tok->outbuf_last_start);
[7dcb7981]	315	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: