Context Navigation

source: mainline/uspace/app/sbi/src/lex.c@ 0485135

Visit:

lfn serial ticket/834-toolchain-update topic/msim-upgrade topic/simplify-dev-export

Last change on this file since 0485135 was 051bc69a, checked in by Jiri Svoboda <jiri@…>, 15 years ago
Update SBI to rev. 244.
Property mode set to `100644`
File size: 15.5 KB

Line
1	/*
2	* Copyright (c) 2010 Jiri Svoboda
3	* All rights reserved.
4	*
5	* Redistribution and use in source and binary forms, with or without
6	* modification, are permitted provided that the following conditions
7	* are met:
8	*
9	* - Redistributions of source code must retain the above copyright
10	* notice, this list of conditions and the following disclaimer.
11	* - Redistributions in binary form must reproduce the above copyright
12	* notice, this list of conditions and the following disclaimer in the
13	* documentation and/or other materials provided with the distribution.
14	* - The name of the author may not be used to endorse or promote products
15	* derived from this software without specific prior written permission.
16	*
17	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27	*/
28
29	/** @file Lexer (lexical analyzer).
30	*
31	* Consumes a text file and produces a sequence of lexical elements (lems).
32	*/
33
34	#include <stdio.h>
35	#include <stdlib.h>
36	#include "bigint.h"
37	#include "cspan.h"
38	#include "mytypes.h"
39	#include "input.h"
40	#include "os/os.h"
41	#include "strtab.h"
42
43	#include "lex.h"
44
45	#define TAB_WIDTH 8
46
47	static void lex_touch(lex_t *lex);
48	static bool_t lex_read_try(lex_t *lex);
49
50	static void lex_skip_comment(lex_t *lex);
51	static void lex_skip_ws(lex_t *lex);
52	static bool_t is_wstart(char c);
53	static bool_t is_wcont(char c);
54	static bool_t is_digit(char c);
55	static void lex_word(lex_t *lex);
56	static void lex_char(lex_t *lex);
57	static void lex_number(lex_t *lex);
58	static void lex_string(lex_t *lex);
59	static int digit_value(char c);
60
61	/* Note: This imposes an implementation limit on identifier length. */
62	#define IBUF_SIZE 128
63	static char ident_buf[IBUF_SIZE + 1];
64
65	/* XXX This imposes an implementation limit on string literal length. */
66	#define SLBUF_SIZE 128
67	static char strlit_buf[SLBUF_SIZE + 1];
68
69	/** Lclass-string pair */
70	struct lc_name {
71	lclass_t lclass;
72	const char *name;
73	};
74
75	/** Keyword names. Used both for printing and recognition. */
76	static struct lc_name keywords[] = {
77	{ lc_and, "and" },
78	{ lc_as, "as" },
79	{ lc_bool, "bool" },
80	{ lc_break, "break" },
81	{ lc_builtin, "builtin" },
82	{ lc_char, "char" },
83	{ lc_class, "class" },
84	{ lc_deleg, "deleg" },
85	{ lc_do, "do" },
86	{ lc_elif, "elif" },
87	{ lc_else, "else" },
88	{ lc_end, "end" },
89	{ lc_enum, "enum" },
90	{ lc_except, "except" },
91	{ lc_false, "false" },
92	{ lc_finally, "finally" },
93	{ lc_for, "for" },
94	{ lc_fun, "fun" },
95	{ lc_get, "get" },
96	{ lc_if, "if" },
97	{ lc_in, "in" },
98	{ lc_int, "int" },
99	{ lc_interface, "interface" },
100	{ lc_is, "is" },
101	{ lc_new, "new" },
102	{ lc_not, "not" },
103	{ lc_nil, "nil" },
104	{ lc_or, "or" },
105	{ lc_override, "override" },
106	{ lc_packed, "packed" },
107	{ lc_private, "private" },
108	{ lc_prop, "prop" },
109	{ lc_protected, "protected" },
110	{ lc_public, "public" },
111	{ lc_raise, "raise" },
112	{ lc_resource, "resource" },
113	{ lc_return, "return" },
114	{ lc_self, "self" },
115	{ lc_set, "set" },
116	{ lc_static, "static" },
117	{ lc_string, "string" },
118	{ lc_struct, "struct" },
119	{ lc_then, "then" },
120	{ lc_this, "this" },
121	{ lc_true, "true" },
122	{ lc_var, "var" },
123	{ lc_with, "with" },
124	{ lc_while, "while" },
125	{ lc_yield, "yield" },
126
127	{ 0, NULL }
128	};
129
130	/** Other simple lclasses. Only used for printing. */
131	static struct lc_name simple_lc[] = {
132	{ lc_invalid, "INVALID" },
133	{ lc_eof, "EOF" },
134
135	/* Operators */
136	{ lc_period, "." },
137	{ lc_slash, "/" },
138	{ lc_lparen, "(" },
139	{ lc_rparen, ")" },
140	{ lc_lsbr, "[" },
141	{ lc_rsbr, "]" },
142	{ lc_equal, "==" },
143	{ lc_notequal, "!=" },
144	{ lc_lt, "<" },
145	{ lc_gt, ">" },
146	{ lc_lt_equal, "<=" },
147	{ lc_gt_equal, ">=" },
148	{ lc_assign, "=" },
149	{ lc_plus, "+" },
150	{ lc_minus, "-" },
151	{ lc_mult, "*" },
152	{ lc_increase, "+=" },
153
154	/* Punctuators */
155	{ lc_comma, "," },
156	{ lc_colon, ":" },
157	{ lc_scolon, ";" },
158
159	{ 0, NULL },
160	};
161
162	/** Print lclass value.
163	*
164	* Prints lclass (lexical element class) value in human-readable form
165	* (for debugging).
166	*
167	* @param lclass Lclass value for display.
168	*/
169	void lclass_print(lclass_t lclass)
170	{
171	struct lc_name *dp;
172
173	dp = keywords;
174	while (dp->name != NULL) {
175	if (dp->lclass == lclass) {
176	printf("%s", dp->name);
177	return;
178	}
179	++dp;
180	}
181
182	dp = simple_lc;
183	while (dp->name != NULL) {
184	if (dp->lclass == lclass) {
185	printf("%s", dp->name);
186	return;
187	}
188	++dp;
189	}
190
191	switch (lclass) {
192	case lc_ident:
193	printf("ident");
194	break;
195	case lc_lit_int:
196	printf("int_literal");
197	break;
198	case lc_lit_string:
199	printf("string_literal");
200	break;
201	default:
202	printf("<unknown?>");
203	break;
204	}
205	}
206
207	/** Print lexical element.
208	*
209	* Prints lexical element in human-readable form (for debugging).
210	*
211	* @param lem Lexical element for display.
212	*/
213	void lem_print(lem_t *lem)
214	{
215	lclass_print(lem->lclass);
216
217	switch (lem->lclass) {
218	case lc_ident:
219	printf("('%s')", strtab_get_str(lem->u.ident.sid));
220	break;
221	case lc_lit_int:
222	printf("(");
223	bigint_print(&lem->u.lit_int.value);
224	printf(")");
225	break;
226	case lc_lit_string:
227	printf("(\"%s\")", lem->u.lit_string.value);
228	default:
229	break;
230	}
231	}
232
233	/** Print lem coordinates.
234	*
235	* Print the coordinates (line number, column number) of a lexical element.
236	*
237	* @param lem Lexical element for coordinate printing.
238	*/
239	void lem_print_coords(lem_t *lem)
240	{
241	cspan_print(lem->cspan);
242	}
243
244	/** Initialize lexer instance.
245	*
246	* @param lex Lexer object to initialize.
247	* @param input Input to associate with lexer.
248	*/
249	void lex_init(lex_t lex, struct input input)
250	{
251	int rc;
252
253	lex->input = input;
254
255	rc = input_get_line(lex->input, &lex->inbuf);
256	if (rc != EOK) {
257	printf("Error reading input.\n");
258	exit(1);
259	}
260
261	lex->ibp = lex->inbuf;
262	lex->col_adj = 0;
263	lex->prev_valid = b_false;
264	lex->current_valid = b_true;
265	}
266
267	/** Advance to next lexical element.
268	*
269	* The new element is read in lazily then it is actually accessed.
270	*
271	* @param lex Lexer object.
272	*/
273	void lex_next(lex_t *lex)
274	{
275	/* Make sure the current lem has already been read in. */
276	lex_touch(lex);
277
278	/* Force a new lem to be read on next access. */
279	lex->current_valid = b_false;
280	}
281
282	/** Get current lem.
283	*
284	* The returned pointer is invalidated by next call to lex_next()
285	*
286	* @param lex Lexer object.
287	* @return Pointer to current lem. Owned by @a lex and only valid
288	* until next call to lex_xxx().
289	*/
290	lem_t lex_get_current(lex_t lex)
291	{
292	lex_touch(lex);
293	return &lex->current;
294	}
295
296	/** Get previous lem if valid.
297	*
298	* The returned pointer is invalidated by next call to lex_next()
299	*
300	* @param lex Lexer object.
301	* @return Pointer to previous lem. Owned by @a lex and only valid
302	* until next call to lex_xxx().
303	*/
304	lem_t lex_peek_prev(lex_t lex)
305	{
306	if (lex->current_valid == b_false) {
307	/*
308	* This means the head is advanced but next lem was not read.
309	* Thus the previous lem is still in @a current.
310	*/
311	return &lex->current;
312	}
313
314	if (lex->prev_valid != b_true) {
315	/* Looks like we are still at the first lem. */
316	return NULL;
317	}
318
319	/*
320	* Current lem has been read in. Thus the previous lem was moved to
321	* @a previous.
322	*/
323	return &lex->prev;
324	}
325
326	/** Read in the current lexical element (unless already read in).
327	*
328	* @param lex Lexer object.
329	*/
330	static void lex_touch(lex_t *lex)
331	{
332	bool_t got_lem;
333
334	if (lex->current_valid == b_true)
335	return;
336
337	/* Copy previous lem */
338	lex->prev = lex->current;
339	lex->prev_valid = b_true;
340
341	do {
342	got_lem = lex_read_try(lex);
343	} while (got_lem == b_false);
344
345	lex->current_valid = b_true;
346	}
347
348	/** Try reading next lexical element.
349	*
350	* Attemps to read the next lexical element. In some cases (such as a comment)
351	* this function will need to give it another try and returns @c b_false
352	* in such case.
353	*
354	* @param lex Lexer object.
355	* @return @c b_true on success or @c b_false if it needs
356	* restarting. On success the lem is stored to
357	* the current lem in @a lex.
358	*/
359	static bool_t lex_read_try(lex_t *lex)
360	{
361	char bp, lsp;
362	int line0, col0;
363
364	lex_skip_ws(lex);
365
366	/*
367	* Record lem coordinates. Line number we already have. For column
368	* number we start with position in the input buffer. This works
369	* for all characters except tab. Thus we keep track of tabs
370	* separately using col_adj.
371	*/
372	line0 = input_get_line_no(lex->input);
373	col0 = 1 + lex->col_adj + (lex->ibp - lex->inbuf);
374
375	lex->current.cspan = cspan_new(lex->input, line0, col0, line0, col0);
376
377	lsp = lex->ibp;
378	bp = lex->ibp;
379
380	if (bp[0] == '\0') {
381	/* End of input */
382	lex->current.lclass = lc_eof;
383	goto finish;
384	}
385
386	if (is_wstart(bp[0])) {
387	lex_word(lex);
388	goto finish;
389	}
390
391	if (bp[0] == '\'') {
392	lex_char(lex);
393	goto finish;
394	}
395
396	if (is_digit(bp[0])) {
397	lex_number(lex);
398	goto finish;
399	}
400
401	if (bp[0] == '"') {
402	lex_string(lex);
403	goto finish;
404	}
405
406	if (bp[0] == '-' && bp[1] == '-') {
407	lex_skip_comment(lex);
408
409	/* Compute ending column number */
410	lex->current.cspan->col1 = col0 + (lex->ibp - lsp) - 1;
411
412	/* Try again */
413	return b_false;
414	}
415
416	switch (bp[0]) {
417	case ',': lex->current.lclass = lc_comma; ++bp; break;
418	case ':': lex->current.lclass = lc_colon; ++bp; break;
419	case ';': lex->current.lclass = lc_scolon; ++bp; break;
420
421	case '.': lex->current.lclass = lc_period; ++bp; break;
422	case '/': lex->current.lclass = lc_slash; ++bp; break;
423	case '(': lex->current.lclass = lc_lparen; ++bp; break;
424	case ')': lex->current.lclass = lc_rparen; ++bp; break;
425	case '[': lex->current.lclass = lc_lsbr; ++bp; break;
426	case ']': lex->current.lclass = lc_rsbr; ++bp; break;
427
428	case '=':
429	if (bp[1] == '=') {
430	lex->current.lclass = lc_equal; bp += 2; break;
431	}
432	lex->current.lclass = lc_assign; ++bp; break;
433
434	case '!':
435	if (bp[1] == '=') {
436	lex->current.lclass = lc_notequal; bp += 2; break;
437	}
438	goto invalid;
439
440	case '+':
441	if (bp[1] == '=') {
442	lex->current.lclass = lc_increase; bp += 2; break;
443	}
444	lex->current.lclass = lc_plus; ++bp; break;
445
446	case '-':
447	lex->current.lclass = lc_minus; ++bp; break;
448
449	case '*':
450	lex->current.lclass = lc_mult; ++bp; break;
451
452	case '<':
453	if (bp[1] == '=') {
454	lex->current.lclass = lc_lt_equal; bp += 2; break;
455	}
456	lex->current.lclass = lc_lt; ++bp; break;
457
458	case '>':
459	if (bp[1] == '=') {
460	lex->current.lclass = lc_gt_equal; bp += 2; break;
461	}
462	lex->current.lclass = lc_gt; ++bp; break;
463
464	default:
465	goto invalid;
466	}
467
468	lex->ibp = bp;
469
470	finish:
471	/* Compute ending column number */
472	lex->current.cspan->col1 = col0 + (lex->ibp - lsp) - 1;
473	return b_true;
474
475	invalid:
476	lex->current.lclass = lc_invalid;
477	++bp;
478	lex->ibp = bp;
479
480	return b_true;
481	}
482
483	/** Lex a word (identifier or keyword).
484	*
485	* Read in a word. This may later turn out to be a keyword or a regular
486	* identifier. It is stored in the current lem in @a lex.
487	*
488	* @param lex Lexer object.
489	*/
490	static void lex_word(lex_t *lex)
491	{
492	struct lc_name *dp;
493	char *bp;
494	int idx;
495
496	bp = lex->ibp;
497	ident_buf[0] = bp[0];
498	idx = 1;
499
500	while (is_wcont(bp[idx])) {
501	if (idx >= IBUF_SIZE) {
502	printf("Error: Identifier too long.\n");
503	exit(1);
504	}
505
506	ident_buf[idx] = bp[idx];
507	++idx;
508	}
509
510	lex->ibp = bp + idx;
511
512	ident_buf[idx] = '\0';
513
514	dp = keywords;
515	while (dp->name != NULL) {
516	if (os_str_cmp(ident_buf, dp->name) == 0) {
517	/* Match */
518	lex->current.lclass = dp->lclass;
519	return;
520	}
521	++dp;
522	}
523
524	/* No matching keyword -- it must be an identifier. */
525	lex->current.lclass = lc_ident;
526	lex->current.u.ident.sid = strtab_get_sid(ident_buf);
527	}
528
529	/** Lex a character literal.
530	*
531	* Reads in a character literal and stores it in the current lem in @a lex.
532	*
533	* @param lex Lexer object.
534	*/
535	static void lex_char(lex_t *lex)
536	{
537	char *bp;
538	int idx;
539	size_t len;
540	int char_val;
541
542	bp = lex->ibp + 1;
543	idx = 0;
544
545	while (bp[idx] != '\'') {
546	if (idx >= SLBUF_SIZE) {
547	printf("Error: Character literal too long.\n");
548	exit(1);
549	}
550
551	if (bp[idx] == '\0') {
552	printf("Error: Unterminated character literal.\n");
553	exit(1);
554	}
555
556	strlit_buf[idx] = bp[idx];
557	++idx;
558	}
559
560	lex->ibp = bp + idx + 1;
561
562	strlit_buf[idx] = '\0';
563	len = os_str_length(strlit_buf);
564	if (len != 1) {
565	printf("Character literal should contain one character, "
566	"but contains %u characters instead.\n", (unsigned) len);
567	exit(1);
568	}
569
570	os_str_get_char(strlit_buf, 0, &char_val);
571	lex->current.lclass = lc_lit_char;
572	bigint_init(&lex->current.u.lit_char.value, char_val);
573	}
574
575	/** Lex a numeric literal.
576	*
577	* Reads in a numeric literal and stores it in the current lem in @a lex.
578	*
579	* @param lex Lexer object.
580	*/
581	static void lex_number(lex_t *lex)
582	{
583	char *bp;
584	bigint_t value;
585	bigint_t dgval;
586	bigint_t base;
587	bigint_t tprod;
588
589	bp = lex->ibp;
590
591	bigint_init(&value, 0);
592	bigint_init(&base, 10);
593
594	while (is_digit(*bp)) {
595	bigint_mul(&value, &base, &tprod);
596	bigint_init(&dgval, digit_value(*bp));
597
598	bigint_destroy(&value);
599	bigint_add(&tprod, &dgval, &value);
600	bigint_destroy(&tprod);
601	bigint_destroy(&dgval);
602
603	++bp;
604	}
605
606	bigint_destroy(&base);
607
608	lex->ibp = bp;
609
610	lex->current.lclass = lc_lit_int;
611	bigint_shallow_copy(&value, &lex->current.u.lit_int.value);
612	}
613
614	/** Lex a string literal.
615	*
616	* Reads in a string literal and stores it in the current lem in @a lex.
617	*
618	* @param lex Lexer object.
619	*/
620	static void lex_string(lex_t *lex)
621	{
622	char *bp;
623	int idx;
624
625	bp = lex->ibp + 1;
626	idx = 0;
627
628	while (bp[idx] != '"') {
629	if (idx >= SLBUF_SIZE) {
630	printf("Error: String literal too long.\n");
631	exit(1);
632	}
633
634	if (bp[idx] == '\0') {
635	printf("Error: Unterminated string literal.\n");
636	exit(1);
637	}
638
639	strlit_buf[idx] = bp[idx];
640	++idx;
641	}
642
643	lex->ibp = bp + idx + 1;
644
645	strlit_buf[idx] = '\0';
646
647	lex->current.lclass = lc_lit_string;
648	lex->current.u.lit_string.value = os_str_dup(strlit_buf);
649	}
650
651	/** Lex a single-line comment.
652	*
653	* This does not produce any lem. The comment is just skipped.
654	*
655	* @param lex Lexer object.
656	*/
657	static void lex_skip_comment(lex_t *lex)
658	{
659	char *bp;
660
661	bp = lex->ibp + 2;
662
663	while (bp != '\n' && bp != '\0') {
664	++bp;
665	}
666
667	lex->ibp = bp;
668	}
669
670	/** Skip whitespace characters.
671	*
672	* This does not produce any lem. The whitespace is just skipped.
673	*
674	* @param lex Lexer object.
675	*/
676	static void lex_skip_ws(lex_t *lex)
677	{
678	char *bp;
679	int rc;
680
681	bp = lex->ibp;
682
683	while (b_true) {
684	while (bp == ' ' \|\| bp == '\t') {
685	if (*bp == '\t') {
686	/* XXX This is too simplifed. */
687	lex->col_adj += (TAB_WIDTH - 1);
688	}
689	++bp;
690	}
691
692	if (*bp != '\n')
693	break;
694
695	/* Read next line */
696	rc = input_get_line(lex->input, &lex->inbuf);
697	if (rc != EOK) {
698	printf("Error reading input.\n");
699	exit(1);
700	}
701
702	bp = lex->inbuf;
703	lex->col_adj = 0;
704	}
705
706	lex->ibp = bp;
707	}
708
709	/** Determine if character can start a word.
710	*
711	* @param c Character.
712	* @return @c b_true if @a c can start a word, @c b_false otherwise.
713	*/
714	static bool_t is_wstart(char c)
715	{
716	return ((c >= 'a') && (c <= 'z')) \|\| ((c >= 'A') && (c <= 'Z')) \|\|
717	(c == '_');
718	}
719
720	/** Determine if character can continue a word.
721	*
722	* @param c Character.
723	* @return @c b_true if @a c can start continue word, @c b_false
724	* otherwise.
725	*/
726	static bool_t is_wcont(char c)
727	{
728	return is_digit(c) \|\| is_wstart(c);
729	}
730
731	/** Determine if character is a numeric digit.
732	*
733	* @param c Character.
734	* @return @c b_true if @a c is a numeric digit, @c b_false otherwise.
735	*/
736	static bool_t is_digit(char c)
737	{
738	return ((c >= '0') && (c <= '9'));
739	}
740
741	/** Determine numeric value of digit character.
742	*
743	* @param c Character, must be a valid decimal digit.
744	* @return Value of the digit (0-9).
745	*/
746	static int digit_value(char c)
747	{
748	return (c - '0');
749	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: