Context Navigation

source: mainline/uspace/app/sbi/src/lex.c

Visit:

Last change on this file was 1433ecda, checked in by Jiri Svoboda <jiri@…>, 7 years ago
Fix cstyle: make ccheck-fix and commit only files where all the changes are good.
Property mode set to `100644`
File size: 16.4 KB

Line
1	/*
2	* Copyright (c) 2011 Jiri Svoboda
3	* All rights reserved.
4	*
5	* Redistribution and use in source and binary forms, with or without
6	* modification, are permitted provided that the following conditions
7	* are met:
8	*
9	* - Redistributions of source code must retain the above copyright
10	* notice, this list of conditions and the following disclaimer.
11	* - Redistributions in binary form must reproduce the above copyright
12	* notice, this list of conditions and the following disclaimer in the
13	* documentation and/or other materials provided with the distribution.
14	* - The name of the author may not be used to endorse or promote products
15	* derived from this software without specific prior written permission.
16	*
17	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27	*/
28
29	/** @file Lexer (lexical analyzer).
30	*
31	* Consumes a text file and produces a sequence of lexical elements (lems).
32	*/
33
34	#include <stdio.h>
35	#include <stdlib.h>
36	#include "bigint.h"
37	#include "cspan.h"
38	#include "mytypes.h"
39	#include "input.h"
40	#include "os/os.h"
41	#include "strtab.h"
42
43	#include "lex.h"
44
45	#define TAB_WIDTH 8
46
47	typedef enum {
48	cs_chr,
49	cs_str
50	} chr_str_t;
51
52	static void lex_touch(lex_t *lex);
53	static bool_t lex_read_try(lex_t *lex);
54
55	static void lex_skip_comment(lex_t *lex);
56	static void lex_skip_ws(lex_t *lex);
57	static bool_t is_wstart(char c);
58	static bool_t is_wcont(char c);
59	static bool_t is_digit(char c);
60	static void lex_word(lex_t *lex);
61	static void lex_char(lex_t *lex);
62	static void lex_number(lex_t *lex);
63	static void lex_string(lex_t *lex);
64	static void lex_char_string_core(lex_t *lex, chr_str_t cs);
65	static int digit_value(char c);
66
67	/* Note: This imposes an implementation limit on identifier length. */
68	#define IBUF_SIZE 128
69	static char ident_buf[IBUF_SIZE + 1];
70
71	/* XXX This imposes an implementation limit on string literal length. */
72	#define SLBUF_SIZE 128
73	static char strlit_buf[SLBUF_SIZE + 1];
74
75	/** Lclass-string pair */
76	struct lc_name {
77	lclass_t lclass;
78	const char *name;
79	};
80
81	/** Keyword names. Used both for printing and recognition. */
82	static struct lc_name keywords[] = {
83	{ lc_and, "and" },
84	{ lc_as, "as" },
85	{ lc_bool, "bool" },
86	{ lc_break, "break" },
87	{ lc_builtin, "builtin" },
88	{ lc_char, "char" },
89	{ lc_class, "class" },
90	{ lc_deleg, "deleg" },
91	{ lc_do, "do" },
92	{ lc_elif, "elif" },
93	{ lc_else, "else" },
94	{ lc_end, "end" },
95	{ lc_enum, "enum" },
96	{ lc_except, "except" },
97	{ lc_false, "false" },
98	{ lc_finally, "finally" },
99	{ lc_for, "for" },
100	{ lc_fun, "fun" },
101	{ lc_get, "get" },
102	{ lc_if, "if" },
103	{ lc_in, "in" },
104	{ lc_int, "int" },
105	{ lc_interface, "interface" },
106	{ lc_is, "is" },
107	{ lc_new, "new" },
108	{ lc_not, "not" },
109	{ lc_nil, "nil" },
110	{ lc_or, "or" },
111	{ lc_override, "override" },
112	{ lc_packed, "packed" },
113	{ lc_private, "private" },
114	{ lc_prop, "prop" },
115	{ lc_protected, "protected" },
116	{ lc_public, "public" },
117	{ lc_raise, "raise" },
118	{ lc_resource, "resource" },
119	{ lc_return, "return" },
120	{ lc_self, "self" },
121	{ lc_set, "set" },
122	{ lc_static, "static" },
123	{ lc_string, "string" },
124	{ lc_struct, "struct" },
125	{ lc_switch, "switch" },
126	{ lc_then, "then" },
127	{ lc_this, "this" },
128	{ lc_true, "true" },
129	{ lc_var, "var" },
130	{ lc_with, "with" },
131	{ lc_when, "when" },
132	{ lc_while, "while" },
133	{ lc_yield, "yield" },
134
135	{ 0, NULL }
136	};
137
138	/** Other simple lclasses. Only used for printing. */
139	static struct lc_name simple_lc[] = {
140	{ lc_invalid, "INVALID" },
141	{ lc_eof, "EOF" },
142
143	/* Operators */
144	{ lc_period, "." },
145	{ lc_slash, "/" },
146	{ lc_lparen, "(" },
147	{ lc_rparen, ")" },
148	{ lc_lsbr, "[" },
149	{ lc_rsbr, "]" },
150	{ lc_equal, "==" },
151	{ lc_notequal, "!=" },
152	{ lc_lt, "<" },
153	{ lc_gt, ">" },
154	{ lc_lt_equal, "<=" },
155	{ lc_gt_equal, ">=" },
156	{ lc_assign, "=" },
157	{ lc_plus, "+" },
158	{ lc_minus, "-" },
159	{ lc_mult, "*" },
160	{ lc_increase, "+=" },
161
162	/* Punctuators */
163	{ lc_comma, "," },
164	{ lc_colon, ":" },
165	{ lc_scolon, ";" },
166
167	{ 0, NULL },
168	};
169
170	/** Print lclass value.
171	*
172	* Prints lclass (lexical element class) value in human-readable form
173	* (for debugging).
174	*
175	* @param lclass Lclass value for display.
176	*/
177	void lclass_print(lclass_t lclass)
178	{
179	struct lc_name *dp;
180
181	dp = keywords;
182	while (dp->name != NULL) {
183	if (dp->lclass == lclass) {
184	printf("%s", dp->name);
185	return;
186	}
187	++dp;
188	}
189
190	dp = simple_lc;
191	while (dp->name != NULL) {
192	if (dp->lclass == lclass) {
193	printf("%s", dp->name);
194	return;
195	}
196	++dp;
197	}
198
199	switch (lclass) {
200	case lc_ident:
201	printf("ident");
202	break;
203	case lc_lit_int:
204	printf("int_literal");
205	break;
206	case lc_lit_string:
207	printf("string_literal");
208	break;
209	default:
210	printf("<unknown?>");
211	break;
212	}
213	}
214
215	/** Print lexical element.
216	*
217	* Prints lexical element in human-readable form (for debugging).
218	*
219	* @param lem Lexical element for display.
220	*/
221	void lem_print(lem_t *lem)
222	{
223	lclass_print(lem->lclass);
224
225	switch (lem->lclass) {
226	case lc_ident:
227	printf("('%s')", strtab_get_str(lem->u.ident.sid));
228	break;
229	case lc_lit_int:
230	printf("(");
231	bigint_print(&lem->u.lit_int.value);
232	printf(")");
233	break;
234	case lc_lit_string:
235	printf("(\"%s\")", lem->u.lit_string.value);
236	default:
237	break;
238	}
239	}
240
241	/** Print lem coordinates.
242	*
243	* Print the coordinates (line number, column number) of a lexical element.
244	*
245	* @param lem Lexical element for coordinate printing.
246	*/
247	void lem_print_coords(lem_t *lem)
248	{
249	cspan_print(lem->cspan);
250	}
251
252	/** Initialize lexer instance.
253	*
254	* @param lex Lexer object to initialize.
255	* @param input Input to associate with lexer.
256	*/
257	void lex_init(lex_t lex, struct input input)
258	{
259	errno_t rc;
260
261	lex->input = input;
262
263	rc = input_get_line(lex->input, &lex->inbuf);
264	if (rc != EOK) {
265	printf("Error reading input.\n");
266	exit(1);
267	}
268
269	lex->ibp = lex->inbuf;
270	lex->col_adj = 0;
271	lex->prev_valid = b_false;
272	lex->current_valid = b_true;
273	}
274
275	/** Advance to next lexical element.
276	*
277	* The new element is read in lazily then it is actually accessed.
278	*
279	* @param lex Lexer object.
280	*/
281	void lex_next(lex_t *lex)
282	{
283	/* Make sure the current lem has already been read in. */
284	lex_touch(lex);
285
286	/* Force a new lem to be read on next access. */
287	lex->current_valid = b_false;
288	}
289
290	/** Get current lem.
291	*
292	* The returned pointer is invalidated by next call to lex_next()
293	*
294	* @param lex Lexer object.
295	* @return Pointer to current lem. Owned by @a lex and only valid
296	* until next call to lex_xxx().
297	*/
298	lem_t lex_get_current(lex_t lex)
299	{
300	lex_touch(lex);
301	return &lex->current;
302	}
303
304	/** Get previous lem if valid.
305	*
306	* The returned pointer is invalidated by next call to lex_next()
307	*
308	* @param lex Lexer object.
309	* @return Pointer to previous lem. Owned by @a lex and only valid
310	* until next call to lex_xxx().
311	*/
312	lem_t lex_peek_prev(lex_t lex)
313	{
314	if (lex->current_valid == b_false) {
315	/*
316	* This means the head is advanced but next lem was not read.
317	* Thus the previous lem is still in @a current.
318	*/
319	return &lex->current;
320	}
321
322	if (lex->prev_valid != b_true) {
323	/* Looks like we are still at the first lem. */
324	return NULL;
325	}
326
327	/*
328	* Current lem has been read in. Thus the previous lem was moved to
329	* @a previous.
330	*/
331	return &lex->prev;
332	}
333
334	/** Read in the current lexical element (unless already read in).
335	*
336	* @param lex Lexer object.
337	*/
338	static void lex_touch(lex_t *lex)
339	{
340	bool_t got_lem;
341
342	if (lex->current_valid == b_true)
343	return;
344
345	/* Copy previous lem */
346	lex->prev = lex->current;
347	lex->prev_valid = b_true;
348
349	do {
350	got_lem = lex_read_try(lex);
351	} while (got_lem == b_false);
352
353	lex->current_valid = b_true;
354	}
355
356	/** Try reading next lexical element.
357	*
358	* Attemps to read the next lexical element. In some cases (such as a comment)
359	* this function will need to give it another try and returns @c b_false
360	* in such case.
361	*
362	* @param lex Lexer object.
363	* @return @c b_true on success or @c b_false if it needs
364	* restarting. On success the lem is stored to
365	* the current lem in @a lex.
366	*/
367	static bool_t lex_read_try(lex_t *lex)
368	{
369	char bp, lsp;
370	int line0, col0;
371
372	lex_skip_ws(lex);
373
374	/*
375	* Record lem coordinates. Line number we already have. For column
376	* number we start with position in the input buffer. This works
377	* for all characters except tab. Thus we keep track of tabs
378	* separately using col_adj.
379	*/
380	line0 = input_get_line_no(lex->input);
381	col0 = 1 + lex->col_adj + (lex->ibp - lex->inbuf);
382
383	lex->current.cspan = cspan_new(lex->input, line0, col0, line0, col0);
384
385	lsp = lex->ibp;
386	bp = lex->ibp;
387
388	if (bp[0] == '\0') {
389	/* End of input */
390	lex->current.lclass = lc_eof;
391	goto finish;
392	}
393
394	if (is_wstart(bp[0])) {
395	lex_word(lex);
396	goto finish;
397	}
398
399	if (bp[0] == '\'') {
400	lex_char(lex);
401	goto finish;
402	}
403
404	if (is_digit(bp[0])) {
405	lex_number(lex);
406	goto finish;
407	}
408
409	if (bp[0] == '"') {
410	lex_string(lex);
411	goto finish;
412	}
413
414	if (bp[0] == '-' && bp[1] == '-') {
415	lex_skip_comment(lex);
416
417	/* Compute ending column number */
418	lex->current.cspan->col1 = col0 + (lex->ibp - lsp) - 1;
419
420	/* Try again */
421	return b_false;
422	}
423
424	switch (bp[0]) {
425	case ',':
426	lex->current.lclass = lc_comma;
427	++bp;
428	break;
429	case ':':
430	lex->current.lclass = lc_colon;
431	++bp;
432	break;
433	case ';':
434	lex->current.lclass = lc_scolon;
435	++bp;
436	break;
437
438	case '.':
439	lex->current.lclass = lc_period;
440	++bp;
441	break;
442	case '/':
443	lex->current.lclass = lc_slash;
444	++bp;
445	break;
446	case '(':
447	lex->current.lclass = lc_lparen;
448	++bp;
449	break;
450	case ')':
451	lex->current.lclass = lc_rparen;
452	++bp;
453	break;
454	case '[':
455	lex->current.lclass = lc_lsbr;
456	++bp;
457	break;
458	case ']':
459	lex->current.lclass = lc_rsbr;
460	++bp;
461	break;
462
463	case '=':
464	if (bp[1] == '=') {
465	lex->current.lclass = lc_equal;
466	bp += 2;
467	break;
468	}
469	lex->current.lclass = lc_assign;
470	++bp;
471	break;
472
473	case '!':
474	if (bp[1] == '=') {
475	lex->current.lclass = lc_notequal;
476	bp += 2;
477	break;
478	}
479	goto invalid;
480
481	case '+':
482	if (bp[1] == '=') {
483	lex->current.lclass = lc_increase;
484	bp += 2;
485	break;
486	}
487	lex->current.lclass = lc_plus;
488	++bp;
489	break;
490
491	case '-':
492	lex->current.lclass = lc_minus;
493	++bp;
494	break;
495
496	case '*':
497	lex->current.lclass = lc_mult;
498	++bp;
499	break;
500
501	case '<':
502	if (bp[1] == '=') {
503	lex->current.lclass = lc_lt_equal;
504	bp += 2;
505	break;
506	}
507	lex->current.lclass = lc_lt;
508	++bp;
509	break;
510
511	case '>':
512	if (bp[1] == '=') {
513	lex->current.lclass = lc_gt_equal;
514	bp += 2;
515	break;
516	}
517	lex->current.lclass = lc_gt;
518	++bp;
519	break;
520
521	default:
522	goto invalid;
523	}
524
525	lex->ibp = bp;
526
527	finish:
528	/* Compute ending column number */
529	lex->current.cspan->col1 = col0 + (lex->ibp - lsp) - 1;
530	return b_true;
531
532	invalid:
533	lex->current.lclass = lc_invalid;
534	++bp;
535	lex->ibp = bp;
536
537	return b_true;
538	}
539
540	/** Lex a word (identifier or keyword).
541	*
542	* Read in a word. This may later turn out to be a keyword or a regular
543	* identifier. It is stored in the current lem in @a lex.
544	*
545	* @param lex Lexer object.
546	*/
547	static void lex_word(lex_t *lex)
548	{
549	struct lc_name *dp;
550	char *bp;
551	int idx;
552
553	bp = lex->ibp;
554	ident_buf[0] = bp[0];
555	idx = 1;
556
557	while (is_wcont(bp[idx])) {
558	if (idx >= IBUF_SIZE) {
559	printf("Error: Identifier too long.\n");
560	exit(1);
561	}
562
563	ident_buf[idx] = bp[idx];
564	++idx;
565	}
566
567	lex->ibp = bp + idx;
568
569	ident_buf[idx] = '\0';
570
571	dp = keywords;
572	while (dp->name != NULL) {
573	if (os_str_cmp(ident_buf, dp->name) == 0) {
574	/* Match */
575	lex->current.lclass = dp->lclass;
576	return;
577	}
578	++dp;
579	}
580
581	/* No matching keyword -- it must be an identifier. */
582	lex->current.lclass = lc_ident;
583	lex->current.u.ident.sid = strtab_get_sid(ident_buf);
584	}
585
586	/** Lex a character literal.
587	*
588	* Reads in a character literal and stores it in the current lem in @a lex.
589	*
590	* @param lex Lexer object.
591	*/
592	static void lex_char(lex_t *lex)
593	{
594	size_t len;
595	int char_val;
596
597	lex_char_string_core(lex, cs_chr);
598
599	len = os_str_length(strlit_buf);
600	if (len != 1) {
601	printf("Character literal should contain one character, "
602	"but contains %u characters instead.\n", (unsigned) len);
603	exit(1);
604	}
605
606	os_str_get_char(strlit_buf, 0, &char_val);
607	lex->current.lclass = lc_lit_char;
608	bigint_init(&lex->current.u.lit_char.value, char_val);
609	}
610
611	/** Lex a numeric literal.
612	*
613	* Reads in a numeric literal and stores it in the current lem in @a lex.
614	*
615	* @param lex Lexer object.
616	*/
617	static void lex_number(lex_t *lex)
618	{
619	char *bp;
620	bigint_t value;
621	bigint_t dgval;
622	bigint_t base;
623	bigint_t tprod;
624
625	bp = lex->ibp;
626
627	bigint_init(&value, 0);
628	bigint_init(&base, 10);
629
630	while (is_digit(*bp)) {
631	bigint_mul(&value, &base, &tprod);
632	bigint_init(&dgval, digit_value(*bp));
633
634	bigint_destroy(&value);
635	bigint_add(&tprod, &dgval, &value);
636	bigint_destroy(&tprod);
637	bigint_destroy(&dgval);
638
639	++bp;
640	}
641
642	bigint_destroy(&base);
643
644	lex->ibp = bp;
645
646	lex->current.lclass = lc_lit_int;
647	bigint_shallow_copy(&value, &lex->current.u.lit_int.value);
648	}
649
650	/** Lex a string literal.
651	*
652	* Reads in a string literal and stores it in the current lem in @a lex.
653	*
654	* @param lex Lexer object.
655	*/
656	static void lex_string(lex_t *lex)
657	{
658	lex_char_string_core(lex, cs_str);
659
660	lex->current.lclass = lc_lit_string;
661	lex->current.u.lit_string.value = os_str_dup(strlit_buf);
662	}
663
664	static void lex_char_string_core(lex_t *lex, chr_str_t cs)
665	{
666	char *bp;
667	int sidx, didx;
668	char term;
669	const char descr, cap_descr;
670	char spchar;
671
672	/* Make compiler happy */
673	term = '\0';
674	descr = NULL;
675	cap_descr = NULL;
676
677	switch (cs) {
678	case cs_chr:
679	term = '\'';
680	descr = "character";
681	cap_descr = "Character";
682	break;
683	case cs_str:
684	term = '"';
685	descr = "string";
686	cap_descr = "String";
687	break;
688	}
689
690	bp = lex->ibp + 1;
691	sidx = didx = 0;
692
693	while (bp[sidx] != term) {
694	if (didx >= SLBUF_SIZE) {
695	printf("Error: %s literal too long.\n", cap_descr);
696	exit(1);
697	}
698
699	if (bp[sidx] == '\0') {
700	printf("Error: Unterminated %s literal.\n", descr);
701	exit(1);
702	}
703
704	if (bp[sidx] == '\\') {
705	switch (bp[sidx + 1]) {
706	case '\\':
707	spchar = '\\';
708	break;
709	case '\'':
710	spchar = '\'';
711	break;
712	case '"':
713	spchar = '"';
714	break;
715	case 'n':
716	spchar = '\n';
717	break;
718	case 't':
719	spchar = '\t';
720	break;
721	default:
722	printf("Error: Unknown character escape sequence.\n");
723	exit(1);
724	}
725
726	strlit_buf[didx] = spchar;
727	++didx;
728	sidx += 2;
729	} else {
730	strlit_buf[didx] = bp[sidx];
731	++sidx;
732	++didx;
733	}
734	}
735
736	lex->ibp = bp + sidx + 1;
737
738	strlit_buf[didx] = '\0';
739	}
740
741	/** Lex a single-line comment.
742	*
743	* This does not produce any lem. The comment is just skipped.
744	*
745	* @param lex Lexer object.
746	*/
747	static void lex_skip_comment(lex_t *lex)
748	{
749	char *bp;
750
751	bp = lex->ibp + 2;
752
753	while (bp != '\n' && bp != '\0') {
754	++bp;
755	}
756
757	lex->ibp = bp;
758	}
759
760	/** Skip whitespace characters.
761	*
762	* This does not produce any lem. The whitespace is just skipped.
763	*
764	* @param lex Lexer object.
765	*/
766	static void lex_skip_ws(lex_t *lex)
767	{
768	char *bp;
769	errno_t rc;
770
771	bp = lex->ibp;
772
773	while (b_true) {
774	while (bp == ' ' \|\| bp == '\t') {
775	if (*bp == '\t') {
776	/* XXX This is too simplifed. */
777	lex->col_adj += (TAB_WIDTH - 1);
778	}
779	++bp;
780	}
781
782	if (*bp != '\n')
783	break;
784
785	/* Read next line */
786	rc = input_get_line(lex->input, &lex->inbuf);
787	if (rc != EOK) {
788	printf("Error reading input.\n");
789	exit(1);
790	}
791
792	bp = lex->inbuf;
793	lex->col_adj = 0;
794	}
795
796	lex->ibp = bp;
797	}
798
799	/** Determine if character can start a word.
800	*
801	* @param c Character.
802	* @return @c b_true if @a c can start a word, @c b_false otherwise.
803	*/
804	static bool_t is_wstart(char c)
805	{
806	return ((c >= 'a') && (c <= 'z')) \|\| ((c >= 'A') && (c <= 'Z')) \|\|
807	(c == '_');
808	}
809
810	/** Determine if character can continue a word.
811	*
812	* @param c Character.
813	* @return @c b_true if @a c can start continue word, @c b_false
814	* otherwise.
815	*/
816	static bool_t is_wcont(char c)
817	{
818	return is_digit(c) \|\| is_wstart(c);
819	}
820
821	/** Determine if character is a numeric digit.
822	*
823	* @param c Character.
824	* @return @c b_true if @a c is a numeric digit, @c b_false otherwise.
825	*/
826	static bool_t is_digit(char c)
827	{
828	return ((c >= '0') && (c <= '9'));
829	}
830
831	/** Determine numeric value of digit character.
832	*
833	* @param c Character, must be a valid decimal digit.
834	* @return Value of the digit (0-9).
835	*/
836	static int digit_value(char c)
837	{
838	return (c - '0');
839	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: