Context Navigation

source: mainline/uspace/app/sbi/src/lex.c@ 37c9fc8

Visit:

lfn serial ticket/834-toolchain-update topic/msim-upgrade topic/simplify-dev-export

Last change on this file since 37c9fc8 was 38aaacc2, checked in by Jiri Svoboda <jiri@…>, 16 years ago
Update SBI to rev. 207.
Property mode set to `100644`
File size: 14.4 KB

Line
1	/*
2	* Copyright (c) 2010 Jiri Svoboda
3	* All rights reserved.
4	*
5	* Redistribution and use in source and binary forms, with or without
6	* modification, are permitted provided that the following conditions
7	* are met:
8	*
9	* - Redistributions of source code must retain the above copyright
10	* notice, this list of conditions and the following disclaimer.
11	* - Redistributions in binary form must reproduce the above copyright
12	* notice, this list of conditions and the following disclaimer in the
13	* documentation and/or other materials provided with the distribution.
14	* - The name of the author may not be used to endorse or promote products
15	* derived from this software without specific prior written permission.
16	*
17	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27	*/
28
29	/** @file Lexer (lexical analyzer).
30	*
31	* Consumes a text file and produces a sequence of lexical elements (lems).
32	*/
33
34	#include <stdio.h>
35	#include <stdlib.h>
36	#include "bigint.h"
37	#include "mytypes.h"
38	#include "input.h"
39	#include "os/os.h"
40	#include "strtab.h"
41
42	#include "lex.h"
43
44	#define TAB_WIDTH 8
45
46	static void lex_touch(lex_t *lex);
47	static bool_t lex_read_try(lex_t *lex);
48
49	static void lex_skip_comment(lex_t *lex);
50	static void lex_skip_ws(lex_t *lex);
51	static bool_t is_wstart(char c);
52	static bool_t is_wcont(char c);
53	static bool_t is_digit(char c);
54	static void lex_word(lex_t *lex);
55	static void lex_char(lex_t *lex);
56	static void lex_number(lex_t *lex);
57	static void lex_string(lex_t *lex);
58	static int digit_value(char c);
59
60	/* Note: This imposes an implementation limit on identifier length. */
61	#define IBUF_SIZE 128
62	static char ident_buf[IBUF_SIZE + 1];
63
64	/* XXX This imposes an implementation limit on string literal length. */
65	#define SLBUF_SIZE 128
66	static char strlit_buf[SLBUF_SIZE + 1];
67
68	/** Lclass-string pair */
69	struct lc_name {
70	lclass_t lclass;
71	const char *name;
72	};
73
74	/** Keyword names. Used both for printing and recognition. */
75	static struct lc_name keywords[] = {
76	{ lc_as, "as" },
77	{ lc_bool, "bool" },
78	{ lc_char, "char" },
79	{ lc_builtin, "builtin" },
80	{ lc_class, "class" },
81	{ lc_constructor, "constructor" },
82	{ lc_deleg, "deleg" },
83	{ lc_do, "do" },
84	{ lc_else, "else" },
85	{ lc_end, "end" },
86	{ lc_except, "except" },
87	{ lc_false, "false" },
88	{ lc_finally, "finally" },
89	{ lc_for, "for" },
90	{ lc_fun, "fun" },
91	{ lc_get, "get" },
92	{ lc_if, "if" },
93	{ lc_in, "in" },
94	{ lc_int, "int" },
95	{ lc_interface, "interface" },
96	{ lc_is, "is" },
97	{ lc_new, "new" },
98	{ lc_nil, "nil" },
99	{ lc_override, "override" },
100	{ lc_packed, "packed" },
101	{ lc_private, "private" },
102	{ lc_prop, "prop" },
103	{ lc_protected, "protected" },
104	{ lc_public, "public" },
105	{ lc_raise, "raise" },
106	{ lc_resource, "resource" },
107	{ lc_return, "return" },
108	{ lc_self, "self" },
109	{ lc_set, "set" },
110	{ lc_static, "static" },
111	{ lc_string, "string" },
112	{ lc_struct, "struct" },
113	{ lc_then, "then" },
114	{ lc_this, "this" },
115	{ lc_true, "true" },
116	{ lc_var, "var" },
117	{ lc_with, "with" },
118	{ lc_while, "while" },
119	{ lc_yield, "yield" },
120
121	{ 0, NULL }
122	};
123
124	/** Other simple lclasses. Only used for printing. */
125	static struct lc_name simple_lc[] = {
126	{ lc_invalid, "INVALID" },
127	{ lc_eof, "EOF" },
128
129	/* Operators */
130	{ lc_period, "." },
131	{ lc_slash, "/" },
132	{ lc_lparen, "(" },
133	{ lc_rparen, ")" },
134	{ lc_lsbr, "[" },
135	{ lc_rsbr, "]" },
136	{ lc_equal, "==" },
137	{ lc_notequal, "!=" },
138	{ lc_lt, "<" },
139	{ lc_gt, ">" },
140	{ lc_lt_equal, "<=" },
141	{ lc_gt_equal, ">=" },
142	{ lc_assign, "=" },
143	{ lc_plus, "+" },
144	{ lc_minus, "-" },
145	{ lc_mult, "*" },
146	{ lc_increase, "+=" },
147
148	/* Punctuators */
149	{ lc_comma, "," },
150	{ lc_colon, ":" },
151	{ lc_scolon, ";" },
152
153	{ 0, NULL },
154	};
155
156	/** Print lclass value.
157	*
158	* Prints lclass (lexical element class) value in human-readable form
159	* (for debugging).
160	*
161	* @param lclass Lclass value for display.
162	*/
163	void lclass_print(lclass_t lclass)
164	{
165	struct lc_name *dp;
166
167	dp = keywords;
168	while (dp->name != NULL) {
169	if (dp->lclass == lclass) {
170	printf("%s", dp->name);
171	return;
172	}
173	++dp;
174	}
175
176	dp = simple_lc;
177	while (dp->name != NULL) {
178	if (dp->lclass == lclass) {
179	printf("%s", dp->name);
180	return;
181	}
182	++dp;
183	}
184
185	switch (lclass) {
186	case lc_ident:
187	printf("ident");
188	break;
189	case lc_lit_int:
190	printf("int_literal");
191	break;
192	case lc_lit_string:
193	printf("string_literal");
194	break;
195	default:
196	printf("<unknown?>");
197	break;
198	}
199	}
200
201	/** Print lexical element.
202	*
203	* Prints lexical element in human-readable form (for debugging).
204	*
205	* @param lem Lexical element for display.
206	*/
207	void lem_print(lem_t *lem)
208	{
209	lclass_print(lem->lclass);
210
211	switch (lem->lclass) {
212	case lc_ident:
213	printf("('%s')", strtab_get_str(lem->u.ident.sid));
214	break;
215	case lc_lit_int:
216	printf("(");
217	bigint_print(&lem->u.lit_int.value);
218	printf(")");
219	break;
220	case lc_lit_string:
221	printf("(\"%s\")", lem->u.lit_string.value);
222	default:
223	break;
224	}
225	}
226
227	/** Print lem coordinates.
228	*
229	* Print the coordinates (line number, column number) of a lexical element.
230	*
231	* @param lem Lexical element for coordinate printing.
232	*/
233	void lem_print_coords(lem_t *lem)
234	{
235	printf("%d:%d", lem->line_no, lem->col_0);
236	}
237
238	/** Initialize lexer instance.
239	*
240	* @param lex Lexer object to initialize.
241	* @param input Input to associate with lexer.
242	*/
243	void lex_init(lex_t lex, struct input input)
244	{
245	int rc;
246
247	lex->input = input;
248
249	rc = input_get_line(lex->input, &lex->inbuf);
250	if (rc != EOK) {
251	printf("Error reading input.\n");
252	exit(1);
253	}
254
255	lex->ibp = lex->inbuf;
256	lex->col_adj = 0;
257	lex->current_valid = b_true;
258	}
259
260	/** Advance to next lexical element.
261	*
262	* The new element is read in lazily then it is actually accessed.
263	*
264	* @param lex Lexer object.
265	*/
266	void lex_next(lex_t *lex)
267	{
268	/* Make sure the current lem has already been read in. */
269	lex_touch(lex);
270
271	/* Force a new lem to be read on next access. */
272	lex->current_valid = b_false;
273	}
274
275	/** Get current lem.
276	*
277	* The returned pointer is invalidated by next call to lex_next()
278	*
279	* @param lex Lexer object.
280	* @return Pointer to current lem. Owned by @a lex and only valid
281	* until next call to lex_next().
282	*/
283	lem_t lex_get_current(lex_t lex)
284	{
285	lex_touch(lex);
286	return &lex->current;
287	}
288
289	/** Read in the current lexical element (unless already read in).
290	*
291	* @param lex Lexer object.
292	*/
293	static void lex_touch(lex_t *lex)
294	{
295	bool_t got_lem;
296
297	if (lex->current_valid == b_true)
298	return;
299
300	do {
301	got_lem = lex_read_try(lex);
302	} while (got_lem == b_false);
303
304	lex->current_valid = b_true;
305	}
306
307	/** Try reading next lexical element.
308	*
309	* Attemps to read the next lexical element. In some cases (such as a comment)
310	* this function will need to give it another try and returns @c b_false
311	* in such case.
312	*
313	* @param lex Lexer object.
314	* @return @c b_true on success or @c b_false if it needs
315	* restarting. On success the lem is stored to
316	* the current lem in @a lex.
317	*/
318	static bool_t lex_read_try(lex_t *lex)
319	{
320	char *bp;
321
322	lex_skip_ws(lex);
323
324	/*
325	* Record lem coordinates. Line number we already have. For column
326	* number we start with position in the input buffer. This works
327	* for all characters except tab. Thus we keep track of tabs
328	* separately using col_adj.
329	*/
330	lex->current.line_no = input_get_line_no(lex->input);
331	lex->current.col_0 = 1 + lex->col_adj + (lex->ibp - lex->inbuf);
332
333	bp = lex->ibp;
334
335	if (bp[0] == '\0') {
336	/* End of input */
337	lex->current.lclass = lc_eof;
338	return b_true;
339	}
340
341	if (is_wstart(bp[0])) {
342	lex_word(lex);
343	return b_true;
344	}
345
346	if (bp[0] == '\'') {
347	lex_char(lex);
348	return b_true;
349	}
350
351	if (is_digit(bp[0])) {
352	lex_number(lex);
353	return b_true;
354	}
355
356	if (bp[0] == '"') {
357	lex_string(lex);
358	return b_true;
359	}
360
361	if (bp[0] == '-' && bp[1] == '-') {
362	lex_skip_comment(lex);
363	return b_false;
364	}
365
366	switch (bp[0]) {
367	case ',': lex->current.lclass = lc_comma; ++bp; break;
368	case ':': lex->current.lclass = lc_colon; ++bp; break;
369	case ';': lex->current.lclass = lc_scolon; ++bp; break;
370
371	case '.': lex->current.lclass = lc_period; ++bp; break;
372	case '/': lex->current.lclass = lc_slash; ++bp; break;
373	case '(': lex->current.lclass = lc_lparen; ++bp; break;
374	case ')': lex->current.lclass = lc_rparen; ++bp; break;
375	case '[': lex->current.lclass = lc_lsbr; ++bp; break;
376	case ']': lex->current.lclass = lc_rsbr; ++bp; break;
377
378	case '=':
379	if (bp[1] == '=') {
380	lex->current.lclass = lc_equal; bp += 2; break;
381	}
382	lex->current.lclass = lc_assign; ++bp; break;
383
384	case '!':
385	if (bp[1] == '=') {
386	lex->current.lclass = lc_notequal; bp += 2; break;
387	}
388	goto invalid;
389
390	case '+':
391	if (bp[1] == '=') {
392	lex->current.lclass = lc_increase; bp += 2; break;
393	}
394	lex->current.lclass = lc_plus; ++bp; break;
395
396	case '-':
397	lex->current.lclass = lc_minus; ++bp; break;
398
399	case '*':
400	lex->current.lclass = lc_mult; ++bp; break;
401
402	case '<':
403	if (bp[1] == '=') {
404	lex->current.lclass = lc_lt_equal; bp += 2; break;
405	}
406	lex->current.lclass = lc_lt; ++bp; break;
407
408	case '>':
409	if (bp[1] == '=') {
410	lex->current.lclass = lc_gt_equal; bp += 2; break;
411	}
412	lex->current.lclass = lc_gt; ++bp; break;
413
414	default:
415	goto invalid;
416	}
417
418	lex->ibp = bp;
419	return b_true;
420
421	invalid:
422	lex->current.lclass = lc_invalid;
423	++bp;
424	lex->ibp = bp;
425
426	return b_true;
427	}
428
429	/** Lex a word (identifier or keyword).
430	*
431	* Read in a word. This may later turn out to be a keyword or a regular
432	* identifier. It is stored in the current lem in @a lex.
433	*
434	* @param lex Lexer object.
435	*/
436	static void lex_word(lex_t *lex)
437	{
438	struct lc_name *dp;
439	char *bp;
440	int idx;
441
442	bp = lex->ibp;
443	ident_buf[0] = bp[0];
444	idx = 1;
445
446	while (is_wcont(bp[idx])) {
447	if (idx >= IBUF_SIZE) {
448	printf("Error: Identifier too long.\n");
449	exit(1);
450	}
451
452	ident_buf[idx] = bp[idx];
453	++idx;
454	}
455
456	lex->ibp = bp + idx;
457
458	ident_buf[idx] = '\0';
459
460	dp = keywords;
461	while (dp->name != NULL) {
462	if (os_str_cmp(ident_buf, dp->name) == 0) {
463	/* Match */
464	lex->current.lclass = dp->lclass;
465	return;
466	}
467	++dp;
468	}
469
470	/* No matching keyword -- it must be an identifier. */
471	lex->current.lclass = lc_ident;
472	lex->current.u.ident.sid = strtab_get_sid(ident_buf);
473	}
474
475	/** Lex a character literal.
476	*
477	* Reads in a character literal and stores it in the current lem in @a lex.
478	*
479	* @param lex Lexer object.
480	*/
481	static void lex_char(lex_t *lex)
482	{
483	char *bp;
484	int idx;
485	size_t len;
486	int char_val;
487
488	bp = lex->ibp + 1;
489	idx = 0;
490
491	while (bp[idx] != '\'') {
492	if (idx >= SLBUF_SIZE) {
493	printf("Error: Character literal too long.\n");
494	exit(1);
495	}
496
497	if (bp[idx] == '\0') {
498	printf("Error: Unterminated character literal.\n");
499	exit(1);
500	}
501
502	strlit_buf[idx] = bp[idx];
503	++idx;
504	}
505
506	lex->ibp = bp + idx + 1;
507
508	strlit_buf[idx] = '\0';
509	len = os_str_length(strlit_buf);
510	if (len != 1) {
511	printf("Character literal should contain one character, "
512	"but contains %u characters instead.\n", (unsigned) len);
513	exit(1);
514	}
515
516	os_str_get_char(strlit_buf, 0, &char_val);
517	lex->current.lclass = lc_lit_char;
518	bigint_init(&lex->current.u.lit_char.value, char_val);
519	}
520
521	/** Lex a numeric literal.
522	*
523	* Reads in a numeric literal and stores it in the current lem in @a lex.
524	*
525	* @param lex Lexer object.
526	*/
527	static void lex_number(lex_t *lex)
528	{
529	char *bp;
530	bigint_t value;
531	bigint_t dgval;
532	bigint_t base;
533	bigint_t tprod;
534
535	bp = lex->ibp;
536
537	bigint_init(&value, 0);
538	bigint_init(&base, 10);
539
540	while (is_digit(*bp)) {
541	bigint_mul(&value, &base, &tprod);
542	bigint_init(&dgval, digit_value(*bp));
543
544	bigint_destroy(&value);
545	bigint_add(&tprod, &dgval, &value);
546	bigint_destroy(&tprod);
547	bigint_destroy(&dgval);
548
549	++bp;
550	}
551
552	bigint_destroy(&base);
553
554	lex->ibp = bp;
555
556	lex->current.lclass = lc_lit_int;
557	bigint_shallow_copy(&value, &lex->current.u.lit_int.value);
558	}
559
560	/** Lex a string literal.
561	*
562	* Reads in a string literal and stores it in the current lem in @a lex.
563	*
564	* @param lex Lexer object.
565	*/
566	static void lex_string(lex_t *lex)
567	{
568	char *bp;
569	int idx;
570
571	bp = lex->ibp + 1;
572	idx = 0;
573
574	while (bp[idx] != '"') {
575	if (idx >= SLBUF_SIZE) {
576	printf("Error: String literal too long.\n");
577	exit(1);
578	}
579
580	if (bp[idx] == '\0') {
581	printf("Error: Unterminated string literal.\n");
582	exit(1);
583	}
584
585	strlit_buf[idx] = bp[idx];
586	++idx;
587	}
588
589	lex->ibp = bp + idx + 1;
590
591	strlit_buf[idx] = '\0';
592
593	lex->current.lclass = lc_lit_string;
594	lex->current.u.lit_string.value = os_str_dup(strlit_buf);
595	}
596
597	/** Lex a single-line comment.
598	*
599	* This does not produce any lem. The comment is just skipped.
600	*
601	* @param lex Lexer object.
602	*/
603	static void lex_skip_comment(lex_t *lex)
604	{
605	char *bp;
606
607	bp = lex->ibp + 2;
608
609	while (bp != '\n' && bp != '\0') {
610	++bp;
611	}
612
613	lex->ibp = bp;
614	}
615
616	/** Skip whitespace characters.
617	*
618	* This does not produce any lem. The whitespace is just skipped.
619	*
620	* @param lex Lexer object.
621	*/
622	static void lex_skip_ws(lex_t *lex)
623	{
624	char *bp;
625	int rc;
626
627	bp = lex->ibp;
628
629	while (b_true) {
630	while (bp == ' ' \|\| bp == '\t') {
631	if (*bp == '\t') {
632	/* XXX This is too simplifed. */
633	lex->col_adj += (TAB_WIDTH - 1);
634	}
635	++bp;
636	}
637
638	if (*bp != '\n')
639	break;
640
641	/* Read next line */
642	rc = input_get_line(lex->input, &lex->inbuf);
643	if (rc != EOK) {
644	printf("Error reading input.\n");
645	exit(1);
646	}
647
648	bp = lex->inbuf;
649	lex->col_adj = 0;
650	}
651
652	lex->ibp = bp;
653	}
654
655	/** Determine if character can start a word.
656	*
657	* @param c Character.
658	* @return @c b_true if @a c can start a word, @c b_false otherwise.
659	*/
660	static bool_t is_wstart(char c)
661	{
662	return ((c >= 'a') && (c <= 'z')) \|\| ((c >= 'A') && (c <= 'Z')) \|\|
663	(c == '_');
664	}
665
666	/** Determine if character can continue a word.
667	*
668	* @param c Character.
669	* @return @c b_true if @a c can start continue word, @c b_false
670	* otherwise.
671	*/
672	static bool_t is_wcont(char c)
673	{
674	return is_digit(c) \|\| is_wstart(c);
675	}
676
677	/** Determine if character is a numeric digit.
678	*
679	* @param c Character.
680	* @return @c b_true if @a c is a numeric digit, @c b_false otherwise.
681	*/
682	static bool_t is_digit(char c)
683	{
684	return ((c >= '0') && (c <= '9'));
685	}
686
687	/** Determine numeric value of digit character.
688	*
689	* @param c Character, must be a valid decimal digit.
690	* @return Value of the digit (0-9).
691	*/
692	static int digit_value(char c)
693	{
694	return (c - '0');
695	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: