source: mainline/uspace/app/sbi/src/lex.c@ 0485135

lfn serial ticket/834-toolchain-update topic/msim-upgrade topic/simplify-dev-export
Last change on this file since 0485135 was 051bc69a, checked in by Jiri Svoboda <jiri@…>, 15 years ago

Update SBI to rev. 244.

  • Property mode set to 100644
File size: 15.5 KB
Line 
1/*
2 * Copyright (c) 2010 Jiri Svoboda
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * - Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * - Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * - The name of the author may not be used to endorse or promote products
15 * derived from this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/** @file Lexer (lexical analyzer).
30 *
31 * Consumes a text file and produces a sequence of lexical elements (lems).
32 */
33
34#include <stdio.h>
35#include <stdlib.h>
36#include "bigint.h"
37#include "cspan.h"
38#include "mytypes.h"
39#include "input.h"
40#include "os/os.h"
41#include "strtab.h"
42
43#include "lex.h"
44
45#define TAB_WIDTH 8
46
47static void lex_touch(lex_t *lex);
48static bool_t lex_read_try(lex_t *lex);
49
50static void lex_skip_comment(lex_t *lex);
51static void lex_skip_ws(lex_t *lex);
52static bool_t is_wstart(char c);
53static bool_t is_wcont(char c);
54static bool_t is_digit(char c);
55static void lex_word(lex_t *lex);
56static void lex_char(lex_t *lex);
57static void lex_number(lex_t *lex);
58static void lex_string(lex_t *lex);
59static int digit_value(char c);
60
61/* Note: This imposes an implementation limit on identifier length. */
62#define IBUF_SIZE 128
63static char ident_buf[IBUF_SIZE + 1];
64
65/* XXX This imposes an implementation limit on string literal length. */
66#define SLBUF_SIZE 128
67static char strlit_buf[SLBUF_SIZE + 1];
68
69/** Lclass-string pair */
70struct lc_name {
71 lclass_t lclass;
72 const char *name;
73};
74
75/** Keyword names. Used both for printing and recognition. */
76static struct lc_name keywords[] = {
77 { lc_and, "and" },
78 { lc_as, "as" },
79 { lc_bool, "bool" },
80 { lc_break, "break" },
81 { lc_builtin, "builtin" },
82 { lc_char, "char" },
83 { lc_class, "class" },
84 { lc_deleg, "deleg" },
85 { lc_do, "do" },
86 { lc_elif, "elif" },
87 { lc_else, "else" },
88 { lc_end, "end" },
89 { lc_enum, "enum" },
90 { lc_except, "except" },
91 { lc_false, "false" },
92 { lc_finally, "finally" },
93 { lc_for, "for" },
94 { lc_fun, "fun" },
95 { lc_get, "get" },
96 { lc_if, "if" },
97 { lc_in, "in" },
98 { lc_int, "int" },
99 { lc_interface, "interface" },
100 { lc_is, "is" },
101 { lc_new, "new" },
102 { lc_not, "not" },
103 { lc_nil, "nil" },
104 { lc_or, "or" },
105 { lc_override, "override" },
106 { lc_packed, "packed" },
107 { lc_private, "private" },
108 { lc_prop, "prop" },
109 { lc_protected, "protected" },
110 { lc_public, "public" },
111 { lc_raise, "raise" },
112 { lc_resource, "resource" },
113 { lc_return, "return" },
114 { lc_self, "self" },
115 { lc_set, "set" },
116 { lc_static, "static" },
117 { lc_string, "string" },
118 { lc_struct, "struct" },
119 { lc_then, "then" },
120 { lc_this, "this" },
121 { lc_true, "true" },
122 { lc_var, "var" },
123 { lc_with, "with" },
124 { lc_while, "while" },
125 { lc_yield, "yield" },
126
127 { 0, NULL }
128};
129
130/** Other simple lclasses. Only used for printing. */
131static struct lc_name simple_lc[] = {
132 { lc_invalid, "INVALID" },
133 { lc_eof, "EOF" },
134
135 /* Operators */
136 { lc_period, "." },
137 { lc_slash, "/" },
138 { lc_lparen, "(" },
139 { lc_rparen, ")" },
140 { lc_lsbr, "[" },
141 { lc_rsbr, "]" },
142 { lc_equal, "==" },
143 { lc_notequal, "!=" },
144 { lc_lt, "<" },
145 { lc_gt, ">" },
146 { lc_lt_equal, "<=" },
147 { lc_gt_equal, ">=" },
148 { lc_assign, "=" },
149 { lc_plus, "+" },
150 { lc_minus, "-" },
151 { lc_mult, "*" },
152 { lc_increase, "+=" },
153
154 /* Punctuators */
155 { lc_comma, "," },
156 { lc_colon, ":" },
157 { lc_scolon, ";" },
158
159 { 0, NULL },
160};
161
162/** Print lclass value.
163 *
164 * Prints lclass (lexical element class) value in human-readable form
165 * (for debugging).
166 *
167 * @param lclass Lclass value for display.
168 */
169void lclass_print(lclass_t lclass)
170{
171 struct lc_name *dp;
172
173 dp = keywords;
174 while (dp->name != NULL) {
175 if (dp->lclass == lclass) {
176 printf("%s", dp->name);
177 return;
178 }
179 ++dp;
180 }
181
182 dp = simple_lc;
183 while (dp->name != NULL) {
184 if (dp->lclass == lclass) {
185 printf("%s", dp->name);
186 return;
187 }
188 ++dp;
189 }
190
191 switch (lclass) {
192 case lc_ident:
193 printf("ident");
194 break;
195 case lc_lit_int:
196 printf("int_literal");
197 break;
198 case lc_lit_string:
199 printf("string_literal");
200 break;
201 default:
202 printf("<unknown?>");
203 break;
204 }
205}
206
207/** Print lexical element.
208 *
209 * Prints lexical element in human-readable form (for debugging).
210 *
211 * @param lem Lexical element for display.
212 */
213void lem_print(lem_t *lem)
214{
215 lclass_print(lem->lclass);
216
217 switch (lem->lclass) {
218 case lc_ident:
219 printf("('%s')", strtab_get_str(lem->u.ident.sid));
220 break;
221 case lc_lit_int:
222 printf("(");
223 bigint_print(&lem->u.lit_int.value);
224 printf(")");
225 break;
226 case lc_lit_string:
227 printf("(\"%s\")", lem->u.lit_string.value);
228 default:
229 break;
230 }
231}
232
233/** Print lem coordinates.
234 *
235 * Print the coordinates (line number, column number) of a lexical element.
236 *
237 * @param lem Lexical element for coordinate printing.
238 */
239void lem_print_coords(lem_t *lem)
240{
241 cspan_print(lem->cspan);
242}
243
244/** Initialize lexer instance.
245 *
246 * @param lex Lexer object to initialize.
247 * @param input Input to associate with lexer.
248 */
249void lex_init(lex_t *lex, struct input *input)
250{
251 int rc;
252
253 lex->input = input;
254
255 rc = input_get_line(lex->input, &lex->inbuf);
256 if (rc != EOK) {
257 printf("Error reading input.\n");
258 exit(1);
259 }
260
261 lex->ibp = lex->inbuf;
262 lex->col_adj = 0;
263 lex->prev_valid = b_false;
264 lex->current_valid = b_true;
265}
266
267/** Advance to next lexical element.
268 *
269 * The new element is read in lazily then it is actually accessed.
270 *
271 * @param lex Lexer object.
272 */
273void lex_next(lex_t *lex)
274{
275 /* Make sure the current lem has already been read in. */
276 lex_touch(lex);
277
278 /* Force a new lem to be read on next access. */
279 lex->current_valid = b_false;
280}
281
282/** Get current lem.
283 *
284 * The returned pointer is invalidated by next call to lex_next()
285 *
286 * @param lex Lexer object.
287 * @return Pointer to current lem. Owned by @a lex and only valid
288 * until next call to lex_xxx().
289 */
290lem_t *lex_get_current(lex_t *lex)
291{
292 lex_touch(lex);
293 return &lex->current;
294}
295
296/** Get previous lem if valid.
297 *
298 * The returned pointer is invalidated by next call to lex_next()
299 *
300 * @param lex Lexer object.
301 * @return Pointer to previous lem. Owned by @a lex and only valid
302 * until next call to lex_xxx().
303 */
304lem_t *lex_peek_prev(lex_t *lex)
305{
306 if (lex->current_valid == b_false) {
307 /*
308 * This means the head is advanced but next lem was not read.
309 * Thus the previous lem is still in @a current.
310 */
311 return &lex->current;
312 }
313
314 if (lex->prev_valid != b_true) {
315 /* Looks like we are still at the first lem. */
316 return NULL;
317 }
318
319 /*
320 * Current lem has been read in. Thus the previous lem was moved to
321 * @a previous.
322 */
323 return &lex->prev;
324}
325
326/** Read in the current lexical element (unless already read in).
327 *
328 * @param lex Lexer object.
329 */
330static void lex_touch(lex_t *lex)
331{
332 bool_t got_lem;
333
334 if (lex->current_valid == b_true)
335 return;
336
337 /* Copy previous lem */
338 lex->prev = lex->current;
339 lex->prev_valid = b_true;
340
341 do {
342 got_lem = lex_read_try(lex);
343 } while (got_lem == b_false);
344
345 lex->current_valid = b_true;
346}
347
348/** Try reading next lexical element.
349 *
350 * Attemps to read the next lexical element. In some cases (such as a comment)
351 * this function will need to give it another try and returns @c b_false
352 * in such case.
353 *
354 * @param lex Lexer object.
355 * @return @c b_true on success or @c b_false if it needs
356 * restarting. On success the lem is stored to
357 * the current lem in @a lex.
358 */
359static bool_t lex_read_try(lex_t *lex)
360{
361 char *bp, *lsp;
362 int line0, col0;
363
364 lex_skip_ws(lex);
365
366 /*
367 * Record lem coordinates. Line number we already have. For column
368 * number we start with position in the input buffer. This works
369 * for all characters except tab. Thus we keep track of tabs
370 * separately using col_adj.
371 */
372 line0 = input_get_line_no(lex->input);
373 col0 = 1 + lex->col_adj + (lex->ibp - lex->inbuf);
374
375 lex->current.cspan = cspan_new(lex->input, line0, col0, line0, col0);
376
377 lsp = lex->ibp;
378 bp = lex->ibp;
379
380 if (bp[0] == '\0') {
381 /* End of input */
382 lex->current.lclass = lc_eof;
383 goto finish;
384 }
385
386 if (is_wstart(bp[0])) {
387 lex_word(lex);
388 goto finish;
389 }
390
391 if (bp[0] == '\'') {
392 lex_char(lex);
393 goto finish;
394 }
395
396 if (is_digit(bp[0])) {
397 lex_number(lex);
398 goto finish;
399 }
400
401 if (bp[0] == '"') {
402 lex_string(lex);
403 goto finish;
404 }
405
406 if (bp[0] == '-' && bp[1] == '-') {
407 lex_skip_comment(lex);
408
409 /* Compute ending column number */
410 lex->current.cspan->col1 = col0 + (lex->ibp - lsp) - 1;
411
412 /* Try again */
413 return b_false;
414 }
415
416 switch (bp[0]) {
417 case ',': lex->current.lclass = lc_comma; ++bp; break;
418 case ':': lex->current.lclass = lc_colon; ++bp; break;
419 case ';': lex->current.lclass = lc_scolon; ++bp; break;
420
421 case '.': lex->current.lclass = lc_period; ++bp; break;
422 case '/': lex->current.lclass = lc_slash; ++bp; break;
423 case '(': lex->current.lclass = lc_lparen; ++bp; break;
424 case ')': lex->current.lclass = lc_rparen; ++bp; break;
425 case '[': lex->current.lclass = lc_lsbr; ++bp; break;
426 case ']': lex->current.lclass = lc_rsbr; ++bp; break;
427
428 case '=':
429 if (bp[1] == '=') {
430 lex->current.lclass = lc_equal; bp += 2; break;
431 }
432 lex->current.lclass = lc_assign; ++bp; break;
433
434 case '!':
435 if (bp[1] == '=') {
436 lex->current.lclass = lc_notequal; bp += 2; break;
437 }
438 goto invalid;
439
440 case '+':
441 if (bp[1] == '=') {
442 lex->current.lclass = lc_increase; bp += 2; break;
443 }
444 lex->current.lclass = lc_plus; ++bp; break;
445
446 case '-':
447 lex->current.lclass = lc_minus; ++bp; break;
448
449 case '*':
450 lex->current.lclass = lc_mult; ++bp; break;
451
452 case '<':
453 if (bp[1] == '=') {
454 lex->current.lclass = lc_lt_equal; bp += 2; break;
455 }
456 lex->current.lclass = lc_lt; ++bp; break;
457
458 case '>':
459 if (bp[1] == '=') {
460 lex->current.lclass = lc_gt_equal; bp += 2; break;
461 }
462 lex->current.lclass = lc_gt; ++bp; break;
463
464 default:
465 goto invalid;
466 }
467
468 lex->ibp = bp;
469
470finish:
471 /* Compute ending column number */
472 lex->current.cspan->col1 = col0 + (lex->ibp - lsp) - 1;
473 return b_true;
474
475invalid:
476 lex->current.lclass = lc_invalid;
477 ++bp;
478 lex->ibp = bp;
479
480 return b_true;
481}
482
483/** Lex a word (identifier or keyword).
484 *
485 * Read in a word. This may later turn out to be a keyword or a regular
486 * identifier. It is stored in the current lem in @a lex.
487 *
488 * @param lex Lexer object.
489 */
490static void lex_word(lex_t *lex)
491{
492 struct lc_name *dp;
493 char *bp;
494 int idx;
495
496 bp = lex->ibp;
497 ident_buf[0] = bp[0];
498 idx = 1;
499
500 while (is_wcont(bp[idx])) {
501 if (idx >= IBUF_SIZE) {
502 printf("Error: Identifier too long.\n");
503 exit(1);
504 }
505
506 ident_buf[idx] = bp[idx];
507 ++idx;
508 }
509
510 lex->ibp = bp + idx;
511
512 ident_buf[idx] = '\0';
513
514 dp = keywords;
515 while (dp->name != NULL) {
516 if (os_str_cmp(ident_buf, dp->name) == 0) {
517 /* Match */
518 lex->current.lclass = dp->lclass;
519 return;
520 }
521 ++dp;
522 }
523
524 /* No matching keyword -- it must be an identifier. */
525 lex->current.lclass = lc_ident;
526 lex->current.u.ident.sid = strtab_get_sid(ident_buf);
527}
528
529/** Lex a character literal.
530 *
531 * Reads in a character literal and stores it in the current lem in @a lex.
532 *
533 * @param lex Lexer object.
534 */
535static void lex_char(lex_t *lex)
536{
537 char *bp;
538 int idx;
539 size_t len;
540 int char_val;
541
542 bp = lex->ibp + 1;
543 idx = 0;
544
545 while (bp[idx] != '\'') {
546 if (idx >= SLBUF_SIZE) {
547 printf("Error: Character literal too long.\n");
548 exit(1);
549 }
550
551 if (bp[idx] == '\0') {
552 printf("Error: Unterminated character literal.\n");
553 exit(1);
554 }
555
556 strlit_buf[idx] = bp[idx];
557 ++idx;
558 }
559
560 lex->ibp = bp + idx + 1;
561
562 strlit_buf[idx] = '\0';
563 len = os_str_length(strlit_buf);
564 if (len != 1) {
565 printf("Character literal should contain one character, "
566 "but contains %u characters instead.\n", (unsigned) len);
567 exit(1);
568 }
569
570 os_str_get_char(strlit_buf, 0, &char_val);
571 lex->current.lclass = lc_lit_char;
572 bigint_init(&lex->current.u.lit_char.value, char_val);
573}
574
575/** Lex a numeric literal.
576 *
577 * Reads in a numeric literal and stores it in the current lem in @a lex.
578 *
579 * @param lex Lexer object.
580 */
581static void lex_number(lex_t *lex)
582{
583 char *bp;
584 bigint_t value;
585 bigint_t dgval;
586 bigint_t base;
587 bigint_t tprod;
588
589 bp = lex->ibp;
590
591 bigint_init(&value, 0);
592 bigint_init(&base, 10);
593
594 while (is_digit(*bp)) {
595 bigint_mul(&value, &base, &tprod);
596 bigint_init(&dgval, digit_value(*bp));
597
598 bigint_destroy(&value);
599 bigint_add(&tprod, &dgval, &value);
600 bigint_destroy(&tprod);
601 bigint_destroy(&dgval);
602
603 ++bp;
604 }
605
606 bigint_destroy(&base);
607
608 lex->ibp = bp;
609
610 lex->current.lclass = lc_lit_int;
611 bigint_shallow_copy(&value, &lex->current.u.lit_int.value);
612}
613
614/** Lex a string literal.
615 *
616 * Reads in a string literal and stores it in the current lem in @a lex.
617 *
618 * @param lex Lexer object.
619 */
620static void lex_string(lex_t *lex)
621{
622 char *bp;
623 int idx;
624
625 bp = lex->ibp + 1;
626 idx = 0;
627
628 while (bp[idx] != '"') {
629 if (idx >= SLBUF_SIZE) {
630 printf("Error: String literal too long.\n");
631 exit(1);
632 }
633
634 if (bp[idx] == '\0') {
635 printf("Error: Unterminated string literal.\n");
636 exit(1);
637 }
638
639 strlit_buf[idx] = bp[idx];
640 ++idx;
641 }
642
643 lex->ibp = bp + idx + 1;
644
645 strlit_buf[idx] = '\0';
646
647 lex->current.lclass = lc_lit_string;
648 lex->current.u.lit_string.value = os_str_dup(strlit_buf);
649}
650
651/** Lex a single-line comment.
652 *
653 * This does not produce any lem. The comment is just skipped.
654 *
655 * @param lex Lexer object.
656 */
657static void lex_skip_comment(lex_t *lex)
658{
659 char *bp;
660
661 bp = lex->ibp + 2;
662
663 while (*bp != '\n' && *bp != '\0') {
664 ++bp;
665 }
666
667 lex->ibp = bp;
668}
669
670/** Skip whitespace characters.
671 *
672 * This does not produce any lem. The whitespace is just skipped.
673 *
674 * @param lex Lexer object.
675 */
676static void lex_skip_ws(lex_t *lex)
677{
678 char *bp;
679 int rc;
680
681 bp = lex->ibp;
682
683 while (b_true) {
684 while (*bp == ' ' || *bp == '\t') {
685 if (*bp == '\t') {
686 /* XXX This is too simplifed. */
687 lex->col_adj += (TAB_WIDTH - 1);
688 }
689 ++bp;
690 }
691
692 if (*bp != '\n')
693 break;
694
695 /* Read next line */
696 rc = input_get_line(lex->input, &lex->inbuf);
697 if (rc != EOK) {
698 printf("Error reading input.\n");
699 exit(1);
700 }
701
702 bp = lex->inbuf;
703 lex->col_adj = 0;
704 }
705
706 lex->ibp = bp;
707}
708
709/** Determine if character can start a word.
710 *
711 * @param c Character.
712 * @return @c b_true if @a c can start a word, @c b_false otherwise.
713 */
714static bool_t is_wstart(char c)
715{
716 return ((c >= 'a') && (c <= 'z')) || ((c >= 'A') && (c <= 'Z')) ||
717 (c == '_');
718}
719
720/** Determine if character can continue a word.
721 *
722 * @param c Character.
723 * @return @c b_true if @a c can start continue word, @c b_false
724 * otherwise.
725 */
726static bool_t is_wcont(char c)
727{
728 return is_digit(c) || is_wstart(c);
729}
730
731/** Determine if character is a numeric digit.
732 *
733 * @param c Character.
734 * @return @c b_true if @a c is a numeric digit, @c b_false otherwise.
735 */
736static bool_t is_digit(char c)
737{
738 return ((c >= '0') && (c <= '9'));
739}
740
741/** Determine numeric value of digit character.
742 *
743 * @param c Character, must be a valid decimal digit.
744 * @return Value of the digit (0-9).
745 */
746static int digit_value(char c)
747{
748 return (c - '0');
749}
Note: See TracBrowser for help on using the repository browser.