source: mainline/uspace/app/sbi/src/lex.c

Last change on this file was 1433ecda, checked in by Jiri Svoboda <jiri@…>, 7 years ago

Fix cstyle: make ccheck-fix and commit only files where all the changes are good.

  • Property mode set to 100644
File size: 16.4 KB
Line 
1/*
2 * Copyright (c) 2011 Jiri Svoboda
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * - Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * - Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * - The name of the author may not be used to endorse or promote products
15 * derived from this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/** @file Lexer (lexical analyzer).
30 *
31 * Consumes a text file and produces a sequence of lexical elements (lems).
32 */
33
34#include <stdio.h>
35#include <stdlib.h>
36#include "bigint.h"
37#include "cspan.h"
38#include "mytypes.h"
39#include "input.h"
40#include "os/os.h"
41#include "strtab.h"
42
43#include "lex.h"
44
45#define TAB_WIDTH 8
46
47typedef enum {
48 cs_chr,
49 cs_str
50} chr_str_t;
51
52static void lex_touch(lex_t *lex);
53static bool_t lex_read_try(lex_t *lex);
54
55static void lex_skip_comment(lex_t *lex);
56static void lex_skip_ws(lex_t *lex);
57static bool_t is_wstart(char c);
58static bool_t is_wcont(char c);
59static bool_t is_digit(char c);
60static void lex_word(lex_t *lex);
61static void lex_char(lex_t *lex);
62static void lex_number(lex_t *lex);
63static void lex_string(lex_t *lex);
64static void lex_char_string_core(lex_t *lex, chr_str_t cs);
65static int digit_value(char c);
66
67/* Note: This imposes an implementation limit on identifier length. */
68#define IBUF_SIZE 128
69static char ident_buf[IBUF_SIZE + 1];
70
71/* XXX This imposes an implementation limit on string literal length. */
72#define SLBUF_SIZE 128
73static char strlit_buf[SLBUF_SIZE + 1];
74
75/** Lclass-string pair */
76struct lc_name {
77 lclass_t lclass;
78 const char *name;
79};
80
81/** Keyword names. Used both for printing and recognition. */
82static struct lc_name keywords[] = {
83 { lc_and, "and" },
84 { lc_as, "as" },
85 { lc_bool, "bool" },
86 { lc_break, "break" },
87 { lc_builtin, "builtin" },
88 { lc_char, "char" },
89 { lc_class, "class" },
90 { lc_deleg, "deleg" },
91 { lc_do, "do" },
92 { lc_elif, "elif" },
93 { lc_else, "else" },
94 { lc_end, "end" },
95 { lc_enum, "enum" },
96 { lc_except, "except" },
97 { lc_false, "false" },
98 { lc_finally, "finally" },
99 { lc_for, "for" },
100 { lc_fun, "fun" },
101 { lc_get, "get" },
102 { lc_if, "if" },
103 { lc_in, "in" },
104 { lc_int, "int" },
105 { lc_interface, "interface" },
106 { lc_is, "is" },
107 { lc_new, "new" },
108 { lc_not, "not" },
109 { lc_nil, "nil" },
110 { lc_or, "or" },
111 { lc_override, "override" },
112 { lc_packed, "packed" },
113 { lc_private, "private" },
114 { lc_prop, "prop" },
115 { lc_protected, "protected" },
116 { lc_public, "public" },
117 { lc_raise, "raise" },
118 { lc_resource, "resource" },
119 { lc_return, "return" },
120 { lc_self, "self" },
121 { lc_set, "set" },
122 { lc_static, "static" },
123 { lc_string, "string" },
124 { lc_struct, "struct" },
125 { lc_switch, "switch" },
126 { lc_then, "then" },
127 { lc_this, "this" },
128 { lc_true, "true" },
129 { lc_var, "var" },
130 { lc_with, "with" },
131 { lc_when, "when" },
132 { lc_while, "while" },
133 { lc_yield, "yield" },
134
135 { 0, NULL }
136};
137
138/** Other simple lclasses. Only used for printing. */
139static struct lc_name simple_lc[] = {
140 { lc_invalid, "INVALID" },
141 { lc_eof, "EOF" },
142
143 /* Operators */
144 { lc_period, "." },
145 { lc_slash, "/" },
146 { lc_lparen, "(" },
147 { lc_rparen, ")" },
148 { lc_lsbr, "[" },
149 { lc_rsbr, "]" },
150 { lc_equal, "==" },
151 { lc_notequal, "!=" },
152 { lc_lt, "<" },
153 { lc_gt, ">" },
154 { lc_lt_equal, "<=" },
155 { lc_gt_equal, ">=" },
156 { lc_assign, "=" },
157 { lc_plus, "+" },
158 { lc_minus, "-" },
159 { lc_mult, "*" },
160 { lc_increase, "+=" },
161
162 /* Punctuators */
163 { lc_comma, "," },
164 { lc_colon, ":" },
165 { lc_scolon, ";" },
166
167 { 0, NULL },
168};
169
170/** Print lclass value.
171 *
172 * Prints lclass (lexical element class) value in human-readable form
173 * (for debugging).
174 *
175 * @param lclass Lclass value for display.
176 */
177void lclass_print(lclass_t lclass)
178{
179 struct lc_name *dp;
180
181 dp = keywords;
182 while (dp->name != NULL) {
183 if (dp->lclass == lclass) {
184 printf("%s", dp->name);
185 return;
186 }
187 ++dp;
188 }
189
190 dp = simple_lc;
191 while (dp->name != NULL) {
192 if (dp->lclass == lclass) {
193 printf("%s", dp->name);
194 return;
195 }
196 ++dp;
197 }
198
199 switch (lclass) {
200 case lc_ident:
201 printf("ident");
202 break;
203 case lc_lit_int:
204 printf("int_literal");
205 break;
206 case lc_lit_string:
207 printf("string_literal");
208 break;
209 default:
210 printf("<unknown?>");
211 break;
212 }
213}
214
215/** Print lexical element.
216 *
217 * Prints lexical element in human-readable form (for debugging).
218 *
219 * @param lem Lexical element for display.
220 */
221void lem_print(lem_t *lem)
222{
223 lclass_print(lem->lclass);
224
225 switch (lem->lclass) {
226 case lc_ident:
227 printf("('%s')", strtab_get_str(lem->u.ident.sid));
228 break;
229 case lc_lit_int:
230 printf("(");
231 bigint_print(&lem->u.lit_int.value);
232 printf(")");
233 break;
234 case lc_lit_string:
235 printf("(\"%s\")", lem->u.lit_string.value);
236 default:
237 break;
238 }
239}
240
241/** Print lem coordinates.
242 *
243 * Print the coordinates (line number, column number) of a lexical element.
244 *
245 * @param lem Lexical element for coordinate printing.
246 */
247void lem_print_coords(lem_t *lem)
248{
249 cspan_print(lem->cspan);
250}
251
252/** Initialize lexer instance.
253 *
254 * @param lex Lexer object to initialize.
255 * @param input Input to associate with lexer.
256 */
257void lex_init(lex_t *lex, struct input *input)
258{
259 errno_t rc;
260
261 lex->input = input;
262
263 rc = input_get_line(lex->input, &lex->inbuf);
264 if (rc != EOK) {
265 printf("Error reading input.\n");
266 exit(1);
267 }
268
269 lex->ibp = lex->inbuf;
270 lex->col_adj = 0;
271 lex->prev_valid = b_false;
272 lex->current_valid = b_true;
273}
274
275/** Advance to next lexical element.
276 *
277 * The new element is read in lazily then it is actually accessed.
278 *
279 * @param lex Lexer object.
280 */
281void lex_next(lex_t *lex)
282{
283 /* Make sure the current lem has already been read in. */
284 lex_touch(lex);
285
286 /* Force a new lem to be read on next access. */
287 lex->current_valid = b_false;
288}
289
290/** Get current lem.
291 *
292 * The returned pointer is invalidated by next call to lex_next()
293 *
294 * @param lex Lexer object.
295 * @return Pointer to current lem. Owned by @a lex and only valid
296 * until next call to lex_xxx().
297 */
298lem_t *lex_get_current(lex_t *lex)
299{
300 lex_touch(lex);
301 return &lex->current;
302}
303
304/** Get previous lem if valid.
305 *
306 * The returned pointer is invalidated by next call to lex_next()
307 *
308 * @param lex Lexer object.
309 * @return Pointer to previous lem. Owned by @a lex and only valid
310 * until next call to lex_xxx().
311 */
312lem_t *lex_peek_prev(lex_t *lex)
313{
314 if (lex->current_valid == b_false) {
315 /*
316 * This means the head is advanced but next lem was not read.
317 * Thus the previous lem is still in @a current.
318 */
319 return &lex->current;
320 }
321
322 if (lex->prev_valid != b_true) {
323 /* Looks like we are still at the first lem. */
324 return NULL;
325 }
326
327 /*
328 * Current lem has been read in. Thus the previous lem was moved to
329 * @a previous.
330 */
331 return &lex->prev;
332}
333
334/** Read in the current lexical element (unless already read in).
335 *
336 * @param lex Lexer object.
337 */
338static void lex_touch(lex_t *lex)
339{
340 bool_t got_lem;
341
342 if (lex->current_valid == b_true)
343 return;
344
345 /* Copy previous lem */
346 lex->prev = lex->current;
347 lex->prev_valid = b_true;
348
349 do {
350 got_lem = lex_read_try(lex);
351 } while (got_lem == b_false);
352
353 lex->current_valid = b_true;
354}
355
356/** Try reading next lexical element.
357 *
358 * Attemps to read the next lexical element. In some cases (such as a comment)
359 * this function will need to give it another try and returns @c b_false
360 * in such case.
361 *
362 * @param lex Lexer object.
363 * @return @c b_true on success or @c b_false if it needs
364 * restarting. On success the lem is stored to
365 * the current lem in @a lex.
366 */
367static bool_t lex_read_try(lex_t *lex)
368{
369 char *bp, *lsp;
370 int line0, col0;
371
372 lex_skip_ws(lex);
373
374 /*
375 * Record lem coordinates. Line number we already have. For column
376 * number we start with position in the input buffer. This works
377 * for all characters except tab. Thus we keep track of tabs
378 * separately using col_adj.
379 */
380 line0 = input_get_line_no(lex->input);
381 col0 = 1 + lex->col_adj + (lex->ibp - lex->inbuf);
382
383 lex->current.cspan = cspan_new(lex->input, line0, col0, line0, col0);
384
385 lsp = lex->ibp;
386 bp = lex->ibp;
387
388 if (bp[0] == '\0') {
389 /* End of input */
390 lex->current.lclass = lc_eof;
391 goto finish;
392 }
393
394 if (is_wstart(bp[0])) {
395 lex_word(lex);
396 goto finish;
397 }
398
399 if (bp[0] == '\'') {
400 lex_char(lex);
401 goto finish;
402 }
403
404 if (is_digit(bp[0])) {
405 lex_number(lex);
406 goto finish;
407 }
408
409 if (bp[0] == '"') {
410 lex_string(lex);
411 goto finish;
412 }
413
414 if (bp[0] == '-' && bp[1] == '-') {
415 lex_skip_comment(lex);
416
417 /* Compute ending column number */
418 lex->current.cspan->col1 = col0 + (lex->ibp - lsp) - 1;
419
420 /* Try again */
421 return b_false;
422 }
423
424 switch (bp[0]) {
425 case ',':
426 lex->current.lclass = lc_comma;
427 ++bp;
428 break;
429 case ':':
430 lex->current.lclass = lc_colon;
431 ++bp;
432 break;
433 case ';':
434 lex->current.lclass = lc_scolon;
435 ++bp;
436 break;
437
438 case '.':
439 lex->current.lclass = lc_period;
440 ++bp;
441 break;
442 case '/':
443 lex->current.lclass = lc_slash;
444 ++bp;
445 break;
446 case '(':
447 lex->current.lclass = lc_lparen;
448 ++bp;
449 break;
450 case ')':
451 lex->current.lclass = lc_rparen;
452 ++bp;
453 break;
454 case '[':
455 lex->current.lclass = lc_lsbr;
456 ++bp;
457 break;
458 case ']':
459 lex->current.lclass = lc_rsbr;
460 ++bp;
461 break;
462
463 case '=':
464 if (bp[1] == '=') {
465 lex->current.lclass = lc_equal;
466 bp += 2;
467 break;
468 }
469 lex->current.lclass = lc_assign;
470 ++bp;
471 break;
472
473 case '!':
474 if (bp[1] == '=') {
475 lex->current.lclass = lc_notequal;
476 bp += 2;
477 break;
478 }
479 goto invalid;
480
481 case '+':
482 if (bp[1] == '=') {
483 lex->current.lclass = lc_increase;
484 bp += 2;
485 break;
486 }
487 lex->current.lclass = lc_plus;
488 ++bp;
489 break;
490
491 case '-':
492 lex->current.lclass = lc_minus;
493 ++bp;
494 break;
495
496 case '*':
497 lex->current.lclass = lc_mult;
498 ++bp;
499 break;
500
501 case '<':
502 if (bp[1] == '=') {
503 lex->current.lclass = lc_lt_equal;
504 bp += 2;
505 break;
506 }
507 lex->current.lclass = lc_lt;
508 ++bp;
509 break;
510
511 case '>':
512 if (bp[1] == '=') {
513 lex->current.lclass = lc_gt_equal;
514 bp += 2;
515 break;
516 }
517 lex->current.lclass = lc_gt;
518 ++bp;
519 break;
520
521 default:
522 goto invalid;
523 }
524
525 lex->ibp = bp;
526
527finish:
528 /* Compute ending column number */
529 lex->current.cspan->col1 = col0 + (lex->ibp - lsp) - 1;
530 return b_true;
531
532invalid:
533 lex->current.lclass = lc_invalid;
534 ++bp;
535 lex->ibp = bp;
536
537 return b_true;
538}
539
540/** Lex a word (identifier or keyword).
541 *
542 * Read in a word. This may later turn out to be a keyword or a regular
543 * identifier. It is stored in the current lem in @a lex.
544 *
545 * @param lex Lexer object.
546 */
547static void lex_word(lex_t *lex)
548{
549 struct lc_name *dp;
550 char *bp;
551 int idx;
552
553 bp = lex->ibp;
554 ident_buf[0] = bp[0];
555 idx = 1;
556
557 while (is_wcont(bp[idx])) {
558 if (idx >= IBUF_SIZE) {
559 printf("Error: Identifier too long.\n");
560 exit(1);
561 }
562
563 ident_buf[idx] = bp[idx];
564 ++idx;
565 }
566
567 lex->ibp = bp + idx;
568
569 ident_buf[idx] = '\0';
570
571 dp = keywords;
572 while (dp->name != NULL) {
573 if (os_str_cmp(ident_buf, dp->name) == 0) {
574 /* Match */
575 lex->current.lclass = dp->lclass;
576 return;
577 }
578 ++dp;
579 }
580
581 /* No matching keyword -- it must be an identifier. */
582 lex->current.lclass = lc_ident;
583 lex->current.u.ident.sid = strtab_get_sid(ident_buf);
584}
585
586/** Lex a character literal.
587 *
588 * Reads in a character literal and stores it in the current lem in @a lex.
589 *
590 * @param lex Lexer object.
591 */
592static void lex_char(lex_t *lex)
593{
594 size_t len;
595 int char_val;
596
597 lex_char_string_core(lex, cs_chr);
598
599 len = os_str_length(strlit_buf);
600 if (len != 1) {
601 printf("Character literal should contain one character, "
602 "but contains %u characters instead.\n", (unsigned) len);
603 exit(1);
604 }
605
606 os_str_get_char(strlit_buf, 0, &char_val);
607 lex->current.lclass = lc_lit_char;
608 bigint_init(&lex->current.u.lit_char.value, char_val);
609}
610
611/** Lex a numeric literal.
612 *
613 * Reads in a numeric literal and stores it in the current lem in @a lex.
614 *
615 * @param lex Lexer object.
616 */
617static void lex_number(lex_t *lex)
618{
619 char *bp;
620 bigint_t value;
621 bigint_t dgval;
622 bigint_t base;
623 bigint_t tprod;
624
625 bp = lex->ibp;
626
627 bigint_init(&value, 0);
628 bigint_init(&base, 10);
629
630 while (is_digit(*bp)) {
631 bigint_mul(&value, &base, &tprod);
632 bigint_init(&dgval, digit_value(*bp));
633
634 bigint_destroy(&value);
635 bigint_add(&tprod, &dgval, &value);
636 bigint_destroy(&tprod);
637 bigint_destroy(&dgval);
638
639 ++bp;
640 }
641
642 bigint_destroy(&base);
643
644 lex->ibp = bp;
645
646 lex->current.lclass = lc_lit_int;
647 bigint_shallow_copy(&value, &lex->current.u.lit_int.value);
648}
649
650/** Lex a string literal.
651 *
652 * Reads in a string literal and stores it in the current lem in @a lex.
653 *
654 * @param lex Lexer object.
655 */
656static void lex_string(lex_t *lex)
657{
658 lex_char_string_core(lex, cs_str);
659
660 lex->current.lclass = lc_lit_string;
661 lex->current.u.lit_string.value = os_str_dup(strlit_buf);
662}
663
664static void lex_char_string_core(lex_t *lex, chr_str_t cs)
665{
666 char *bp;
667 int sidx, didx;
668 char term;
669 const char *descr, *cap_descr;
670 char spchar;
671
672 /* Make compiler happy */
673 term = '\0';
674 descr = NULL;
675 cap_descr = NULL;
676
677 switch (cs) {
678 case cs_chr:
679 term = '\'';
680 descr = "character";
681 cap_descr = "Character";
682 break;
683 case cs_str:
684 term = '"';
685 descr = "string";
686 cap_descr = "String";
687 break;
688 }
689
690 bp = lex->ibp + 1;
691 sidx = didx = 0;
692
693 while (bp[sidx] != term) {
694 if (didx >= SLBUF_SIZE) {
695 printf("Error: %s literal too long.\n", cap_descr);
696 exit(1);
697 }
698
699 if (bp[sidx] == '\0') {
700 printf("Error: Unterminated %s literal.\n", descr);
701 exit(1);
702 }
703
704 if (bp[sidx] == '\\') {
705 switch (bp[sidx + 1]) {
706 case '\\':
707 spchar = '\\';
708 break;
709 case '\'':
710 spchar = '\'';
711 break;
712 case '"':
713 spchar = '"';
714 break;
715 case 'n':
716 spchar = '\n';
717 break;
718 case 't':
719 spchar = '\t';
720 break;
721 default:
722 printf("Error: Unknown character escape sequence.\n");
723 exit(1);
724 }
725
726 strlit_buf[didx] = spchar;
727 ++didx;
728 sidx += 2;
729 } else {
730 strlit_buf[didx] = bp[sidx];
731 ++sidx;
732 ++didx;
733 }
734 }
735
736 lex->ibp = bp + sidx + 1;
737
738 strlit_buf[didx] = '\0';
739}
740
741/** Lex a single-line comment.
742 *
743 * This does not produce any lem. The comment is just skipped.
744 *
745 * @param lex Lexer object.
746 */
747static void lex_skip_comment(lex_t *lex)
748{
749 char *bp;
750
751 bp = lex->ibp + 2;
752
753 while (*bp != '\n' && *bp != '\0') {
754 ++bp;
755 }
756
757 lex->ibp = bp;
758}
759
760/** Skip whitespace characters.
761 *
762 * This does not produce any lem. The whitespace is just skipped.
763 *
764 * @param lex Lexer object.
765 */
766static void lex_skip_ws(lex_t *lex)
767{
768 char *bp;
769 errno_t rc;
770
771 bp = lex->ibp;
772
773 while (b_true) {
774 while (*bp == ' ' || *bp == '\t') {
775 if (*bp == '\t') {
776 /* XXX This is too simplifed. */
777 lex->col_adj += (TAB_WIDTH - 1);
778 }
779 ++bp;
780 }
781
782 if (*bp != '\n')
783 break;
784
785 /* Read next line */
786 rc = input_get_line(lex->input, &lex->inbuf);
787 if (rc != EOK) {
788 printf("Error reading input.\n");
789 exit(1);
790 }
791
792 bp = lex->inbuf;
793 lex->col_adj = 0;
794 }
795
796 lex->ibp = bp;
797}
798
799/** Determine if character can start a word.
800 *
801 * @param c Character.
802 * @return @c b_true if @a c can start a word, @c b_false otherwise.
803 */
804static bool_t is_wstart(char c)
805{
806 return ((c >= 'a') && (c <= 'z')) || ((c >= 'A') && (c <= 'Z')) ||
807 (c == '_');
808}
809
810/** Determine if character can continue a word.
811 *
812 * @param c Character.
813 * @return @c b_true if @a c can start continue word, @c b_false
814 * otherwise.
815 */
816static bool_t is_wcont(char c)
817{
818 return is_digit(c) || is_wstart(c);
819}
820
821/** Determine if character is a numeric digit.
822 *
823 * @param c Character.
824 * @return @c b_true if @a c is a numeric digit, @c b_false otherwise.
825 */
826static bool_t is_digit(char c)
827{
828 return ((c >= '0') && (c <= '9'));
829}
830
831/** Determine numeric value of digit character.
832 *
833 * @param c Character, must be a valid decimal digit.
834 * @return Value of the digit (0-9).
835 */
836static int digit_value(char c)
837{
838 return (c - '0');
839}
Note: See TracBrowser for help on using the repository browser.