source: mainline/uspace/app/sbi/src/lex.c@ 37c9fc8

lfn serial ticket/834-toolchain-update topic/msim-upgrade topic/simplify-dev-export
Last change on this file since 37c9fc8 was 38aaacc2, checked in by Jiri Svoboda <jiri@…>, 16 years ago

Update SBI to rev. 207.

  • Property mode set to 100644
File size: 14.4 KB
Line 
1/*
2 * Copyright (c) 2010 Jiri Svoboda
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * - Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * - Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * - The name of the author may not be used to endorse or promote products
15 * derived from this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/** @file Lexer (lexical analyzer).
30 *
31 * Consumes a text file and produces a sequence of lexical elements (lems).
32 */
33
34#include <stdio.h>
35#include <stdlib.h>
36#include "bigint.h"
37#include "mytypes.h"
38#include "input.h"
39#include "os/os.h"
40#include "strtab.h"
41
42#include "lex.h"
43
44#define TAB_WIDTH 8
45
46static void lex_touch(lex_t *lex);
47static bool_t lex_read_try(lex_t *lex);
48
49static void lex_skip_comment(lex_t *lex);
50static void lex_skip_ws(lex_t *lex);
51static bool_t is_wstart(char c);
52static bool_t is_wcont(char c);
53static bool_t is_digit(char c);
54static void lex_word(lex_t *lex);
55static void lex_char(lex_t *lex);
56static void lex_number(lex_t *lex);
57static void lex_string(lex_t *lex);
58static int digit_value(char c);
59
60/* Note: This imposes an implementation limit on identifier length. */
61#define IBUF_SIZE 128
62static char ident_buf[IBUF_SIZE + 1];
63
64/* XXX This imposes an implementation limit on string literal length. */
65#define SLBUF_SIZE 128
66static char strlit_buf[SLBUF_SIZE + 1];
67
68/** Lclass-string pair */
69struct lc_name {
70 lclass_t lclass;
71 const char *name;
72};
73
74/** Keyword names. Used both for printing and recognition. */
75static struct lc_name keywords[] = {
76 { lc_as, "as" },
77 { lc_bool, "bool" },
78 { lc_char, "char" },
79 { lc_builtin, "builtin" },
80 { lc_class, "class" },
81 { lc_constructor, "constructor" },
82 { lc_deleg, "deleg" },
83 { lc_do, "do" },
84 { lc_else, "else" },
85 { lc_end, "end" },
86 { lc_except, "except" },
87 { lc_false, "false" },
88 { lc_finally, "finally" },
89 { lc_for, "for" },
90 { lc_fun, "fun" },
91 { lc_get, "get" },
92 { lc_if, "if" },
93 { lc_in, "in" },
94 { lc_int, "int" },
95 { lc_interface, "interface" },
96 { lc_is, "is" },
97 { lc_new, "new" },
98 { lc_nil, "nil" },
99 { lc_override, "override" },
100 { lc_packed, "packed" },
101 { lc_private, "private" },
102 { lc_prop, "prop" },
103 { lc_protected, "protected" },
104 { lc_public, "public" },
105 { lc_raise, "raise" },
106 { lc_resource, "resource" },
107 { lc_return, "return" },
108 { lc_self, "self" },
109 { lc_set, "set" },
110 { lc_static, "static" },
111 { lc_string, "string" },
112 { lc_struct, "struct" },
113 { lc_then, "then" },
114 { lc_this, "this" },
115 { lc_true, "true" },
116 { lc_var, "var" },
117 { lc_with, "with" },
118 { lc_while, "while" },
119 { lc_yield, "yield" },
120
121 { 0, NULL }
122};
123
124/** Other simple lclasses. Only used for printing. */
125static struct lc_name simple_lc[] = {
126 { lc_invalid, "INVALID" },
127 { lc_eof, "EOF" },
128
129 /* Operators */
130 { lc_period, "." },
131 { lc_slash, "/" },
132 { lc_lparen, "(" },
133 { lc_rparen, ")" },
134 { lc_lsbr, "[" },
135 { lc_rsbr, "]" },
136 { lc_equal, "==" },
137 { lc_notequal, "!=" },
138 { lc_lt, "<" },
139 { lc_gt, ">" },
140 { lc_lt_equal, "<=" },
141 { lc_gt_equal, ">=" },
142 { lc_assign, "=" },
143 { lc_plus, "+" },
144 { lc_minus, "-" },
145 { lc_mult, "*" },
146 { lc_increase, "+=" },
147
148 /* Punctuators */
149 { lc_comma, "," },
150 { lc_colon, ":" },
151 { lc_scolon, ";" },
152
153 { 0, NULL },
154};
155
156/** Print lclass value.
157 *
158 * Prints lclass (lexical element class) value in human-readable form
159 * (for debugging).
160 *
161 * @param lclass Lclass value for display.
162 */
163void lclass_print(lclass_t lclass)
164{
165 struct lc_name *dp;
166
167 dp = keywords;
168 while (dp->name != NULL) {
169 if (dp->lclass == lclass) {
170 printf("%s", dp->name);
171 return;
172 }
173 ++dp;
174 }
175
176 dp = simple_lc;
177 while (dp->name != NULL) {
178 if (dp->lclass == lclass) {
179 printf("%s", dp->name);
180 return;
181 }
182 ++dp;
183 }
184
185 switch (lclass) {
186 case lc_ident:
187 printf("ident");
188 break;
189 case lc_lit_int:
190 printf("int_literal");
191 break;
192 case lc_lit_string:
193 printf("string_literal");
194 break;
195 default:
196 printf("<unknown?>");
197 break;
198 }
199}
200
201/** Print lexical element.
202 *
203 * Prints lexical element in human-readable form (for debugging).
204 *
205 * @param lem Lexical element for display.
206 */
207void lem_print(lem_t *lem)
208{
209 lclass_print(lem->lclass);
210
211 switch (lem->lclass) {
212 case lc_ident:
213 printf("('%s')", strtab_get_str(lem->u.ident.sid));
214 break;
215 case lc_lit_int:
216 printf("(");
217 bigint_print(&lem->u.lit_int.value);
218 printf(")");
219 break;
220 case lc_lit_string:
221 printf("(\"%s\")", lem->u.lit_string.value);
222 default:
223 break;
224 }
225}
226
227/** Print lem coordinates.
228 *
229 * Print the coordinates (line number, column number) of a lexical element.
230 *
231 * @param lem Lexical element for coordinate printing.
232 */
233void lem_print_coords(lem_t *lem)
234{
235 printf("%d:%d", lem->line_no, lem->col_0);
236}
237
238/** Initialize lexer instance.
239 *
240 * @param lex Lexer object to initialize.
241 * @param input Input to associate with lexer.
242 */
243void lex_init(lex_t *lex, struct input *input)
244{
245 int rc;
246
247 lex->input = input;
248
249 rc = input_get_line(lex->input, &lex->inbuf);
250 if (rc != EOK) {
251 printf("Error reading input.\n");
252 exit(1);
253 }
254
255 lex->ibp = lex->inbuf;
256 lex->col_adj = 0;
257 lex->current_valid = b_true;
258}
259
260/** Advance to next lexical element.
261 *
262 * The new element is read in lazily then it is actually accessed.
263 *
264 * @param lex Lexer object.
265 */
266void lex_next(lex_t *lex)
267{
268 /* Make sure the current lem has already been read in. */
269 lex_touch(lex);
270
271 /* Force a new lem to be read on next access. */
272 lex->current_valid = b_false;
273}
274
275/** Get current lem.
276 *
277 * The returned pointer is invalidated by next call to lex_next()
278 *
279 * @param lex Lexer object.
280 * @return Pointer to current lem. Owned by @a lex and only valid
281 * until next call to lex_next().
282 */
283lem_t *lex_get_current(lex_t *lex)
284{
285 lex_touch(lex);
286 return &lex->current;
287}
288
289/** Read in the current lexical element (unless already read in).
290 *
291 * @param lex Lexer object.
292 */
293static void lex_touch(lex_t *lex)
294{
295 bool_t got_lem;
296
297 if (lex->current_valid == b_true)
298 return;
299
300 do {
301 got_lem = lex_read_try(lex);
302 } while (got_lem == b_false);
303
304 lex->current_valid = b_true;
305}
306
307/** Try reading next lexical element.
308 *
309 * Attemps to read the next lexical element. In some cases (such as a comment)
310 * this function will need to give it another try and returns @c b_false
311 * in such case.
312 *
313 * @param lex Lexer object.
314 * @return @c b_true on success or @c b_false if it needs
315 * restarting. On success the lem is stored to
316 * the current lem in @a lex.
317 */
318static bool_t lex_read_try(lex_t *lex)
319{
320 char *bp;
321
322 lex_skip_ws(lex);
323
324 /*
325 * Record lem coordinates. Line number we already have. For column
326 * number we start with position in the input buffer. This works
327 * for all characters except tab. Thus we keep track of tabs
328 * separately using col_adj.
329 */
330 lex->current.line_no = input_get_line_no(lex->input);
331 lex->current.col_0 = 1 + lex->col_adj + (lex->ibp - lex->inbuf);
332
333 bp = lex->ibp;
334
335 if (bp[0] == '\0') {
336 /* End of input */
337 lex->current.lclass = lc_eof;
338 return b_true;
339 }
340
341 if (is_wstart(bp[0])) {
342 lex_word(lex);
343 return b_true;
344 }
345
346 if (bp[0] == '\'') {
347 lex_char(lex);
348 return b_true;
349 }
350
351 if (is_digit(bp[0])) {
352 lex_number(lex);
353 return b_true;
354 }
355
356 if (bp[0] == '"') {
357 lex_string(lex);
358 return b_true;
359 }
360
361 if (bp[0] == '-' && bp[1] == '-') {
362 lex_skip_comment(lex);
363 return b_false;
364 }
365
366 switch (bp[0]) {
367 case ',': lex->current.lclass = lc_comma; ++bp; break;
368 case ':': lex->current.lclass = lc_colon; ++bp; break;
369 case ';': lex->current.lclass = lc_scolon; ++bp; break;
370
371 case '.': lex->current.lclass = lc_period; ++bp; break;
372 case '/': lex->current.lclass = lc_slash; ++bp; break;
373 case '(': lex->current.lclass = lc_lparen; ++bp; break;
374 case ')': lex->current.lclass = lc_rparen; ++bp; break;
375 case '[': lex->current.lclass = lc_lsbr; ++bp; break;
376 case ']': lex->current.lclass = lc_rsbr; ++bp; break;
377
378 case '=':
379 if (bp[1] == '=') {
380 lex->current.lclass = lc_equal; bp += 2; break;
381 }
382 lex->current.lclass = lc_assign; ++bp; break;
383
384 case '!':
385 if (bp[1] == '=') {
386 lex->current.lclass = lc_notequal; bp += 2; break;
387 }
388 goto invalid;
389
390 case '+':
391 if (bp[1] == '=') {
392 lex->current.lclass = lc_increase; bp += 2; break;
393 }
394 lex->current.lclass = lc_plus; ++bp; break;
395
396 case '-':
397 lex->current.lclass = lc_minus; ++bp; break;
398
399 case '*':
400 lex->current.lclass = lc_mult; ++bp; break;
401
402 case '<':
403 if (bp[1] == '=') {
404 lex->current.lclass = lc_lt_equal; bp += 2; break;
405 }
406 lex->current.lclass = lc_lt; ++bp; break;
407
408 case '>':
409 if (bp[1] == '=') {
410 lex->current.lclass = lc_gt_equal; bp += 2; break;
411 }
412 lex->current.lclass = lc_gt; ++bp; break;
413
414 default:
415 goto invalid;
416 }
417
418 lex->ibp = bp;
419 return b_true;
420
421invalid:
422 lex->current.lclass = lc_invalid;
423 ++bp;
424 lex->ibp = bp;
425
426 return b_true;
427}
428
429/** Lex a word (identifier or keyword).
430 *
431 * Read in a word. This may later turn out to be a keyword or a regular
432 * identifier. It is stored in the current lem in @a lex.
433 *
434 * @param lex Lexer object.
435 */
436static void lex_word(lex_t *lex)
437{
438 struct lc_name *dp;
439 char *bp;
440 int idx;
441
442 bp = lex->ibp;
443 ident_buf[0] = bp[0];
444 idx = 1;
445
446 while (is_wcont(bp[idx])) {
447 if (idx >= IBUF_SIZE) {
448 printf("Error: Identifier too long.\n");
449 exit(1);
450 }
451
452 ident_buf[idx] = bp[idx];
453 ++idx;
454 }
455
456 lex->ibp = bp + idx;
457
458 ident_buf[idx] = '\0';
459
460 dp = keywords;
461 while (dp->name != NULL) {
462 if (os_str_cmp(ident_buf, dp->name) == 0) {
463 /* Match */
464 lex->current.lclass = dp->lclass;
465 return;
466 }
467 ++dp;
468 }
469
470 /* No matching keyword -- it must be an identifier. */
471 lex->current.lclass = lc_ident;
472 lex->current.u.ident.sid = strtab_get_sid(ident_buf);
473}
474
475/** Lex a character literal.
476 *
477 * Reads in a character literal and stores it in the current lem in @a lex.
478 *
479 * @param lex Lexer object.
480 */
481static void lex_char(lex_t *lex)
482{
483 char *bp;
484 int idx;
485 size_t len;
486 int char_val;
487
488 bp = lex->ibp + 1;
489 idx = 0;
490
491 while (bp[idx] != '\'') {
492 if (idx >= SLBUF_SIZE) {
493 printf("Error: Character literal too long.\n");
494 exit(1);
495 }
496
497 if (bp[idx] == '\0') {
498 printf("Error: Unterminated character literal.\n");
499 exit(1);
500 }
501
502 strlit_buf[idx] = bp[idx];
503 ++idx;
504 }
505
506 lex->ibp = bp + idx + 1;
507
508 strlit_buf[idx] = '\0';
509 len = os_str_length(strlit_buf);
510 if (len != 1) {
511 printf("Character literal should contain one character, "
512 "but contains %u characters instead.\n", (unsigned) len);
513 exit(1);
514 }
515
516 os_str_get_char(strlit_buf, 0, &char_val);
517 lex->current.lclass = lc_lit_char;
518 bigint_init(&lex->current.u.lit_char.value, char_val);
519}
520
521/** Lex a numeric literal.
522 *
523 * Reads in a numeric literal and stores it in the current lem in @a lex.
524 *
525 * @param lex Lexer object.
526 */
527static void lex_number(lex_t *lex)
528{
529 char *bp;
530 bigint_t value;
531 bigint_t dgval;
532 bigint_t base;
533 bigint_t tprod;
534
535 bp = lex->ibp;
536
537 bigint_init(&value, 0);
538 bigint_init(&base, 10);
539
540 while (is_digit(*bp)) {
541 bigint_mul(&value, &base, &tprod);
542 bigint_init(&dgval, digit_value(*bp));
543
544 bigint_destroy(&value);
545 bigint_add(&tprod, &dgval, &value);
546 bigint_destroy(&tprod);
547 bigint_destroy(&dgval);
548
549 ++bp;
550 }
551
552 bigint_destroy(&base);
553
554 lex->ibp = bp;
555
556 lex->current.lclass = lc_lit_int;
557 bigint_shallow_copy(&value, &lex->current.u.lit_int.value);
558}
559
560/** Lex a string literal.
561 *
562 * Reads in a string literal and stores it in the current lem in @a lex.
563 *
564 * @param lex Lexer object.
565 */
566static void lex_string(lex_t *lex)
567{
568 char *bp;
569 int idx;
570
571 bp = lex->ibp + 1;
572 idx = 0;
573
574 while (bp[idx] != '"') {
575 if (idx >= SLBUF_SIZE) {
576 printf("Error: String literal too long.\n");
577 exit(1);
578 }
579
580 if (bp[idx] == '\0') {
581 printf("Error: Unterminated string literal.\n");
582 exit(1);
583 }
584
585 strlit_buf[idx] = bp[idx];
586 ++idx;
587 }
588
589 lex->ibp = bp + idx + 1;
590
591 strlit_buf[idx] = '\0';
592
593 lex->current.lclass = lc_lit_string;
594 lex->current.u.lit_string.value = os_str_dup(strlit_buf);
595}
596
597/** Lex a single-line comment.
598 *
599 * This does not produce any lem. The comment is just skipped.
600 *
601 * @param lex Lexer object.
602 */
603static void lex_skip_comment(lex_t *lex)
604{
605 char *bp;
606
607 bp = lex->ibp + 2;
608
609 while (*bp != '\n' && *bp != '\0') {
610 ++bp;
611 }
612
613 lex->ibp = bp;
614}
615
616/** Skip whitespace characters.
617 *
618 * This does not produce any lem. The whitespace is just skipped.
619 *
620 * @param lex Lexer object.
621 */
622static void lex_skip_ws(lex_t *lex)
623{
624 char *bp;
625 int rc;
626
627 bp = lex->ibp;
628
629 while (b_true) {
630 while (*bp == ' ' || *bp == '\t') {
631 if (*bp == '\t') {
632 /* XXX This is too simplifed. */
633 lex->col_adj += (TAB_WIDTH - 1);
634 }
635 ++bp;
636 }
637
638 if (*bp != '\n')
639 break;
640
641 /* Read next line */
642 rc = input_get_line(lex->input, &lex->inbuf);
643 if (rc != EOK) {
644 printf("Error reading input.\n");
645 exit(1);
646 }
647
648 bp = lex->inbuf;
649 lex->col_adj = 0;
650 }
651
652 lex->ibp = bp;
653}
654
655/** Determine if character can start a word.
656 *
657 * @param c Character.
658 * @return @c b_true if @a c can start a word, @c b_false otherwise.
659 */
660static bool_t is_wstart(char c)
661{
662 return ((c >= 'a') && (c <= 'z')) || ((c >= 'A') && (c <= 'Z')) ||
663 (c == '_');
664}
665
666/** Determine if character can continue a word.
667 *
668 * @param c Character.
669 * @return @c b_true if @a c can start continue word, @c b_false
670 * otherwise.
671 */
672static bool_t is_wcont(char c)
673{
674 return is_digit(c) || is_wstart(c);
675}
676
677/** Determine if character is a numeric digit.
678 *
679 * @param c Character.
680 * @return @c b_true if @a c is a numeric digit, @c b_false otherwise.
681 */
682static bool_t is_digit(char c)
683{
684 return ((c >= '0') && (c <= '9'));
685}
686
687/** Determine numeric value of digit character.
688 *
689 * @param c Character, must be a valid decimal digit.
690 * @return Value of the digit (0-9).
691 */
692static int digit_value(char c)
693{
694 return (c - '0');
695}
Note: See TracBrowser for help on using the repository browser.