source: mainline/uspace/app/sbi/src/lex.c@ 074444f

lfn serial ticket/834-toolchain-update topic/msim-upgrade topic/simplify-dev-export
Last change on this file since 074444f was 074444f, checked in by Jiri Svoboda <jiri@…>, 16 years ago

Update SBI to rev. 184.

  • Property mode set to 100644
File size: 14.3 KB
Line 
1/*
2 * Copyright (c) 2010 Jiri Svoboda
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * - Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * - Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * - The name of the author may not be used to endorse or promote products
15 * derived from this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/** @file Lexer (lexical analyzer).
30 *
31 * Consumes a text file and produces a sequence of lexical elements (lems).
32 */
33
34#include <stdio.h>
35#include <stdlib.h>
36#include "bigint.h"
37#include "mytypes.h"
38#include "input.h"
39#include "os/os.h"
40#include "strtab.h"
41
42#include "lex.h"
43
44#define TAB_WIDTH 8
45
46static void lex_touch(lex_t *lex);
47static bool_t lex_read_try(lex_t *lex);
48
49static void lex_skip_comment(lex_t *lex);
50static void lex_skip_ws(lex_t *lex);
51static bool_t is_wstart(char c);
52static bool_t is_wcont(char c);
53static bool_t is_digit(char c);
54static void lex_word(lex_t *lex);
55static void lex_char(lex_t *lex);
56static void lex_number(lex_t *lex);
57static void lex_string(lex_t *lex);
58static int digit_value(char c);
59
60/* Note: This imposes an implementation limit on identifier length. */
61#define IBUF_SIZE 128
62static char ident_buf[IBUF_SIZE + 1];
63
64/* XXX This imposes an implementation limit on string literal length. */
65#define SLBUF_SIZE 128
66static char strlit_buf[SLBUF_SIZE + 1];
67
68/** Lclass-string pair */
69struct lc_name {
70 lclass_t lclass;
71 const char *name;
72};
73
74/** Keyword names. Used both for printing and recognition. */
75static struct lc_name keywords[] = {
76 { lc_as, "as" },
77 { lc_bool, "bool" },
78 { lc_char, "char" },
79 { lc_builtin, "builtin" },
80 { lc_class, "class" },
81 { lc_constructor, "constructor" },
82 { lc_do, "do" },
83 { lc_else, "else" },
84 { lc_end, "end" },
85 { lc_except, "except" },
86 { lc_false, "false" },
87 { lc_finally, "finally" },
88 { lc_for, "for" },
89 { lc_fun, "fun" },
90 { lc_get, "get" },
91 { lc_if, "if" },
92 { lc_in, "in" },
93 { lc_int, "int" },
94 { lc_interface, "interface" },
95 { lc_is, "is" },
96 { lc_new, "new" },
97 { lc_nil, "nil" },
98 { lc_override, "override" },
99 { lc_packed, "packed" },
100 { lc_private, "private" },
101 { lc_prop, "prop" },
102 { lc_protected, "protected" },
103 { lc_public, "public" },
104 { lc_raise, "raise" },
105 { lc_resource, "resource" },
106 { lc_return, "return" },
107 { lc_self, "self" },
108 { lc_set, "set" },
109 { lc_static, "static" },
110 { lc_string, "string" },
111 { lc_struct, "struct" },
112 { lc_then, "then" },
113 { lc_this, "this" },
114 { lc_true, "true" },
115 { lc_var, "var" },
116 { lc_with, "with" },
117 { lc_while, "while" },
118 { lc_yield, "yield" },
119
120 { 0, NULL }
121};
122
123/** Other simple lclasses. Only used for printing. */
124static struct lc_name simple_lc[] = {
125 { lc_invalid, "INVALID" },
126 { lc_eof, "EOF" },
127
128 /* Operators */
129 { lc_period, "." },
130 { lc_slash, "/" },
131 { lc_lparen, "(" },
132 { lc_rparen, ")" },
133 { lc_lsbr, "[" },
134 { lc_rsbr, "]" },
135 { lc_equal, "==" },
136 { lc_notequal, "!=" },
137 { lc_lt, "<" },
138 { lc_gt, ">" },
139 { lc_lt_equal, "<=" },
140 { lc_gt_equal, ">=" },
141 { lc_assign, "=" },
142 { lc_plus, "+" },
143 { lc_minus, "-" },
144 { lc_mult, "*" },
145 { lc_increase, "+=" },
146
147 /* Punctuators */
148 { lc_comma, "," },
149 { lc_colon, ":" },
150 { lc_scolon, ";" },
151
152 { 0, NULL },
153};
154
155/** Print lclass value.
156 *
157 * Prints lclass (lexical element class) value in human-readable form
158 * (for debugging).
159 *
160 * @param lclass Lclass value for display.
161 */
162void lclass_print(lclass_t lclass)
163{
164 struct lc_name *dp;
165
166 dp = keywords;
167 while (dp->name != NULL) {
168 if (dp->lclass == lclass) {
169 printf("%s", dp->name);
170 return;
171 }
172 ++dp;
173 }
174
175 dp = simple_lc;
176 while (dp->name != NULL) {
177 if (dp->lclass == lclass) {
178 printf("%s", dp->name);
179 return;
180 }
181 ++dp;
182 }
183
184 switch (lclass) {
185 case lc_ident:
186 printf("ident");
187 break;
188 case lc_lit_int:
189 printf("int_literal");
190 break;
191 case lc_lit_string:
192 printf("string_literal");
193 break;
194 default:
195 printf("<unknown?>");
196 break;
197 }
198}
199
200/** Print lexical element.
201 *
202 * Prints lexical element in human-readable form (for debugging).
203 *
204 * @param lem Lexical element for display.
205 */
206void lem_print(lem_t *lem)
207{
208 lclass_print(lem->lclass);
209
210 switch (lem->lclass) {
211 case lc_ident:
212 printf("(%d)", lem->u.ident.sid);
213 break;
214 case lc_lit_int:
215 printf("(");
216 bigint_print(&lem->u.lit_int.value);
217 printf(")");
218 break;
219 case lc_lit_string:
220 printf("(\"%s\")", lem->u.lit_string.value);
221 default:
222 break;
223 }
224}
225
226/** Print lem coordinates.
227 *
228 * Print the coordinates (line number, column number) of a lexical element.
229 *
230 * @param lem Lexical element for coordinate printing.
231 */
232void lem_print_coords(lem_t *lem)
233{
234 printf("%d:%d", lem->line_no, lem->col_0);
235}
236
237/** Initialize lexer instance.
238 *
239 * @param lex Lexer object to initialize.
240 * @param input Input to associate with lexer.
241 */
242void lex_init(lex_t *lex, struct input *input)
243{
244 int rc;
245
246 lex->input = input;
247
248 rc = input_get_line(lex->input, &lex->inbuf);
249 if (rc != EOK) {
250 printf("Error reading input.\n");
251 exit(1);
252 }
253
254 lex->ibp = lex->inbuf;
255 lex->col_adj = 0;
256 lex->current_valid = b_true;
257}
258
259/** Advance to next lexical element.
260 *
261 * The new element is read in lazily then it is actually accessed.
262 *
263 * @param lex Lexer object.
264 */
265void lex_next(lex_t *lex)
266{
267 /* Make sure the current lem has already been read in. */
268 lex_touch(lex);
269
270 /* Force a new lem to be read on next access. */
271 lex->current_valid = b_false;
272}
273
274/** Get current lem.
275 *
276 * The returned pointer is invalidated by next call to lex_next()
277 *
278 * @param lex Lexer object.
279 * @return Pointer to current lem. Owned by @a lex and only valid
280 * until next call to lex_next().
281 */
282lem_t *lex_get_current(lex_t *lex)
283{
284 lex_touch(lex);
285 return &lex->current;
286}
287
288/** Read in the current lexical element (unless already read in).
289 *
290 * @param lex Lexer object.
291 */
292static void lex_touch(lex_t *lex)
293{
294 bool_t got_lem;
295
296 if (lex->current_valid == b_true)
297 return;
298
299 do {
300 got_lem = lex_read_try(lex);
301 } while (got_lem == b_false);
302
303 lex->current_valid = b_true;
304}
305
306/** Try reading next lexical element.
307 *
308 * Attemps to read the next lexical element. In some cases (such as a comment)
309 * this function will need to give it another try and returns @c b_false
310 * in such case.
311 *
312 * @param lex Lexer object.
313 * @return @c b_true on success or @c b_false if it needs
314 * restarting. On success the lem is stored to
315 * the current lem in @a lex.
316 */
317static bool_t lex_read_try(lex_t *lex)
318{
319 char *bp;
320
321 lex_skip_ws(lex);
322
323 /*
324 * Record lem coordinates. Line number we already have. For column
325 * number we start with position in the input buffer. This works
326 * for all characters except tab. Thus we keep track of tabs
327 * separately using col_adj.
328 */
329 lex->current.line_no = input_get_line_no(lex->input);
330 lex->current.col_0 = 1 + lex->col_adj + (lex->ibp - lex->inbuf);
331
332 bp = lex->ibp;
333
334 if (bp[0] == '\0') {
335 /* End of input */
336 lex->current.lclass = lc_eof;
337 return b_true;
338 }
339
340 if (is_wstart(bp[0])) {
341 lex_word(lex);
342 return b_true;
343 }
344
345 if (bp[0] == '\'') {
346 lex_char(lex);
347 return b_true;
348 }
349
350 if (is_digit(bp[0])) {
351 lex_number(lex);
352 return b_true;
353 }
354
355 if (bp[0] == '"') {
356 lex_string(lex);
357 return b_true;
358 }
359
360 if (bp[0] == '-' && bp[1] == '-') {
361 lex_skip_comment(lex);
362 return b_false;
363 }
364
365 switch (bp[0]) {
366 case ',': lex->current.lclass = lc_comma; ++bp; break;
367 case ':': lex->current.lclass = lc_colon; ++bp; break;
368 case ';': lex->current.lclass = lc_scolon; ++bp; break;
369
370 case '.': lex->current.lclass = lc_period; ++bp; break;
371 case '/': lex->current.lclass = lc_slash; ++bp; break;
372 case '(': lex->current.lclass = lc_lparen; ++bp; break;
373 case ')': lex->current.lclass = lc_rparen; ++bp; break;
374 case '[': lex->current.lclass = lc_lsbr; ++bp; break;
375 case ']': lex->current.lclass = lc_rsbr; ++bp; break;
376
377 case '=':
378 if (bp[1] == '=') {
379 lex->current.lclass = lc_equal; bp += 2; break;
380 }
381 lex->current.lclass = lc_assign; ++bp; break;
382
383 case '!':
384 if (bp[1] == '=') {
385 lex->current.lclass = lc_notequal; bp += 2; break;
386 }
387 goto invalid;
388
389 case '+':
390 if (bp[1] == '=') {
391 lex->current.lclass = lc_increase; bp += 2; break;
392 }
393 lex->current.lclass = lc_plus; ++bp; break;
394
395 case '-':
396 lex->current.lclass = lc_minus; ++bp; break;
397
398 case '*':
399 lex->current.lclass = lc_mult; ++bp; break;
400
401 case '<':
402 if (bp[1] == '=') {
403 lex->current.lclass = lc_lt_equal; bp += 2; break;
404 }
405 lex->current.lclass = lc_lt; ++bp; break;
406
407 case '>':
408 if (bp[1] == '=') {
409 lex->current.lclass = lc_gt_equal; bp += 2; break;
410 }
411 lex->current.lclass = lc_gt; ++bp; break;
412
413 default:
414 goto invalid;
415 }
416
417 lex->ibp = bp;
418 return b_true;
419
420invalid:
421 lex->current.lclass = lc_invalid;
422 ++bp;
423 lex->ibp = bp;
424
425 return b_true;
426}
427
428/** Lex a word (identifier or keyword).
429 *
430 * Read in a word. This may later turn out to be a keyword or a regular
431 * identifier. It is stored in the current lem in @a lex.
432 *
433 * @param lex Lexer object.
434 */
435static void lex_word(lex_t *lex)
436{
437 struct lc_name *dp;
438 char *bp;
439 int idx;
440
441 bp = lex->ibp;
442 ident_buf[0] = bp[0];
443 idx = 1;
444
445 while (is_wcont(bp[idx])) {
446 if (idx >= IBUF_SIZE) {
447 printf("Error: Identifier too long.\n");
448 exit(1);
449 }
450
451 ident_buf[idx] = bp[idx];
452 ++idx;
453 }
454
455 lex->ibp = bp + idx;
456
457 ident_buf[idx] = '\0';
458
459 dp = keywords;
460 while (dp->name != NULL) {
461 if (os_str_cmp(ident_buf, dp->name) == 0) {
462 /* Match */
463 lex->current.lclass = dp->lclass;
464 return;
465 }
466 ++dp;
467 }
468
469 /* No matching keyword -- it must be an identifier. */
470 lex->current.lclass = lc_ident;
471 lex->current.u.ident.sid = strtab_get_sid(ident_buf);
472}
473
474/** Lex a character literal.
475 *
476 * Reads in a character literal and stores it in the current lem in @a lex.
477 *
478 * @param lex Lexer object.
479 */
480static void lex_char(lex_t *lex)
481{
482 char *bp;
483 int idx;
484 size_t len;
485 int char_val;
486
487 bp = lex->ibp + 1;
488 idx = 0;
489
490 while (bp[idx] != '\'') {
491 if (idx >= SLBUF_SIZE) {
492 printf("Error: Character literal too long.\n");
493 exit(1);
494 }
495
496 if (bp[idx] == '\0') {
497 printf("Error: Unterminated character literal.\n");
498 exit(1);
499 }
500
501 strlit_buf[idx] = bp[idx];
502 ++idx;
503 }
504
505 lex->ibp = bp + idx + 1;
506
507 strlit_buf[idx] = '\0';
508 len = os_str_length(strlit_buf);
509 if (len != 1) {
510 printf("Character literal should contain one character, "
511 "but contains %u characters instead.\n", (unsigned) len);
512 exit(1);
513 }
514
515 os_str_get_char(strlit_buf, 0, &char_val);
516 lex->current.lclass = lc_lit_char;
517 bigint_init(&lex->current.u.lit_char.value, char_val);
518}
519
520/** Lex a numeric literal.
521 *
522 * Reads in a numeric literal and stores it in the current lem in @a lex.
523 *
524 * @param lex Lexer object.
525 */
526static void lex_number(lex_t *lex)
527{
528 char *bp;
529 bigint_t value;
530 bigint_t dgval;
531 bigint_t base;
532 bigint_t tprod;
533
534 bp = lex->ibp;
535
536 bigint_init(&value, 0);
537 bigint_init(&base, 10);
538
539 while (is_digit(*bp)) {
540 bigint_mul(&value, &base, &tprod);
541 bigint_init(&dgval, digit_value(*bp));
542
543 bigint_destroy(&value);
544 bigint_add(&tprod, &dgval, &value);
545 bigint_destroy(&tprod);
546 bigint_destroy(&dgval);
547
548 ++bp;
549 }
550
551 bigint_destroy(&base);
552
553 lex->ibp = bp;
554
555 lex->current.lclass = lc_lit_int;
556 bigint_shallow_copy(&value, &lex->current.u.lit_int.value);
557}
558
559/** Lex a string literal.
560 *
561 * Reads in a string literal and stores it in the current lem in @a lex.
562 *
563 * @param lex Lexer object.
564 */
565static void lex_string(lex_t *lex)
566{
567 char *bp;
568 int idx;
569
570 bp = lex->ibp + 1;
571 idx = 0;
572
573 while (bp[idx] != '"') {
574 if (idx >= SLBUF_SIZE) {
575 printf("Error: String literal too long.\n");
576 exit(1);
577 }
578
579 if (bp[idx] == '\0') {
580 printf("Error: Unterminated string literal.\n");
581 exit(1);
582 }
583
584 strlit_buf[idx] = bp[idx];
585 ++idx;
586 }
587
588 lex->ibp = bp + idx + 1;
589
590 strlit_buf[idx] = '\0';
591
592 lex->current.lclass = lc_lit_string;
593 lex->current.u.lit_string.value = os_str_dup(strlit_buf);
594}
595
596/** Lex a single-line comment.
597 *
598 * This does not produce any lem. The comment is just skipped.
599 *
600 * @param lex Lexer object.
601 */
602static void lex_skip_comment(lex_t *lex)
603{
604 char *bp;
605
606 bp = lex->ibp + 2;
607
608 while (*bp != '\n' && *bp != '\0') {
609 ++bp;
610 }
611
612 lex->ibp = bp;
613}
614
615/** Skip whitespace characters.
616 *
617 * This does not produce any lem. The whitespace is just skipped.
618 *
619 * @param lex Lexer object.
620 */
621static void lex_skip_ws(lex_t *lex)
622{
623 char *bp;
624 int rc;
625
626 bp = lex->ibp;
627
628 while (b_true) {
629 while (*bp == ' ' || *bp == '\t') {
630 if (*bp == '\t') {
631 /* XXX This is too simplifed. */
632 lex->col_adj += (TAB_WIDTH - 1);
633 }
634 ++bp;
635 }
636
637 if (*bp != '\n')
638 break;
639
640 /* Read next line */
641 rc = input_get_line(lex->input, &lex->inbuf);
642 if (rc != EOK) {
643 printf("Error reading input.\n");
644 exit(1);
645 }
646
647 bp = lex->inbuf;
648 lex->col_adj = 0;
649 }
650
651 lex->ibp = bp;
652}
653
654/** Determine if character can start a word.
655 *
656 * @param c Character.
657 * @return @c b_true if @a c can start a word, @c b_false otherwise.
658 */
659static bool_t is_wstart(char c)
660{
661 return ((c >= 'a') && (c <= 'z')) || ((c >= 'A') && (c <= 'Z')) ||
662 (c == '_');
663}
664
665/** Determine if character can continue a word.
666 *
667 * @param c Character.
668 * @return @c b_true if @a c can start continue word, @c b_false
669 * otherwise.
670 */
671static bool_t is_wcont(char c)
672{
673 return is_digit(c) || is_wstart(c);
674}
675
676/** Determine if character is a numeric digit.
677 *
678 * @param c Character.
679 * @return @c b_true if @a c is a numeric digit, @c b_false otherwise.
680 */
681static bool_t is_digit(char c)
682{
683 return ((c >= '0') && (c <= '9'));
684}
685
686/** Determine numeric value of digit character.
687 *
688 * @param c Character, must be a valid decimal digit.
689 * @return Value of the digit (0-9).
690 */
691static int digit_value(char c)
692{
693 return (c - '0');
694}
Note: See TracBrowser for help on using the repository browser.