source: mainline/uspace/app/sbi/src/lex.c@ 23de644

lfn serial ticket/834-toolchain-update topic/msim-upgrade topic/simplify-dev-export
Last change on this file since 23de644 was 23de644, checked in by Jiri Svoboda <jiri@…>, 16 years ago

Update SBI to rev. 174.

  • Property mode set to 100644
File size: 13.3 KB
Line 
1/*
2 * Copyright (c) 2010 Jiri Svoboda
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * - Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * - Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * - The name of the author may not be used to endorse or promote products
15 * derived from this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/** @file Lexer (lexical analyzer).
30 *
31 * Consumes a text file and produces a sequence of lexical elements (lems).
32 */
33
34#include <stdio.h>
35#include <stdlib.h>
36#include "bigint.h"
37#include "mytypes.h"
38#include "input.h"
39#include "os/os.h"
40#include "strtab.h"
41
42#include "lex.h"
43
44#define TAB_WIDTH 8
45
46static void lex_touch(lex_t *lex);
47static bool_t lex_read_try(lex_t *lex);
48
49static void lex_skip_comment(lex_t *lex);
50static void lex_skip_ws(lex_t *lex);
51static bool_t is_wstart(char c);
52static bool_t is_wcont(char c);
53static bool_t is_digit(char c);
54static void lex_word(lex_t *lex);
55static void lex_number(lex_t *lex);
56static void lex_string(lex_t *lex);
57static int digit_value(char c);
58
59/* Note: This imposes an implementation limit on identifier length. */
60#define IBUF_SIZE 128
61static char ident_buf[IBUF_SIZE + 1];
62
63/* XXX This imposes an implementation limit on string literal length. */
64#define SLBUF_SIZE 128
65static char strlit_buf[SLBUF_SIZE + 1];
66
67/** Lclass-string pair */
68struct lc_name {
69 lclass_t lclass;
70 const char *name;
71};
72
73/** Keyword names. Used both for printing and recognition. */
74static struct lc_name keywords[] = {
75 { lc_as, "as" },
76 { lc_builtin, "builtin" },
77 { lc_class, "class" },
78 { lc_constructor, "constructor" },
79 { lc_do, "do" },
80 { lc_else, "else" },
81 { lc_end, "end" },
82 { lc_except, "except" },
83 { lc_finally, "finally" },
84 { lc_for, "for" },
85 { lc_fun, "fun" },
86 { lc_get, "get" },
87 { lc_if, "if" },
88 { lc_in, "in" },
89 { lc_int, "int" },
90 { lc_interface, "interface" },
91 { lc_is, "is" },
92 { lc_new, "new" },
93 { lc_nil, "nil" },
94 { lc_override, "override" },
95 { lc_packed, "packed" },
96 { lc_private, "private" },
97 { lc_prop, "prop" },
98 { lc_protected, "protected" },
99 { lc_public, "public" },
100 { lc_raise, "raise" },
101 { lc_resource, "resource" },
102 { lc_return, "return" },
103 { lc_self, "self" },
104 { lc_set, "set" },
105 { lc_static, "static" },
106 { lc_string, "string" },
107 { lc_struct, "struct" },
108 { lc_then, "then" },
109 { lc_this, "this" },
110 { lc_var, "var" },
111 { lc_with, "with" },
112 { lc_while, "while" },
113 { lc_yield, "yield" },
114
115 { 0, NULL }
116};
117
118/** Other simple lclasses. Only used for printing. */
119static struct lc_name simple_lc[] = {
120 { lc_invalid, "INVALID" },
121 { lc_eof, "EOF" },
122
123 /* Operators */
124 { lc_period, "." },
125 { lc_slash, "/" },
126 { lc_lparen, "(" },
127 { lc_rparen, ")" },
128 { lc_lsbr, "[" },
129 { lc_rsbr, "]" },
130 { lc_equal, "==" },
131 { lc_notequal, "!=" },
132 { lc_lt, "<" },
133 { lc_gt, ">" },
134 { lc_lt_equal, "<=" },
135 { lc_gt_equal, ">=" },
136 { lc_assign, "=" },
137 { lc_plus, "+" },
138 { lc_minus, "-" },
139 { lc_mult, "*" },
140 { lc_increase, "+=" },
141
142 /* Punctuators */
143 { lc_comma, "," },
144 { lc_colon, ":" },
145 { lc_scolon, ";" },
146
147 { 0, NULL },
148};
149
150/** Print lclass value.
151 *
152 * Prints lclass (lexical element class) value in human-readable form
153 * (for debugging).
154 *
155 * @param lclass Lclass value for display.
156 */
157void lclass_print(lclass_t lclass)
158{
159 struct lc_name *dp;
160
161 dp = keywords;
162 while (dp->name != NULL) {
163 if (dp->lclass == lclass) {
164 printf("%s", dp->name);
165 return;
166 }
167 ++dp;
168 }
169
170 dp = simple_lc;
171 while (dp->name != NULL) {
172 if (dp->lclass == lclass) {
173 printf("%s", dp->name);
174 return;
175 }
176 ++dp;
177 }
178
179 switch (lclass) {
180 case lc_ident:
181 printf("ident");
182 break;
183 case lc_lit_int:
184 printf("int_literal");
185 break;
186 case lc_lit_string:
187 printf("string_literal");
188 break;
189 default:
190 printf("<unknown?>");
191 break;
192 }
193}
194
195/** Print lexical element.
196 *
197 * Prints lexical element in human-readable form (for debugging).
198 *
199 * @param lem Lexical element for display.
200 */
201void lem_print(lem_t *lem)
202{
203 lclass_print(lem->lclass);
204
205 switch (lem->lclass) {
206 case lc_ident:
207 printf("(%d)", lem->u.ident.sid);
208 break;
209 case lc_lit_int:
210 printf("(");
211 bigint_print(&lem->u.lit_int.value);
212 printf(")");
213 break;
214 case lc_lit_string:
215 printf("(\"%s\")", lem->u.lit_string.value);
216 default:
217 break;
218 }
219}
220
221/** Print lem coordinates.
222 *
223 * Print the coordinates (line number, column number) of a lexical element.
224 *
225 * @param lem Lexical element for coordinate printing.
226 */
227void lem_print_coords(lem_t *lem)
228{
229 printf("%d:%d", lem->line_no, lem->col_0);
230}
231
232/** Initialize lexer instance.
233 *
234 * @param lex Lexer object to initialize.
235 * @param input Input to associate with lexer.
236 */
237void lex_init(lex_t *lex, struct input *input)
238{
239 int rc;
240
241 lex->input = input;
242
243 rc = input_get_line(lex->input, &lex->inbuf);
244 if (rc != EOK) {
245 printf("Error reading input.\n");
246 exit(1);
247 }
248
249 lex->ibp = lex->inbuf;
250 lex->col_adj = 0;
251 lex->current_valid = b_true;
252}
253
254/** Advance to next lexical element.
255 *
256 * The new element is read in lazily then it is actually accessed.
257 *
258 * @param lex Lexer object.
259 */
260void lex_next(lex_t *lex)
261{
262 /* Make sure the current lem has already been read in. */
263 lex_touch(lex);
264
265 /* Force a new lem to be read on next access. */
266 lex->current_valid = b_false;
267}
268
269/** Get current lem.
270 *
271 * The returned pointer is invalidated by next call to lex_next()
272 *
273 * @param lex Lexer object.
274 * @return Pointer to current lem. Owned by @a lex and only valid
275 * until next call to lex_next().
276 */
277lem_t *lex_get_current(lex_t *lex)
278{
279 lex_touch(lex);
280 return &lex->current;
281}
282
283/** Read in the current lexical element (unless already read in).
284 *
285 * @param lex Lexer object.
286 */
287static void lex_touch(lex_t *lex)
288{
289 bool_t got_lem;
290
291 if (lex->current_valid == b_true)
292 return;
293
294 do {
295 got_lem = lex_read_try(lex);
296 } while (got_lem == b_false);
297
298 lex->current_valid = b_true;
299}
300
301/** Try reading next lexical element.
302 *
303 * Attemps to read the next lexical element. In some cases (such as a comment)
304 * this function will need to give it another try and returns @c b_false
305 * in such case.
306 *
307 * @param lex Lexer object.
308 * @return @c b_true on success or @c b_false if it needs
309 * restarting. On success the lem is stored to
310 * the current lem in @a lex.
311 */
312static bool_t lex_read_try(lex_t *lex)
313{
314 char *bp;
315
316 lex_skip_ws(lex);
317
318 /*
319 * Record lem coordinates. Line number we already have. For column
320 * number we start with position in the input buffer. This works
321 * for all characters except tab. Thus we keep track of tabs
322 * separately using col_adj.
323 */
324 lex->current.line_no = input_get_line_no(lex->input);
325 lex->current.col_0 = 1 + lex->col_adj + (lex->ibp - lex->inbuf);
326
327 bp = lex->ibp;
328
329 if (bp[0] == '\0') {
330 /* End of input */
331 lex->current.lclass = lc_eof;
332 return b_true;
333 }
334
335 if (is_wstart(bp[0])) {
336 lex_word(lex);
337 return b_true;
338 }
339
340 if (is_digit(bp[0])) {
341 lex_number(lex);
342 return b_true;
343 }
344
345 if (bp[0] == '"') {
346 lex_string(lex);
347 return b_true;
348 }
349
350 if (bp[0] == '-' && bp[1] == '-') {
351 lex_skip_comment(lex);
352 return b_false;
353 }
354
355 switch (bp[0]) {
356 case ',': lex->current.lclass = lc_comma; ++bp; break;
357 case ':': lex->current.lclass = lc_colon; ++bp; break;
358 case ';': lex->current.lclass = lc_scolon; ++bp; break;
359
360 case '.': lex->current.lclass = lc_period; ++bp; break;
361 case '/': lex->current.lclass = lc_slash; ++bp; break;
362 case '(': lex->current.lclass = lc_lparen; ++bp; break;
363 case ')': lex->current.lclass = lc_rparen; ++bp; break;
364 case '[': lex->current.lclass = lc_lsbr; ++bp; break;
365 case ']': lex->current.lclass = lc_rsbr; ++bp; break;
366
367 case '=':
368 if (bp[1] == '=') {
369 lex->current.lclass = lc_equal; bp += 2; break;
370 }
371 lex->current.lclass = lc_assign; ++bp; break;
372
373 case '!':
374 if (bp[1] == '=') {
375 lex->current.lclass = lc_notequal; bp += 2; break;
376 }
377 goto invalid;
378
379 case '+':
380 if (bp[1] == '=') {
381 lex->current.lclass = lc_increase; bp += 2; break;
382 }
383 lex->current.lclass = lc_plus; ++bp; break;
384
385 case '-':
386 lex->current.lclass = lc_minus; ++bp; break;
387
388 case '*':
389 lex->current.lclass = lc_mult; ++bp; break;
390
391 case '<':
392 if (bp[1] == '=') {
393 lex->current.lclass = lc_lt_equal; bp += 2; break;
394 }
395 lex->current.lclass = lc_lt; ++bp; break;
396
397 case '>':
398 if (bp[1] == '=') {
399 lex->current.lclass = lc_gt_equal; bp += 2; break;
400 }
401 lex->current.lclass = lc_gt; ++bp; break;
402
403 default:
404 goto invalid;
405 }
406
407 lex->ibp = bp;
408 return b_true;
409
410invalid:
411 lex->current.lclass = lc_invalid;
412 ++bp;
413 lex->ibp = bp;
414
415 return b_true;
416}
417
418/** Lex a word (identifier or keyword).
419 *
420 * Read in a word. This may later turn out to be a keyword or a regular
421 * identifier. It is stored in the current lem in @a lex.
422 *
423 * @param lex Lexer object.
424 */
425static void lex_word(lex_t *lex)
426{
427 struct lc_name *dp;
428 char *bp;
429 int idx;
430
431 bp = lex->ibp;
432 ident_buf[0] = bp[0];
433 idx = 1;
434
435 while (is_wcont(bp[idx])) {
436 if (idx >= IBUF_SIZE) {
437 printf("Error: Identifier too long.\n");
438 exit(1);
439 }
440
441 ident_buf[idx] = bp[idx];
442 ++idx;
443 }
444
445 lex->ibp = bp + idx;
446
447 ident_buf[idx] = '\0';
448
449 dp = keywords;
450 while (dp->name != NULL) {
451 if (os_str_cmp(ident_buf, dp->name) == 0) {
452 /* Match */
453 lex->current.lclass = dp->lclass;
454 return;
455 }
456 ++dp;
457 }
458
459 /* No matching keyword -- it must be an identifier. */
460 lex->current.lclass = lc_ident;
461 lex->current.u.ident.sid = strtab_get_sid(ident_buf);
462}
463
464/** Lex a numeric literal.
465 *
466 * Reads in a numeric literal and stores it in the current lem in @a lex.
467 *
468 * @param lex Lexer object.
469 */
470static void lex_number(lex_t *lex)
471{
472 char *bp;
473 bigint_t value;
474 bigint_t dgval;
475 bigint_t base;
476 bigint_t tprod;
477
478 bp = lex->ibp;
479
480 bigint_init(&value, 0);
481 bigint_init(&base, 10);
482
483 while (is_digit(*bp)) {
484 bigint_mul(&value, &base, &tprod);
485 bigint_init(&dgval, digit_value(*bp));
486
487 bigint_destroy(&value);
488 bigint_add(&tprod, &dgval, &value);
489 bigint_destroy(&tprod);
490 bigint_destroy(&dgval);
491
492 ++bp;
493 }
494
495 bigint_destroy(&base);
496
497 lex->ibp = bp;
498
499 lex->current.lclass = lc_lit_int;
500 bigint_shallow_copy(&value, &lex->current.u.lit_int.value);
501}
502
503/** Lex a string literal.
504 *
505 * Reads in a string literal and stores it in the current lem in @a lex.
506 *
507 * @param lex Lexer object.
508 */
509static void lex_string(lex_t *lex)
510{
511 char *bp;
512 int idx;
513
514 bp = lex->ibp + 1;
515 idx = 0;
516
517 while (bp[idx] != '"') {
518 if (idx >= SLBUF_SIZE) {
519 printf("Error: String literal too long.\n");
520 exit(1);
521 }
522
523 if (bp[idx] == '\0') {
524 printf("Error: Unterminated string literal.\n");
525 exit(1);
526 }
527
528 strlit_buf[idx] = bp[idx];
529 ++idx;
530 }
531
532 lex->ibp = bp + idx + 1;
533
534 strlit_buf[idx] = '\0';
535
536 lex->current.lclass = lc_lit_string;
537 lex->current.u.lit_string.value = os_str_dup(strlit_buf);
538}
539
540/** Lex a single-line comment.
541 *
542 * This does not produce any lem. The comment is just skipped.
543 *
544 * @param lex Lexer object.
545 */
546static void lex_skip_comment(lex_t *lex)
547{
548 char *bp;
549
550 bp = lex->ibp + 2;
551
552 while (*bp != '\n' && *bp != '\0') {
553 ++bp;
554 }
555
556 lex->ibp = bp;
557}
558
559/** Skip whitespace characters.
560 *
561 * This does not produce any lem. The whitespace is just skipped.
562 *
563 * @param lex Lexer object.
564 */
565static void lex_skip_ws(lex_t *lex)
566{
567 char *bp;
568 int rc;
569
570 bp = lex->ibp;
571
572 while (b_true) {
573 while (*bp == ' ' || *bp == '\t') {
574 if (*bp == '\t') {
575 /* XXX This is too simplifed. */
576 lex->col_adj += (TAB_WIDTH - 1);
577 }
578 ++bp;
579 }
580
581 if (*bp != '\n')
582 break;
583
584 /* Read next line */
585 rc = input_get_line(lex->input, &lex->inbuf);
586 if (rc != EOK) {
587 printf("Error reading input.\n");
588 exit(1);
589 }
590
591 bp = lex->inbuf;
592 lex->col_adj = 0;
593 }
594
595 lex->ibp = bp;
596}
597
598/** Determine if character can start a word.
599 *
600 * @param c Character.
601 * @return @c b_true if @a c can start a word, @c b_false otherwise.
602 */
603static bool_t is_wstart(char c)
604{
605 return ((c >= 'a') && (c <= 'z')) || ((c >= 'A') && (c <= 'Z')) ||
606 (c == '_');
607}
608
609/** Determine if character can continue a word.
610 *
611 * @param c Character.
612 * @return @c b_true if @a c can start continue word, @c b_false
613 * otherwise.
614 */
615static bool_t is_wcont(char c)
616{
617 return is_digit(c) || is_wstart(c);
618}
619
620/** Determine if character is a numeric digit.
621 *
622 * @param c Character.
623 * @return @c b_true if @a c is a numeric digit, @c b_false otherwise.
624 */
625static bool_t is_digit(char c)
626{
627 return ((c >= '0') && (c <= '9'));
628}
629
630/** Determine numeric value of digit character.
631 *
632 * @param c Character, must be a valid decimal digit.
633 * @return Value of the digit (0-9).
634 */
635static int digit_value(char c)
636{
637 return (c - '0');
638}
Note: See TracBrowser for help on using the repository browser.