1: /*
   2:  * The lexical analyzer.
   3:  */
   4: 
   5: #include "itran.h"
   6: #include "token.h"
   7: #include "lex.h"
   8: #include "char.h"
   9: #include "tree.h"
  10: 
  11: int tline;
  12: int tcol;
  13: 
  14: /*
  15:  * yylex - find the next token in the input stream, and return its token
  16:  *  type and value to the parser.
  17:  *
  18:  * Variables of interest:
  19:  *
  20:  *  cc - character following last token.
  21:  *  comflag - set if in a comment.
  22:  *  nlflag - set if a newline was between the last token and the current token
  23:  *  lastend - set if the last token was an ENDER.
  24:  *  lastval - when a semicolon is inserted and returned, lastval gets the
  25:  *   token value that would have been returned if the semicolon hadn't
  26:  *   been inserted.
  27:  */
  28: 
  29: yylex()
  30:    {
  31:    register struct toktab *t;
  32:    register int c;
  33:    int nlflag;
  34:    int comflag;
  35:    static struct toktab *lasttok = NULL;
  36:    static nodeptr lastval;
  37:    static int lastend = 0;
  38:    static int eofflag = 0;
  39:    static int lastline = 0;
  40:    static int cc = '\n';
  41:    extern struct toktab *getident(), *getnum(), *getstring(), *getop();
  42: 
  43:    if (lasttok != NULL) {
  44:       /*
  45:        * A semicolon was inserted and returned on the last call to yylex,
  46:        *  instead of going to the input, return lasttok and set the
  47:        *  appropriate variables.
  48:        */
  49:       yylval = lastval;
  50:       tline = LINE(lastval);
  51:       tcol = COL(lastval);
  52:       t = lasttok;
  53:       goto ret;
  54:       }
  55:    nlflag = 0;
  56:    comflag = 0;
  57: loop:
  58:    c = cc;
  59:    /*
  60:     * Skip whitespace and comments.
  61:     */
  62:    while (c != EOF && (comflag || c == COMMENT || isspace(c))) {
  63:       if (c == '\n') {
  64:          nlflag++;
  65:          comflag = 0;
  66:          }
  67:       else if (c == COMMENT)
  68:          comflag++;
  69:       c = NEXTCHAR;
  70:       }
  71:    /*
  72:     * A token is the next thing in the input.  Record the last line number
  73:     *  and set tline and tcol to the current line and column.
  74:     */
  75:    lastline = tline;
  76:    tline = inline;
  77:    tcol = incol;
  78: 
  79:    if (c == EOF) {
  80:       /*
  81:        * End of file has been reached.  Set eofflag, return T_EOF, and
  82:        *  set cc to EOF so that any subsequent scans also return T_EOF.
  83:        */
  84:       if (eofflag++) {
  85:          eofflag = 0;
  86:          cc = '\n';
  87:          return (int) (yylval = 0);
  88:          }
  89:       cc = EOF;
  90:       t = T_EOF;
  91:       yylval = 0;
  92:       goto ret;
  93:       }
  94: 
  95:    /*
  96:     * Look at current input character to determine what class of token
  97:     *  is next and take the appropriate action.  Note that the various
  98:     *  token gathering routines write a value into cc.
  99:     */
 100:    c = ctran[c];
 101:    if (isalpha(c)) {                    /* gather ident or reserved word */
 102:       if ((t = getident(c, &cc)) == NULL)
 103:          goto loop;
 104:       }
 105:    else if (isdigit(c)) {               /* gather numeric literal */
 106:       if ((t = getnum(c, &cc)) == NULL)
 107:          goto loop;
 108:       }
 109:    else if (c == '"' || c == '\'') {    /* gather string or cset literal */
 110:       if ((t = getstring(c, &cc)) == NULL)
 111:          goto loop;
 112:       }
 113:    else {           /* gather longest legal operator */
 114:       if ((t = getop(c, &cc)) == NULL)
 115:          goto loop;
 116:       yylval = OPNODE(t->t_type);
 117:       }
 118:    if (nlflag && lastend && (t->t_flags & BEGINNER)) {
 119:       /*
 120:        * A newline was encountered between the current token and the last,
 121:        *  the last token was an ENDER, and the current token is a BEGINNER.
 122:        *  Return a semicolon and save the current token in lastval.
 123:        */
 124:       lastval = yylval;
 125:       lasttok = t;
 126:       tline = lastline;
 127:       tcol = 0;
 128:       yylval = OPNODE(SEMICOL);
 129:       return (SEMICOL);
 130:       }
 131: ret:
 132:    /*
 133:     * Clear lasttok, set lastend if the token being returned is an
 134:     *  ENDER, and return the token.
 135:     */
 136:    lasttok = 0;
 137:    lastend = t->t_flags & ENDER;
 138:    return (t->t_type);
 139:    }
 140: 
 141: /*
 142:  * getident - gather an identifier beginning with ac.  The character
 143:  *  following identifier goes in cc.
 144:  */
 145: 
 146: struct toktab *getident(ac, cc)
 147: char ac;
 148: int *cc;
 149:    {
 150:    register c;
 151:    register char *p;
 152:    register struct toktab *t;
 153:    extern char *putident();
 154:    extern struct toktab *findres();
 155: 
 156:    c = ac;
 157:    p = sfree;
 158:    /*
 159:     * Copy characters into string space until a non-alphanumeric character
 160:     *  is found.
 161:     */
 162:    do {
 163:       if (p >= send)
 164:          syserr("out of string space");
 165:       *p++ = c;
 166:       c = ctran[NEXTCHAR];
 167:       } while (isalnum(c));
 168:    if (p >= send)
 169:       syserr("out of string space");
 170:    *p++ = 0;
 171:    *cc = c;
 172:    /*
 173:     * If the identifier is a reserved word, make a RESNODE for it and return
 174:     *  the token value.  Otherwise, install it with putident, make an
 175:     *  IDNODE for it, and return.
 176:     */
 177:    if ((t = findres()) != NULL) {
 178:       yylval = RESNODE(t->t_type);
 179:       return (t);
 180:       }
 181:    else {
 182:       yylval = IDNODE((int)putident(p-sfree));
 183:       return (T_IDENT);
 184:       }
 185:    }
 186: 
 187: /*
 188:  * findres - if the string just copied into the string space by getident
 189:  *  is a reserved word, return a pointer to its entry in the token table.
 190:  *  Return NULL if the string isn't a reserved word.
 191:  */
 192: 
 193: struct toktab *findres()
 194:    {
 195:    register struct toktab *t;
 196:    register char c, *p;
 197: 
 198:    p = sfree;
 199:    c = *p;
 200:    if (!islower(c))
 201:       return (NULL);
 202:    /*
 203:     * Point t at first reserved word that starts with c (if any).
 204:     */
 205:    if ((t = restab[c - '_']) == NULL)
 206:       return (NULL);
 207:    /*
 208:     * Search through reserved words, stopping when a match is found
 209:     *  or when the current reserved word doesn't start with c.
 210:     */
 211:    while (t->t_word[0] == c) {
 212:       if (strcmp(t->t_word, p) == 0)
 213:          return (t);
 214:       t++;
 215:       }
 216:    return (NULL);
 217:    }
 218: 
 219: /*
 220:  * getnum - gather a numeric literal starting with ac and put the
 221:  *  character following the literal into *cc.
 222:  */
 223: 
 224: struct toktab *getnum(ac, cc)
 225: char ac;
 226: int *cc;
 227:    {
 228:    register c;
 229:    register r;
 230:    register state;
 231:    char *p;
 232:    int realflag;
 233:    extern char *putident();
 234: 
 235:    c = ac;
 236:    r = tonum(c);
 237:    p = sfree;
 238:    state = 0;
 239:    realflag = 0;
 240:    for (;;) {
 241:       if (p >= send)
 242:          syserr("out of string space");
 243:       *p++ = c;
 244:       c = ctran[NEXTCHAR];
 245:       switch (state) {
 246:          case 0:        /* integer part */
 247:             if (isdigit(c))         { r = r * 10 + tonum(c); continue; }
 248:             if (c == '.')           { state = 1; realflag++; continue; }
 249:             if (tolower(c) == 'e')  { state = 2; realflag++; continue; }
 250:             if (tolower(c) == 'r')  {
 251:                state = 5;
 252:                if (r < 2 || r > 36)
 253:                   err("invalid radix for integer literal", 0);
 254:                continue;
 255:                }
 256:             break;
 257:          case 1:        /* fractional part */
 258:             if (isdigit(c))   continue;
 259:             if (tolower(c) == 'e')   { state = 2; continue; }
 260:             break;
 261:          case 2:        /* optional exponent sign */
 262:             if (c == '+' || c == '-') { state = 3; continue; }
 263:          case 3:        /* first digit after e, e+, or e- */
 264:             if (isdigit(c)) { state = 4; continue; }
 265:             err("invalid real literal", 0);
 266:             break;
 267:          case 4:        /* remaining digits after e */
 268:             if (isdigit(c))   continue;
 269:             break;
 270:          case 5:        /* first digit after r */
 271:             if ((isdigit(c) || isletter(c)) && tonum(c) < r)
 272:                { state = 6; continue; }
 273:             err("invalid integer literal", 0);
 274:             break;
 275:          case 6:        /* remaining digits after r */
 276:             if (isdigit(c) || isletter(c)) {
 277:                if (tonum(c) >= r) { /* illegal digit for radix r */
 278:                   err("invalid digit in integer literal", 0);
 279:                   r = tonum('z');   /* prevent more messages */
 280:                   }
 281:                continue;
 282:                }
 283:             break;
 284:          }
 285:       break;
 286:       }
 287:    if (p >= send)
 288:       syserr("out of string space");
 289:    *p++ = 0;
 290:    *cc = c;
 291:    if (realflag) {
 292:       yylval = REALNODE((int)putident(p-sfree));
 293:       return (T_REAL);
 294:       }
 295:    yylval = INTNODE((int)putident(p-sfree));
 296:    return (T_INT);
 297:    }
 298: 
 299: /*
 300:  * getstring - gather a string literal starting with ac and place the
 301:  *  character following the literal in *cc.
 302:  */
 303: 
 304: struct toktab *getstring(ac, cc)
 305: char ac;
 306: int *cc;
 307:    {
 308:    register c, sc;
 309:    register char *p;
 310:    char *lc;
 311:    extern char *putident();
 312: 
 313:    sc = c = ac;
 314:    p = sfree;
 315:    lc = 0;
 316:    while ((c = NEXTCHAR) != sc && c != '\n' && c != EOF) {
 317:    contin:
 318:       if (c == '_')
 319:          lc = p;
 320:       else if (!isspace(c))
 321:          lc = 0;
 322:       if (ctran[c] == ESCAPE) {
 323:          c = NEXTCHAR;
 324:          if (isoctal(c))
 325:             c = octesc(c);
 326:          else if (ctran[c] == 'x')
 327:             c = hexesc();
 328:          else if (ctran[c] == '^')
 329:             c = ctlesc();
 330:          else
 331:             c = esctab[c];
 332:          if (c == EOF)
 333:             goto noquote;
 334:          }
 335:       if (p >= send)
 336:          syserr("out of string space");
 337:       *p++ = c;
 338:       }
 339:    if (p >= send)
 340:       syserr("out of string space");
 341:    *p++ = 0;
 342:    if (c == sc)
 343:       *cc = ' ';
 344:    else {
 345:       if (c == '\n' && lc) {
 346:          p = lc;
 347:          while ((c = NEXTCHAR) != EOF && isspace(c)) ;
 348:          if (c != EOF)
 349:             goto contin;
 350:          }
 351: noquote:
 352:       err("unclosed quote", 0);
 353:       *cc = c;
 354:       }
 355:    if (ac == '"') { /* a string literal */
 356:       yylval = STRNODE((int)putident(p-sfree), p-sfree);
 357:       return (T_STRING);
 358:       }
 359:    else {       /* a cset literal */
 360:       yylval = CSETNODE((int)putident(p-sfree), p-sfree);
 361:       return (T_CSET);
 362:       }
 363:    }
 364: 
 365: /*
 366:  * ctlesc - translate a control escape -- backslash followed by
 367:  *  caret and one character.
 368:  */
 369: 
 370: ctlesc()
 371:    {
 372:    register c;
 373: 
 374:    c = NEXTCHAR;
 375:    if (c == EOF)
 376:       return (EOF);
 377:    return (c & 037);
 378:    }
 379: 
 380: /*
 381:  * octesc - translate an octal escape -- backslash followed by
 382:  *  one, two, or three octal digits.
 383:  */
 384: 
 385: octesc(ac)
 386: char ac;
 387:    {
 388:    register c, nc, i;
 389: 
 390:    c = 0;
 391:    nc = ac;
 392:    i = 1;
 393:    do {
 394:       c = (c << 3) | (nc - '0');
 395:       nc = NEXTCHAR;
 396:       if (nc == EOF)
 397:          return (EOF);
 398:       } while (isoctal(nc) && i++ < 3);
 399:    PUSHCHAR(nc);
 400:    return (c & 0377);
 401:    }
 402: 
 403: /*
 404:  * hexesc - translate a hexadecimal escape -- backslash-x
 405:  *  followed by one or two hexadecimal digits.
 406:  */
 407: 
 408: hexesc()
 409:    {
 410:    register c, nc, i;
 411: 
 412:    c = 0;
 413:    i = 0;
 414:    while (i++ < 2) {
 415:       nc = NEXTCHAR;
 416:       if (nc == EOF)
 417:          return (EOF);
 418:       if (nc >= 'a' && nc <= 'f')
 419:          nc -= 'a' - 10;
 420:       else if (nc >= 'A' && nc <= 'F')
 421:          nc -= 'A' - 10;
 422:       else if (isdigit(nc))
 423:          nc -= '0';
 424:       else {
 425:          PUSHCHAR(nc);
 426:          break;
 427:          }
 428:       c = (c << 4) | nc;
 429:       }
 430:    return (c);
 431:    }
 432: 
 433: /*
 434:  * getop - find the longest legal operator and return a pointer
 435:  *  to its entry in the token table.  The tour describes the
 436:  *  operator recognition process in detail.
 437:  */
 438: 
 439: struct toktab *getop(ac, cc)
 440: char ac;
 441: int *cc;
 442:    {
 443:    register struct optab *state;
 444:    register char c, i;
 445: 
 446:    state = state0;
 447:    c = ac;
 448:    for (;;) {
 449:       while ((i = state->o_input) && c != i)
 450:          state++;
 451:       switch (state->o_action) {
 452:          case A_GOTO:
 453:             state = (struct optab *) state->o_val;
 454:             c = ctran[NEXTCHAR];
 455:             continue;
 456:          case A_ERROR:
 457:             err("invalid character", 0);
 458:             *cc = ' ';
 459:             return (NULL);
 460:          case A_RETURN:
 461:             *cc = c;
 462:             return (struct toktab *) (state->o_val);
 463:          case A_IMMRET:
 464:             *cc = ' ';
 465:             return (struct toktab *) (state->o_val);
 466:          }
 467:       }
 468:    }
 469: 
 470: /*
 471:  * nextchar - return the next character in the input.
 472:  */
 473: 
 474: nextchar()
 475:    {
 476:    register char c;
 477: 
 478:    if (c = peekc) {
 479:       peekc = 0;
 480:       return (c);
 481:       }
 482:    c = getc(infile);
 483:    switch (c) {
 484:       case EOF:
 485:          inline = 0;
 486:          incol = 0;
 487:          break;
 488:       case '\n':
 489:          inline++;
 490:          incol = 0;
 491:          break;
 492:       case '\t':
 493:          incol = (incol | 7) + 1;
 494:          break;
 495:       case '\b':
 496:          if (incol)
 497:             incol--;
 498:          break;
 499:       default:
 500:          incol++;
 501:       }
 502:    return (c);
 503:    }

Defined functions

ctlesc defined in line 370; used 1 times
findres defined in line 193; used 2 times
getident defined in line 146; used 2 times
getnum defined in line 224; used 2 times
getop defined in line 439; used 2 times
getstring defined in line 304; used 2 times
hexesc defined in line 408; used 1 times
nextchar defined in line 474; used 1 times
octesc defined in line 385; used 1 times
yylex defined in line 29; used 2 times

Defined variables

tcol defined in line 12; used 6 times
tline defined in line 11; used 8 times
Last modified: 1984-11-18
Generated: 2016-12-26
Generated by src2html V0.67
page hit count: 1818
Valid CSS Valid XHTML 1.0 Strict