1: /* 2: * The lexical analyzer. 3: */ 4: 5: #include "itran.h" 6: #include "token.h" 7: #include "lex.h" 8: #include "char.h" 9: #include "tree.h" 10: 11: int tline; 12: int tcol; 13: 14: /* 15: * yylex - find the next token in the input stream, and return its token 16: * type and value to the parser. 17: * 18: * Variables of interest: 19: * 20: * cc - character following last token. 21: * comflag - set if in a comment. 22: * nlflag - set if a newline was between the last token and the current token 23: * lastend - set if the last token was an ENDER. 24: * lastval - when a semicolon is inserted and returned, lastval gets the 25: * token value that would have been returned if the semicolon hadn't 26: * been inserted. 27: */ 28: 29: yylex() 30: { 31: register struct toktab *t; 32: register int c; 33: int nlflag; 34: int comflag; 35: static struct toktab *lasttok = NULL; 36: static nodeptr lastval; 37: static int lastend = 0; 38: static int eofflag = 0; 39: static int lastline = 0; 40: static int cc = '\n'; 41: extern struct toktab *getident(), *getnum(), *getstring(), *getop(); 42: 43: if (lasttok != NULL) { 44: /* 45: * A semicolon was inserted and returned on the last call to yylex, 46: * instead of going to the input, return lasttok and set the 47: * appropriate variables. 48: */ 49: yylval = lastval; 50: tline = LINE(lastval); 51: tcol = COL(lastval); 52: t = lasttok; 53: goto ret; 54: } 55: nlflag = 0; 56: comflag = 0; 57: loop: 58: c = cc; 59: /* 60: * Skip whitespace and comments. 61: */ 62: while (c != EOF && (comflag || c == COMMENT || isspace(c))) { 63: if (c == '\n') { 64: nlflag++; 65: comflag = 0; 66: } 67: else if (c == COMMENT) 68: comflag++; 69: c = NEXTCHAR; 70: } 71: /* 72: * A token is the next thing in the input. Record the last line number 73: * and set tline and tcol to the current line and column. 74: */ 75: lastline = tline; 76: tline = inline; 77: tcol = incol; 78: 79: if (c == EOF) { 80: /* 81: * End of file has been reached. Set eofflag, return T_EOF, and 82: * set cc to EOF so that any subsequent scans also return T_EOF. 83: */ 84: if (eofflag++) { 85: eofflag = 0; 86: cc = '\n'; 87: return (int) (yylval = 0); 88: } 89: cc = EOF; 90: t = T_EOF; 91: yylval = 0; 92: goto ret; 93: } 94: 95: /* 96: * Look at current input character to determine what class of token 97: * is next and take the appropriate action. Note that the various 98: * token gathering routines write a value into cc. 99: */ 100: c = ctran[c]; 101: if (isalpha(c)) { /* gather ident or reserved word */ 102: if ((t = getident(c, &cc)) == NULL) 103: goto loop; 104: } 105: else if (isdigit(c)) { /* gather numeric literal */ 106: if ((t = getnum(c, &cc)) == NULL) 107: goto loop; 108: } 109: else if (c == '"' || c == '\'') { /* gather string or cset literal */ 110: if ((t = getstring(c, &cc)) == NULL) 111: goto loop; 112: } 113: else { /* gather longest legal operator */ 114: if ((t = getop(c, &cc)) == NULL) 115: goto loop; 116: yylval = OPNODE(t->t_type); 117: } 118: if (nlflag && lastend && (t->t_flags & BEGINNER)) { 119: /* 120: * A newline was encountered between the current token and the last, 121: * the last token was an ENDER, and the current token is a BEGINNER. 122: * Return a semicolon and save the current token in lastval. 123: */ 124: lastval = yylval; 125: lasttok = t; 126: tline = lastline; 127: tcol = 0; 128: yylval = OPNODE(SEMICOL); 129: return (SEMICOL); 130: } 131: ret: 132: /* 133: * Clear lasttok, set lastend if the token being returned is an 134: * ENDER, and return the token. 135: */ 136: lasttok = 0; 137: lastend = t->t_flags & ENDER; 138: return (t->t_type); 139: } 140: 141: /* 142: * getident - gather an identifier beginning with ac. The character 143: * following identifier goes in cc. 144: */ 145: 146: struct toktab *getident(ac, cc) 147: char ac; 148: int *cc; 149: { 150: register c; 151: register char *p; 152: register struct toktab *t; 153: extern char *putident(); 154: extern struct toktab *findres(); 155: 156: c = ac; 157: p = sfree; 158: /* 159: * Copy characters into string space until a non-alphanumeric character 160: * is found. 161: */ 162: do { 163: if (p >= send) 164: syserr("out of string space"); 165: *p++ = c; 166: c = ctran[NEXTCHAR]; 167: } while (isalnum(c)); 168: if (p >= send) 169: syserr("out of string space"); 170: *p++ = 0; 171: *cc = c; 172: /* 173: * If the identifier is a reserved word, make a RESNODE for it and return 174: * the token value. Otherwise, install it with putident, make an 175: * IDNODE for it, and return. 176: */ 177: if ((t = findres()) != NULL) { 178: yylval = RESNODE(t->t_type); 179: return (t); 180: } 181: else { 182: yylval = IDNODE((int)putident(p-sfree)); 183: return (T_IDENT); 184: } 185: } 186: 187: /* 188: * findres - if the string just copied into the string space by getident 189: * is a reserved word, return a pointer to its entry in the token table. 190: * Return NULL if the string isn't a reserved word. 191: */ 192: 193: struct toktab *findres() 194: { 195: register struct toktab *t; 196: register char c, *p; 197: 198: p = sfree; 199: c = *p; 200: if (!islower(c)) 201: return (NULL); 202: /* 203: * Point t at first reserved word that starts with c (if any). 204: */ 205: if ((t = restab[c - '_']) == NULL) 206: return (NULL); 207: /* 208: * Search through reserved words, stopping when a match is found 209: * or when the current reserved word doesn't start with c. 210: */ 211: while (t->t_word[0] == c) { 212: if (strcmp(t->t_word, p) == 0) 213: return (t); 214: t++; 215: } 216: return (NULL); 217: } 218: 219: /* 220: * getnum - gather a numeric literal starting with ac and put the 221: * character following the literal into *cc. 222: */ 223: 224: struct toktab *getnum(ac, cc) 225: char ac; 226: int *cc; 227: { 228: register c; 229: register r; 230: register state; 231: char *p; 232: int realflag; 233: extern char *putident(); 234: 235: c = ac; 236: r = tonum(c); 237: p = sfree; 238: state = 0; 239: realflag = 0; 240: for (;;) { 241: if (p >= send) 242: syserr("out of string space"); 243: *p++ = c; 244: c = ctran[NEXTCHAR]; 245: switch (state) { 246: case 0: /* integer part */ 247: if (isdigit(c)) { r = r * 10 + tonum(c); continue; } 248: if (c == '.') { state = 1; realflag++; continue; } 249: if (tolower(c) == 'e') { state = 2; realflag++; continue; } 250: if (tolower(c) == 'r') { 251: state = 5; 252: if (r < 2 || r > 36) 253: err("invalid radix for integer literal", 0); 254: continue; 255: } 256: break; 257: case 1: /* fractional part */ 258: if (isdigit(c)) continue; 259: if (tolower(c) == 'e') { state = 2; continue; } 260: break; 261: case 2: /* optional exponent sign */ 262: if (c == '+' || c == '-') { state = 3; continue; } 263: case 3: /* first digit after e, e+, or e- */ 264: if (isdigit(c)) { state = 4; continue; } 265: err("invalid real literal", 0); 266: break; 267: case 4: /* remaining digits after e */ 268: if (isdigit(c)) continue; 269: break; 270: case 5: /* first digit after r */ 271: if ((isdigit(c) || isletter(c)) && tonum(c) < r) 272: { state = 6; continue; } 273: err("invalid integer literal", 0); 274: break; 275: case 6: /* remaining digits after r */ 276: if (isdigit(c) || isletter(c)) { 277: if (tonum(c) >= r) { /* illegal digit for radix r */ 278: err("invalid digit in integer literal", 0); 279: r = tonum('z'); /* prevent more messages */ 280: } 281: continue; 282: } 283: break; 284: } 285: break; 286: } 287: if (p >= send) 288: syserr("out of string space"); 289: *p++ = 0; 290: *cc = c; 291: if (realflag) { 292: yylval = REALNODE((int)putident(p-sfree)); 293: return (T_REAL); 294: } 295: yylval = INTNODE((int)putident(p-sfree)); 296: return (T_INT); 297: } 298: 299: /* 300: * getstring - gather a string literal starting with ac and place the 301: * character following the literal in *cc. 302: */ 303: 304: struct toktab *getstring(ac, cc) 305: char ac; 306: int *cc; 307: { 308: register c, sc; 309: register char *p; 310: char *lc; 311: extern char *putident(); 312: 313: sc = c = ac; 314: p = sfree; 315: lc = 0; 316: while ((c = NEXTCHAR) != sc && c != '\n' && c != EOF) { 317: contin: 318: if (c == '_') 319: lc = p; 320: else if (!isspace(c)) 321: lc = 0; 322: if (ctran[c] == ESCAPE) { 323: c = NEXTCHAR; 324: if (isoctal(c)) 325: c = octesc(c); 326: else if (ctran[c] == 'x') 327: c = hexesc(); 328: else if (ctran[c] == '^') 329: c = ctlesc(); 330: else 331: c = esctab[c]; 332: if (c == EOF) 333: goto noquote; 334: } 335: if (p >= send) 336: syserr("out of string space"); 337: *p++ = c; 338: } 339: if (p >= send) 340: syserr("out of string space"); 341: *p++ = 0; 342: if (c == sc) 343: *cc = ' '; 344: else { 345: if (c == '\n' && lc) { 346: p = lc; 347: while ((c = NEXTCHAR) != EOF && isspace(c)) ; 348: if (c != EOF) 349: goto contin; 350: } 351: noquote: 352: err("unclosed quote", 0); 353: *cc = c; 354: } 355: if (ac == '"') { /* a string literal */ 356: yylval = STRNODE((int)putident(p-sfree), p-sfree); 357: return (T_STRING); 358: } 359: else { /* a cset literal */ 360: yylval = CSETNODE((int)putident(p-sfree), p-sfree); 361: return (T_CSET); 362: } 363: } 364: 365: /* 366: * ctlesc - translate a control escape -- backslash followed by 367: * caret and one character. 368: */ 369: 370: ctlesc() 371: { 372: register c; 373: 374: c = NEXTCHAR; 375: if (c == EOF) 376: return (EOF); 377: return (c & 037); 378: } 379: 380: /* 381: * octesc - translate an octal escape -- backslash followed by 382: * one, two, or three octal digits. 383: */ 384: 385: octesc(ac) 386: char ac; 387: { 388: register c, nc, i; 389: 390: c = 0; 391: nc = ac; 392: i = 1; 393: do { 394: c = (c << 3) | (nc - '0'); 395: nc = NEXTCHAR; 396: if (nc == EOF) 397: return (EOF); 398: } while (isoctal(nc) && i++ < 3); 399: PUSHCHAR(nc); 400: return (c & 0377); 401: } 402: 403: /* 404: * hexesc - translate a hexadecimal escape -- backslash-x 405: * followed by one or two hexadecimal digits. 406: */ 407: 408: hexesc() 409: { 410: register c, nc, i; 411: 412: c = 0; 413: i = 0; 414: while (i++ < 2) { 415: nc = NEXTCHAR; 416: if (nc == EOF) 417: return (EOF); 418: if (nc >= 'a' && nc <= 'f') 419: nc -= 'a' - 10; 420: else if (nc >= 'A' && nc <= 'F') 421: nc -= 'A' - 10; 422: else if (isdigit(nc)) 423: nc -= '0'; 424: else { 425: PUSHCHAR(nc); 426: break; 427: } 428: c = (c << 4) | nc; 429: } 430: return (c); 431: } 432: 433: /* 434: * getop - find the longest legal operator and return a pointer 435: * to its entry in the token table. The tour describes the 436: * operator recognition process in detail. 437: */ 438: 439: struct toktab *getop(ac, cc) 440: char ac; 441: int *cc; 442: { 443: register struct optab *state; 444: register char c, i; 445: 446: state = state0; 447: c = ac; 448: for (;;) { 449: while ((i = state->o_input) && c != i) 450: state++; 451: switch (state->o_action) { 452: case A_GOTO: 453: state = (struct optab *) state->o_val; 454: c = ctran[NEXTCHAR]; 455: continue; 456: case A_ERROR: 457: err("invalid character", 0); 458: *cc = ' '; 459: return (NULL); 460: case A_RETURN: 461: *cc = c; 462: return (struct toktab *) (state->o_val); 463: case A_IMMRET: 464: *cc = ' '; 465: return (struct toktab *) (state->o_val); 466: } 467: } 468: } 469: 470: /* 471: * nextchar - return the next character in the input. 472: */ 473: 474: nextchar() 475: { 476: register char c; 477: 478: if (c = peekc) { 479: peekc = 0; 480: return (c); 481: } 482: c = getc(infile); 483: switch (c) { 484: case EOF: 485: inline = 0; 486: incol = 0; 487: break; 488: case '\n': 489: inline++; 490: incol = 0; 491: break; 492: case '\t': 493: incol = (incol | 7) + 1; 494: break; 495: case '\b': 496: if (incol) 497: incol--; 498: break; 499: default: 500: incol++; 501: } 502: return (c); 503: }