1: /* 2: * Copyright (c) 1980 Regents of the University of California. 3: * All rights reserved. The Berkeley software License Agreement 4: * specifies the terms and conditions for redistribution. 5: */ 6: 7: #ifndef lint 8: static char sccsid[] = "@(#)lexi.c 5.4 (Berkeley) 9/10/85"; 9: #endif not lint 10: 11: /*- 12: * 13: * Copyright (C) 1976 14: * by the 15: * Board of Trustees 16: * of the 17: * University of Illinois 18: * 19: * All rights reserved 20: * 21: * 22: * NAME: 23: * lexi 24: * 25: * FUNCTION: 26: * This is the token scanner for indent 27: * 28: * ALGORITHM: 29: * 1) Strip off intervening blanks and/or tabs. 30: * 2) If it is an alphanumeric token, move it to the token buffer "token". 31: * Check if it is a special reserved word that indent will want to 32: * know about. 33: * 3) Non-alphanumeric tokens are handled with a big switch statement. A 34: * flag is kept to remember if the last token was a "unary delimiter", 35: * which forces a following operator to be unary as opposed to binary. 36: * 37: * PARAMETERS: 38: * None 39: * 40: * RETURNS: 41: * An integer code indicating the type of token scanned. 42: * 43: * GLOBALS: 44: * buf_ptr = 45: * had_eof 46: * ps.last_u_d = Set to true iff this token is a "unary delimiter" 47: * 48: * CALLS: 49: * fill_buffer 50: * printf (lib) 51: * 52: * CALLED BY: 53: * main 54: * 55: * NOTES: 56: * Start of comment is passed back so that the comment can be scanned by 57: * pr_comment. 58: * 59: * Strings and character literals are returned just like identifiers. 60: * 61: * HISTORY: 62: * initial coding November 1976 D A Willcox of CAC 63: * 1/7/77 D A Willcox of CAC Fix to provide proper handling 64: * of "int a -1;" 65: * 66: */ 67: 68: /* 69: * Here we have the token scanner for indent. It scans off one token and 70: * puts it in the global variable "token". It returns a code, indicating 71: * the type of token scanned. 72: */ 73: 74: #include "indent_globs.h"; 75: #include "indent_codes.h"; 76: #include "ctype.h" 77: 78: #define alphanum 1 79: #define opchar 3 80: 81: struct templ { 82: char *rwd; 83: int rwcode; 84: }; 85: 86: struct templ specials[100] = 87: { 88: "switch", 1, 89: "case", 2, 90: "break", 0, 91: "struct", 3, 92: "union", 3, 93: "enum", 3, 94: "default", 2, 95: "int", 4, 96: "char", 4, 97: "float", 4, 98: "double", 4, 99: "long", 4, 100: "short", 4, 101: "typdef", 4, 102: "unsigned", 4, 103: "register", 4, 104: "static", 4, 105: "global", 4, 106: "extern", 4, 107: "void", 4, 108: "goto", 0, 109: "return", 0, 110: "if", 5, 111: "while", 5, 112: "for", 5, 113: "else", 6, 114: "do", 6, 115: "sizeof", 7, 116: 0, 0 117: }; 118: 119: char chartype[128] = 120: { /* this is used to facilitate the decision 121: * of what type (alphanumeric, operator) 122: * each character is */ 123: 0, 0, 0, 0, 0, 0, 0, 0, 124: 0, 0, 0, 0, 0, 0, 0, 0, 125: 0, 0, 0, 0, 0, 0, 0, 0, 126: 0, 0, 0, 0, 0, 0, 0, 0, 127: 0, 3, 0, 0, 0, 3, 3, 0, 128: 0, 0, 3, 3, 0, 3, 3, 3, 129: 1, 1, 1, 1, 1, 1, 1, 1, 130: 1, 1, 0, 0, 3, 3, 3, 3, 131: 0, 1, 1, 1, 1, 1, 1, 1, 132: 1, 1, 1, 1, 1, 1, 1, 1, 133: 1, 1, 1, 1, 1, 1, 1, 1, 134: 1, 1, 1, 0, 0, 0, 3, 1, 135: 0, 1, 1, 1, 1, 1, 1, 1, 136: 1, 1, 1, 1, 1, 1, 1, 1, 137: 1, 1, 1, 1, 1, 1, 1, 1, 138: 1, 1, 1, 0, 3, 0, 3, 0 139: }; 140: 141: 142: 143: 144: int 145: lexi() 146: { 147: register char *tok; /* local pointer to next char in token */ 148: int unary_delim; /* this is set to 1 if the current token 149: * 150: * forces a following operator to be unary */ 151: static int last_code; /* the last token type returned */ 152: static int l_struct; /* set to 1 if the last token was 'struct' */ 153: int code; /* internal code to be returned */ 154: char qchar; /* the delimiter character for a string */ 155: 156: tok = token; /* point to start of place to save token */ 157: unary_delim = false; 158: ps.col_1 = ps.last_nl; /* tell world that this token started in 159: * column 1 iff the last thing scanned was 160: * nl */ 161: ps.last_nl = false; 162: 163: while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */ 164: ps.col_1 = false; /* leading blanks imply token is not in 165: * column 1 */ 166: if (++buf_ptr >= buf_end) 167: fill_buffer(); 168: } 169: 170: /* Scan an alphanumeric token. Note that we must also handle 171: * stuff like "1.0e+03" and "7e-6". */ 172: if (chartype[*buf_ptr & 0177] == alphanum) { /* we have a character 173: * or number */ 174: register char *j; /* used for searching thru list of 175: * reserved words */ 176: register struct templ *p; 177: register int c; 178: 179: do { /* copy it over */ 180: *tok++ = *buf_ptr++; 181: if (buf_ptr >= buf_end) 182: fill_buffer(); 183: } while (chartype[c = *buf_ptr & 0177] == alphanum || 184: isdigit(token[0]) && (c == '+' || c == '-') && 185: (tok[-1] == 'e' || tok[-1] == 'E')); 186: *tok++ = '\0'; 187: while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */ 188: if (++buf_ptr >= buf_end) 189: fill_buffer(); 190: } 191: ps.its_a_keyword = false; 192: ps.sizeof_keyword = false; 193: if (l_struct) { /* if last token was 'struct', then this 194: * token should be treated as a 195: * declaration */ 196: l_struct = false; 197: last_code = ident; 198: ps.last_u_d = true; 199: return (decl); 200: } 201: ps.last_u_d = false; /* Operator after indentifier is binary */ 202: last_code = ident; /* Remember that this is the code we will 203: * return */ 204: 205: /* 206: * This loop will check if the token is a keyword. 207: */ 208: for (p = specials; (j = p->rwd) != 0; p++) { 209: tok = token; /* point at scanned token */ 210: if (*j++ != *tok++ || *j++ != *tok++) 211: continue; /* This test depends on the fact that 212: * identifiers are always at least 1 213: * character long (ie. the first two bytes 214: * of the identifier are always 215: * meaningful) */ 216: if (tok[-1] == 0) 217: break; /* If its a one-character identifier */ 218: while (*tok++ == *j) 219: if (*j++ == 0) 220: goto found_keyword; /* I wish that C had a multi-level 221: * break... */ 222: } 223: if (p->rwd) { /* we have a keyword */ 224: found_keyword: 225: ps.its_a_keyword = true; 226: ps.last_u_d = true; 227: switch (p->rwcode) { 228: case 1: /* it is a switch */ 229: return (swstmt); 230: case 2: /* a case or default */ 231: return (casestmt); 232: 233: case 3: /* a "struct" */ 234: if (ps.p_l_follow) 235: break; /* inside parens: cast */ 236: l_struct = true; 237: 238: /* 239: * Next time around, we will want to know that we have 240: * had a 'struct' 241: */ 242: case 4: /* one of the declaration keywords */ 243: if (ps.p_l_follow) { 244: ps.cast_mask |= 1 << ps.p_l_follow; 245: break; /* inside parens: cast */ 246: } 247: last_code = decl; 248: return (decl); 249: 250: case 5: /* if, while, for */ 251: return (sp_paren); 252: 253: case 6: /* do, else */ 254: return (sp_nparen); 255: 256: case 7: 257: ps.sizeof_keyword = true; 258: default: /* all others are treated like any other 259: * identifier */ 260: return (ident); 261: } /* end of switch */ 262: } /* end of if (found_it) */ 263: if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0 264: && (buf_ptr[1] != ')' || buf_ptr[2] != ';')) { 265: strncpy(ps.procname, token, sizeof ps.procname - 1); 266: ps.in_parameter_declaration = 1; 267: } 268: 269: /* 270: * The following hack attempts to guess whether or not the current 271: * token is in fact a declaration keyword -- one that has been 272: * typedefd 273: */ 274: if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr)) 275: && !ps.p_l_follow 276: && (ps.last_token == rparen || ps.last_token == semicolon || 277: ps.last_token == decl || 278: ps.last_token == lbrace || ps.last_token == rbrace)) { 279: ps.its_a_keyword = true; 280: ps.last_u_d = true; 281: last_code = decl; 282: return decl; 283: } 284: if (last_code == decl) /* if this is a declared variable, then 285: * following sign is unary */ 286: ps.last_u_d = true; /* will make "int a -1" work */ 287: last_code = ident; 288: return (ident); /* the ident is not in the list */ 289: } /* end of procesing for alpanum character */ 290: /* Scan a non-alphanumeric token */ 291: 292: *tok++ = *buf_ptr; /* if it is only a one-character token, it 293: * is moved here */ 294: *tok = '\0'; 295: if (++buf_ptr >= buf_end) 296: fill_buffer(); 297: 298: switch (*token) { 299: case '\n': 300: unary_delim = ps.last_u_d; 301: ps.last_nl = true; /* remember that we just had a newline */ 302: code = (had_eof ? 0 : newline); 303: 304: /* 305: * if data has been exausted, the newline is a dummy, and we 306: * should return code to stop 307: */ 308: break; 309: 310: case '\'': /* start of quoted character */ 311: case '"': /* start of string */ 312: qchar = *token; 313: if (troff) { 314: tok[-1] = '`'; 315: if (qchar == '"') 316: *tok++ = '`'; 317: *tok++ = BACKSLASH; 318: *tok++ = 'f'; 319: *tok++ = 'L'; 320: } 321: do { /* copy the string */ 322: while (1) { /* move one character or [/<char>]<char> */ 323: if (*buf_ptr == '\n') { 324: printf("%d: Unterminated literal\n", line_no); 325: goto stop_lit; 326: } 327: *tok = *buf_ptr++; 328: if (buf_ptr >= buf_end) 329: fill_buffer(); 330: if (had_eof || ((tok - token) > (bufsize - 2))) { 331: printf("Unterminated literal\n"); 332: ++tok; 333: goto stop_lit; 334: /* get outof literal copying loop */ 335: } 336: if (*tok == BACKSLASH) { /* if escape, copy extra 337: * char */ 338: if (*buf_ptr == '\n') /* check for escaped 339: * newline */ 340: ++line_no; 341: if (troff) { 342: *++tok = BACKSLASH; 343: if (*buf_ptr == BACKSLASH) 344: *++tok = BACKSLASH; 345: } 346: *++tok = *buf_ptr++; 347: ++tok; /* we must increment this again because we 348: * copied two chars */ 349: if (buf_ptr >= buf_end) 350: fill_buffer(); 351: } 352: else 353: break; /* we copied one character */ 354: } /* end of while (1) */ 355: } while (*tok++ != qchar); 356: if (troff) { 357: tok[-1] = BACKSLASH; 358: *tok++ = 'f'; 359: *tok++ = 'R'; 360: *tok++ = '\''; 361: if (qchar == '"') 362: *tok++ = '\''; 363: } 364: stop_lit: 365: code = ident; 366: break; 367: 368: case ('('): 369: case ('['): 370: unary_delim = true; 371: code = lparen; 372: break; 373: 374: case (')'): 375: case (']'): 376: code = rparen; 377: break; 378: 379: case '#': 380: unary_delim = ps.last_u_d; 381: code = preesc; 382: break; 383: 384: case '?': 385: unary_delim = true; 386: code = question; 387: break; 388: 389: case (':'): 390: code = colon; 391: unary_delim = true; 392: break; 393: 394: case (';'): 395: unary_delim = true; 396: code = semicolon; 397: break; 398: 399: case ('{'): 400: unary_delim = true; 401: 402: /* 403: * if (ps.in_or_st) ps.block_init = 1; 404: */ 405: code = ps.block_init ? lparen : lbrace; 406: break; 407: 408: case ('}'): 409: unary_delim = true; 410: code = ps.block_init ? rparen : rbrace; 411: break; 412: 413: case 014: /* a form feed */ 414: unary_delim = ps.last_u_d; 415: ps.last_nl = true; /* remember this so we can set 'ps.col_1' 416: * right */ 417: code = form_feed; 418: break; 419: 420: case (','): 421: unary_delim = true; 422: code = comma; 423: break; 424: 425: case '.': 426: unary_delim = false; 427: code = period; 428: break; 429: 430: case '-': 431: case '+': /* check for -, +, --, ++ */ 432: code = (ps.last_u_d ? unary_op : binary_op); 433: unary_delim = true; 434: 435: if (*buf_ptr == token[0]) { 436: /* check for doubled character */ 437: *tok++ = *buf_ptr++; 438: /* buffer overflow will be checked at end of loop */ 439: if (last_code == ident || last_code == rparen) { 440: code = (ps.last_u_d ? unary_op : postop); 441: /* check for following ++ or -- */ 442: unary_delim = false; 443: } 444: } 445: else if (*buf_ptr == '=') 446: /* check for operator += */ 447: *tok++ = *buf_ptr++; 448: else if (token[0] == '-' && *buf_ptr == '>') { 449: /* check for operator -> */ 450: *tok++ = *buf_ptr++; 451: if (!pointer_as_binop) { 452: code = unary_op; 453: unary_delim = false; 454: ps.want_blank = false; 455: } 456: } 457: /* buffer overflow will be checked at end of switch */ 458: 459: break; 460: 461: case '=': 462: if (ps.in_or_st) 463: ps.block_init = 1; 464: if (chartype[*buf_ptr] == opchar) { /* we have two char 465: * assignment */ 466: tok[-1] = *buf_ptr++; 467: if ((tok[-1] == '<' || tok[-1] == '>') && tok[-1] == *buf_ptr) 468: *tok++ = *buf_ptr++; 469: *tok++ = '='; /* Flip =+ to += */ 470: *tok = 0; 471: } 472: code = binary_op; 473: unary_delim = true; 474: break; 475: /* can drop thru!!! */ 476: 477: case '>': 478: case '<': 479: case '!': /* ops like <, <<, <=, !=, etc */ 480: if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') { 481: *tok++ = *buf_ptr; 482: if (++buf_ptr >= buf_end) 483: fill_buffer(); 484: } 485: if (*buf_ptr == '=') 486: *tok++ = *buf_ptr++; 487: code = (ps.last_u_d ? unary_op : binary_op); 488: unary_delim = true; 489: break; 490: 491: default: 492: if (token[0] == '/' && *buf_ptr == '*') { 493: /* it is start of comment */ 494: *tok++ = '*'; 495: 496: if (++buf_ptr >= buf_end) 497: fill_buffer(); 498: 499: code = comment; 500: unary_delim = ps.last_u_d; 501: break; 502: } 503: while (*(tok - 1) == *buf_ptr || *buf_ptr == '=') { 504: /* handle ||, &&, etc, and also things as in int *****i */ 505: *tok++ = *buf_ptr; 506: if (++buf_ptr >= buf_end) 507: fill_buffer(); 508: } 509: code = (ps.last_u_d ? unary_op : binary_op); 510: unary_delim = true; 511: 512: 513: } /* end of switch */ 514: if (code != newline) { 515: l_struct = false; 516: last_code = code; 517: } 518: if (buf_ptr >= buf_end) /* check for input buffer empty */ 519: fill_buffer(); 520: ps.last_u_d = unary_delim; 521: *tok = '\0'; /* null terminate the token */ 522: return (code); 523: }; 524: 525: /* Add the given keyword to the keyword table, using val as the keyword type 526: */ 527: addkey (key, val) 528: char *key; 529: { 530: register struct templ *p = specials; 531: while (p->rwd) 532: if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0) 533: return; 534: else 535: p++; 536: if (p >= specials + sizeof specials / sizeof specials[0]) 537: return; /* For now, table overflows are silently 538: ignored */ 539: p->rwd = key; 540: p->rwcode = val; 541: p[1].rwd = 0; 542: p[1].rwcode = 0; 543: return; 544: }