1: static char sccsid[] = "@(#)lexi.c 4.1 (Berkeley) 10/21/82"; 2: 3: /* 4: 5: Copyright (C) 1976 6: by the 7: Board of Trustees 8: of the 9: University of Illinois 10: 11: All rights reserved 12: 13: 14: NAME: 15: lexi 16: 17: FUNCTION: 18: This is the token scanner for indent 19: 20: ALGORITHM: 21: 1) Strip off intervening blanks and/or tabs. 22: 2) If it is an alphanumeric token, move it to the token buffer "token". 23: Check if it is a special reserved word that indent will want to 24: know about. 25: 3) Non-alphanumeric tokens are handled with a big switch statement. A 26: flag is kept to remember if the last token was a "unary delimiter", 27: which forces a following operator to be unary as opposed to binary. 28: 29: PARAMETERS: 30: None 31: 32: RETURNS: 33: An integer code indicating the type of token scanned. 34: 35: GLOBALS: 36: buf_ptr = 37: had_eof 38: last_u_d = Set to true iff this token is a "unary delimiter" 39: 40: CALLS: 41: fill_buffer 42: printf (lib) 43: 44: CALLED BY: 45: main 46: 47: NOTES: 48: Start of comment is passed back so that the comment can be scanned by 49: pr_comment. 50: 51: Strings and character literals are returned just like identifiers. 52: 53: HISTORY: 54: initial coding November 1976 D A Willcox of CAC 55: 1/7/77 D A Willcox of CAC Fix to provide proper handling 56: of "int a -1;" 57: 58: */ 59: 60: /* Here we have the token scanner for indent. It scans off one token and 61: puts it in the global variable "token". It returns a code, indicating the 62: type of token scanned. */ 63: 64: #include "indent_globs.h"; 65: #include "indent_codes.h"; 66: 67: 68: 69: #define alphanum 1 70: #define opchar 3 71: 72: struct templ { 73: char *rwd; 74: int rwcode; 75: }; 76: 77: struct templ specials[] = 78: { 79: "switch", 1, 80: "case", 2, 81: "struct", 3, 82: "default", 2, 83: "int", 4, 84: "char", 4, 85: "float", 4, 86: "double", 4, 87: "long", 4, 88: "short", 4, 89: "typdef", 4, 90: "unsigned", 4, 91: "register", 4, 92: "static", 4, 93: "global", 4, 94: "extern", 4, 95: "if", 5, 96: "while", 5, 97: "for", 5, 98: "else", 6, 99: "do", 6, 100: "sizeof", 0, 101: 0, 0 102: }; 103: 104: char chartype[128] = 105: { /* this is used to facilitate the decision of what type 106: (alphanumeric, operator) each character is */ 107: 0, 0, 0, 0, 0, 0, 0, 0, 108: 0, 0, 0, 0, 0, 0, 0, 0, 109: 0, 0, 0, 0, 0, 0, 0, 0, 110: 0, 0, 0, 0, 0, 0, 0, 0, 111: 0, 3, 0, 0, 0, 3, 3, 0, 112: 0, 0, 3, 3, 0, 3, 3, 3, 113: 1, 1, 1, 1, 1, 1, 1, 1, 114: 1, 1, 0, 0, 3, 3, 3, 3, 115: 0, 1, 1, 1, 1, 1, 1, 1, 116: 1, 1, 1, 1, 1, 1, 1, 1, 117: 1, 1, 1, 1, 1, 1, 1, 1, 118: 1, 1, 1, 0, 0, 0, 3, 1, 119: 0, 1, 1, 1, 1, 1, 1, 1, 120: 1, 1, 1, 1, 1, 1, 1, 1, 121: 1, 1, 1, 1, 1, 1, 1, 1, 122: 1, 1, 1, 0, 3, 0, 3, 0 123: }; 124: 125: int last_nl = true; 126: /* this is true if the last thing scanned was a newline */ 127: 128: 129: 130: int lexi () { 131: register char *tok; 132: /* local pointer to next char in token */ 133: register int i; 134: /* local loop counter */ 135: register char *j; 136: /* used for searching thru list of reserved words */ 137: int unary_delim; 138: /* this is set to 1 if the current token forces a following operator to be 139: unary */ 140: static int last_code; 141: /* the last token type returned */ 142: static int l_struct; 143: /* set to 1 if the last token was 'struct' */ 144: int found_it; 145: int code; /* internal code to be returned */ 146: char qchar; /* the delimiter character for a string */ 147: 148: tok = token; /* point to start of place to save token */ 149: unary_delim = false; 150: col_1 = last_nl; /* tell world that this token started in column 151: 1 iff the last thing scanned was nl */ 152: last_nl = false; 153: 154: while (*buf_ptr == ' ' || *buf_ptr == '\t') { 155: /* get rid of blanks */ 156: col_1 = false; /* leading blanks imply token is not in column 1 157: */ 158: if (++buf_ptr >= buf_end) 159: fill_buffer (); 160: } 161: 162: /*----------------------------------------------------------*\ 163: | Scan an alphanumeric token 164: \*----------------------------------------------------------*/ 165: 166: if (chartype[*buf_ptr & 0177] == alphanum) { 167: /* we have a character or number */ 168: while (chartype[*buf_ptr & 0177] == alphanum) { 169: /* copy it over */ 170: *tok++ = *buf_ptr++; 171: if (buf_ptr >= buf_end) 172: fill_buffer (); 173: } 174: 175: *tok++ = '\0'; 176: 177: if (l_struct) { /* if last token was 'struct', then this token 178: should be treated as a declaration */ 179: l_struct = false; 180: last_code = ident; 181: last_u_d = true; 182: return (decl); 183: } 184: 185: last_u_d = false; /* operator after indentifier is binary */ 186: 187: for (i = 0; specials[i].rwd != 0; ++i) { 188: /* this loop will check if the token is a keyword. if so, a following 189: operator is unary */ 190: last_code = ident; /* remember that this is the code we will return 191: */ 192: j = specials[i].rwd; 193: /* point at ith reserved word */ 194: tok = token; /* point at scanned toekn */ 195: found_it = true; /* set to false if not found */ 196: do { 197: if (*tok++ != *j) { 198: found_it = false; 199: break; 200: } 201: } while (*j++); 202: 203: if (found_it) { /* we have a keyword */ 204: last_u_d = true; 205: switch (specials[i].rwcode) { 206: case 1: /* it is a switch */ 207: return (swstmt); 208: case 2: /* a case or default */ 209: return (casestmt); 210: 211: case 3: /* a "struct" */ 212: l_struct = true; 213: /* Next time around, we will want to know that we have had 214: a 'struct' */ 215: case 4: /* one of the declaration keywords */ 216: if(p_l_follow) break; /* inside parens: cast */ 217: last_code = decl; 218: return (decl); 219: 220: case 5: /* if, while, for */ 221: return (sp_paren); 222: 223: case 6: /* do, else */ 224: return (sp_nparen); 225: 226: default: /* all others are treated like any other 227: identifier */ 228: return (ident); 229: } /* end of switch */ 230: } /* end of if (found_it) */ 231: 232: } 233: 234: if (last_code == decl) /* if this is a declared variable, then 235: following sign is unary */ 236: last_u_d = true; /* will make "int a -1" work */ 237: last_code = ident; 238: return (ident); /* the ident is not in the list */ 239: } /* end of procesing for alpanum character */ 240: 241: 242: 243: /*----------------------------------------------------------*\ 244: | Scan a non-alphanumeric token 245: \*----------------------------------------------------------*/ 246: 247: *tok++ = *buf_ptr; /* if it is only a one-character token, it is 248: moved here */ 249: *tok = '\0'; 250: if (++buf_ptr >= buf_end) 251: fill_buffer (); 252: 253: switch (*token) { 254: case '\n': 255: unary_delim = last_u_d; 256: last_nl = true; /* remember that we just had a newline */ 257: code = (had_eof ? 0 : newline); 258: /* if data has been exausted, the newline is a dummy, and we should 259: return code to stop */ 260: break; 261: 262: case '\'': /* start of quoted character */ 263: qchar = '\''; /* remember final delimiter */ 264: goto copy_lit; /* and go to common literal code */ 265: 266: case '"': /* start of string */ 267: qchar = '"'; 268: 269: copy_lit: 270: do { /* copy the string */ 271: while (1) { /* move one character or [/<char>]<char> */ 272: if (*buf_ptr == '\n') { 273: /* check for unterminated literal */ 274: printf ("%d: Unterminated literal\n", line_no); 275: goto stop_lit; 276: /* Don't copy any more */ 277: } 278: 279: *tok = *buf_ptr++; 280: if (buf_ptr >= buf_end) 281: fill_buffer (); 282: if (had_eof || ((tok - token) > (bufsize - 2))) { 283: printf ("Unterminated literal\n"); 284: ++tok; 285: goto stop_lit; 286: /* get outof literal copying loop */ 287: } 288: 289: if (*tok == '\\') { 290: /* if escape, copy extra char */ 291: if (*buf_ptr == '\n') 292: /* check for escaped newline */ 293: ++line_no; 294: *(++tok) = *buf_ptr++; 295: ++tok; /* we must increment this again because we 296: copied two chars */ 297: if (buf_ptr >= buf_end) 298: fill_buffer (); 299: } 300: else 301: break; /* we copied one character */ 302: } /* end of while (1) */ 303: } while (*tok++ != qchar); 304: 305: stop_lit: 306: code = ident; 307: break; 308: 309: case ('('): 310: case ('['): 311: unary_delim = true; 312: code = lparen; 313: break; 314: 315: case (')'): 316: case (']'): 317: code = rparen; 318: break; 319: 320: case '#': 321: unary_delim = last_u_d; 322: code = preesc; 323: break; 324: 325: case '?': 326: unary_delim = true; 327: code = question; 328: break; 329: 330: case (':'): 331: code = colon; 332: unary_delim = true; 333: break; 334: 335: case (';'): 336: unary_delim = true; 337: code = semicolon; 338: break; 339: 340: case ('{'): 341: unary_delim = true; 342: code = lbrace; 343: break; 344: 345: case ('}'): 346: unary_delim = true; 347: code = rbrace; 348: break; 349: 350: case 014: /* a form feed */ 351: unary_delim = last_u_d; 352: last_nl = true; /* remember this so we can set 'col_1' right */ 353: code = form_feed; 354: break; 355: 356: case (','): 357: unary_delim = true; 358: code = comma; 359: break; 360: 361: case '.': 362: unary_delim = false; 363: code = period; 364: break; 365: 366: case '-': 367: case '+': /* check for -, +, --, ++ */ 368: code = (last_u_d ? unary_op : binary_op); 369: unary_delim = true; 370: 371: if (*buf_ptr == token[0]) { 372: /* check for doubled character */ 373: *tok++ = *buf_ptr++; 374: /* buffer overflow will be checked at end of loop */ 375: if (last_code == ident || last_code == rparen) { 376: code = (last_u_d ? unary_op : postop); 377: /* check for following ++ or -- */ 378: unary_delim = false; 379: } 380: } 381: else 382: if (*buf_ptr == '>' || *buf_ptr == '=') 383: /* check for operator -> or += */ 384: *tok++ = *buf_ptr++; 385: /* buffer overflow will be checked at end of switch */ 386: 387: break; 388: 389: case '=': 390: if (chartype[*buf_ptr] == opchar) { 391: /* we have two char assignment */ 392: *tok++ = *buf_ptr; 393: /* move second character */ 394: if (++buf_ptr >= buf_end) 395: fill_buffer (); 396: } 397: 398: code = binary_op; 399: unary_delim = true; 400: if (token[1] != '<' && token[1] != '>') 401: /* check for possible 3 char operator */ 402: break; 403: /* can drop thru!!! */ 404: 405: case '>': 406: case '<': 407: case '!': /* ops like <, <<, <=, !=, etc */ 408: if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') { 409: *tok++ = *buf_ptr; 410: if (++buf_ptr >= buf_end) 411: fill_buffer (); 412: } 413: 414: if (*buf_ptr == '=') 415: *tok++ = *buf_ptr++; 416: code = (last_u_d ? unary_op : binary_op); 417: unary_delim = true; 418: break; 419: 420: default: 421: if (token[0] == '/' && *buf_ptr == '*') { 422: /* it is start of comment */ 423: *tok++ = '*'; 424: 425: if (++buf_ptr >= buf_end) 426: fill_buffer (); 427: 428: code = comment; 429: unary_delim = last_u_d; 430: break; 431: } 432: 433: while (*(tok - 1) == *buf_ptr || *buf_ptr=='=') { 434: /* handle ||, &&, etc, and also things as in int *****i */ 435: *tok++ = *buf_ptr; 436: if (++buf_ptr >= buf_end) 437: fill_buffer (); 438: } 439: 440: 441: code = (last_u_d ? unary_op : binary_op); 442: unary_delim = true; 443: 444: 445: } /* end of switch */ 446: 447: if (code != newline) { 448: l_struct = false; 449: last_code = code; 450: } 451: 452: if (buf_ptr >= buf_end) /* check for input buffer empty */ 453: fill_buffer (); 454: last_u_d = unary_delim; 455: *tok = '\0'; /* null terminate the token */ 456: return (code); 457: };