1: /* $Header: search.c,v 1.0 87/12/18 13:05:59 root Exp $ 2: * 3: * $Log: search.c,v $ 4: * Revision 1.0 87/12/18 13:05:59 root 5: * Initial revision 6: * 7: */ 8: 9: /* string search routines */ 10: 11: #include <stdio.h> 12: #include <ctype.h> 13: 14: #include "EXTERN.h" 15: #include "handy.h" 16: #include "util.h" 17: #include "INTERN.h" 18: #include "search.h" 19: 20: #define VERBOSE 21: #define FLUSH 22: #define MEM_SIZE int 23: 24: #ifndef BITSPERBYTE 25: #define BITSPERBYTE 8 26: #endif 27: 28: #define BMAPSIZ (127 / BITSPERBYTE + 1) 29: 30: #define CHAR 0 /* a normal character */ 31: #define ANY 1 /* . matches anything except newline */ 32: #define CCL 2 /* [..] character class */ 33: #define NCCL 3 /* [^..]negated character class */ 34: #define BEG 4 /* ^ beginning of a line */ 35: #define END 5 /* $ end of a line */ 36: #define LPAR 6 /* ( begin sub-match */ 37: #define RPAR 7 /* ) end sub-match */ 38: #define REF 8 /* \N backreference to the Nth submatch */ 39: #define WORD 9 /* \w matches alphanumeric character */ 40: #define NWORD 10 /* \W matches non-alphanumeric character */ 41: #define WBOUND 11 /* \b matches word boundary */ 42: #define NWBOUND 12 /* \B matches non-boundary */ 43: #define FINIS 13 /* the end of the pattern */ 44: 45: #define CODEMASK 15 46: 47: /* Quantifiers: */ 48: 49: #define MINZERO 16 /* minimum is 0, not 1 */ 50: #define MAXINF 32 /* maximum is infinity, not 1 */ 51: 52: #define ASCSIZ 0200 53: typedef char TRANSTABLE[ASCSIZ]; 54: 55: static TRANSTABLE trans = { 56: 0000,0001,0002,0003,0004,0005,0006,0007, 57: 0010,0011,0012,0013,0014,0015,0016,0017, 58: 0020,0021,0022,0023,0024,0025,0026,0027, 59: 0030,0031,0032,0033,0034,0035,0036,0037, 60: 0040,0041,0042,0043,0044,0045,0046,0047, 61: 0050,0051,0052,0053,0054,0055,0056,0057, 62: 0060,0061,0062,0063,0064,0065,0066,0067, 63: 0070,0071,0072,0073,0074,0075,0076,0077, 64: 0100,0101,0102,0103,0104,0105,0106,0107, 65: 0110,0111,0112,0113,0114,0115,0116,0117, 66: 0120,0121,0122,0123,0124,0125,0126,0127, 67: 0130,0131,0132,0133,0134,0135,0136,0137, 68: 0140,0141,0142,0143,0144,0145,0146,0147, 69: 0150,0151,0152,0153,0154,0155,0156,0157, 70: 0160,0161,0162,0163,0164,0165,0166,0167, 71: 0170,0171,0172,0173,0174,0175,0176,0177, 72: }; 73: static bool folding = FALSE; 74: 75: static int err; 76: #define NOERR 0 77: #define BEGFAIL 1 78: #define FATAL 2 79: 80: static char *FirstCharacter; 81: static char *matchend; 82: static char *matchtill; 83: 84: void 85: search_init() 86: { 87: #ifdef UNDEF 88: register int i; 89: 90: for (i = 0; i < ASCSIZ; i++) 91: trans[i] = i; 92: #else 93: ; 94: #endif 95: } 96: 97: void 98: init_compex(compex) 99: register COMPEX *compex; 100: { 101: /* the following must start off zeroed */ 102: 103: compex->precomp = Nullch; 104: compex->complen = 0; 105: compex->subbase = Nullch; 106: } 107: 108: #ifdef NOTUSED 109: void 110: free_compex(compex) 111: register COMPEX *compex; 112: { 113: if (compex->complen) { 114: safefree(compex->compbuf); 115: compex->complen = 0; 116: } 117: if (compex->subbase) { 118: safefree(compex->subbase); 119: compex->subbase = Nullch; 120: } 121: } 122: #endif 123: 124: static char *gbr_str = Nullch; 125: static int gbr_siz = 0; 126: 127: char * 128: getparen(compex,n) 129: register COMPEX *compex; 130: int n; 131: { 132: int length = compex->subend[n] - compex->subbeg[n]; 133: 134: if (!n && 135: (!compex->numsubs || n > compex->numsubs || !compex->subend[n] || length<0)) 136: return ""; 137: growstr(&gbr_str, &gbr_siz, length+1); 138: safecpy(gbr_str, compex->subbeg[n], length+1); 139: return gbr_str; 140: } 141: 142: void 143: case_fold(which) 144: int which; 145: { 146: register int i; 147: 148: if (which != folding) { 149: if (which) { 150: for (i = 'A'; i <= 'Z'; i++) 151: trans[i] = tolower(i); 152: } 153: else { 154: for (i = 'A'; i <= 'Z'; i++) 155: trans[i] = i; 156: } 157: folding = which; 158: } 159: } 160: 161: /* Compile the regular expression into internal form */ 162: 163: char * 164: compile(compex, sp, regex, fold) 165: register COMPEX *compex; 166: register char *sp; 167: int regex; 168: int fold; 169: { 170: register int c; 171: register char *cp; 172: char *lastcp; 173: char paren[MAXSUB], 174: *parenp; 175: char **alt = compex->alternatives; 176: char *retmes = "Badly formed search string"; 177: 178: case_fold(compex->do_folding = fold); 179: if (compex->precomp) 180: safefree(compex->precomp); 181: compex->precomp = savestr(sp); 182: if (!compex->complen) { 183: compex->compbuf = safemalloc(84); 184: compex->complen = 80; 185: } 186: cp = compex->compbuf; /* point at compiled buffer */ 187: *alt++ = cp; /* first alternative starts here */ 188: parenp = paren; /* first paren goes here */ 189: if (*sp == 0) { /* nothing to compile? */ 190: #ifdef NOTDEF 191: if (*cp == 0) /* nothing there yet? */ 192: return "Null search string"; 193: #endif 194: if (*cp) 195: return Nullch; /* just keep old expression */ 196: } 197: compex->numsubs = 0; /* no parens yet */ 198: lastcp = 0; 199: for (;;) { 200: if (cp - compex->compbuf >= compex->complen) { 201: char *ocompbuf = compex->compbuf; 202: 203: grow_comp(compex); 204: if (ocompbuf != compex->compbuf) { /* adjust pointers? */ 205: char **tmpalt; 206: 207: cp = compex->compbuf + (cp - ocompbuf); 208: if (lastcp) 209: lastcp = compex->compbuf + (lastcp - ocompbuf); 210: for (tmpalt = compex->alternatives; tmpalt < alt; tmpalt++) 211: if (*tmpalt) 212: *tmpalt = compex->compbuf + (*tmpalt - ocompbuf); 213: } 214: } 215: c = *sp++; /* get next char of pattern */ 216: if (c == 0) { /* end of pattern? */ 217: if (parenp != paren) { /* balanced parentheses? */ 218: #ifdef VERBOSE 219: retmes = "Missing right parenthesis"; 220: #endif 221: goto badcomp; 222: } 223: *cp++ = FINIS; /* append a stopper */ 224: *alt++ = 0; /* terminate alternative list */ 225: /* 226: compex->complen = cp - compex->compbuf + 1; 227: compex->compbuf = saferealloc(compex->compbuf,compex->complen+4); */ 228: return Nullch; /* return success */ 229: } 230: if (c != '*' && c != '?' && c != '+') 231: lastcp = cp; 232: if (!regex) { /* just a normal search string? */ 233: *cp++ = CHAR; /* everything is a normal char */ 234: *cp++ = trans[c]; 235: } 236: else /* it is a regular expression */ 237: switch (c) { 238: 239: default: 240: normal_char: 241: *cp++ = CHAR; 242: *cp++ = trans[c]; 243: continue; 244: 245: case '.': 246: *cp++ = ANY; 247: continue; 248: 249: case '[': { /* character class */ 250: register int i; 251: 252: if (cp - compex->compbuf >= compex->complen - BMAPSIZ) { 253: char *ocompbuf = compex->compbuf; 254: 255: grow_comp(compex); /* reserve bitmap */ 256: if (ocompbuf != compex->compbuf) {/* adjust pointers? */ 257: char **tmpalt; 258: 259: cp = compex->compbuf + (cp - ocompbuf); 260: if (lastcp) 261: lastcp = compex->compbuf + (lastcp - ocompbuf); 262: for (tmpalt = compex->alternatives; tmpalt < alt; 263: tmpalt++) 264: if (*tmpalt) 265: *tmpalt = 266: compex->compbuf + (*tmpalt - ocompbuf); 267: } 268: } 269: for (i = BMAPSIZ; i; --i) 270: cp[i] = 0; 271: 272: if ((c = *sp++) == '^') { 273: c = *sp++; 274: *cp++ = NCCL; /* negated */ 275: } 276: else 277: *cp++ = CCL; /* normal */ 278: 279: i = 0; /* remember oldchar */ 280: do { 281: if (c == '\0') { 282: #ifdef VERBOSE 283: retmes = "Missing ]"; 284: #endif 285: goto badcomp; 286: } 287: if (c == '\\' && *sp) { 288: switch (*sp) { 289: default: 290: c = *sp++; 291: break; 292: case '0': case '1': case '2': case '3': 293: case '4': case '5': case '6': case '7': 294: c = *sp++ - '0'; 295: if (index("01234567",*sp)) { 296: c <<= 3; 297: c += *sp++ - '0'; 298: } 299: if (index("01234567",*sp)) { 300: c <<= 3; 301: c += *sp++ - '0'; 302: } 303: break; 304: case 'b': 305: c = '\b'; 306: sp++; 307: break; 308: case 'n': 309: c = '\n'; 310: sp++; 311: break; 312: case 'r': 313: c = '\r'; 314: sp++; 315: break; 316: case 'f': 317: c = '\f'; 318: sp++; 319: break; 320: case 't': 321: c = '\t'; 322: sp++; 323: break; 324: } 325: } 326: if (*sp == '-' && *(++sp)) 327: i = *sp++; 328: else 329: i = c; 330: while (c <= i) { 331: cp[c / BITSPERBYTE] |= 1 << (c % BITSPERBYTE); 332: if (fold && isalpha(c)) 333: cp[(c ^ 32) / BITSPERBYTE] |= 334: 1 << ((c ^ 32) % BITSPERBYTE); 335: /* set the other bit too */ 336: c++; 337: } 338: } while ((c = *sp++) != ']'); 339: if (cp[-1] == NCCL) 340: cp[0] |= 1; 341: cp += BMAPSIZ; 342: continue; 343: } 344: 345: case '^': 346: if (cp != compex->compbuf && cp[-1] != FINIS) 347: goto normal_char; 348: *cp++ = BEG; 349: continue; 350: 351: case '$': 352: if (isdigit(*sp)) { 353: *cp++ = REF; 354: *cp++ = *sp - '0'; 355: break; 356: } 357: if (*sp && *sp != '|') 358: goto normal_char; 359: *cp++ = END; 360: continue; 361: 362: case '*': case '?': case '+': 363: if (lastcp == 0 || 364: (*lastcp & (MINZERO|MAXINF)) || 365: *lastcp == LPAR || 366: *lastcp == RPAR || 367: *lastcp == BEG || 368: *lastcp == END || 369: *lastcp == WBOUND || 370: *lastcp == NWBOUND ) 371: goto normal_char; 372: if (c != '+') 373: *lastcp |= MINZERO; 374: if (c != '?') 375: *lastcp |= MAXINF; 376: continue; 377: 378: case '(': 379: if (compex->numsubs >= MAXSUB) { 380: #ifdef VERBOSE 381: retmes = "Too many parens"; 382: #endif 383: goto badcomp; 384: } 385: *parenp++ = ++compex->numsubs; 386: *cp++ = LPAR; 387: *cp++ = compex->numsubs; 388: break; 389: case ')': 390: if (parenp <= paren) { 391: #ifdef VERBOSE 392: retmes = "Unmatched right paren"; 393: #endif 394: goto badcomp; 395: } 396: *cp++ = RPAR; 397: *cp++ = *--parenp; 398: break; 399: case '|': 400: if (parenp>paren) { 401: #ifdef VERBOSE 402: retmes = "No | in subpattern"; /* Sigh! */ 403: #endif 404: goto badcomp; 405: } 406: *cp++ = FINIS; 407: if (alt - compex->alternatives >= MAXALT) { 408: #ifdef VERBOSE 409: retmes = "Too many alternatives"; 410: #endif 411: goto badcomp; 412: } 413: *alt++ = cp; 414: break; 415: case '\\': /* backslashed thingie */ 416: switch (c = *sp++) { 417: case '0': case '1': case '2': case '3': case '4': 418: case '5': case '6': case '7': case '8': case '9': 419: *cp++ = REF; 420: *cp++ = c - '0'; 421: break; 422: case 'w': 423: *cp++ = WORD; 424: break; 425: case 'W': 426: *cp++ = NWORD; 427: break; 428: case 'b': 429: *cp++ = WBOUND; 430: break; 431: case 'B': 432: *cp++ = NWBOUND; 433: break; 434: default: 435: *cp++ = CHAR; 436: if (c == '\0') 437: goto badcomp; 438: switch (c) { 439: case 'n': 440: c = '\n'; 441: break; 442: case 'r': 443: c = '\r'; 444: break; 445: case 'f': 446: c = '\f'; 447: break; 448: case 't': 449: c = '\t'; 450: break; 451: } 452: *cp++ = c; 453: break; 454: } 455: break; 456: } 457: } 458: badcomp: 459: compex->compbuf[0] = 0; 460: compex->numsubs = 0; 461: return retmes; 462: } 463: 464: void 465: grow_comp(compex) 466: register COMPEX *compex; 467: { 468: compex->complen += 80; 469: compex->compbuf = saferealloc(compex->compbuf, (MEM_SIZE)compex->complen + 4); 470: } 471: 472: char * 473: execute(compex, addr, beginning, minend) 474: register COMPEX *compex; 475: char *addr; 476: bool beginning; 477: int minend; 478: { 479: register char *p1 = addr; 480: register char *trt = trans; 481: register int c; 482: register int scr; 483: register int c2; 484: 485: if (addr == Nullch) 486: return Nullch; 487: if (compex->numsubs) { /* any submatches? */ 488: for (c = 0; c <= compex->numsubs; c++) 489: compex->subbeg[c] = compex->subend[c] = Nullch; 490: } 491: case_fold(compex->do_folding); /* make sure table is correct */ 492: if (beginning) 493: FirstCharacter = p1; /* for ^ tests */ 494: else { 495: if (multiline || compex->alternatives[1] || compex->compbuf[0] != BEG) 496: FirstCharacter = Nullch; 497: else 498: return Nullch; /* can't match */ 499: } 500: matchend = Nullch; 501: matchtill = addr + minend; 502: err = 0; 503: if (compex->compbuf[0] == CHAR && !compex->alternatives[1]) { 504: if (compex->do_folding) { 505: c = compex->compbuf[1]; /* fast check for first character */ 506: do { 507: if (trt[*p1] == c && try(compex, p1, compex->compbuf)) 508: goto got_it; 509: } while (*p1++ && !err); 510: } 511: else { 512: c = compex->compbuf[1]; /* faster check for first character */ 513: if (compex->compbuf[2] == CHAR) 514: c2 = compex->compbuf[3]; 515: else 516: c2 = 0; 517: do { 518: false_alarm: 519: while (scr = *p1++, scr && scr != c) ; 520: if (!scr) 521: break; 522: if (c2 && *p1 != c2) /* and maybe even second character */ 523: goto false_alarm; 524: if (try(compex, p1, compex->compbuf+2)) { 525: p1--; 526: goto got_it; 527: } 528: } while (!err); 529: } 530: return Nullch; 531: } 532: else { /* normal algorithm */ 533: do { 534: register char **alt = compex->alternatives; 535: while (*alt) { 536: if (try(compex, p1, *alt++)) 537: goto got_it; 538: } 539: } while (*p1++ && err < FATAL); 540: return Nullch; 541: } 542: 543: got_it: 544: if (compex->numsubs) { /* any parens? */ 545: trt = savestr(addr); /* in case addr is not static */ 546: if (compex->subbase) 547: safefree(compex->subbase); /* (may be freeing addr!) */ 548: compex->subbase = trt; 549: scr = compex->subbase - addr; 550: p1 += scr; 551: matchend += scr; 552: for (c = 0; c <= compex->numsubs; c++) { 553: if (compex->subend[c]) { 554: compex->subbeg[c] += scr; 555: compex->subend[c] += scr; 556: } 557: } 558: } 559: compex->subend[0] = matchend; 560: compex->subbeg[0] = p1; 561: return p1; 562: } 563: 564: bool 565: try(compex, sp, cp) 566: COMPEX *compex; 567: register char *cp; 568: register char *sp; 569: { 570: register char *basesp; 571: register char *trt = trans; 572: register int i; 573: register int backlen; 574: register int code; 575: 576: while (*sp || (*cp & MAXINF) || *cp == BEG || *cp == RPAR || 577: *cp == WBOUND || *cp == NWBOUND) { 578: switch ((code = *cp++) & CODEMASK) { 579: 580: case CHAR: 581: basesp = sp; 582: i = *cp++; 583: if (code & MAXINF) 584: while (*sp && trt[*sp] == i) sp++; 585: else 586: if (*sp && trt[*sp] == i) sp++; 587: backlen = 1; 588: goto backoff; 589: 590: backoff: 591: while (sp > basesp) { 592: if (try(compex, sp, cp)) 593: goto right; 594: sp -= backlen; 595: } 596: if (code & MINZERO) 597: continue; 598: goto wrong; 599: 600: case ANY: 601: basesp = sp; 602: if (code & MAXINF) 603: while (*sp && *sp != '\n') sp++; 604: else 605: if (*sp && *sp != '\n') sp++; 606: backlen = 1; 607: goto backoff; 608: 609: case CCL: 610: basesp = sp; 611: if (code & MAXINF) 612: while (*sp && cclass(cp, *sp, 1)) sp++; 613: else 614: if (*sp && cclass(cp, *sp, 1)) sp++; 615: cp += BMAPSIZ; 616: backlen = 1; 617: goto backoff; 618: 619: case NCCL: 620: basesp = sp; 621: if (code & MAXINF) 622: while (*sp && cclass(cp, *sp, 0)) sp++; 623: else 624: if (*sp && cclass(cp, *sp, 0)) sp++; 625: cp += BMAPSIZ; 626: backlen = 1; 627: goto backoff; 628: 629: case END: 630: if (!*sp || *sp == '\n') { 631: matchtill--; 632: continue; 633: } 634: goto wrong; 635: 636: case BEG: 637: if (sp == FirstCharacter || ( 638: *sp && sp[-1] == '\n') ) { 639: matchtill--; 640: continue; 641: } 642: if (!multiline) /* no point in advancing more */ 643: err = BEGFAIL; 644: goto wrong; 645: 646: case WORD: 647: basesp = sp; 648: if (code & MAXINF) 649: while (*sp && isalnum(*sp)) sp++; 650: else 651: if (*sp && isalnum(*sp)) sp++; 652: backlen = 1; 653: goto backoff; 654: 655: case NWORD: 656: basesp = sp; 657: if (code & MAXINF) 658: while (*sp && !isalnum(*sp)) sp++; 659: else 660: if (*sp && !isalnum(*sp)) sp++; 661: backlen = 1; 662: goto backoff; 663: 664: case WBOUND: 665: if ((sp == FirstCharacter || !isalnum(sp[-1])) != 666: (!*sp || !isalnum(*sp)) ) 667: continue; 668: goto wrong; 669: 670: case NWBOUND: 671: if ((sp == FirstCharacter || !isalnum(sp[-1])) == 672: (!*sp || !isalnum(*sp))) 673: continue; 674: goto wrong; 675: 676: case FINIS: 677: goto right; 678: 679: case LPAR: 680: compex->subbeg[*cp++] = sp; 681: continue; 682: 683: case RPAR: 684: i = *cp++; 685: compex->subend[i] = sp; 686: compex->lastparen = i; 687: continue; 688: 689: case REF: 690: if (compex->subend[i = *cp++] == 0) { 691: fputs("Bad subpattern reference\n",stdout) FLUSH; 692: err = FATAL; 693: goto wrong; 694: } 695: basesp = sp; 696: backlen = compex->subend[i] - compex->subbeg[i]; 697: if (code & MAXINF) 698: while (*sp && subpat(compex, i, sp)) sp += backlen; 699: else 700: if (*sp && subpat(compex, i, sp)) sp += backlen; 701: goto backoff; 702: 703: default: 704: fputs("Botched pattern compilation\n",stdout) FLUSH; 705: err = FATAL; 706: return -1; 707: } 708: } 709: if (*cp == FINIS || *cp == END) { 710: right: 711: if (matchend == Nullch || sp > matchend) 712: matchend = sp; 713: return matchend >= matchtill; 714: } 715: wrong: 716: matchend = Nullch; 717: return FALSE; 718: } 719: 720: bool 721: subpat(compex, i, sp) 722: register COMPEX *compex; 723: register int i; 724: register char *sp; 725: { 726: register char *bp; 727: 728: bp = compex->subbeg[i]; 729: while (*sp && *bp == *sp) { 730: bp++; 731: sp++; 732: if (bp >= compex->subend[i]) 733: return TRUE; 734: } 735: return FALSE; 736: } 737: 738: bool 739: cclass(set, c, af) 740: register char *set; 741: register int c; 742: { 743: c &= 0177; 744: #if BITSPERBYTE == 8 745: if (set[c >> 3] & 1 << (c & 7)) 746: #else 747: if (set[c / BITSPERBYTE] & 1 << (c % BITSPERBYTE)) 748: #endif 749: return af; 750: return !af; 751: }