1: /* $Header: search.c,v 1.0.1.2 88/01/28 10:30:46 root Exp $ 2: * 3: * $Log: search.c,v $ 4: * Revision 1.0.1.2 88/01/28 10:30:46 root 5: * patch8: uncommented free_compex for use with eval operator. 6: * 7: * Revision 1.0.1.1 88/01/24 03:55:05 root 8: * patch 2: made depend on perl.h. 9: * 10: * Revision 1.0 87/12/18 13:05:59 root 11: * Initial revision 12: * 13: */ 14: 15: /* string search routines */ 16: 17: #include "EXTERN.h" 18: #include "handy.h" 19: #include "util.h" 20: #include "INTERN.h" 21: #include "search.h" 22: #include "EXTERN.h" 23: #include "perl.h" 24: 25: #define VERBOSE 26: #define FLUSH 27: #define MEM_SIZE int 28: 29: #ifndef BITSPERBYTE 30: #define BITSPERBYTE 8 31: #endif 32: 33: #define BMAPSIZ (127 / BITSPERBYTE + 1) 34: 35: #define CHAR 0 /* a normal character */ 36: #define ANY 1 /* . matches anything except newline */ 37: #define CCL 2 /* [..] character class */ 38: #define NCCL 3 /* [^..]negated character class */ 39: #define BEG 4 /* ^ beginning of a line */ 40: #define END 5 /* $ end of a line */ 41: #define LPAR 6 /* ( begin sub-match */ 42: #define RPAR 7 /* ) end sub-match */ 43: #define REF 8 /* \N backreference to the Nth submatch */ 44: #define WORD 9 /* \w matches alphanumeric character */ 45: #define NWORD 10 /* \W matches non-alphanumeric character */ 46: #define WBOUND 11 /* \b matches word boundary */ 47: #define NWBOUND 12 /* \B matches non-boundary */ 48: #define FINIS 13 /* the end of the pattern */ 49: 50: #define CODEMASK 15 51: 52: /* Quantifiers: */ 53: 54: #define MINZERO 16 /* minimum is 0, not 1 */ 55: #define MAXINF 32 /* maximum is infinity, not 1 */ 56: 57: #define ASCSIZ 0200 58: typedef char TRANSTABLE[ASCSIZ]; 59: 60: static TRANSTABLE trans = { 61: 0000,0001,0002,0003,0004,0005,0006,0007, 62: 0010,0011,0012,0013,0014,0015,0016,0017, 63: 0020,0021,0022,0023,0024,0025,0026,0027, 64: 0030,0031,0032,0033,0034,0035,0036,0037, 65: 0040,0041,0042,0043,0044,0045,0046,0047, 66: 0050,0051,0052,0053,0054,0055,0056,0057, 67: 0060,0061,0062,0063,0064,0065,0066,0067, 68: 0070,0071,0072,0073,0074,0075,0076,0077, 69: 0100,0101,0102,0103,0104,0105,0106,0107, 70: 0110,0111,0112,0113,0114,0115,0116,0117, 71: 0120,0121,0122,0123,0124,0125,0126,0127, 72: 0130,0131,0132,0133,0134,0135,0136,0137, 73: 0140,0141,0142,0143,0144,0145,0146,0147, 74: 0150,0151,0152,0153,0154,0155,0156,0157, 75: 0160,0161,0162,0163,0164,0165,0166,0167, 76: 0170,0171,0172,0173,0174,0175,0176,0177, 77: }; 78: static bool folding = FALSE; 79: 80: static int err; 81: #define NOERR 0 82: #define BEGFAIL 1 83: #define FATAL 2 84: 85: static char *FirstCharacter; 86: static char *matchend; 87: static char *matchtill; 88: 89: void 90: search_init() 91: { 92: #ifdef UNDEF 93: register int i; 94: 95: for (i = 0; i < ASCSIZ; i++) 96: trans[i] = i; 97: #else 98: ; 99: #endif 100: } 101: 102: void 103: init_compex(compex) 104: register COMPEX *compex; 105: { 106: /* the following must start off zeroed */ 107: 108: compex->precomp = Nullch; 109: compex->complen = 0; 110: compex->subbase = Nullch; 111: } 112: 113: void 114: free_compex(compex) 115: register COMPEX *compex; 116: { 117: if (compex->complen) { 118: safefree(compex->compbuf); 119: compex->complen = 0; 120: } 121: if (compex->subbase) { 122: safefree(compex->subbase); 123: compex->subbase = Nullch; 124: } 125: } 126: 127: static char *gbr_str = Nullch; 128: static int gbr_siz = 0; 129: 130: char * 131: getparen(compex,n) 132: register COMPEX *compex; 133: int n; 134: { 135: int length = compex->subend[n] - compex->subbeg[n]; 136: 137: if (!n && 138: (!compex->numsubs || n > compex->numsubs || !compex->subend[n] || length<0)) 139: return ""; 140: growstr(&gbr_str, &gbr_siz, length+1); 141: safecpy(gbr_str, compex->subbeg[n], length+1); 142: return gbr_str; 143: } 144: 145: void 146: case_fold(which) 147: int which; 148: { 149: register int i; 150: 151: if (which != folding) { 152: if (which) { 153: for (i = 'A'; i <= 'Z'; i++) 154: trans[i] = tolower(i); 155: } 156: else { 157: for (i = 'A'; i <= 'Z'; i++) 158: trans[i] = i; 159: } 160: folding = which; 161: } 162: } 163: 164: /* Compile the regular expression into internal form */ 165: 166: char * 167: compile(compex, sp, regex, fold) 168: register COMPEX *compex; 169: register char *sp; 170: int regex; 171: int fold; 172: { 173: register int c; 174: register char *cp; 175: char *lastcp; 176: char paren[MAXSUB], 177: *parenp; 178: char **alt = compex->alternatives; 179: char *retmes = "Badly formed search string"; 180: 181: case_fold(compex->do_folding = fold); 182: if (compex->precomp) 183: safefree(compex->precomp); 184: compex->precomp = savestr(sp); 185: if (!compex->complen) { 186: compex->compbuf = safemalloc(84); 187: compex->complen = 80; 188: } 189: cp = compex->compbuf; /* point at compiled buffer */ 190: *alt++ = cp; /* first alternative starts here */ 191: parenp = paren; /* first paren goes here */ 192: if (*sp == 0) { /* nothing to compile? */ 193: #ifdef NOTDEF 194: if (*cp == 0) /* nothing there yet? */ 195: return "Null search string"; 196: #endif 197: if (*cp) 198: return Nullch; /* just keep old expression */ 199: } 200: compex->numsubs = 0; /* no parens yet */ 201: lastcp = 0; 202: for (;;) { 203: if (cp - compex->compbuf >= compex->complen) { 204: char *ocompbuf = compex->compbuf; 205: 206: grow_comp(compex); 207: if (ocompbuf != compex->compbuf) { /* adjust pointers? */ 208: char **tmpalt; 209: 210: cp = compex->compbuf + (cp - ocompbuf); 211: if (lastcp) 212: lastcp = compex->compbuf + (lastcp - ocompbuf); 213: for (tmpalt = compex->alternatives; tmpalt < alt; tmpalt++) 214: if (*tmpalt) 215: *tmpalt = compex->compbuf + (*tmpalt - ocompbuf); 216: } 217: } 218: c = *sp++; /* get next char of pattern */ 219: if (c == 0) { /* end of pattern? */ 220: if (parenp != paren) { /* balanced parentheses? */ 221: #ifdef VERBOSE 222: retmes = "Missing right parenthesis"; 223: #endif 224: goto badcomp; 225: } 226: *cp++ = FINIS; /* append a stopper */ 227: *alt++ = 0; /* terminate alternative list */ 228: /* 229: compex->complen = cp - compex->compbuf + 1; 230: compex->compbuf = saferealloc(compex->compbuf,compex->complen+4); */ 231: return Nullch; /* return success */ 232: } 233: if (c != '*' && c != '?' && c != '+') 234: lastcp = cp; 235: if (!regex) { /* just a normal search string? */ 236: *cp++ = CHAR; /* everything is a normal char */ 237: *cp++ = trans[c]; 238: } 239: else /* it is a regular expression */ 240: switch (c) { 241: 242: default: 243: normal_char: 244: *cp++ = CHAR; 245: *cp++ = trans[c]; 246: continue; 247: 248: case '.': 249: *cp++ = ANY; 250: continue; 251: 252: case '[': { /* character class */ 253: register int i; 254: 255: if (cp - compex->compbuf >= compex->complen - BMAPSIZ) { 256: char *ocompbuf = compex->compbuf; 257: 258: grow_comp(compex); /* reserve bitmap */ 259: if (ocompbuf != compex->compbuf) {/* adjust pointers? */ 260: char **tmpalt; 261: 262: cp = compex->compbuf + (cp - ocompbuf); 263: if (lastcp) 264: lastcp = compex->compbuf + (lastcp - ocompbuf); 265: for (tmpalt = compex->alternatives; tmpalt < alt; 266: tmpalt++) 267: if (*tmpalt) 268: *tmpalt = 269: compex->compbuf + (*tmpalt - ocompbuf); 270: } 271: } 272: for (i = BMAPSIZ; i; --i) 273: cp[i] = 0; 274: 275: if ((c = *sp++) == '^') { 276: c = *sp++; 277: *cp++ = NCCL; /* negated */ 278: } 279: else 280: *cp++ = CCL; /* normal */ 281: 282: i = 0; /* remember oldchar */ 283: do { 284: if (c == '\0') { 285: #ifdef VERBOSE 286: retmes = "Missing ]"; 287: #endif 288: goto badcomp; 289: } 290: if (c == '\\' && *sp) { 291: switch (*sp) { 292: default: 293: c = *sp++; 294: break; 295: case '0': case '1': case '2': case '3': 296: case '4': case '5': case '6': case '7': 297: c = *sp++ - '0'; 298: if (index("01234567",*sp)) { 299: c <<= 3; 300: c += *sp++ - '0'; 301: } 302: if (index("01234567",*sp)) { 303: c <<= 3; 304: c += *sp++ - '0'; 305: } 306: break; 307: case 'b': 308: c = '\b'; 309: sp++; 310: break; 311: case 'n': 312: c = '\n'; 313: sp++; 314: break; 315: case 'r': 316: c = '\r'; 317: sp++; 318: break; 319: case 'f': 320: c = '\f'; 321: sp++; 322: break; 323: case 't': 324: c = '\t'; 325: sp++; 326: break; 327: } 328: } 329: if (*sp == '-' && *(++sp)) 330: i = *sp++; 331: else 332: i = c; 333: while (c <= i) { 334: cp[c / BITSPERBYTE] |= 1 << (c % BITSPERBYTE); 335: if (fold && isalpha(c)) 336: cp[(c ^ 32) / BITSPERBYTE] |= 337: 1 << ((c ^ 32) % BITSPERBYTE); 338: /* set the other bit too */ 339: c++; 340: } 341: } while ((c = *sp++) != ']'); 342: if (cp[-1] == NCCL) 343: cp[0] |= 1; 344: cp += BMAPSIZ; 345: continue; 346: } 347: 348: case '^': 349: if (cp != compex->compbuf && cp[-1] != FINIS) 350: goto normal_char; 351: *cp++ = BEG; 352: continue; 353: 354: case '$': 355: if (isdigit(*sp)) { 356: *cp++ = REF; 357: *cp++ = *sp - '0'; 358: break; 359: } 360: if (*sp && *sp != '|') 361: goto normal_char; 362: *cp++ = END; 363: continue; 364: 365: case '*': case '?': case '+': 366: if (lastcp == 0 || 367: (*lastcp & (MINZERO|MAXINF)) || 368: *lastcp == LPAR || 369: *lastcp == RPAR || 370: *lastcp == BEG || 371: *lastcp == END || 372: *lastcp == WBOUND || 373: *lastcp == NWBOUND ) 374: goto normal_char; 375: if (c != '+') 376: *lastcp |= MINZERO; 377: if (c != '?') 378: *lastcp |= MAXINF; 379: continue; 380: 381: case '(': 382: if (compex->numsubs >= MAXSUB) { 383: #ifdef VERBOSE 384: retmes = "Too many parens"; 385: #endif 386: goto badcomp; 387: } 388: *parenp++ = ++compex->numsubs; 389: *cp++ = LPAR; 390: *cp++ = compex->numsubs; 391: break; 392: case ')': 393: if (parenp <= paren) { 394: #ifdef VERBOSE 395: retmes = "Unmatched right paren"; 396: #endif 397: goto badcomp; 398: } 399: *cp++ = RPAR; 400: *cp++ = *--parenp; 401: break; 402: case '|': 403: if (parenp>paren) { 404: #ifdef VERBOSE 405: retmes = "No | in subpattern"; /* Sigh! */ 406: #endif 407: goto badcomp; 408: } 409: *cp++ = FINIS; 410: if (alt - compex->alternatives >= MAXALT) { 411: #ifdef VERBOSE 412: retmes = "Too many alternatives"; 413: #endif 414: goto badcomp; 415: } 416: *alt++ = cp; 417: break; 418: case '\\': /* backslashed thingie */ 419: switch (c = *sp++) { 420: case '0': case '1': case '2': case '3': case '4': 421: case '5': case '6': case '7': case '8': case '9': 422: *cp++ = REF; 423: *cp++ = c - '0'; 424: break; 425: case 'w': 426: *cp++ = WORD; 427: break; 428: case 'W': 429: *cp++ = NWORD; 430: break; 431: case 'b': 432: *cp++ = WBOUND; 433: break; 434: case 'B': 435: *cp++ = NWBOUND; 436: break; 437: default: 438: *cp++ = CHAR; 439: if (c == '\0') 440: goto badcomp; 441: switch (c) { 442: case 'n': 443: c = '\n'; 444: break; 445: case 'r': 446: c = '\r'; 447: break; 448: case 'f': 449: c = '\f'; 450: break; 451: case 't': 452: c = '\t'; 453: break; 454: } 455: *cp++ = c; 456: break; 457: } 458: break; 459: } 460: } 461: badcomp: 462: compex->compbuf[0] = 0; 463: compex->numsubs = 0; 464: return retmes; 465: } 466: 467: void 468: grow_comp(compex) 469: register COMPEX *compex; 470: { 471: compex->complen += 80; 472: compex->compbuf = saferealloc(compex->compbuf, (MEM_SIZE)compex->complen + 4); 473: } 474: 475: char * 476: execute(compex, addr, beginning, minend) 477: register COMPEX *compex; 478: char *addr; 479: bool beginning; 480: int minend; 481: { 482: register char *p1 = addr; 483: register char *trt = trans; 484: register int c; 485: register int scr; 486: register int c2; 487: 488: if (addr == Nullch) 489: return Nullch; 490: if (compex->numsubs) { /* any submatches? */ 491: for (c = 0; c <= compex->numsubs; c++) 492: compex->subbeg[c] = compex->subend[c] = Nullch; 493: } 494: case_fold(compex->do_folding); /* make sure table is correct */ 495: if (beginning) 496: FirstCharacter = p1; /* for ^ tests */ 497: else { 498: if (multiline || compex->alternatives[1] || compex->compbuf[0] != BEG) 499: FirstCharacter = Nullch; 500: else 501: return Nullch; /* can't match */ 502: } 503: matchend = Nullch; 504: matchtill = addr + minend; 505: err = 0; 506: if (compex->compbuf[0] == CHAR && !compex->alternatives[1]) { 507: if (compex->do_folding) { 508: c = compex->compbuf[1]; /* fast check for first character */ 509: do { 510: if (trt[*p1] == c && try(compex, p1, compex->compbuf)) 511: goto got_it; 512: } while (*p1++ && !err); 513: } 514: else { 515: c = compex->compbuf[1]; /* faster check for first character */ 516: if (compex->compbuf[2] == CHAR) 517: c2 = compex->compbuf[3]; 518: else 519: c2 = 0; 520: do { 521: false_alarm: 522: while (scr = *p1++, scr && scr != c) ; 523: if (!scr) 524: break; 525: if (c2 && *p1 != c2) /* and maybe even second character */ 526: goto false_alarm; 527: if (try(compex, p1, compex->compbuf+2)) { 528: p1--; 529: goto got_it; 530: } 531: } while (!err); 532: } 533: return Nullch; 534: } 535: else { /* normal algorithm */ 536: do { 537: register char **alt = compex->alternatives; 538: while (*alt) { 539: if (try(compex, p1, *alt++)) 540: goto got_it; 541: } 542: } while (*p1++ && err < FATAL); 543: return Nullch; 544: } 545: 546: got_it: 547: if (compex->numsubs) { /* any parens? */ 548: trt = savestr(addr); /* in case addr is not static */ 549: if (compex->subbase) 550: safefree(compex->subbase); /* (may be freeing addr!) */ 551: compex->subbase = trt; 552: scr = compex->subbase - addr; 553: p1 += scr; 554: matchend += scr; 555: for (c = 0; c <= compex->numsubs; c++) { 556: if (compex->subend[c]) { 557: compex->subbeg[c] += scr; 558: compex->subend[c] += scr; 559: } 560: } 561: } 562: compex->subend[0] = matchend; 563: compex->subbeg[0] = p1; 564: return p1; 565: } 566: 567: bool 568: try(compex, sp, cp) 569: COMPEX *compex; 570: register char *cp; 571: register char *sp; 572: { 573: register char *basesp; 574: register char *trt = trans; 575: register int i; 576: register int backlen; 577: register int code; 578: 579: while (*sp || (*cp & MAXINF) || *cp == BEG || *cp == RPAR || 580: *cp == WBOUND || *cp == NWBOUND) { 581: switch ((code = *cp++) & CODEMASK) { 582: 583: case CHAR: 584: basesp = sp; 585: i = *cp++; 586: if (code & MAXINF) 587: while (*sp && trt[*sp] == i) sp++; 588: else 589: if (*sp && trt[*sp] == i) sp++; 590: backlen = 1; 591: goto backoff; 592: 593: backoff: 594: while (sp > basesp) { 595: if (try(compex, sp, cp)) 596: goto right; 597: sp -= backlen; 598: } 599: if (code & MINZERO) 600: continue; 601: goto wrong; 602: 603: case ANY: 604: basesp = sp; 605: if (code & MAXINF) 606: while (*sp && *sp != '\n') sp++; 607: else 608: if (*sp && *sp != '\n') sp++; 609: backlen = 1; 610: goto backoff; 611: 612: case CCL: 613: basesp = sp; 614: if (code & MAXINF) 615: while (*sp && cclass(cp, *sp, 1)) sp++; 616: else 617: if (*sp && cclass(cp, *sp, 1)) sp++; 618: cp += BMAPSIZ; 619: backlen = 1; 620: goto backoff; 621: 622: case NCCL: 623: basesp = sp; 624: if (code & MAXINF) 625: while (*sp && cclass(cp, *sp, 0)) sp++; 626: else 627: if (*sp && cclass(cp, *sp, 0)) sp++; 628: cp += BMAPSIZ; 629: backlen = 1; 630: goto backoff; 631: 632: case END: 633: if (!*sp || *sp == '\n') { 634: matchtill--; 635: continue; 636: } 637: goto wrong; 638: 639: case BEG: 640: if (sp == FirstCharacter || ( 641: *sp && sp[-1] == '\n') ) { 642: matchtill--; 643: continue; 644: } 645: if (!multiline) /* no point in advancing more */ 646: err = BEGFAIL; 647: goto wrong; 648: 649: case WORD: 650: basesp = sp; 651: if (code & MAXINF) 652: while (*sp && isalnum(*sp)) sp++; 653: else 654: if (*sp && isalnum(*sp)) sp++; 655: backlen = 1; 656: goto backoff; 657: 658: case NWORD: 659: basesp = sp; 660: if (code & MAXINF) 661: while (*sp && !isalnum(*sp)) sp++; 662: else 663: if (*sp && !isalnum(*sp)) sp++; 664: backlen = 1; 665: goto backoff; 666: 667: case WBOUND: 668: if ((sp == FirstCharacter || !isalnum(sp[-1])) != 669: (!*sp || !isalnum(*sp)) ) 670: continue; 671: goto wrong; 672: 673: case NWBOUND: 674: if ((sp == FirstCharacter || !isalnum(sp[-1])) == 675: (!*sp || !isalnum(*sp))) 676: continue; 677: goto wrong; 678: 679: case FINIS: 680: goto right; 681: 682: case LPAR: 683: compex->subbeg[*cp++] = sp; 684: continue; 685: 686: case RPAR: 687: i = *cp++; 688: compex->subend[i] = sp; 689: compex->lastparen = i; 690: continue; 691: 692: case REF: 693: if (compex->subend[i = *cp++] == 0) { 694: fputs("Bad subpattern reference\n",stdout) FLUSH; 695: err = FATAL; 696: goto wrong; 697: } 698: basesp = sp; 699: backlen = compex->subend[i] - compex->subbeg[i]; 700: if (code & MAXINF) 701: while (*sp && subpat(compex, i, sp)) sp += backlen; 702: else 703: if (*sp && subpat(compex, i, sp)) sp += backlen; 704: goto backoff; 705: 706: default: 707: fputs("Botched pattern compilation\n",stdout) FLUSH; 708: err = FATAL; 709: return -1; 710: } 711: } 712: if (*cp == FINIS || *cp == END) { 713: right: 714: if (matchend == Nullch || sp > matchend) 715: matchend = sp; 716: return matchend >= matchtill; 717: } 718: wrong: 719: matchend = Nullch; 720: return FALSE; 721: } 722: 723: bool 724: subpat(compex, i, sp) 725: register COMPEX *compex; 726: register int i; 727: register char *sp; 728: { 729: register char *bp; 730: 731: bp = compex->subbeg[i]; 732: while (*sp && *bp == *sp) { 733: bp++; 734: sp++; 735: if (bp >= compex->subend[i]) 736: return TRUE; 737: } 738: return FALSE; 739: } 740: 741: bool 742: cclass(set, c, af) 743: register char *set; 744: register int c; 745: { 746: c &= 0177; 747: #if BITSPERBYTE == 8 748: if (set[c >> 3] & 1 << (c & 7)) 749: #else 750: if (set[c / BITSPERBYTE] & 1 << (c % BITSPERBYTE)) 751: #endif 752: return af; 753: return !af; 754: }