1: /* $Header: search.c,v 4.3 85/05/01 11:50:16 lwall Exp $ 2: * 3: * $Log: search.c,v $ 4: * Revision 4.3 85/05/01 11:50:16 lwall 5: * Baseline for release with 4.3bsd. 6: * 7: */ 8: 9: /* string search routines */ 10: 11: /* Copyright (c) 1981,1980 James Gosling */ 12: 13: /* Modified Aug. 12, 1981 by Tom London to include regular expressions 14: as in ed. RE stuff hacked over by jag to correct a few major problems, 15: mainly dealing with searching within the buffer rather than copying 16: each line to a separate array. Newlines can now appear in RE's */ 17: 18: /* Ripped to shreds and glued back together to make a search package, 19: * July 6, 1984, by Larry Wall. (If it doesn't work, it's probably my fault.) 20: * Changes include: 21: * Buffer, window, and mlisp stuff gone. 22: * Translation tables reduced to 1 table. 23: * Expression buffer is now dynamically allocated. 24: * Character classes now implemented with a bitmap. 25: */ 26: 27: #include "EXTERN.h" 28: #include "common.h" 29: #include "util.h" 30: #include "INTERN.h" 31: #include "search.h" 32: 33: #ifndef BITSPERBYTE 34: #define BITSPERBYTE 8 35: #endif 36: 37: #define BMAPSIZ (127 / BITSPERBYTE + 1) 38: 39: /* meta characters in the "compiled" form of a regular expression */ 40: #define CBRA 2 /* \( -- begin bracket */ 41: #define CCHR 4 /* a vanilla character */ 42: #define CDOT 6 /* . -- match anything except a newline */ 43: #define CCL 8 /* [...] -- character class */ 44: #define NCCL 10 /* [^...] -- negated character class */ 45: #define CDOL 12 /* $ -- matches the end of a line */ 46: #define CEND 14 /* The end of the pattern */ 47: #define CKET 16 /* \) -- close bracket */ 48: #define CBACK 18 /* \N -- backreference to the Nth bracketed 49: string */ 50: #define CIRC 20 /* ^ matches the beginning of a line */ 51: 52: #define WORD 32 /* matches word character \w */ 53: #define NWORD 34 /* matches non-word characer \W */ 54: #define WBOUND 36 /* matches word boundary \b */ 55: #define NWBOUND 38 /* matches non-(word boundary) \B */ 56: 57: #define STAR 01 /* * -- Kleene star, repeats the previous 58: REas many times as possible; the value 59: ORs with the other operator types */ 60: 61: #define ASCSIZ 0200 62: typedef char TRANSTABLE[ASCSIZ]; 63: 64: static TRANSTABLE trans = { 65: 0000,0001,0002,0003,0004,0005,0006,0007, 66: 0010,0011,0012,0013,0014,0015,0016,0017, 67: 0020,0021,0022,0023,0024,0025,0026,0027, 68: 0030,0031,0032,0033,0034,0035,0036,0037, 69: 0040,0041,0042,0043,0044,0045,0046,0047, 70: 0050,0051,0052,0053,0054,0055,0056,0057, 71: 0060,0061,0062,0063,0064,0065,0066,0067, 72: 0070,0071,0072,0073,0074,0075,0076,0077, 73: 0100,0101,0102,0103,0104,0105,0106,0107, 74: 0110,0111,0112,0113,0114,0115,0116,0117, 75: 0120,0121,0122,0123,0124,0125,0126,0127, 76: 0130,0131,0132,0133,0134,0135,0136,0137, 77: 0140,0141,0142,0143,0144,0145,0146,0147, 78: 0150,0151,0152,0153,0154,0155,0156,0157, 79: 0160,0161,0162,0163,0164,0165,0166,0167, 80: 0170,0171,0172,0173,0174,0175,0176,0177, 81: }; 82: static bool folding = FALSE; 83: 84: static int err; 85: static char *FirstCharacter; 86: 87: void 88: search_init() 89: { 90: #ifdef UNDEF 91: register int i; 92: 93: for (i = 0; i < ASCSIZ; i++) 94: trans[i] = i; 95: #else 96: ; 97: #endif 98: } 99: 100: void 101: init_compex(compex) 102: register COMPEX *compex; 103: { 104: /* the following must start off zeroed */ 105: 106: compex->eblen = 0; 107: compex->brastr = Nullch; 108: } 109: 110: void 111: free_compex(compex) 112: register COMPEX *compex; 113: { 114: if (compex->eblen) { 115: free(compex->expbuf); 116: compex->eblen = 0; 117: } 118: if (compex->brastr) { 119: free(compex->brastr); 120: compex->brastr = Nullch; 121: } 122: } 123: 124: static char *gbr_str = Nullch; 125: static int gbr_siz = 0; 126: 127: char * 128: getbracket(compex,n) 129: register COMPEX *compex; 130: int n; 131: { 132: int length = compex->braelist[n] - compex->braslist[n]; 133: 134: if (!compex->nbra || n > compex->nbra || !compex->braelist[n] || length<0) 135: return nullstr; 136: growstr(&gbr_str, &gbr_siz, length+1); 137: safecpy(gbr_str, compex->braslist[n], length+1); 138: return gbr_str; 139: } 140: 141: void 142: case_fold(which) 143: int which; 144: { 145: register int i; 146: 147: if (which != folding) { 148: if (which) { 149: for (i = 'A'; i <= 'Z'; i++) 150: trans[i] = tolower(i); 151: } 152: else { 153: for (i = 'A'; i <= 'Z'; i++) 154: trans[i] = i; 155: } 156: folding = which; 157: } 158: } 159: 160: /* Compile the given regular expression into a [secret] internal format */ 161: 162: char * 163: compile (compex, strp, RE, fold) 164: register COMPEX *compex; 165: register char *strp; 166: int RE; 167: int fold; 168: { 169: register int c; 170: register char *ep; 171: char *lastep; 172: char bracket[NBRA], 173: *bracketp; 174: char **alt = compex->alternatives; 175: char *retmes = "Badly formed search string"; 176: 177: case_fold(compex->do_folding = fold); 178: if (!compex->eblen) { 179: compex->expbuf = safemalloc(84); 180: compex->eblen = 80; 181: } 182: ep = compex->expbuf; /* point at expression buffer */ 183: *alt++ = ep; /* first alternative starts here */ 184: bracketp = bracket; /* first bracket goes here */ 185: if (*strp == 0) { /* nothing to compile? */ 186: if (*ep == 0) /* nothing there yet? */ 187: return "Null search string"; 188: return Nullch; /* just keep old expression */ 189: } 190: compex->nbra = 0; /* no brackets yet */ 191: lastep = 0; 192: for (;;) { 193: if (ep - compex->expbuf >= compex->eblen) 194: grow_eb(compex); 195: c = *strp++; /* fetch next char of pattern */ 196: if (c == 0) { /* end of pattern? */ 197: if (bracketp != bracket) { /* balanced brackets? */ 198: #ifdef VERBOSE 199: retmes = "Unbalanced parens"; 200: #endif 201: goto cerror; 202: } 203: *ep++ = CEND; /* terminate expression */ 204: *alt++ = 0; /* terminal alternative list */ 205: /* 206: compex->eblen = ep - compex->expbuf + 1; 207: compex->expbuf = saferealloc(compex->expbuf,compex->eblen+4); */ 208: return Nullch; /* return success */ 209: } 210: if (c != '*') 211: lastep = ep; 212: if (!RE) { /* just a normal search string? */ 213: *ep++ = CCHR; /* everything is a normal char */ 214: *ep++ = c; 215: } 216: else /* it is a regular expression */ 217: switch (c) { 218: 219: case '\\': /* meta something */ 220: switch (c = *strp++) { 221: case '(': 222: if (compex->nbra >= NBRA) { 223: #ifdef VERBOSE 224: retmes = "Too many parens"; 225: #endif 226: goto cerror; 227: } 228: *bracketp++ = ++compex->nbra; 229: *ep++ = CBRA; 230: *ep++ = compex->nbra; 231: break; 232: case '|': 233: if (bracketp>bracket) { 234: #ifdef VERBOSE 235: retmes = "No \\| in parens"; /* Alas! */ 236: #endif 237: goto cerror; 238: } 239: *ep++ = CEND; 240: *alt++ = ep; 241: break; 242: case ')': 243: if (bracketp <= bracket) { 244: #ifdef VERBOSE 245: retmes = "Unmatched right paren"; 246: #endif 247: goto cerror; 248: } 249: *ep++ = CKET; 250: *ep++ = *--bracketp; 251: break; 252: case 'w': 253: *ep++ = WORD; 254: break; 255: case 'W': 256: *ep++ = NWORD; 257: break; 258: case 'b': 259: *ep++ = WBOUND; 260: break; 261: case 'B': 262: *ep++ = NWBOUND; 263: break; 264: case '0': case '1': case '2': case '3': case '4': 265: case '5': case '6': case '7': case '8': case '9': 266: *ep++ = CBACK; 267: *ep++ = c - '0'; 268: break; 269: default: 270: *ep++ = CCHR; 271: if (c == '\0') 272: goto cerror; 273: *ep++ = c; 274: break; 275: } 276: break; 277: case '.': 278: *ep++ = CDOT; 279: continue; 280: 281: case '*': 282: if (lastep == 0 || *lastep == CBRA || *lastep == CKET 283: || *lastep == CIRC 284: || (*lastep&STAR)|| *lastep>NWORD) 285: goto defchar; 286: *lastep |= STAR; 287: continue; 288: 289: case '^': 290: if (ep != compex->expbuf && ep[-1] != CEND) 291: goto defchar; 292: *ep++ = CIRC; 293: continue; 294: 295: case '$': 296: if (*strp != 0 && (*strp != '\\' || strp[1] != '|')) 297: goto defchar; 298: *ep++ = CDOL; 299: continue; 300: 301: case '[': { /* character class */ 302: register int i; 303: 304: if (ep - compex->expbuf >= compex->eblen - BMAPSIZ) 305: grow_eb(compex); /* reserve bitmap */ 306: for (i = BMAPSIZ; i; --i) 307: ep[i] = 0; 308: 309: if ((c = *strp++) == '^') { 310: c = *strp++; 311: *ep++ = NCCL; /* negated */ 312: } 313: else 314: *ep++ = CCL; /* normal */ 315: 316: i = 0; /* remember oldchar */ 317: do { 318: if (c == '\0') { 319: #ifdef VERBOSE 320: retmes = "Missing ]"; 321: #endif 322: goto cerror; 323: } 324: if (*strp == '-' && *(++strp)) 325: i = *strp++; 326: else 327: i = c; 328: while (c <= i) { 329: ep[c / BITSPERBYTE] |= 1 << (c % BITSPERBYTE); 330: if (fold && isalpha(c)) 331: ep[(c ^ 32) / BITSPERBYTE] |= 332: 1 << ((c ^ 32) % BITSPERBYTE); 333: /* set the other bit too */ 334: c++; 335: } 336: } while ((c = *strp++) != ']'); 337: ep += BMAPSIZ; 338: continue; 339: } 340: 341: defchar: 342: default: 343: *ep++ = CCHR; 344: *ep++ = c; 345: } 346: } 347: cerror: 348: compex->expbuf[0] = 0; 349: compex->nbra = 0; 350: return retmes; 351: } 352: 353: void 354: grow_eb(compex) 355: register COMPEX *compex; 356: { 357: compex->eblen += 80; 358: compex->expbuf = saferealloc(compex->expbuf, (MEM_SIZE)compex->eblen + 4); 359: } 360: 361: char * 362: execute (compex, addr) 363: register COMPEX *compex; 364: char *addr; 365: { 366: register char *p1 = addr; 367: register char *trt = trans; 368: register int c; 369: 370: if (addr == Nullch) 371: return Nullch; 372: if (compex->nbra) { /* any brackets? */ 373: for (c = 0; c <= compex->nbra; c++) 374: compex->braslist[c] = compex->braelist[c] = Nullch; 375: if (compex->brastr) 376: free(compex->brastr); 377: compex->brastr = savestr(p1); /* in case p1 is not static */ 378: p1 = compex->brastr; /* ! */ 379: } 380: case_fold(compex->do_folding); /* make sure table is correct */ 381: FirstCharacter = p1; /* for ^ tests */ 382: if (compex->expbuf[0] == CCHR && !compex->alternatives[1]) { 383: c = trt[compex->expbuf[1]]; /* fast check for first character */ 384: do { 385: if (trt[*p1] == c && advance (compex, p1, compex->expbuf)) 386: return p1; 387: p1++; 388: } while (*p1 && !err); 389: return Nullch; 390: } 391: else { /* regular algorithm */ 392: do { 393: register char **alt = compex->alternatives; 394: while (*alt) { 395: if (advance (compex, p1, *alt++)) 396: return p1; 397: } 398: p1++; 399: } while (*p1 && !err); 400: return Nullch; 401: } 402: } 403: 404: /* advance the match of the regular expression starting at ep along the 405: string lp, simulates an NDFSA */ 406: bool 407: advance (compex, lp, ep) 408: register COMPEX *compex; 409: register char *ep; 410: register char *lp; 411: { 412: register char *curlp; 413: register char *trt = trans; 414: register int i; 415: 416: while ((*ep & STAR) || *lp || *ep == CIRC || *ep == CKET) 417: switch (*ep++) { 418: 419: case CCHR: 420: if (trt[*ep++] != trt[*lp]) return FALSE; 421: lp++; 422: continue; 423: 424: case CDOT: 425: if (*lp == '\n') return FALSE; 426: lp++; 427: continue; 428: 429: case CDOL: 430: if (!*lp || *lp == '\n') 431: continue; 432: return FALSE; 433: 434: case CIRC: 435: if (lp == FirstCharacter || lp[-1]=='\n') 436: continue; 437: return FALSE; 438: 439: case WORD: 440: if (isalnum(*lp)) { 441: lp++; 442: continue; 443: } 444: return FALSE; 445: 446: case NWORD: 447: if (!isalnum(*lp)) { 448: lp++; 449: continue; 450: } 451: return FALSE; 452: 453: case WBOUND: 454: if ((lp == FirstCharacter || !isalnum(lp[-1])) != 455: (!*lp || !isalnum(*lp)) ) 456: continue; 457: return FALSE; 458: 459: case NWBOUND: 460: if ((lp == FirstCharacter || !isalnum(lp[-1])) == 461: (!*lp || !isalnum(*lp))) 462: continue; 463: return FALSE; 464: 465: case CEND: 466: return TRUE; 467: 468: case CCL: 469: if (cclass (ep, *lp, 1)) { 470: ep += BMAPSIZ; 471: lp++; 472: continue; 473: } 474: return FALSE; 475: 476: case NCCL: 477: if (cclass (ep, *lp, 0)) { 478: ep += BMAPSIZ; 479: lp++; 480: continue; 481: } 482: return FALSE; 483: 484: case CBRA: 485: compex->braslist[*ep++] = lp; 486: continue; 487: 488: case CKET: 489: i = *ep++; 490: compex->braelist[i] = lp; 491: compex->braelist[0] = lp; 492: compex->braslist[0] = compex->braslist[i]; 493: continue; 494: 495: case CBACK: 496: if (compex->braelist[i = *ep++] == 0) { 497: fputs("bad braces\n",stdout) FLUSH; 498: err = TRUE; 499: return FALSE; 500: } 501: if (backref (compex, i, lp)) { 502: lp += compex->braelist[i] - compex->braslist[i]; 503: continue; 504: } 505: return FALSE; 506: 507: case CBACK | STAR: 508: if (compex->braelist[i = *ep++] == 0) { 509: fputs("bad braces\n",stdout) FLUSH; 510: err = TRUE; 511: return FALSE; 512: } 513: curlp = lp; 514: while (backref (compex, i, lp)) { 515: lp += compex->braelist[i] - compex->braslist[i]; 516: } 517: while (lp >= curlp) { 518: if (advance (compex, lp, ep)) 519: return TRUE; 520: lp -= compex->braelist[i] - compex->braslist[i]; 521: } 522: continue; 523: 524: case CDOT | STAR: 525: curlp = lp; 526: while (*lp++ && lp[-1] != '\n'); 527: goto star; 528: 529: case WORD | STAR: 530: curlp = lp; 531: while (*lp++ && isalnum(lp[-1])); 532: goto star; 533: 534: case NWORD | STAR: 535: curlp = lp; 536: while (*lp++ && !isalnum(lp[-1])); 537: goto star; 538: 539: case CCHR | STAR: 540: curlp = lp; 541: while (*lp++ && trt[lp[-1]] == trt[*ep]); 542: ep++; 543: goto star; 544: 545: case CCL | STAR: 546: case NCCL | STAR: 547: curlp = lp; 548: while (*lp++ && cclass (ep, lp[-1], ep[-1] == (CCL | STAR))); 549: ep += BMAPSIZ; 550: goto star; 551: 552: star: 553: do { 554: lp--; 555: if (advance (compex, lp, ep)) 556: return TRUE; 557: } while (lp > curlp); 558: return FALSE; 559: 560: default: 561: fputs("Badly compiled pattern\n",stdout) FLUSH; 562: err = TRUE; 563: return -1; 564: } 565: if (*ep == CEND || *ep == CDOL) { 566: return TRUE; 567: } 568: return FALSE; 569: } 570: 571: bool 572: backref (compex, i, lp) 573: register COMPEX *compex; 574: register int i; 575: register char *lp; 576: { 577: register char *bp; 578: 579: bp = compex->braslist[i]; 580: while (*lp && *bp == *lp) { 581: bp++; 582: lp++; 583: if (bp >= compex->braelist[i]) 584: return TRUE; 585: } 586: return FALSE; 587: } 588: 589: bool 590: cclass (set, c, af) 591: register char *set; 592: register int c; 593: { 594: c &= 0177; 595: #if BITSPERBYTE == 8 596: if (set[c >> 3] & 1 << (c & 7)) 597: #else 598: if (set[c / BITSPERBYTE] & 1 << (c % BITSPERBYTE)) 599: #endif 600: return af; 601: return !af; 602: }