1: #
   2: 
   3: /*
   4:  * routines to do regular expression matching
   5:  *
   6:  * Entry points:
   7:  *
   8:  *	re_comp(s)
   9:  *		char *s;
  10:  *	 ... returns 0 if the string s was compiled successfully,
  11:  *		     a pointer to an error message otherwise.
  12:  *	     If passed 0 or a null string returns without changing
  13:  *           the currently compiled re (see note 11 below).
  14:  *
  15:  *	re_exec(s)
  16:  *		char *s;
  17:  *	 ... returns 1 if the string s matches the last compiled regular
  18:  *		       expression,
  19:  *		     0 if the string s failed to match the last compiled
  20:  *		       regular expression, and
  21:  *		    -1 if the compiled regular expression was invalid
  22:  *		       (indicating an internal error).
  23:  *
  24:  * The strings passed to both re_comp and re_exec may have trailing or
  25:  * embedded newline characters; they are terminated by nulls.
  26:  *
  27:  * The identity of the author of these routines is lost in antiquity;
  28:  * this is essentially the same as the re code in the original V6 ed.
  29:  *
  30:  * The regular expressions recognized are described below. This description
  31:  * is essentially the same as that for ed.
  32:  *
  33:  *	A regular expression specifies a set of strings of characters.
  34:  *	A member of this set of strings is said to be matched by
  35:  *	the regular expression.  In the following specification for
  36:  *	regular expressions the word `character' means any character but NUL.
  37:  *
  38:  *	1.  Any character except a special character matches itself.
  39:  *	    Special characters are the regular expression delimiter plus
  40:  *	    \ [ . and sometimes ^ * $.
  41:  *	2.  A . matches any character.
  42:  *	3.  A \ followed by any character except a digit or ( )
  43:  *	    matches that character.
  44:  *	4.  A nonempty string s bracketed [s] (or [^s]) matches any
  45:  *	    character in (or not in) s. In s, \ has no special meaning,
  46:  *	    and ] may only appear as the first letter. A substring
  47:  *	    a-b, with a and b in ascending ASCII order, stands for
  48:  *	    the inclusive range of ASCII characters.
  49:  *	5.  A regular expression of form 1-4 followed by * matches a
  50:  *	    sequence of 0 or more matches of the regular expression.
  51:  *	6.  A regular expression, x, of form 1-8, bracketed \(x\)
  52:  *	    matches what x matches.
  53:  *	7.  A \ followed by a digit n matches a copy of the string that the
  54:  *	    bracketed regular expression beginning with the nth \( matched.
  55:  *	8.  A regular expression of form 1-8, x, followed by a regular
  56:  *	    expression of form 1-7, y matches a match for x followed by
  57:  *	    a match for y, with the x match being as long as possible
  58:  *	    while still permitting a y match.
  59:  *	9.  A regular expression of form 1-8 preceded by ^ (or followed
  60:  *	    by $), is constrained to matches that begin at the left
  61:  *	    (or end at the right) end of a line.
  62:  *	10. A regular expression of form 1-9 picks out the longest among
  63:  *	    the leftmost matches in a line.
  64:  *	11. An empty regular expression stands for a copy of the last
  65:  *	    regular expression encountered.
  66:  */
  67: 
  68: /*
  69:  * constants for re's
  70:  */
  71: #define CBRA    1
  72: #define CCHR    2
  73: #define CDOT    4
  74: #define CCL 6
  75: #define NCCL    8
  76: #define CDOL    10
  77: #define CEOF    11
  78: #define CKET    12
  79: #define CBACK   18
  80: 
  81: #define CSTAR   01
  82: 
  83: #define ESIZE   512
  84: #define NBRA    9
  85: 
  86: static  char    expbuf[ESIZE], *braslist[NBRA], *braelist[NBRA];
  87: static  char    circf;
  88: 
  89: /*
  90:  * compile the regular expression argument into a dfa
  91:  */
  92: char *
  93: re_comp(sp)
  94:     register char   *sp;
  95: {
  96:     register int    c;
  97:     register char   *ep = expbuf;
  98:     int cclcnt, numbra = 0;
  99:     char    *lastep = 0;
 100:     char    bracket[NBRA];
 101:     char    *bracketp = &bracket[0];
 102:     static  char    *retoolong = "Regular expression too long";
 103: 
 104: #define comerr(msg) {expbuf[0] = 0; numbra = 0; return(msg); }
 105: 
 106:     if (sp == 0 || *sp == '\0') {
 107:         if (*ep == 0)
 108:             return("No previous regular expression");
 109:         return(0);
 110:     }
 111:     if (*sp == '^') {
 112:         circf = 1;
 113:         sp++;
 114:     }
 115:     else
 116:         circf = 0;
 117:     for (;;) {
 118:         if (ep >= &expbuf[ESIZE])
 119:             comerr(retoolong);
 120:         if ((c = *sp++) == '\0') {
 121:             if (bracketp != bracket)
 122:                 comerr("unmatched \\(");
 123:             *ep++ = CEOF;
 124:             *ep++ = 0;
 125:             return(0);
 126:         }
 127:         if (c != '*')
 128:             lastep = ep;
 129:         switch (c) {
 130: 
 131:         case '.':
 132:             *ep++ = CDOT;
 133:             continue;
 134: 
 135:         case '*':
 136:             if (lastep == 0 || *lastep == CBRA || *lastep == CKET)
 137:                 goto defchar;
 138:             *lastep |= CSTAR;
 139:             continue;
 140: 
 141:         case '$':
 142:             if (*sp != '\0')
 143:                 goto defchar;
 144:             *ep++ = CDOL;
 145:             continue;
 146: 
 147:         case '[':
 148:             *ep++ = CCL;
 149:             *ep++ = 0;
 150:             cclcnt = 1;
 151:             if ((c = *sp++) == '^') {
 152:                 c = *sp++;
 153:                 ep[-2] = NCCL;
 154:             }
 155:             do {
 156:                 if (c == '\0')
 157:                     comerr("missing ]");
 158:                 if (c == '-' && ep [-1] != 0) {
 159:                     if ((c = *sp++) == ']') {
 160:                         *ep++ = '-';
 161:                         cclcnt++;
 162:                         break;
 163:                     }
 164:                     while (ep[-1] < c) {
 165:                         *ep = ep[-1] + 1;
 166:                         ep++;
 167:                         cclcnt++;
 168:                         if (ep >= &expbuf[ESIZE])
 169:                             comerr(retoolong);
 170:                     }
 171:                 }
 172:                 *ep++ = c;
 173:                 cclcnt++;
 174:                 if (ep >= &expbuf[ESIZE])
 175:                     comerr(retoolong);
 176:             } while ((c = *sp++) != ']');
 177:             lastep[1] = cclcnt;
 178:             continue;
 179: 
 180:         case '\\':
 181:             if ((c = *sp++) == '(') {
 182:                 if (numbra >= NBRA)
 183:                     comerr("too many \\(\\) pairs");
 184:                 *bracketp++ = numbra;
 185:                 *ep++ = CBRA;
 186:                 *ep++ = numbra++;
 187:                 continue;
 188:             }
 189:             if (c == ')') {
 190:                 if (bracketp <= bracket)
 191:                     comerr("unmatched \\)");
 192:                 *ep++ = CKET;
 193:                 *ep++ = *--bracketp;
 194:                 continue;
 195:             }
 196:             if (c >= '1' && c < ('1' + NBRA)) {
 197:                 *ep++ = CBACK;
 198:                 *ep++ = c - '1';
 199:                 continue;
 200:             }
 201:             *ep++ = CCHR;
 202:             *ep++ = c;
 203:             continue;
 204: 
 205:         defchar:
 206:         default:
 207:             *ep++ = CCHR;
 208:             *ep++ = c;
 209:         }
 210:     }
 211: }
 212: 
 213: /*
 214:  * match the argument string against the compiled re
 215:  */
 216: int
 217: re_exec(p1)
 218:     register char   *p1;
 219: {
 220:     register char   *p2 = expbuf;
 221:     register int    c;
 222:     int rv;
 223: 
 224:     for (c = 0; c < NBRA; c++) {
 225:         braslist[c] = 0;
 226:         braelist[c] = 0;
 227:     }
 228:     if (circf)
 229:         return((advance(p1, p2)));
 230:     /*
 231: 	 * fast check for first character
 232: 	 */
 233:     if (*p2 == CCHR) {
 234:         c = p2[1];
 235:         do {
 236:             if (*p1 != c)
 237:                 continue;
 238:             if (rv = advance(p1, p2))
 239:                 return(rv);
 240:         } while (*p1++);
 241:         return(0);
 242:     }
 243:     /*
 244: 	 * regular algorithm
 245: 	 */
 246:     do
 247:         if (rv = advance(p1, p2))
 248:             return(rv);
 249:     while (*p1++);
 250:     return(0);
 251: }
 252: 
 253: /*
 254:  * try to match the next thing in the dfa
 255:  */
 256: static  int
 257: advance(lp, ep)
 258:     register char   *lp, *ep;
 259: {
 260:     register char   *curlp;
 261:     int ct, i;
 262:     int rv;
 263: 
 264:     for (;;)
 265:         switch (*ep++) {
 266: 
 267:         case CCHR:
 268:             if (*ep++ == *lp++)
 269:                 continue;
 270:             return(0);
 271: 
 272:         case CDOT:
 273:             if (*lp++)
 274:                 continue;
 275:             return(0);
 276: 
 277:         case CDOL:
 278:             if (*lp == '\0')
 279:                 continue;
 280:             return(0);
 281: 
 282:         case CEOF:
 283:             return(1);
 284: 
 285:         case CCL:
 286:             if (cclass(ep, *lp++, 1)) {
 287:                 ep += *ep;
 288:                 continue;
 289:             }
 290:             return(0);
 291: 
 292:         case NCCL:
 293:             if (cclass(ep, *lp++, 0)) {
 294:                 ep += *ep;
 295:                 continue;
 296:             }
 297:             return(0);
 298: 
 299:         case CBRA:
 300:             braslist[*ep++] = lp;
 301:             continue;
 302: 
 303:         case CKET:
 304:             braelist[*ep++] = lp;
 305:             continue;
 306: 
 307:         case CBACK:
 308:             if (braelist[i = *ep++] == 0)
 309:                 return(-1);
 310:             if (backref(i, lp)) {
 311:                 lp += braelist[i] - braslist[i];
 312:                 continue;
 313:             }
 314:             return(0);
 315: 
 316:         case CBACK|CSTAR:
 317:             if (braelist[i = *ep++] == 0)
 318:                 return(-1);
 319:             curlp = lp;
 320:             ct = braelist[i] - braslist[i];
 321:             while (backref(i, lp))
 322:                 lp += ct;
 323:             while (lp >= curlp) {
 324:                 if (rv = advance(lp, ep))
 325:                     return(rv);
 326:                 lp -= ct;
 327:             }
 328:             continue;
 329: 
 330:         case CDOT|CSTAR:
 331:             curlp = lp;
 332:             while (*lp++)
 333:                 ;
 334:             goto star;
 335: 
 336:         case CCHR|CSTAR:
 337:             curlp = lp;
 338:             while (*lp++ == *ep)
 339:                 ;
 340:             ep++;
 341:             goto star;
 342: 
 343:         case CCL|CSTAR:
 344:         case NCCL|CSTAR:
 345:             curlp = lp;
 346:             while (cclass(ep, *lp++, ep[-1] == (CCL|CSTAR)))
 347:                 ;
 348:             ep += *ep;
 349:             goto star;
 350: 
 351:         star:
 352:             do {
 353:                 lp--;
 354:                 if (rv = advance(lp, ep))
 355:                     return(rv);
 356:             } while (lp > curlp);
 357:             return(0);
 358: 
 359:         default:
 360:             return(-1);
 361:         }
 362: }
 363: 
 364: backref(i, lp)
 365:     register int    i;
 366:     register char   *lp;
 367: {
 368:     register char   *bp;
 369: 
 370:     bp = braslist[i];
 371:     while (*bp++ == *lp++)
 372:         if (bp >= braelist[i])
 373:             return(1);
 374:     return(0);
 375: }
 376: 
 377: int
 378: cclass(set, c, af)
 379:     register char   *set, c;
 380:     int af;
 381: {
 382:     register int    n;
 383: 
 384:     if (c == 0)
 385:         return(0);
 386:     n = *set++;
 387:     while (--n)
 388:         if (*set++ == c)
 389:             return(af);
 390:     return(! af);
 391: }

Defined functions

advance defined in line 256; used 5 times
backref defined in line 364; used 2 times
cclass defined in line 377; used 3 times
re_comp defined in line 92; used 2 times
re_exec defined in line 216; used 2 times

Defined variables

braelist defined in line 86; used 7 times
braslist defined in line 86; used 5 times
circf defined in line 87; used 3 times
expbuf defined in line 86; used 6 times

Defined macros

CBACK defined in line 79; used 2 times
CBRA defined in line 71; used 2 times
CCHR defined in line 72; used 4 times
CCL defined in line 74; used 3 times
CDOL defined in line 76; used 1 times
CDOT defined in line 73; used 2 times
CEOF defined in line 77; used 1 times
CKET defined in line 78; used 2 times
CSTAR defined in line 81; used 2 times
ESIZE defined in line 83; used 4 times
NBRA defined in line 84; used 6 times
NCCL defined in line 75; used 2 times
comerr defined in line 104; used 7 times
Last modified: 1980-10-31
Generated: 2016-12-26
Generated by src2html V0.67
page hit count: 1006
Valid CSS Valid XHTML 1.0 Strict