1: /*
   2:  * Copyright (c) 1980 Regents of the University of California.
   3:  * All rights reserved.  The Berkeley software License Agreement
   4:  * specifies the terms and conditions for redistribution.
   5:  */
   6: 
   7: #if defined(LIBC_SCCS) && !defined(lint)
   8: static char sccsid[] = "@(#)regex.c	5.2 (Berkeley) 3/9/86";
   9: #endif LIBC_SCCS and not lint
  10: 
  11: #
  12: 
  13: /*
  14:  * routines to do regular expression matching
  15:  *
  16:  * Entry points:
  17:  *
  18:  *	re_comp(s)
  19:  *		char *s;
  20:  *	 ... returns 0 if the string s was compiled successfully,
  21:  *		     a pointer to an error message otherwise.
  22:  *	     If passed 0 or a null string returns without changing
  23:  *           the currently compiled re (see note 11 below).
  24:  *
  25:  *	re_exec(s)
  26:  *		char *s;
  27:  *	 ... returns 1 if the string s matches the last compiled regular
  28:  *		       expression,
  29:  *		     0 if the string s failed to match the last compiled
  30:  *		       regular expression, and
  31:  *		    -1 if the compiled regular expression was invalid
  32:  *		       (indicating an internal error).
  33:  *
  34:  * The strings passed to both re_comp and re_exec may have trailing or
  35:  * embedded newline characters; they are terminated by nulls.
  36:  *
  37:  * The identity of the author of these routines is lost in antiquity;
  38:  * this is essentially the same as the re code in the original V6 ed.
  39:  *
  40:  * The regular expressions recognized are described below. This description
  41:  * is essentially the same as that for ed.
  42:  *
  43:  *	A regular expression specifies a set of strings of characters.
  44:  *	A member of this set of strings is said to be matched by
  45:  *	the regular expression.  In the following specification for
  46:  *	regular expressions the word `character' means any character but NUL.
  47:  *
  48:  *	1.  Any character except a special character matches itself.
  49:  *	    Special characters are the regular expression delimiter plus
  50:  *	    \ [ . and sometimes ^ * $.
  51:  *	2.  A . matches any character.
  52:  *	3.  A \ followed by any character except a digit or ( )
  53:  *	    matches that character.
  54:  *	4.  A nonempty string s bracketed [s] (or [^s]) matches any
  55:  *	    character in (or not in) s. In s, \ has no special meaning,
  56:  *	    and ] may only appear as the first letter. A substring
  57:  *	    a-b, with a and b in ascending ASCII order, stands for
  58:  *	    the inclusive range of ASCII characters.
  59:  *	5.  A regular expression of form 1-4 followed by * matches a
  60:  *	    sequence of 0 or more matches of the regular expression.
  61:  *	6.  A regular expression, x, of form 1-8, bracketed \(x\)
  62:  *	    matches what x matches.
  63:  *	7.  A \ followed by a digit n matches a copy of the string that the
  64:  *	    bracketed regular expression beginning with the nth \( matched.
  65:  *	8.  A regular expression of form 1-8, x, followed by a regular
  66:  *	    expression of form 1-7, y matches a match for x followed by
  67:  *	    a match for y, with the x match being as long as possible
  68:  *	    while still permitting a y match.
  69:  *	9.  A regular expression of form 1-8 preceded by ^ (or followed
  70:  *	    by $), is constrained to matches that begin at the left
  71:  *	    (or end at the right) end of a line.
  72:  *	10. A regular expression of form 1-9 picks out the longest among
  73:  *	    the leftmost matches in a line.
  74:  *	11. An empty regular expression stands for a copy of the last
  75:  *	    regular expression encountered.
  76:  */
  77: 
  78: /*
  79:  * constants for re's
  80:  */
  81: #define CBRA    1
  82: #define CCHR    2
  83: #define CDOT    4
  84: #define CCL 6
  85: #define NCCL    8
  86: #define CDOL    10
  87: #define CEOF    11
  88: #define CKET    12
  89: #define CBACK   18
  90: 
  91: #define CSTAR   01
  92: 
  93: #define ESIZE   512
  94: #define NBRA    9
  95: 
  96: static  char    expbuf[ESIZE], *braslist[NBRA], *braelist[NBRA];
  97: static  char    circf;
  98: 
  99: /*
 100:  * compile the regular expression argument into a dfa
 101:  */
 102: char *
 103: re_comp(sp)
 104:     register char   *sp;
 105: {
 106:     register int    c;
 107:     register char   *ep = expbuf;
 108:     int cclcnt, numbra = 0;
 109:     char    *lastep = 0;
 110:     char    bracket[NBRA];
 111:     char    *bracketp = &bracket[0];
 112:     static  char    *retoolong = "Regular expression too long";
 113: 
 114: #define comerr(msg) {expbuf[0] = 0; numbra = 0; return(msg); }
 115: 
 116:     if (sp == 0 || *sp == '\0') {
 117:         if (*ep == 0)
 118:             return("No previous regular expression");
 119:         return(0);
 120:     }
 121:     if (*sp == '^') {
 122:         circf = 1;
 123:         sp++;
 124:     }
 125:     else
 126:         circf = 0;
 127:     for (;;) {
 128:         if (ep >= &expbuf[ESIZE])
 129:             comerr(retoolong);
 130:         if ((c = *sp++) == '\0') {
 131:             if (bracketp != bracket)
 132:                 comerr("unmatched \\(");
 133:             *ep++ = CEOF;
 134:             *ep++ = 0;
 135:             return(0);
 136:         }
 137:         if (c != '*')
 138:             lastep = ep;
 139:         switch (c) {
 140: 
 141:         case '.':
 142:             *ep++ = CDOT;
 143:             continue;
 144: 
 145:         case '*':
 146:             if (lastep == 0 || *lastep == CBRA || *lastep == CKET)
 147:                 goto defchar;
 148:             *lastep |= CSTAR;
 149:             continue;
 150: 
 151:         case '$':
 152:             if (*sp != '\0')
 153:                 goto defchar;
 154:             *ep++ = CDOL;
 155:             continue;
 156: 
 157:         case '[':
 158:             *ep++ = CCL;
 159:             *ep++ = 0;
 160:             cclcnt = 1;
 161:             if ((c = *sp++) == '^') {
 162:                 c = *sp++;
 163:                 ep[-2] = NCCL;
 164:             }
 165:             do {
 166:                 if (c == '\0')
 167:                     comerr("missing ]");
 168:                 if (c == '-' && ep [-1] != 0) {
 169:                     if ((c = *sp++) == ']') {
 170:                         *ep++ = '-';
 171:                         cclcnt++;
 172:                         break;
 173:                     }
 174:                     while (ep[-1] < c) {
 175:                         *ep = ep[-1] + 1;
 176:                         ep++;
 177:                         cclcnt++;
 178:                         if (ep >= &expbuf[ESIZE])
 179:                             comerr(retoolong);
 180:                     }
 181:                 }
 182:                 *ep++ = c;
 183:                 cclcnt++;
 184:                 if (ep >= &expbuf[ESIZE])
 185:                     comerr(retoolong);
 186:             } while ((c = *sp++) != ']');
 187:             lastep[1] = cclcnt;
 188:             continue;
 189: 
 190:         case '\\':
 191:             if ((c = *sp++) == '(') {
 192:                 if (numbra >= NBRA)
 193:                     comerr("too many \\(\\) pairs");
 194:                 *bracketp++ = numbra;
 195:                 *ep++ = CBRA;
 196:                 *ep++ = numbra++;
 197:                 continue;
 198:             }
 199:             if (c == ')') {
 200:                 if (bracketp <= bracket)
 201:                     comerr("unmatched \\)");
 202:                 *ep++ = CKET;
 203:                 *ep++ = *--bracketp;
 204:                 continue;
 205:             }
 206:             if (c >= '1' && c < ('1' + NBRA)) {
 207:                 *ep++ = CBACK;
 208:                 *ep++ = c - '1';
 209:                 continue;
 210:             }
 211:             *ep++ = CCHR;
 212:             *ep++ = c;
 213:             continue;
 214: 
 215:         defchar:
 216:         default:
 217:             *ep++ = CCHR;
 218:             *ep++ = c;
 219:         }
 220:     }
 221: }
 222: 
 223: /*
 224:  * match the argument string against the compiled re
 225:  */
 226: int
 227: re_exec(p1)
 228:     register char   *p1;
 229: {
 230:     register char   *p2 = expbuf;
 231:     register int    c;
 232:     int rv;
 233: 
 234:     for (c = 0; c < NBRA; c++) {
 235:         braslist[c] = 0;
 236:         braelist[c] = 0;
 237:     }
 238:     if (circf)
 239:         return((advance(p1, p2)));
 240:     /*
 241: 	 * fast check for first character
 242: 	 */
 243:     if (*p2 == CCHR) {
 244:         c = p2[1];
 245:         do {
 246:             if (*p1 != c)
 247:                 continue;
 248:             if (rv = advance(p1, p2))
 249:                 return(rv);
 250:         } while (*p1++);
 251:         return(0);
 252:     }
 253:     /*
 254: 	 * regular algorithm
 255: 	 */
 256:     do
 257:         if (rv = advance(p1, p2))
 258:             return(rv);
 259:     while (*p1++);
 260:     return(0);
 261: }
 262: 
 263: /*
 264:  * try to match the next thing in the dfa
 265:  */
 266: static  int
 267: advance(lp, ep)
 268:     register char   *lp, *ep;
 269: {
 270:     register char   *curlp;
 271:     int ct, i;
 272:     int rv;
 273: 
 274:     for (;;)
 275:         switch (*ep++) {
 276: 
 277:         case CCHR:
 278:             if (*ep++ == *lp++)
 279:                 continue;
 280:             return(0);
 281: 
 282:         case CDOT:
 283:             if (*lp++)
 284:                 continue;
 285:             return(0);
 286: 
 287:         case CDOL:
 288:             if (*lp == '\0')
 289:                 continue;
 290:             return(0);
 291: 
 292:         case CEOF:
 293:             return(1);
 294: 
 295:         case CCL:
 296:             if (cclass(ep, *lp++, 1)) {
 297:                 ep += *ep;
 298:                 continue;
 299:             }
 300:             return(0);
 301: 
 302:         case NCCL:
 303:             if (cclass(ep, *lp++, 0)) {
 304:                 ep += *ep;
 305:                 continue;
 306:             }
 307:             return(0);
 308: 
 309:         case CBRA:
 310:             braslist[*ep++] = lp;
 311:             continue;
 312: 
 313:         case CKET:
 314:             braelist[*ep++] = lp;
 315:             continue;
 316: 
 317:         case CBACK:
 318:             if (braelist[i = *ep++] == 0)
 319:                 return(-1);
 320:             if (backref(i, lp)) {
 321:                 lp += braelist[i] - braslist[i];
 322:                 continue;
 323:             }
 324:             return(0);
 325: 
 326:         case CBACK|CSTAR:
 327:             if (braelist[i = *ep++] == 0)
 328:                 return(-1);
 329:             curlp = lp;
 330:             ct = braelist[i] - braslist[i];
 331:             while (backref(i, lp))
 332:                 lp += ct;
 333:             while (lp >= curlp) {
 334:                 if (rv = advance(lp, ep))
 335:                     return(rv);
 336:                 lp -= ct;
 337:             }
 338:             continue;
 339: 
 340:         case CDOT|CSTAR:
 341:             curlp = lp;
 342:             while (*lp++)
 343:                 ;
 344:             goto star;
 345: 
 346:         case CCHR|CSTAR:
 347:             curlp = lp;
 348:             while (*lp++ == *ep)
 349:                 ;
 350:             ep++;
 351:             goto star;
 352: 
 353:         case CCL|CSTAR:
 354:         case NCCL|CSTAR:
 355:             curlp = lp;
 356:             while (cclass(ep, *lp++, ep[-1] == (CCL|CSTAR)))
 357:                 ;
 358:             ep += *ep;
 359:             goto star;
 360: 
 361:         star:
 362:             do {
 363:                 lp--;
 364:                 if (rv = advance(lp, ep))
 365:                     return(rv);
 366:             } while (lp > curlp);
 367:             return(0);
 368: 
 369:         default:
 370:             return(-1);
 371:         }
 372: }
 373: 
 374: backref(i, lp)
 375:     register int    i;
 376:     register char   *lp;
 377: {
 378:     register char   *bp;
 379: 
 380:     bp = braslist[i];
 381:     while (*bp++ == *lp++)
 382:         if (bp >= braelist[i])
 383:             return(1);
 384:     return(0);
 385: }
 386: 
 387: int
 388: cclass(set, c, af)
 389:     register char   *set, c;
 390:     int af;
 391: {
 392:     register int    n;
 393: 
 394:     if (c == 0)
 395:         return(0);
 396:     n = *set++;
 397:     while (--n)
 398:         if (*set++ == c)
 399:             return(af);
 400:     return(! af);
 401: }

Defined functions

advance defined in line 266; used 5 times
backref defined in line 374; used 2 times
cclass defined in line 387; used 3 times

Defined variables

braelist defined in line 96; used 7 times
braslist defined in line 96; used 5 times
circf defined in line 97; used 3 times
expbuf defined in line 96; used 6 times
sccsid defined in line 8; never used

Defined macros

CBACK defined in line 89; used 2 times
CBRA defined in line 81; used 2 times
CCHR defined in line 82; used 4 times
CCL defined in line 84; used 3 times
CDOL defined in line 86; used 1 times
CDOT defined in line 83; used 2 times
CEOF defined in line 87; used 1 times
CKET defined in line 88; used 2 times
CSTAR defined in line 91; used 2 times
ESIZE defined in line 93; used 4 times
NBRA defined in line 94; used 6 times
NCCL defined in line 85; used 2 times
comerr defined in line 114; used 7 times
Last modified: 1986-03-10
Generated: 2016-12-26
Generated by src2html V0.67
page hit count: 3261
Valid CSS Valid XHTML 1.0 Strict