1: /*
2: * Copyright (c) 1980 Regents of the University of California.
3: * All rights reserved. The Berkeley software License Agreement
4: * specifies the terms and conditions for redistribution.
5: */
6:
7: #if defined(LIBC_SCCS) && !defined(lint)
8: static char sccsid[] = "@(#)regex.c 5.2 (Berkeley) 3/9/86";
9: #endif LIBC_SCCS and not lint
10:
11: #
12:
13: /*
14: * routines to do regular expression matching
15: *
16: * Entry points:
17: *
18: * re_comp(s)
19: * char *s;
20: * ... returns 0 if the string s was compiled successfully,
21: * a pointer to an error message otherwise.
22: * If passed 0 or a null string returns without changing
23: * the currently compiled re (see note 11 below).
24: *
25: * re_exec(s)
26: * char *s;
27: * ... returns 1 if the string s matches the last compiled regular
28: * expression,
29: * 0 if the string s failed to match the last compiled
30: * regular expression, and
31: * -1 if the compiled regular expression was invalid
32: * (indicating an internal error).
33: *
34: * The strings passed to both re_comp and re_exec may have trailing or
35: * embedded newline characters; they are terminated by nulls.
36: *
37: * The identity of the author of these routines is lost in antiquity;
38: * this is essentially the same as the re code in the original V6 ed.
39: *
40: * The regular expressions recognized are described below. This description
41: * is essentially the same as that for ed.
42: *
43: * A regular expression specifies a set of strings of characters.
44: * A member of this set of strings is said to be matched by
45: * the regular expression. In the following specification for
46: * regular expressions the word `character' means any character but NUL.
47: *
48: * 1. Any character except a special character matches itself.
49: * Special characters are the regular expression delimiter plus
50: * \ [ . and sometimes ^ * $.
51: * 2. A . matches any character.
52: * 3. A \ followed by any character except a digit or ( )
53: * matches that character.
54: * 4. A nonempty string s bracketed [s] (or [^s]) matches any
55: * character in (or not in) s. In s, \ has no special meaning,
56: * and ] may only appear as the first letter. A substring
57: * a-b, with a and b in ascending ASCII order, stands for
58: * the inclusive range of ASCII characters.
59: * 5. A regular expression of form 1-4 followed by * matches a
60: * sequence of 0 or more matches of the regular expression.
61: * 6. A regular expression, x, of form 1-8, bracketed \(x\)
62: * matches what x matches.
63: * 7. A \ followed by a digit n matches a copy of the string that the
64: * bracketed regular expression beginning with the nth \( matched.
65: * 8. A regular expression of form 1-8, x, followed by a regular
66: * expression of form 1-7, y matches a match for x followed by
67: * a match for y, with the x match being as long as possible
68: * while still permitting a y match.
69: * 9. A regular expression of form 1-8 preceded by ^ (or followed
70: * by $), is constrained to matches that begin at the left
71: * (or end at the right) end of a line.
72: * 10. A regular expression of form 1-9 picks out the longest among
73: * the leftmost matches in a line.
74: * 11. An empty regular expression stands for a copy of the last
75: * regular expression encountered.
76: */
77:
78: /*
79: * constants for re's
80: */
81: #define CBRA 1
82: #define CCHR 2
83: #define CDOT 4
84: #define CCL 6
85: #define NCCL 8
86: #define CDOL 10
87: #define CEOF 11
88: #define CKET 12
89: #define CBACK 18
90:
91: #define CSTAR 01
92:
93: #define ESIZE 512
94: #define NBRA 9
95:
96: static char expbuf[ESIZE], *braslist[NBRA], *braelist[NBRA];
97: static char circf;
98:
99: /*
100: * compile the regular expression argument into a dfa
101: */
102: char *
103: re_comp(sp)
104: register char *sp;
105: {
106: register int c;
107: register char *ep = expbuf;
108: int cclcnt, numbra = 0;
109: char *lastep = 0;
110: char bracket[NBRA];
111: char *bracketp = &bracket[0];
112: static char *retoolong = "Regular expression too long";
113:
114: #define comerr(msg) {expbuf[0] = 0; numbra = 0; return(msg); }
115:
116: if (sp == 0 || *sp == '\0') {
117: if (*ep == 0)
118: return("No previous regular expression");
119: return(0);
120: }
121: if (*sp == '^') {
122: circf = 1;
123: sp++;
124: }
125: else
126: circf = 0;
127: for (;;) {
128: if (ep >= &expbuf[ESIZE])
129: comerr(retoolong);
130: if ((c = *sp++) == '\0') {
131: if (bracketp != bracket)
132: comerr("unmatched \\(");
133: *ep++ = CEOF;
134: *ep++ = 0;
135: return(0);
136: }
137: if (c != '*')
138: lastep = ep;
139: switch (c) {
140:
141: case '.':
142: *ep++ = CDOT;
143: continue;
144:
145: case '*':
146: if (lastep == 0 || *lastep == CBRA || *lastep == CKET)
147: goto defchar;
148: *lastep |= CSTAR;
149: continue;
150:
151: case '$':
152: if (*sp != '\0')
153: goto defchar;
154: *ep++ = CDOL;
155: continue;
156:
157: case '[':
158: *ep++ = CCL;
159: *ep++ = 0;
160: cclcnt = 1;
161: if ((c = *sp++) == '^') {
162: c = *sp++;
163: ep[-2] = NCCL;
164: }
165: do {
166: if (c == '\0')
167: comerr("missing ]");
168: if (c == '-' && ep [-1] != 0) {
169: if ((c = *sp++) == ']') {
170: *ep++ = '-';
171: cclcnt++;
172: break;
173: }
174: while (ep[-1] < c) {
175: *ep = ep[-1] + 1;
176: ep++;
177: cclcnt++;
178: if (ep >= &expbuf[ESIZE])
179: comerr(retoolong);
180: }
181: }
182: *ep++ = c;
183: cclcnt++;
184: if (ep >= &expbuf[ESIZE])
185: comerr(retoolong);
186: } while ((c = *sp++) != ']');
187: lastep[1] = cclcnt;
188: continue;
189:
190: case '\\':
191: if ((c = *sp++) == '(') {
192: if (numbra >= NBRA)
193: comerr("too many \\(\\) pairs");
194: *bracketp++ = numbra;
195: *ep++ = CBRA;
196: *ep++ = numbra++;
197: continue;
198: }
199: if (c == ')') {
200: if (bracketp <= bracket)
201: comerr("unmatched \\)");
202: *ep++ = CKET;
203: *ep++ = *--bracketp;
204: continue;
205: }
206: if (c >= '1' && c < ('1' + NBRA)) {
207: *ep++ = CBACK;
208: *ep++ = c - '1';
209: continue;
210: }
211: *ep++ = CCHR;
212: *ep++ = c;
213: continue;
214:
215: defchar:
216: default:
217: *ep++ = CCHR;
218: *ep++ = c;
219: }
220: }
221: }
222:
223: /*
224: * match the argument string against the compiled re
225: */
226: int
227: re_exec(p1)
228: register char *p1;
229: {
230: register char *p2 = expbuf;
231: register int c;
232: int rv;
233:
234: for (c = 0; c < NBRA; c++) {
235: braslist[c] = 0;
236: braelist[c] = 0;
237: }
238: if (circf)
239: return((advance(p1, p2)));
240: /*
241: * fast check for first character
242: */
243: if (*p2 == CCHR) {
244: c = p2[1];
245: do {
246: if (*p1 != c)
247: continue;
248: if (rv = advance(p1, p2))
249: return(rv);
250: } while (*p1++);
251: return(0);
252: }
253: /*
254: * regular algorithm
255: */
256: do
257: if (rv = advance(p1, p2))
258: return(rv);
259: while (*p1++);
260: return(0);
261: }
262:
263: /*
264: * try to match the next thing in the dfa
265: */
266: static int
267: advance(lp, ep)
268: register char *lp, *ep;
269: {
270: register char *curlp;
271: int ct, i;
272: int rv;
273:
274: for (;;)
275: switch (*ep++) {
276:
277: case CCHR:
278: if (*ep++ == *lp++)
279: continue;
280: return(0);
281:
282: case CDOT:
283: if (*lp++)
284: continue;
285: return(0);
286:
287: case CDOL:
288: if (*lp == '\0')
289: continue;
290: return(0);
291:
292: case CEOF:
293: return(1);
294:
295: case CCL:
296: if (cclass(ep, *lp++, 1)) {
297: ep += *ep;
298: continue;
299: }
300: return(0);
301:
302: case NCCL:
303: if (cclass(ep, *lp++, 0)) {
304: ep += *ep;
305: continue;
306: }
307: return(0);
308:
309: case CBRA:
310: braslist[*ep++] = lp;
311: continue;
312:
313: case CKET:
314: braelist[*ep++] = lp;
315: continue;
316:
317: case CBACK:
318: if (braelist[i = *ep++] == 0)
319: return(-1);
320: if (backref(i, lp)) {
321: lp += braelist[i] - braslist[i];
322: continue;
323: }
324: return(0);
325:
326: case CBACK|CSTAR:
327: if (braelist[i = *ep++] == 0)
328: return(-1);
329: curlp = lp;
330: ct = braelist[i] - braslist[i];
331: while (backref(i, lp))
332: lp += ct;
333: while (lp >= curlp) {
334: if (rv = advance(lp, ep))
335: return(rv);
336: lp -= ct;
337: }
338: continue;
339:
340: case CDOT|CSTAR:
341: curlp = lp;
342: while (*lp++)
343: ;
344: goto star;
345:
346: case CCHR|CSTAR:
347: curlp = lp;
348: while (*lp++ == *ep)
349: ;
350: ep++;
351: goto star;
352:
353: case CCL|CSTAR:
354: case NCCL|CSTAR:
355: curlp = lp;
356: while (cclass(ep, *lp++, ep[-1] == (CCL|CSTAR)))
357: ;
358: ep += *ep;
359: goto star;
360:
361: star:
362: do {
363: lp--;
364: if (rv = advance(lp, ep))
365: return(rv);
366: } while (lp > curlp);
367: return(0);
368:
369: default:
370: return(-1);
371: }
372: }
373:
374: backref(i, lp)
375: register int i;
376: register char *lp;
377: {
378: register char *bp;
379:
380: bp = braslist[i];
381: while (*bp++ == *lp++)
382: if (bp >= braelist[i])
383: return(1);
384: return(0);
385: }
386:
387: int
388: cclass(set, c, af)
389: register char *set, c;
390: int af;
391: {
392: register int n;
393:
394: if (c == 0)
395: return(0);
396: n = *set++;
397: while (--n)
398: if (*set++ == c)
399: return(af);
400: return(! af);
401: }
Defined functions
Defined variables
circf
defined in line
97; used 3 times
sccsid
defined in line
8;
never used
Defined macros
CBACK
defined in line
89; used 2 times
CBRA
defined in line
81; used 2 times
CCHR
defined in line
82; used 4 times
CCL
defined in line
84; used 3 times
CDOL
defined in line
86; used 1 times
CDOT
defined in line
83; used 2 times
CEOF
defined in line
87; used 1 times
CKET
defined in line
88; used 2 times
CSTAR
defined in line
91; used 2 times
ESIZE
defined in line
93; used 4 times
NBRA
defined in line
94; used 6 times
NCCL
defined in line
85; used 2 times