1: #
2:
3: /*
4: * routines to do regular expression matching
5: *
6: * Entry points:
7: *
8: * re_comp(s)
9: * char *s;
10: * ... returns 0 if the string s was compiled successfully,
11: * a pointer to an error message otherwise.
12: * If passed 0 or a null string returns without changing
13: * the currently compiled re (see note 11 below).
14: *
15: * re_exec(s)
16: * char *s;
17: * ... returns 1 if the string s matches the last compiled regular
18: * expression,
19: * 0 if the string s failed to match the last compiled
20: * regular expression, and
21: * -1 if the compiled regular expression was invalid
22: * (indicating an internal error).
23: *
24: * The strings passed to both re_comp and re_exec may have trailing or
25: * embedded newline characters; they are terminated by nulls.
26: *
27: * The identity of the author of these routines is lost in antiquity;
28: * this is essentially the same as the re code in the original V6 ed.
29: *
30: * The regular expressions recognized are described below. This description
31: * is essentially the same as that for ed.
32: *
33: * A regular expression specifies a set of strings of characters.
34: * A member of this set of strings is said to be matched by
35: * the regular expression. In the following specification for
36: * regular expressions the word `character' means any character but NUL.
37: *
38: * 1. Any character except a special character matches itself.
39: * Special characters are the regular expression delimiter plus
40: * \ [ . and sometimes ^ * $.
41: * 2. A . matches any character.
42: * 3. A \ followed by any character except a digit or ( )
43: * matches that character.
44: * 4. A nonempty string s bracketed [s] (or [^s]) matches any
45: * character in (or not in) s. In s, \ has no special meaning,
46: * and ] may only appear as the first letter. A substring
47: * a-b, with a and b in ascending ASCII order, stands for
48: * the inclusive range of ASCII characters.
49: * 5. A regular expression of form 1-4 followed by * matches a
50: * sequence of 0 or more matches of the regular expression.
51: * 6. A regular expression, x, of form 1-8, bracketed \(x\)
52: * matches what x matches.
53: * 7. A \ followed by a digit n matches a copy of the string that the
54: * bracketed regular expression beginning with the nth \( matched.
55: * 8. A regular expression of form 1-8, x, followed by a regular
56: * expression of form 1-7, y matches a match for x followed by
57: * a match for y, with the x match being as long as possible
58: * while still permitting a y match.
59: * 9. A regular expression of form 1-8 preceded by ^ (or followed
60: * by $), is constrained to matches that begin at the left
61: * (or end at the right) end of a line.
62: * 10. A regular expression of form 1-9 picks out the longest among
63: * the leftmost matches in a line.
64: * 11. An empty regular expression stands for a copy of the last
65: * regular expression encountered.
66: */
67:
68: /*
69: * constants for re's
70: */
71: #define CBRA 1
72: #define CCHR 2
73: #define CDOT 4
74: #define CCL 6
75: #define NCCL 8
76: #define CDOL 10
77: #define CEOF 11
78: #define CKET 12
79: #define CBACK 18
80:
81: #define CSTAR 01
82:
83: #define ESIZE 512
84: #define NBRA 9
85:
86: static char expbuf[ESIZE], *braslist[NBRA], *braelist[NBRA];
87: static char circf;
88:
89: /*
90: * compile the regular expression argument into a dfa
91: */
92: char *
93: re_comp(sp)
94: register char *sp;
95: {
96: register int c;
97: register char *ep = expbuf;
98: int cclcnt, numbra = 0;
99: char *lastep = 0;
100: char bracket[NBRA];
101: char *bracketp = &bracket[0];
102: static char *retoolong = "Regular expression too long";
103:
104: #define comerr(msg) {expbuf[0] = 0; numbra = 0; return(msg); }
105:
106: if (sp == 0 || *sp == '\0') {
107: if (*ep == 0)
108: return("No previous regular expression");
109: return(0);
110: }
111: if (*sp == '^') {
112: circf = 1;
113: sp++;
114: }
115: else
116: circf = 0;
117: for (;;) {
118: if (ep >= &expbuf[ESIZE])
119: comerr(retoolong);
120: if ((c = *sp++) == '\0') {
121: if (bracketp != bracket)
122: comerr("unmatched \\(");
123: *ep++ = CEOF;
124: *ep++ = 0;
125: return(0);
126: }
127: if (c != '*')
128: lastep = ep;
129: switch (c) {
130:
131: case '.':
132: *ep++ = CDOT;
133: continue;
134:
135: case '*':
136: if (lastep == 0 || *lastep == CBRA || *lastep == CKET)
137: goto defchar;
138: *lastep |= CSTAR;
139: continue;
140:
141: case '$':
142: if (*sp != '\0')
143: goto defchar;
144: *ep++ = CDOL;
145: continue;
146:
147: case '[':
148: *ep++ = CCL;
149: *ep++ = 0;
150: cclcnt = 1;
151: if ((c = *sp++) == '^') {
152: c = *sp++;
153: ep[-2] = NCCL;
154: }
155: do {
156: if (c == '\0')
157: comerr("missing ]");
158: if (c == '-' && ep [-1] != 0) {
159: if ((c = *sp++) == ']') {
160: *ep++ = '-';
161: cclcnt++;
162: break;
163: }
164: while (ep[-1] < c) {
165: *ep = ep[-1] + 1;
166: ep++;
167: cclcnt++;
168: if (ep >= &expbuf[ESIZE])
169: comerr(retoolong);
170: }
171: }
172: *ep++ = c;
173: cclcnt++;
174: if (ep >= &expbuf[ESIZE])
175: comerr(retoolong);
176: } while ((c = *sp++) != ']');
177: lastep[1] = cclcnt;
178: continue;
179:
180: case '\\':
181: if ((c = *sp++) == '(') {
182: if (numbra >= NBRA)
183: comerr("too many \\(\\) pairs");
184: *bracketp++ = numbra;
185: *ep++ = CBRA;
186: *ep++ = numbra++;
187: continue;
188: }
189: if (c == ')') {
190: if (bracketp <= bracket)
191: comerr("unmatched \\)");
192: *ep++ = CKET;
193: *ep++ = *--bracketp;
194: continue;
195: }
196: if (c >= '1' && c < ('1' + NBRA)) {
197: *ep++ = CBACK;
198: *ep++ = c - '1';
199: continue;
200: }
201: *ep++ = CCHR;
202: *ep++ = c;
203: continue;
204:
205: defchar:
206: default:
207: *ep++ = CCHR;
208: *ep++ = c;
209: }
210: }
211: }
212:
213: /*
214: * match the argument string against the compiled re
215: */
216: int
217: re_exec(p1)
218: register char *p1;
219: {
220: register char *p2 = expbuf;
221: register int c;
222: int rv;
223:
224: for (c = 0; c < NBRA; c++) {
225: braslist[c] = 0;
226: braelist[c] = 0;
227: }
228: if (circf)
229: return((advance(p1, p2)));
230: /*
231: * fast check for first character
232: */
233: if (*p2 == CCHR) {
234: c = p2[1];
235: do {
236: if (*p1 != c)
237: continue;
238: if (rv = advance(p1, p2))
239: return(rv);
240: } while (*p1++);
241: return(0);
242: }
243: /*
244: * regular algorithm
245: */
246: do
247: if (rv = advance(p1, p2))
248: return(rv);
249: while (*p1++);
250: return(0);
251: }
252:
253: /*
254: * try to match the next thing in the dfa
255: */
256: static int
257: advance(lp, ep)
258: register char *lp, *ep;
259: {
260: register char *curlp;
261: int ct, i;
262: int rv;
263:
264: for (;;)
265: switch (*ep++) {
266:
267: case CCHR:
268: if (*ep++ == *lp++)
269: continue;
270: return(0);
271:
272: case CDOT:
273: if (*lp++)
274: continue;
275: return(0);
276:
277: case CDOL:
278: if (*lp == '\0')
279: continue;
280: return(0);
281:
282: case CEOF:
283: return(1);
284:
285: case CCL:
286: if (cclass(ep, *lp++, 1)) {
287: ep += *ep;
288: continue;
289: }
290: return(0);
291:
292: case NCCL:
293: if (cclass(ep, *lp++, 0)) {
294: ep += *ep;
295: continue;
296: }
297: return(0);
298:
299: case CBRA:
300: braslist[*ep++] = lp;
301: continue;
302:
303: case CKET:
304: braelist[*ep++] = lp;
305: continue;
306:
307: case CBACK:
308: if (braelist[i = *ep++] == 0)
309: return(-1);
310: if (backref(i, lp)) {
311: lp += braelist[i] - braslist[i];
312: continue;
313: }
314: return(0);
315:
316: case CBACK|CSTAR:
317: if (braelist[i = *ep++] == 0)
318: return(-1);
319: curlp = lp;
320: ct = braelist[i] - braslist[i];
321: while (backref(i, lp))
322: lp += ct;
323: while (lp >= curlp) {
324: if (rv = advance(lp, ep))
325: return(rv);
326: lp -= ct;
327: }
328: continue;
329:
330: case CDOT|CSTAR:
331: curlp = lp;
332: while (*lp++)
333: ;
334: goto star;
335:
336: case CCHR|CSTAR:
337: curlp = lp;
338: while (*lp++ == *ep)
339: ;
340: ep++;
341: goto star;
342:
343: case CCL|CSTAR:
344: case NCCL|CSTAR:
345: curlp = lp;
346: while (cclass(ep, *lp++, ep[-1] == (CCL|CSTAR)))
347: ;
348: ep += *ep;
349: goto star;
350:
351: star:
352: do {
353: lp--;
354: if (rv = advance(lp, ep))
355: return(rv);
356: } while (lp > curlp);
357: return(0);
358:
359: default:
360: return(-1);
361: }
362: }
363:
364: backref(i, lp)
365: register int i;
366: register char *lp;
367: {
368: register char *bp;
369:
370: bp = braslist[i];
371: while (*bp++ == *lp++)
372: if (bp >= braelist[i])
373: return(1);
374: return(0);
375: }
376:
377: int
378: cclass(set, c, af)
379: register char *set, c;
380: int af;
381: {
382: register int n;
383:
384: if (c == 0)
385: return(0);
386: n = *set++;
387: while (--n)
388: if (*set++ == c)
389: return(af);
390: return(! af);
391: }
Defined functions
Defined variables
circf
defined in line
87; used 3 times
Defined macros
CBACK
defined in line
79; used 2 times
CBRA
defined in line
71; used 2 times
CCHR
defined in line
72; used 4 times
CCL
defined in line
74; used 3 times
CDOL
defined in line
76; used 1 times
CDOT
defined in line
73; used 2 times
CEOF
defined in line
77; used 1 times
CKET
defined in line
78; used 2 times
CSTAR
defined in line
81; used 2 times
ESIZE
defined in line
83; used 4 times
NBRA
defined in line
84; used 6 times
NCCL
defined in line
75; used 2 times