1: /*
2: * Copyright (c) 1980 Regents of the University of California.
3: * All rights reserved. The Berkeley software License Agreement
4: * specifies the terms and conditions for redistribution.
5: */
6:
7: #ifndef lint
8: static char sccsid[] = "@(#)regexp.c 5.1 (Berkeley) 6/5/85";
9: #endif not lint
10:
11: #include <ctype.h>
12:
13: typedef int boolean;
14: #define TRUE 1
15: #define FALSE 0
16: #define NIL 0
17:
18: boolean l_onecase; /* true if upper and lower equivalent */
19:
20: #define makelower(c) (isupper((c)) ? tolower((c)) : (c))
21:
22: /* STRNCMP - like strncmp except that we convert the
23: * first string to lower case before comparing
24: * if l_onecase is set.
25: */
26:
27: STRNCMP(s1, s2, len)
28: register char *s1,*s2;
29: register int len;
30: {
31: if (l_onecase) {
32: do
33: if (*s2 - makelower(*s1))
34: return (*s2 - makelower(*s1));
35: else {
36: s2++;
37: s1++;
38: }
39: while (--len);
40: } else {
41: do
42: if (*s2 - *s1)
43: return (*s2 - *s1);
44: else {
45: s2++;
46: s1++;
47: }
48: while (--len);
49: }
50: return(0);
51: }
52:
53: /* The following routine converts an irregular expression to
54: * internal format.
55: *
56: * Either meta symbols (\a \d or \p) or character strings or
57: * operations ( alternation or perenthesizing ) can be
58: * specified. Each starts with a descriptor byte. The descriptor
59: * byte has STR set for strings, META set for meta symbols
60: * and OPER set for operations.
61: * The descriptor byte can also have the OPT bit set if the object
62: * defined is optional. Also ALT can be set to indicate an alternation.
63: *
64: * For metasymbols the byte following the descriptor byte identities
65: * the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '('). For
66: * strings the byte after the descriptor is a character count for
67: * the string:
68: *
69: * meta symbols := descriptor
70: * symbol
71: *
72: * strings := descriptor
73: * character count
74: * the string
75: *
76: * operatins := descriptor
77: * symbol
78: * character count
79: */
80:
81: /*
82: * handy macros for accessing parts of match blocks
83: */
84: #define MSYM(A) (*(A+1)) /* symbol in a meta symbol block */
85: #define MNEXT(A) (A+2) /* character following a metasymbol block */
86:
87: #define OSYM(A) (*(A+1)) /* symbol in an operation block */
88: #define OCNT(A) (*(A+2)) /* character count */
89: #define ONEXT(A) (A+3) /* next character after the operation */
90: #define OPTR(A) (A+*(A+2)) /* place pointed to by the operator */
91:
92: #define SCNT(A) (*(A+1)) /* byte count of a string */
93: #define SSTR(A) (A+2) /* address of the string */
94: #define SNEXT(A) (A+2+*(A+1)) /* character following the string */
95:
96: /*
97: * bit flags in the descriptor
98: */
99: #define OPT 1
100: #define STR 2
101: #define META 4
102: #define ALT 8
103: #define OPER 16
104:
105: char *ure; /* pointer current position in unconverted exp */
106: char *ccre; /* pointer to current position in converted exp*/
107: char *malloc();
108:
109: char *
110: convexp(re)
111: char *re; /* unconverted irregular expression */
112: {
113: register char *cre; /* pointer to converted regular expression */
114:
115: /* allocate room for the converted expression */
116: if (re == NIL)
117: return (NIL);
118: if (*re == '\0')
119: return (NIL);
120: cre = malloc (4 * strlen(re) + 3);
121: ccre = cre;
122: ure = re;
123:
124: /* start the conversion with a \a */
125: *cre = META | OPT;
126: MSYM(cre) = 'a';
127: ccre = MNEXT(cre);
128:
129: /* start the conversion (its recursive) */
130: expconv ();
131: *ccre = 0;
132: return (cre);
133: }
134:
135: expconv()
136: {
137: register char *cs; /* pointer to current symbol in converted exp */
138: register char c; /* character being processed */
139: register char *acs; /* pinter to last alternate */
140: register int temp;
141:
142: /* let the conversion begin */
143: acs = NIL;
144: cs = NIL;
145: while (*ure != NIL) {
146: switch (c = *ure++) {
147:
148: case '\\':
149: switch (c = *ure++) {
150:
151: /* escaped characters are just characters */
152: default:
153: if (cs == NIL || (*cs & STR) == 0) {
154: cs = ccre;
155: *cs = STR;
156: SCNT(cs) = 1;
157: ccre += 2;
158: } else
159: SCNT(cs)++;
160: *ccre++ = c;
161: break;
162:
163: /* normal(?) metacharacters */
164: case 'a':
165: case 'd':
166: case 'e':
167: case 'p':
168: if (acs != NIL && acs != cs) {
169: do {
170: temp = OCNT(acs);
171: OCNT(acs) = ccre - acs;
172: acs -= temp;
173: } while (temp != 0);
174: acs = NIL;
175: }
176: cs = ccre;
177: *cs = META;
178: MSYM(cs) = c;
179: ccre = MNEXT(cs);
180: break;
181: }
182: break;
183:
184: /* just put the symbol in */
185: case '^':
186: case '$':
187: if (acs != NIL && acs != cs) {
188: do {
189: temp = OCNT(acs);
190: OCNT(acs) = ccre - acs;
191: acs -= temp;
192: } while (temp != 0);
193: acs = NIL;
194: }
195: cs = ccre;
196: *cs = META;
197: MSYM(cs) = c;
198: ccre = MNEXT(cs);
199: break;
200:
201: /* mark the last match sequence as optional */
202: case '?':
203: if (cs)
204: *cs = *cs | OPT;
205: break;
206:
207: /* recurse and define a subexpression */
208: case '(':
209: if (acs != NIL && acs != cs) {
210: do {
211: temp = OCNT(acs);
212: OCNT(acs) = ccre - acs;
213: acs -= temp;
214: } while (temp != 0);
215: acs = NIL;
216: }
217: cs = ccre;
218: *cs = OPER;
219: OSYM(cs) = '(';
220: ccre = ONEXT(cs);
221: expconv ();
222: OCNT(cs) = ccre - cs; /* offset to next symbol */
223: break;
224:
225: /* return from a recursion */
226: case ')':
227: if (acs != NIL) {
228: do {
229: temp = OCNT(acs);
230: OCNT(acs) = ccre - acs;
231: acs -= temp;
232: } while (temp != 0);
233: acs = NIL;
234: }
235: cs = ccre;
236: *cs = META;
237: MSYM(cs) = c;
238: ccre = MNEXT(cs);
239: return;
240:
241: /* mark the last match sequence as having an alternate */
242: /* the third byte will contain an offset to jump over the */
243: /* alternate match in case the first did not fail */
244: case '|':
245: if (acs != NIL && acs != cs)
246: OCNT(ccre) = ccre - acs; /* make a back pointer */
247: else
248: OCNT(ccre) = 0;
249: *cs |= ALT;
250: cs = ccre;
251: *cs = OPER;
252: OSYM(cs) = '|';
253: ccre = ONEXT(cs);
254: acs = cs; /* remember that the pointer is to be filles */
255: break;
256:
257: /* if its not a metasymbol just build a scharacter string */
258: default:
259: if (cs == NIL || (*cs & STR) == 0) {
260: cs = ccre;
261: *cs = STR;
262: SCNT(cs) = 1;
263: ccre = SSTR(cs);
264: } else
265: SCNT(cs)++;
266: *ccre++ = c;
267: break;
268: }
269: }
270: if (acs != NIL) {
271: do {
272: temp = OCNT(acs);
273: OCNT(acs) = ccre - acs;
274: acs -= temp;
275: } while (temp != 0);
276: acs = NIL;
277: }
278: return;
279: }
280: /* end of convertre */
281:
282:
283: /*
284: * The following routine recognises an irregular expresion
285: * with the following special characters:
286: *
287: * \? - means last match was optional
288: * \a - matches any number of characters
289: * \d - matches any number of spaces and tabs
290: * \p - matches any number of alphanumeric
291: * characters. The
292: * characters matched will be copied into
293: * the area pointed to by 'name'.
294: * \| - alternation
295: * \( \) - grouping used mostly for alternation and
296: * optionality
297: *
298: * The irregular expression must be translated to internal form
299: * prior to calling this routine
300: *
301: * The value returned is the pointer to the first non \a
302: * character matched.
303: */
304:
305: boolean _escaped; /* true if we are currently _escaped */
306: char *_start; /* start of string */
307:
308: char *
309: expmatch (s, re, mstring)
310: register char *s; /* string to check for a match in */
311: register char *re; /* a converted irregular expression */
312: register char *mstring; /* where to put whatever matches a \p */
313: {
314: register char *cs; /* the current symbol */
315: register char *ptr,*s1; /* temporary pointer */
316: boolean matched; /* a temporary boolean */
317:
318: /* initial conditions */
319: if (re == NIL)
320: return (NIL);
321: cs = re;
322: matched = FALSE;
323:
324: /* loop till expression string is exhausted (or at least pretty tired) */
325: while (*cs) {
326: switch (*cs & (OPER | STR | META)) {
327:
328: /* try to match a string */
329: case STR:
330: matched = !STRNCMP (s, SSTR(cs), SCNT(cs));
331: if (matched) {
332:
333: /* hoorah it matches */
334: s += SCNT(cs);
335: cs = SNEXT(cs);
336: } else if (*cs & ALT) {
337:
338: /* alternation, skip to next expression */
339: cs = SNEXT(cs);
340: } else if (*cs & OPT) {
341:
342: /* the match is optional */
343: cs = SNEXT(cs);
344: matched = 1; /* indicate a successful match */
345: } else {
346:
347: /* no match, error return */
348: return (NIL);
349: }
350: break;
351:
352: /* an operator, do something fancy */
353: case OPER:
354: switch (OSYM(cs)) {
355:
356: /* this is an alternation */
357: case '|':
358: if (matched)
359:
360: /* last thing in the alternation was a match, skip ahead */
361: cs = OPTR(cs);
362: else
363:
364: /* no match, keep trying */
365: cs = ONEXT(cs);
366: break;
367:
368: /* this is a grouping, recurse */
369: case '(':
370: ptr = expmatch (s, ONEXT(cs), mstring);
371: if (ptr != NIL) {
372:
373: /* the subexpression matched */
374: matched = 1;
375: s = ptr;
376: } else if (*cs & ALT) {
377:
378: /* alternation, skip to next expression */
379: matched = 0;
380: } else if (*cs & OPT) {
381:
382: /* the match is optional */
383: matched = 1; /* indicate a successful match */
384: } else {
385:
386: /* no match, error return */
387: return (NIL);
388: }
389: cs = OPTR(cs);
390: break;
391: }
392: break;
393:
394: /* try to match a metasymbol */
395: case META:
396: switch (MSYM(cs)) {
397:
398: /* try to match anything and remember what was matched */
399: case 'p':
400: /*
401: * This is really the same as trying the match the
402: * remaining parts of the expression to any subset
403: * of the string.
404: */
405: s1 = s;
406: do {
407: ptr = expmatch (s1, MNEXT(cs), mstring);
408: if (ptr != NIL && s1 != s) {
409:
410: /* we have a match, remember the match */
411: strncpy (mstring, s, s1 - s);
412: mstring[s1 - s] = '\0';
413: return (ptr);
414: } else if (ptr != NIL && (*cs & OPT)) {
415:
416: /* it was aoptional so no match is ok */
417: return (ptr);
418: } else if (ptr != NIL) {
419:
420: /* not optional and we still matched */
421: return (NIL);
422: }
423: if (!isalnum(*s1) && *s1 != '_')
424: return (NIL);
425: if (*s1 == '\\')
426: _escaped = _escaped ? FALSE : TRUE;
427: else
428: _escaped = FALSE;
429: } while (*s1++);
430: return (NIL);
431:
432: /* try to match anything */
433: case 'a':
434: /*
435: * This is really the same as trying the match the
436: * remaining parts of the expression to any subset
437: * of the string.
438: */
439: s1 = s;
440: do {
441: ptr = expmatch (s1, MNEXT(cs), mstring);
442: if (ptr != NIL && s1 != s) {
443:
444: /* we have a match */
445: return (ptr);
446: } else if (ptr != NIL && (*cs & OPT)) {
447:
448: /* it was aoptional so no match is ok */
449: return (ptr);
450: } else if (ptr != NIL) {
451:
452: /* not optional and we still matched */
453: return (NIL);
454: }
455: if (*s1 == '\\')
456: _escaped = _escaped ? FALSE : TRUE;
457: else
458: _escaped = FALSE;
459: } while (*s1++);
460: return (NIL);
461:
462: /* fail if we are currently _escaped */
463: case 'e':
464: if (_escaped)
465: return(NIL);
466: cs = MNEXT(cs);
467: break;
468:
469: /* match any number of tabs and spaces */
470: case 'd':
471: ptr = s;
472: while (*s == ' ' || *s == '\t')
473: s++;
474: if (s != ptr || s == _start) {
475:
476: /* match, be happy */
477: matched = 1;
478: cs = MNEXT(cs);
479: } else if (*s == '\n' || *s == '\0') {
480:
481: /* match, be happy */
482: matched = 1;
483: cs = MNEXT(cs);
484: } else if (*cs & ALT) {
485:
486: /* try the next part */
487: matched = 0;
488: cs = MNEXT(cs);
489: } else if (*cs & OPT) {
490:
491: /* doesn't matter */
492: matched = 1;
493: cs = MNEXT(cs);
494: } else
495:
496: /* no match, error return */
497: return (NIL);
498: break;
499:
500: /* check for end of line */
501: case '$':
502: if (*s == '\0' || *s == '\n') {
503:
504: /* match, be happy */
505: s++;
506: matched = 1;
507: cs = MNEXT(cs);
508: } else if (*cs & ALT) {
509:
510: /* try the next part */
511: matched = 0;
512: cs = MNEXT(cs);
513: } else if (*cs & OPT) {
514:
515: /* doesn't matter */
516: matched = 1;
517: cs = MNEXT(cs);
518: } else
519:
520: /* no match, error return */
521: return (NIL);
522: break;
523:
524: /* check for start of line */
525: case '^':
526: if (s == _start) {
527:
528: /* match, be happy */
529: matched = 1;
530: cs = MNEXT(cs);
531: } else if (*cs & ALT) {
532:
533: /* try the next part */
534: matched = 0;
535: cs = MNEXT(cs);
536: } else if (*cs & OPT) {
537:
538: /* doesn't matter */
539: matched = 1;
540: cs = MNEXT(cs);
541: } else
542:
543: /* no match, error return */
544: return (NIL);
545: break;
546:
547: /* end of a subexpression, return success */
548: case ')':
549: return (s);
550: }
551: break;
552: }
553: }
554: return (s);
555: }
Defined functions
Defined variables
ccre
defined in line
106; used 28 times
sccsid
defined in line
8;
never used
ure
defined in line
105; used 4 times
Defined typedef's
Defined macros
ALT
defined in line
102; used 6 times
FALSE
defined in line
15; used 5 times
META
defined in line
101; used 5 times
MNEXT
defined in line
85; used 17 times
MSYM
defined in line
84; used 5 times
NIL
defined in line
16; used 39 times
- in line 116-119(3),
143-145(3),
153,
168,
174,
187,
193,
209,
215,
227,
233,
245,
259,
270,
276,
319-320(2),
348,
371,
387,
408,
414-424(4),
430,
442-453(4),
460-465(2),
497,
521,
544
OCNT
defined in line
88; used 13 times
ONEXT
defined in line
89; used 4 times
OPER
defined in line
103; used 3 times
OPT
defined in line
99; used 9 times
OPTR
defined in line
90; used 2 times
OSYM
defined in line
87; used 3 times
SCNT
defined in line
92; used 6 times
SNEXT
defined in line
94; used 3 times
SSTR
defined in line
93; used 2 times
STR
defined in line
100; used 5 times
TRUE
defined in line
14; used 2 times