#ifndef lint static char *sccsid="@(#)regexp.c 1.2 (LBL) 4/12/85"; #endif /* * regular expression matching routines for tgrind/tfontedpr. * * These routines were written by Dave Presotto (I think) for vgrind. * Minor mods & attempts to improve performance by Van Jacobson (van@lbl-rtsg) * and Chris Torek (chris@maryland). * * Modifications. * -------------- * 30Mar85 Van & Chris Changed expmatch to return pointer to start of what * was matched in addition to pointer to match end. * Several changes to improve performance (too numerous * to mention). * 11Dec84 Dave Presotto Written. */ #include typedef int boolean; #define TRUE 1 #define FALSE 0 #define NIL 0 #define makelower(c) (isupper((c)) ? tolower((c)) : (c)) int (*re_strncmp)(); /* function used by expmatch to compare * strings. The caller should make it point to * strncmp if case is significant & * lc_strncmp otherwise. */ /* lc_strncmp - like strncmp except that we convert the * first string to lower case before comparing. */ lc_strncmp(s1, s2, len) register char *s1,*s2; register int len; { while (len-- > 0) if (*s2 - makelower(*s1)) return(1); else s2++, s1++; return(0); } /* The following routine converts an irregular expression to * internal format. * * Either meta symbols (\a \d or \p) or character strings or * operations ( alternation or perenthesizing ) can be * specified. Each starts with a descriptor byte. The descriptor * byte has STR set for strings, META set for meta symbols * and OPER set for operations. * The descriptor byte can also have the OPT bit set if the object * defined is optional. Also ALT can be set to indicate an alternation. * * For metasymbols the byte following the descriptor byte identities * the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '('). For * strings the byte after the descriptor is a character count for * the string: * * meta symbols := descriptor * symbol * * strings := descriptor * character count * the string * * operatins := descriptor * symbol * character count */ /* * handy macros for accessing parts of match blocks */ #define MSYM(A) (*(A+1)) /* symbol in a meta symbol block */ #define MNEXT(A) (A+2) /* character following a metasymbol block */ #define OSYM(A) (*(A+1)) /* symbol in an operation block */ #define OCNT(A) (*(A+2)) /* character count */ #define ONEXT(A) (A+3) /* next character after the operation */ #define OPTR(A) (A+*(A+2)) /* place pointed to by the operator */ #define SCNT(A) (*(A+1)) /* byte count of a string */ #define SSTR(A) (A+2) /* address of the string */ #define SNEXT(A) (A+2+*(A+1)) /* character following the string */ /* * bit flags in the descriptor */ #define OPT 1 #define STR 2 #define META 4 #define ALT 8 #define OPER 16 char *ure; /* pointer current position in unconverted exp */ char *ccre; /* pointer to current position in converted exp*/ char *malloc(); char * convexp(re) char *re; /* unconverted irregular expression */ { register char *cre; /* pointer to converted regular expression */ /* allocate room for the converted expression */ if (re == NIL) return (NIL); if (*re == '\0') return (NIL); cre = malloc (4 * strlen(re) + 3); ccre = cre; ure = re; /* start the conversion with a \a */ *cre = META | OPT; MSYM(cre) = 'a'; ccre = MNEXT(cre); /* start the conversion (its recursive) */ expconv (); *ccre = 0; return (cre); } expconv() { register char *cs; /* pointer to current symbol in converted exp */ register char c; /* character being processed */ register char *acs; /* pinter to last alternate */ register int temp; /* let the conversion begin */ acs = NIL; cs = NIL; while (*ure != NIL) { switch (c = *ure++) { case '\\': switch (c = *ure++) { /* escaped characters are just characters */ default: if (cs == NIL || (*cs & STR) == 0) { cs = ccre; *cs = STR; SCNT(cs) = 1; ccre += 2; } else SCNT(cs)++; *ccre++ = c; break; /* normal(?) metacharacters */ case 'a': case 'd': case 'e': case 'p': if (acs != NIL && acs != cs) { do { temp = OCNT(acs); OCNT(acs) = ccre - acs; acs -= temp; } while (temp != 0); acs = NIL; } cs = ccre; *cs = META; MSYM(cs) = c; ccre = MNEXT(cs); break; } break; /* just put the symbol in */ case '^': case '$': if (acs != NIL && acs != cs) { do { temp = OCNT(acs); OCNT(acs) = ccre - acs; acs -= temp; } while (temp != 0); acs = NIL; } cs = ccre; *cs = META; MSYM(cs) = c; ccre = MNEXT(cs); break; /* mark the last match sequence as optional */ case '?': if (cs) *cs = *cs | OPT; break; /* recurse and define a subexpression */ case '(': if (acs != NIL && acs != cs) { do { temp = OCNT(acs); OCNT(acs) = ccre - acs; acs -= temp; } while (temp != 0); acs = NIL; } cs = ccre; *cs = OPER; OSYM(cs) = '('; ccre = ONEXT(cs); expconv (); OCNT(cs) = ccre - cs; /* offset to next symbol */ break; /* return from a recursion */ case ')': if (acs != NIL) { do { temp = OCNT(acs); OCNT(acs) = ccre - acs; acs -= temp; } while (temp != 0); acs = NIL; } cs = ccre; *cs = META; MSYM(cs) = c; ccre = MNEXT(cs); return; /* mark the last match sequence as having an alternate */ /* the third byte will contain an offset to jump over the */ /* alternate match in case the first did not fail */ case '|': if (acs != NIL && acs != cs) OCNT(ccre) = ccre - acs; /* make a back pointer */ else OCNT(ccre) = 0; *cs |= ALT; cs = ccre; *cs = OPER; OSYM(cs) = '|'; ccre = ONEXT(cs); acs = cs; /* remember that the pointer is to be filles */ break; /* if its not a metasymbol just build a scharacter string */ default: if (cs == NIL || (*cs & STR) == 0) { cs = ccre; *cs = STR; SCNT(cs) = 1; ccre = SSTR(cs); } else SCNT(cs)++; *ccre++ = c; break; } } if (acs != NIL) { do { temp = OCNT(acs); OCNT(acs) = ccre - acs; acs -= temp; } while (temp != 0); acs = NIL; } return; } /* end of convertre */ /* * The following routine recognises an irregular expresion * with the following special characters: * * \? - means last match was optional * \a - matches any number of characters * \d - matches any number of spaces and tabs * \p - matches any number of alphanumeric * characters. The * characters matched will be copied into * the area pointed to by 'name'. * \| - alternation * \( \) - grouping used mostly for alternation and * optionality * * The irregular expression must be translated to internal form * prior to calling this routine * * The value returned is the pointer to the last character matched. * If strtptr is non-null, a pointer to the start of the string matched * (excluding \a matches) will be returned at *strtptr. * If mstring is non-null, the string matched by a \p will be copied * into mstring. */ boolean _escaped; /* true if we are currently _escaped */ char *_start; /* start of string */ char * expmatch (s, re, strtptr, mstring) register char *s; /* string to check for a match in */ register char *re; /* a converted irregular expression */ char **strtptr; /* where to put ptr to start of match */ char *mstring; /* where to put whatever matches a \p */ { register char *cs; /* the current symbol */ register char *ptr, *s1; /* temporary pointer */ register char c; /* temporary */ boolean matched; /* a temporary boolean */ /* initial conditions */ if ( strtptr ) *strtptr = NIL; if (re == NIL) return (NIL); cs = re; matched = FALSE; /* loop till expression string is exhausted (or at least pretty tired) */ while (*cs) { switch (*cs & (OPER | STR | META)) { /* try to match a string */ case STR: matched = !((*re_strncmp)(s, SSTR(cs), SCNT(cs))); if (matched) { /* hoorah it matches */ s += SCNT(cs); cs = SNEXT(cs); } else if (*cs & ALT) { /* alternation, skip to next expression */ cs = SNEXT(cs); } else if (*cs & OPT) { /* the match is optional */ cs = SNEXT(cs); matched = 1; /* indicate a successful match */ } else { /* no match, error return */ return (NIL); } break; /* an operator, do something fancy */ case OPER: switch (OSYM(cs)) { /* this is an alternation */ case '|': if (matched) /* last thing in the alternation was a match, skip ahead */ cs = OPTR(cs); else /* no match, keep trying */ cs = ONEXT(cs); break; /* this is a grouping, recurse */ case '(': ptr = expmatch (s, ONEXT(cs), strtptr, mstring); if (ptr != NIL) { /* the subexpression matched */ matched = 1; s = ptr; } else if (*cs & ALT) { /* alternation, skip to next expression */ matched = 0; } else if (*cs & OPT) { /* the match is optional */ matched = 1; /* indicate a successful match */ } else { /* no match, error return */ return (NIL); } cs = OPTR(cs); break; } break; /* try to match a metasymbol */ case META: switch (MSYM(cs)) { /* try to match anything and remember what was matched */ case 'p': /* * This is really the same as trying the match the * remaining parts of the expression to any subset * of the string. */ s1 = s; do { ptr = expmatch (s1, MNEXT(cs), strtptr, mstring); if (ptr != NIL && s1 != s) { /* we have a match, remember the match */ if ( mstring ) { strncpy (mstring, s, s1 - s); mstring[s1 - s] = '\0'; } return (ptr); } else if (ptr != NIL && (*cs & OPT)) { /* \p was optional so no match is ok */ return (ptr); } else if (ptr != NIL) { /* not optional and we still matched */ return (NIL); } if (!isalnum(*s1) && *s1 != '_') return (NIL); if (*s1 == '\\') _escaped = _escaped ? FALSE : TRUE; else _escaped = FALSE; } while (*s1++); return (NIL); /* try to match anything */ case 'a': /* * This is really the same as trying the match the * remaining parts of the expression to any subset * of the string. */ s1 = s; do { /* * Hack for an important special case: if the next thing * in the pattern is a string, just gobble characters until * we find something that matches that string (this saves * the cost of a recursive call on expmatch while scanning * for the start of comments or strings). Since many * patterns end with a string, we also treat that as a * special case. */ if( *(ptr=MNEXT(cs)) == STR ) { c = *SSTR(ptr); while( *s1 && *s1 != c ) s1++; if ( *s1 == 0 ) return(NIL); if ( SNEXT(ptr) == 0 && (s1 != s || *cs & OPT)) { /* next item is a string, it's the last item and * the \a match is ok - just loop to try & match * the string. */ if ( strtptr ) *strtptr = s1; cs = ptr; s = s1; break; } } ptr = expmatch (s1, MNEXT(cs), strtptr, mstring); if (ptr != NIL && (s1 != s || *cs & OPT)) { /* we have a match */ if ( strtptr ) *strtptr = s1; return(ptr); } else if (ptr != NIL) { /* not optional and we still matched */ return (NIL); } if (*s1 == '\\') _escaped = _escaped ? FALSE : TRUE; else _escaped = FALSE; } while (*s1++); return (NIL); /* fail if we are currently _escaped */ case 'e': if (_escaped) return(NIL); cs = MNEXT(cs); break; /* match any number of tabs and spaces */ case 'd': ptr = s; while (*s == ' ' || *s == '\t') s++; if (s != ptr || s == _start) { /* match, be happy */ matched = 1; cs = MNEXT(cs); } else if (*s == '\n' || *s == '\0') { /* match, be happy */ matched = 1; cs = MNEXT(cs); } else if (*cs & ALT) { /* try the next part */ matched = 0; cs = MNEXT(cs); } else if (*cs & OPT) { /* doesn't matter */ matched = 1; cs = MNEXT(cs); } else /* no match, error return */ return (NIL); break; /* check for end of line */ case '$': if (*s == '\0' || *s == '\n') { /* match, be happy */ s++; matched = 1; cs = MNEXT(cs); } else if (*cs & ALT) { /* try the next part */ matched = 0; cs = MNEXT(cs); } else if (*cs & OPT) { /* doesn't matter */ matched = 1; cs = MNEXT(cs); } else /* no match, error return */ return (NIL); break; /* check for start of line */ case '^': if (s == _start) { /* match, be happy */ matched = 1; cs = MNEXT(cs); } else if (*cs & ALT) { /* try the next part */ matched = 0; cs = MNEXT(cs); } else if (*cs & OPT) { /* doesn't matter */ matched = 1; cs = MNEXT(cs); } else /* no match, error return */ return (NIL); break; /* end of a subexpression, return success */ case ')': return (s); } break; } } return (s); }