2 * Copyright (c) 1992, 1993, 1994 Henry Spencer.
3 * Copyright (c) 1992, 1993, 1994
4 * The Regents of the University of California. All rights reserved.
6 * This code is derived from software contributed to Berkeley by
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 * must display the following acknowledgement:
19 * This product includes software developed by the University of
20 * California, Berkeley and its contributors.
21 * 4. Neither the name of the University nor the names of its contributors
22 * may be used to endorse or promote products derived from this software
23 * without specific prior written permission.
25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37 * @(#)regcomp.c 8.5 (Berkeley) 3/20/94
39 * $FreeBSD: src/lib/libc/regex/regcomp.c,v 1.13.2.1 2000/07/31 06:30:37 dcs Exp $
42 #if defined(LIBC_SCCS) && !defined(lint)
43 static char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94";
44 #endif /* LIBC_SCCS and not lint */
46 #include <sys/types.h>
54 // removed collate stuff --SDM
55 // #include "collate.h"
64 * parse structure, passed up and down to avoid global variables and
68 char *next; /* next character in RE */
69 char *end; /* end of string (-> NUL normally) */
70 int error; /* has an error been seen? */
71 sop *strip; /* malloced strip */
72 sopno ssize; /* malloced strip size (allocated) */
73 sopno slen; /* malloced strip length (used) */
74 int ncsalloc; /* number of csets allocated */
76 # define NPAREN 10 /* we need to remember () 1-9 for back refs */
77 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */
78 sopno pend[NPAREN]; /* -> ) ([0] unused) */
81 /* ========= begin header generated by ./mkh ========= */
86 /* === regcomp.c === */
87 static void p_ere __P((struct parse *p, int stop));
88 static void p_ere_exp __P((struct parse *p));
89 static void p_str __P((struct parse *p));
90 static void p_bre __P((struct parse *p, int end1, int end2));
91 static int p_simp_re __P((struct parse *p, int starordinary));
92 static int p_count __P((struct parse *p));
93 static void p_bracket __P((struct parse *p));
94 static void p_b_term __P((struct parse *p, cset *cs));
95 static void p_b_cclass __P((struct parse *p, cset *cs));
96 static void p_b_eclass __P((struct parse *p, cset *cs));
97 static char p_b_symbol __P((struct parse *p));
98 static char p_b_coll_elem __P((struct parse *p, int endc));
99 static char othercase __P((int ch));
100 static void bothcases __P((struct parse *p, int ch));
101 static void ordinary __P((struct parse *p, int ch));
102 static void nonnewline __P((struct parse *p));
103 static void repeat __P((struct parse *p, sopno start, int from, int to));
104 static int seterr __P((struct parse *p, int e));
105 static cset *allocset __P((struct parse *p));
106 static void freeset __P((struct parse *p, cset *cs));
107 static int freezeset __P((struct parse *p, cset *cs));
108 static int firstch __P((struct parse *p, cset *cs));
109 static int nch __P((struct parse *p, cset *cs));
110 static void mcadd __P((struct parse *p, cset *cs, char *cp));
112 static void mcsub __P((cset *cs, char *cp));
113 static int mcin __P((cset *cs, char *cp));
114 static char *mcfind __P((cset *cs, char *cp));
116 static void mcinvert __P((struct parse *p, cset *cs));
117 static void mccase __P((struct parse *p, cset *cs));
118 static int isinsets __P((struct re_guts *g, int c));
119 static int samesets __P((struct re_guts *g, int c1, int c2));
120 static void categorize __P((struct parse *p, struct re_guts *g));
121 static sopno dupl __P((struct parse *p, sopno start, sopno finish));
122 static void doemit __P((struct parse *p, sop op, size_t opnd));
123 static void doinsert __P((struct parse *p, sop op, size_t opnd, sopno pos));
124 static void dofwd __P((struct parse *p, sopno pos, sop value));
125 static void enlarge __P((struct parse *p, sopno size));
126 static void stripsnug __P((struct parse *p, struct re_guts *g));
127 static void findmust __P((struct parse *p, struct re_guts *g));
128 static int altoffset __P((sop *scan, int offset, int mccs));
129 static void computejumps __P((struct parse *p, struct re_guts *g));
130 static void computematchjumps __P((struct parse *p, struct re_guts *g));
131 static sopno pluscount __P((struct parse *p, struct re_guts *g));
136 /* ========= end header generated by ./mkh ========= */
138 static char nuls[10]; /* place to point scanner in event of error */
141 * macros for use with parse structure
142 * BEWARE: these know that the parse structure is named `p' !!!
144 #define PEEK() (*p->next)
145 #define PEEK2() (*(p->next+1))
146 #define MORE() (p->next < p->end)
147 #define MORE2() (p->next+1 < p->end)
148 #define SEE(c) (MORE() && PEEK() == (c))
149 #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
150 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0)
151 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
152 #define NEXT() (p->next++)
153 #define NEXT2() (p->next += 2)
154 #define NEXTn(n) (p->next += (n))
155 #define GETNEXT() (*p->next++)
156 #define SETERROR(e) seterr(p, (e))
157 #define REQUIRE(co, e) ((co) || SETERROR(e))
158 #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e))
159 #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e))
160 #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e))
161 #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd))
162 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
163 #define AHEAD(pos) dofwd(p, pos, HERE()-(pos))
164 #define ASTERN(sop, pos) EMIT(sop, HERE()-pos)
165 #define HERE() (p->slen)
166 #define THERE() (p->slen - 1)
167 #define THERETHERE() (p->slen - 2)
168 #define DROP(n) (p->slen -= (n))
171 static int never = 0; /* for use in asserts; shuts lint up */
173 #define never 0 /* some <assert.h>s have bugs too */
176 /* Macro used by computejump()/computematchjump() */
177 #define MIN(a,b) ((a)<(b)?(a):(b))
180 - regcomp - interface for parser and compilation
181 = extern int regcomp(regex_t *, const char *, int);
182 = #define REG_BASIC 0000
183 = #define REG_EXTENDED 0001
184 = #define REG_ICASE 0002
185 = #define REG_NOSUB 0004
186 = #define REG_NEWLINE 0010
187 = #define REG_NOSPEC 0020
188 = #define REG_PEND 0040
189 = #define REG_DUMP 0200
191 int /* 0 success, otherwise REG_something */
192 regcomp(preg, pattern, cflags)
198 register struct re_guts *g;
199 register struct parse *p = &pa;
203 # define GOODFLAGS(f) (f)
205 # define GOODFLAGS(f) ((f)&~REG_DUMP)
208 cflags = GOODFLAGS(cflags);
209 if ((cflags®_EXTENDED) && (cflags®_NOSPEC))
212 if (cflags®_PEND) {
213 if (preg->re_endp < pattern)
215 len = preg->re_endp - pattern;
217 len = strlen((char *)pattern);
219 /* do the mallocs early so failure handling is easy */
220 g = (struct re_guts *)malloc(sizeof(struct re_guts) +
221 (NC-1)*sizeof(cat_t));
224 p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */
225 p->strip = (sop *)malloc(p->ssize * sizeof(sop));
227 if (p->strip == NULL) {
234 p->next = (char *)pattern; /* convenience; we do not modify it */
235 p->end = p->next + len;
238 for (i = 0; i < NPAREN; i++) {
256 g->ncategories = 1; /* category 0 is "everything else" */
257 g->categories = &g->catspace[-(CHAR_MIN)];
258 (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t));
263 g->firststate = THERE();
264 if (cflags®_EXTENDED)
266 else if (cflags®_NOSPEC)
271 g->laststate = THERE();
273 /* tidy up loose ends and fill things in */
277 /* only use Boyer-Moore algorithm if the pattern is bigger
278 * than three characters
282 computematchjumps(p, g);
283 if(g->matchjump == NULL && g->charjump != NULL) {
288 g->nplus = pluscount(p, g);
290 preg->re_nsub = g->nsub;
292 preg->re_magic = MAGIC1;
294 /* not debugging, so can't rely on the assert() in regexec() */
296 SETERROR(REG_ASSERT);
299 /* win or lose, we're done */
300 if (p->error != 0) /* lose */
306 - p_ere - ERE parser top level, concatenation and alternation
307 == static void p_ere(register struct parse *p, int stop);
311 register struct parse *p;
312 int stop; /* character this ERE should end at */
315 register sopno prevback;
316 register sopno prevfwd;
318 register int first = 1; /* is this the first alternative? */
321 /* do a bunch of concatenated expressions */
323 while (MORE() && (c = PEEK()) != '|' && c != stop)
325 (void)REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */
328 break; /* NOTE BREAK OUT */
331 INSERT(OCH_, conc); /* offset is wrong */
336 ASTERN(OOR1, prevback);
338 AHEAD(prevfwd); /* fix previous offset */
340 EMIT(OOR2, 0); /* offset is very wrong */
343 if (!first) { /* tail-end fixups */
345 ASTERN(O_CH, prevback);
348 assert(!MORE() || SEE(stop));
352 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
353 == static void p_ere_exp(register struct parse *p);
357 register struct parse *p;
363 register sopno subno;
366 assert(MORE()); /* caller should have ensured this */
372 (void)REQUIRE(MORE(), REG_EPAREN);
376 p->pbegin[subno] = HERE();
377 EMIT(OLPAREN, subno);
380 if (subno < NPAREN) {
381 p->pend[subno] = HERE();
382 assert(p->pend[subno] != 0);
384 EMIT(ORPAREN, subno);
385 (void)MUSTEAT(')', REG_EPAREN);
387 #ifndef POSIX_MISTAKE
388 case ')': /* happens only if no current unmatched ( */
390 * You may ask, why the ifndef? Because I didn't notice
391 * this until slightly too late for 1003.2, and none of the
392 * other 1003.2 regular-expression reviewers noticed it at
393 * all. So an unmatched ) is legal POSIX, at least until
394 * we can get it fixed.
396 SETERROR(REG_EPAREN);
401 p->g->iflags |= USEBOL;
407 p->g->iflags |= USEEOL;
416 SETERROR(REG_BADRPT);
419 if (p->g->cflags®_NEWLINE)
428 (void)REQUIRE(MORE(), REG_EESCAPE);
432 case '{': /* okay as ordinary except if digit follows */
433 (void)REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT);
443 /* we call { a repetition if followed by a digit */
444 if (!( c == '*' || c == '+' || c == '?' ||
445 (c == '{' && MORE2() && isdigit((uch)PEEK2())) ))
446 return; /* no repetition, we're done */
449 (void)REQUIRE(!wascaret, REG_BADRPT);
451 case '*': /* implemented as +? */
452 /* this case does not require the (y|) trick, noKLUDGE */
455 INSERT(OQUEST_, pos);
456 ASTERN(O_QUEST, pos);
463 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
464 INSERT(OCH_, pos); /* offset slightly wrong */
465 ASTERN(OOR1, pos); /* this one's right */
466 AHEAD(pos); /* fix the OCH_ */
467 EMIT(OOR2, 0); /* offset very wrong... */
468 AHEAD(THERE()); /* ...so fix it */
469 ASTERN(O_CH, THERETHERE());
474 if (isdigit((uch)PEEK())) {
476 (void)REQUIRE(count <= count2, REG_BADBR);
477 } else /* single number with comma */
479 } else /* just a single number */
481 repeat(p, pos, count, count2);
482 if (!EAT('}')) { /* error heuristics */
483 while (MORE() && PEEK() != '}')
485 (void)REQUIRE(MORE(), REG_EBRACE);
494 if (!( c == '*' || c == '+' || c == '?' ||
495 (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) )
497 SETERROR(REG_BADRPT);
501 - p_str - string (no metacharacters) "parser"
502 == static void p_str(register struct parse *p);
506 register struct parse *p;
508 (void)REQUIRE(MORE(), REG_EMPTY);
510 ordinary(p, GETNEXT());
514 - p_bre - BRE parser top level, anchoring and concatenation
515 == static void p_bre(register struct parse *p, register int end1, \
516 == register int end2);
517 * Giving end1 as OUT essentially eliminates the end1/end2 check.
519 * This implementation is a bit of a kludge, in that a trailing $ is first
520 * taken as an ordinary character and then revised to be an anchor. The
521 * only undesirable side effect is that '$' gets included as a character
522 * category in such cases. This is fairly harmless; not worth fixing.
523 * The amount of lookahead needed to avoid this kludge is excessive.
527 register struct parse *p;
528 register int end1; /* first terminating character */
529 register int end2; /* second terminating character */
531 register sopno start = HERE();
532 register int first = 1; /* first subexpression? */
533 register int wasdollar = 0;
537 p->g->iflags |= USEBOL;
540 while (MORE() && !SEETWO(end1, end2)) {
541 wasdollar = p_simp_re(p, first);
544 if (wasdollar) { /* oops, that was a trailing anchor */
547 p->g->iflags |= USEEOL;
551 (void)REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */
555 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
556 == static int p_simp_re(register struct parse *p, int starordinary);
558 static int /* was the simple RE an unbackslashed $? */
559 p_simp_re(p, starordinary)
560 register struct parse *p;
561 int starordinary; /* is a leading * an ordinary character? */
568 register sopno subno;
569 # define BACKSL (1<<CHAR_BIT)
571 pos = HERE(); /* repetion op, if any, covers from here */
573 assert(MORE()); /* caller should have ensured this */
576 (void)REQUIRE(MORE(), REG_EESCAPE);
577 c = BACKSL | GETNEXT();
581 if (p->g->cflags®_NEWLINE)
590 SETERROR(REG_BADRPT);
596 p->pbegin[subno] = HERE();
597 EMIT(OLPAREN, subno);
598 /* the MORE here is an error heuristic */
599 if (MORE() && !SEETWO('\\', ')'))
601 if (subno < NPAREN) {
602 p->pend[subno] = HERE();
603 assert(p->pend[subno] != 0);
605 EMIT(ORPAREN, subno);
606 (void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
608 case BACKSL|')': /* should not get here -- must be user */
610 SETERROR(REG_EPAREN);
621 i = (c&~BACKSL) - '0';
623 if (p->pend[i] != 0) {
624 assert(i <= p->g->nsub);
626 assert(p->pbegin[i] != 0);
627 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
628 assert(OP(p->strip[p->pend[i]]) == ORPAREN);
629 (void) dupl(p, p->pbegin[i]+1, p->pend[i]);
632 SETERROR(REG_ESUBREG);
636 (void)REQUIRE(starordinary, REG_BADRPT);
639 ordinary(p, (char)c);
643 if (EAT('*')) { /* implemented as +? */
644 /* this case does not require the (y|) trick, noKLUDGE */
647 INSERT(OQUEST_, pos);
648 ASTERN(O_QUEST, pos);
649 } else if (EATTWO('\\', '{')) {
652 if (MORE() && isdigit((uch)PEEK())) {
654 (void)REQUIRE(count <= count2, REG_BADBR);
655 } else /* single number with comma */
657 } else /* just a single number */
659 repeat(p, pos, count, count2);
660 if (!EATTWO('\\', '}')) { /* error heuristics */
661 while (MORE() && !SEETWO('\\', '}'))
663 (void)REQUIRE(MORE(), REG_EBRACE);
666 } else if (c == '$') /* $ (but not \$) ends it */
673 - p_count - parse a repetition count
674 == static int p_count(register struct parse *p);
676 static int /* the value */
678 register struct parse *p;
680 register int count = 0;
681 register int ndigits = 0;
683 while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) {
684 count = count*10 + (GETNEXT() - '0');
688 (void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
693 - p_bracket - parse a bracketed character list
694 == static void p_bracket(register struct parse *p);
696 * Note a significant property of this code: if the allocset() did SETERROR,
697 * no set operations are done.
701 register struct parse *p;
703 register cset *cs = allocset(p);
704 register int invert = 0;
706 /* Dept of Truly Sickening Special-Case Kludges */
707 if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
712 if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
719 invert++; /* make note to invert set at end */
724 while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
728 (void)MUSTEAT(']', REG_EBRACK);
730 if (p->error != 0) /* don't mess things up further */
733 if (p->g->cflags®_ICASE) {
737 for (i = p->g->csetsize - 1; i >= 0; i--)
738 if (CHIN(cs, i) && isalpha(i)) {
743 if (cs->multis != NULL)
749 for (i = p->g->csetsize - 1; i >= 0; i--)
754 if (p->g->cflags®_NEWLINE)
756 if (cs->multis != NULL)
760 assert(cs->multis == NULL); /* xxx */
762 if (nch(p, cs) == 1) { /* optimize singleton sets */
763 ordinary(p, firstch(p, cs));
766 EMIT(OANYOF, freezeset(p, cs));
770 - p_b_term - parse one term of a bracketed character list
771 == static void p_b_term(register struct parse *p, register cset *cs);
775 register struct parse *p;
779 register char start, finish;
782 /* classify what we've got */
783 switch ((MORE()) ? PEEK() : '\0') {
785 c = (MORE2()) ? PEEK2() : '\0';
788 SETERROR(REG_ERANGE);
789 return; /* NOTE RETURN */
797 case ':': /* character class */
799 (void)REQUIRE(MORE(), REG_EBRACK);
801 (void)REQUIRE(c != '-' && c != ']', REG_ECTYPE);
803 (void)REQUIRE(MORE(), REG_EBRACK);
804 (void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
806 case '=': /* equivalence class */
808 (void)REQUIRE(MORE(), REG_EBRACK);
810 (void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
812 (void)REQUIRE(MORE(), REG_EBRACK);
813 (void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
815 default: /* symbol, ordinary character, or range */
816 /* xxx revision needed for multichar stuff */
817 start = p_b_symbol(p);
818 if (SEE('-') && MORE2() && PEEK2() != ']') {
824 finish = p_b_symbol(p);
830 // remove collate stuff --SDM
832 if (__collate_load_error) {
833 (void)REQUIRE((uch)start <= (uch)finish, REG_ERANGE);
835 for (i = (uch)start; i <= (uch)finish; i++)
839 (void)REQUIRE(__collate_range_cmp(start, finish) <= 0, REG_ERANGE);
840 for (i = CHAR_MIN; i <= CHAR_MAX; i++) {
841 if ( __collate_range_cmp(start, i) <= 0
842 && __collate_range_cmp(i, finish) <= 0
854 - p_b_cclass - parse a character-class name and deal with it
855 == static void p_b_cclass(register struct parse *p, register cset *cs);
859 register struct parse *p;
863 register char *sp = p->next;
864 register struct cclass *cp;
867 while (MORE() && isalpha((uch)PEEK()))
870 for (cp = cclasses; cp->name != NULL; cp++)
871 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
873 if (cp->name == NULL) {
874 /* oops, didn't find it */
875 SETERROR(REG_ECTYPE);
881 for (c = CHAR_MIN; c <= CHAR_MAX; c++)
886 for (c = CHAR_MIN; c <= CHAR_MAX; c++)
891 for (c = CHAR_MIN; c <= CHAR_MAX; c++)
896 for (c = CHAR_MIN; c <= CHAR_MAX; c++)
901 for (c = CHAR_MIN; c <= CHAR_MAX; c++)
906 for (c = CHAR_MIN; c <= CHAR_MAX; c++)
911 for (c = CHAR_MIN; c <= CHAR_MAX; c++)
916 for (c = CHAR_MIN; c <= CHAR_MAX; c++)
921 for (c = CHAR_MIN; c <= CHAR_MAX; c++)
926 for (c = CHAR_MIN; c <= CHAR_MAX; c++)
931 for (c = CHAR_MIN; c <= CHAR_MAX; c++)
936 for (c = CHAR_MIN; c <= CHAR_MAX; c++)
937 if (isxdigit((uch)c))
942 for (u = cp->multis; *u != '\0'; u += strlen(u) + 1)
948 - p_b_eclass - parse an equivalence-class name and deal with it
949 == static void p_b_eclass(register struct parse *p, register cset *cs);
951 * This implementation is incomplete. xxx
955 register struct parse *p;
960 c = p_b_coll_elem(p, '=');
965 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
966 == static char p_b_symbol(register struct parse *p);
968 static char /* value of symbol */
970 register struct parse *p;
974 (void)REQUIRE(MORE(), REG_EBRACK);
975 if (!EATTWO('[', '.'))
978 /* collating symbol */
979 value = p_b_coll_elem(p, '.');
980 (void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
985 - p_b_coll_elem - parse a collating-element name and look it up
986 == static char p_b_coll_elem(register struct parse *p, int endc);
988 static char /* value of collating element */
989 p_b_coll_elem(p, endc)
990 register struct parse *p;
991 int endc; /* name ended by endc,']' */
993 register char *sp = p->next;
994 register struct cname *cp;
997 while (MORE() && !SEETWO(endc, ']'))
1000 SETERROR(REG_EBRACK);
1004 for (cp = cnames; cp->name != NULL; cp++)
1005 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
1006 return(cp->code); /* known name */
1008 return(*sp); /* single character */
1009 SETERROR(REG_ECOLLATE); /* neither */
1014 - othercase - return the case counterpart of an alphabetic
1015 == static char othercase(int ch);
1017 static char /* if no counterpart, return ch */
1022 assert(isalpha(ch));
1024 return(tolower(ch));
1025 else if (islower(ch))
1026 return(toupper(ch));
1027 else /* peculiar, but could happen */
1032 - bothcases - emit a dualcase version of a two-case character
1033 == static void bothcases(register struct parse *p, int ch);
1035 * Boy, is this implementation ever a kludge...
1039 register struct parse *p;
1042 register char *oldnext = p->next;
1043 register char *oldend = p->end;
1047 assert(othercase(ch) != ch); /* p_bracket() would recurse */
1054 assert(p->next == bracket+2);
1060 - ordinary - emit an ordinary character
1061 == static void ordinary(register struct parse *p, register int ch);
1065 register struct parse *p;
1068 register cat_t *cap = p->g->categories;
1070 if ((p->g->cflags®_ICASE) && isalpha((uch)ch) && othercase(ch) != ch)
1073 EMIT(OCHAR, (uch)ch);
1075 cap[ch] = p->g->ncategories++;
1080 - nonnewline - emit REG_NEWLINE version of OANY
1081 == static void nonnewline(register struct parse *p);
1083 * Boy, is this implementation ever a kludge...
1087 register struct parse *p;
1089 register char *oldnext = p->next;
1090 register char *oldend = p->end;
1100 assert(p->next == bracket+3);
1106 - repeat - generate code for a bounded repetition, recursively if needed
1107 == static void repeat(register struct parse *p, sopno start, int from, int to);
1110 repeat(p, start, from, to)
1111 register struct parse *p;
1112 sopno start; /* operand from here to end of strip */
1113 int from; /* repeated from this number */
1114 int to; /* to this number of times (maybe INFINITY) */
1116 register sopno finish = HERE();
1119 # define REP(f, t) ((f)*8 + (t))
1120 # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
1121 register sopno copy;
1123 if (p->error != 0) /* head off possible runaway recursion */
1128 switch (REP(MAP(from), MAP(to))) {
1129 case REP(0, 0): /* must be user doing this */
1130 DROP(finish-start); /* drop the operand */
1132 case REP(0, 1): /* as x{1,1}? */
1133 case REP(0, N): /* as x{1,n}? */
1134 case REP(0, INF): /* as x{1,}? */
1135 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1136 INSERT(OCH_, start); /* offset is wrong... */
1137 repeat(p, start+1, 1, to);
1138 ASTERN(OOR1, start);
1139 AHEAD(start); /* ... fix it */
1142 ASTERN(O_CH, THERETHERE());
1144 case REP(1, 1): /* trivial case */
1147 case REP(1, N): /* as x?x{1,n-1} */
1148 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1149 INSERT(OCH_, start);
1150 ASTERN(OOR1, start);
1152 EMIT(OOR2, 0); /* offset very wrong... */
1153 AHEAD(THERE()); /* ...so fix it */
1154 ASTERN(O_CH, THERETHERE());
1155 copy = dupl(p, start+1, finish+1);
1156 assert(copy == finish+4);
1157 repeat(p, copy, 1, to-1);
1159 case REP(1, INF): /* as x+ */
1160 INSERT(OPLUS_, start);
1161 ASTERN(O_PLUS, start);
1163 case REP(N, N): /* as xx{m-1,n-1} */
1164 copy = dupl(p, start, finish);
1165 repeat(p, copy, from-1, to-1);
1167 case REP(N, INF): /* as xx{n-1,INF} */
1168 copy = dupl(p, start, finish);
1169 repeat(p, copy, from-1, to);
1171 default: /* "can't happen" */
1172 SETERROR(REG_ASSERT); /* just in case */
1178 - seterr - set an error condition
1179 == static int seterr(register struct parse *p, int e);
1181 static int /* useless but makes type checking happy */
1183 register struct parse *p;
1186 if (p->error == 0) /* keep earliest error condition */
1188 p->next = nuls; /* try to bring things to a halt */
1190 return(0); /* make the return value well-defined */
1194 - allocset - allocate a set of characters for []
1195 == static cset *allocset(register struct parse *p);
1199 register struct parse *p;
1201 register int no = p->g->ncsets++;
1203 register size_t nbytes;
1205 register size_t css = (size_t)p->g->csetsize;
1208 if (no >= p->ncsalloc) { /* need another column of space */
1209 p->ncsalloc += CHAR_BIT;
1211 assert(nc % CHAR_BIT == 0);
1212 nbytes = nc / CHAR_BIT * css;
1213 if (p->g->sets == NULL)
1214 p->g->sets = (cset *)malloc(nc * sizeof(cset));
1216 p->g->sets = (cset *)reallocf((char *)p->g->sets,
1218 if (p->g->setbits == NULL)
1219 p->g->setbits = (uch *)malloc(nbytes);
1221 p->g->setbits = (uch *)reallocf((char *)p->g->setbits,
1223 /* xxx this isn't right if setbits is now NULL */
1224 for (i = 0; i < no; i++)
1225 p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT);
1227 if (p->g->sets != NULL && p->g->setbits != NULL)
1228 (void) memset((char *)p->g->setbits + (nbytes - css),
1232 SETERROR(REG_ESPACE);
1233 /* caller's responsibility not to do set ops */
1237 assert(p->g->sets != NULL); /* xxx */
1238 cs = &p->g->sets[no];
1239 cs->ptr = p->g->setbits + css*((no)/CHAR_BIT);
1240 cs->mask = 1 << ((no) % CHAR_BIT);
1249 - freeset - free a now-unused set
1250 == static void freeset(register struct parse *p, register cset *cs);
1254 register struct parse *p;
1258 register cset *top = &p->g->sets[p->g->ncsets];
1259 register size_t css = (size_t)p->g->csetsize;
1261 for (i = 0; i < css; i++)
1263 if (cs == top-1) /* recover only the easy case */
1268 - freezeset - final processing on a set of characters
1269 == static int freezeset(register struct parse *p, register cset *cs);
1271 * The main task here is merging identical sets. This is usually a waste
1272 * of time (although the hash code minimizes the overhead), but can win
1273 * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash
1274 * is done using addition rather than xor -- all ASCII [aA] sets xor to
1277 static int /* set number */
1279 register struct parse *p;
1282 register short h = cs->hash;
1284 register cset *top = &p->g->sets[p->g->ncsets];
1286 register size_t css = (size_t)p->g->csetsize;
1288 /* look for an earlier one which is the same */
1289 for (cs2 = &p->g->sets[0]; cs2 < top; cs2++)
1290 if (cs2->hash == h && cs2 != cs) {
1292 for (i = 0; i < css; i++)
1293 if (!!CHIN(cs2, i) != !!CHIN(cs, i))
1299 if (cs2 < top) { /* found one */
1304 return((int)(cs - p->g->sets));
1308 - firstch - return first character in a set (which must have at least one)
1309 == static int firstch(register struct parse *p, register cset *cs);
1311 static int /* character; there is no "none" value */
1313 register struct parse *p;
1317 register size_t css = (size_t)p->g->csetsize;
1319 for (i = 0; i < css; i++)
1323 return(0); /* arbitrary */
1327 - nch - number of characters in a set
1328 == static int nch(register struct parse *p, register cset *cs);
1332 register struct parse *p;
1336 register size_t css = (size_t)p->g->csetsize;
1339 for (i = 0; i < css; i++)
1346 - mcadd - add a collating element to a cset
1347 == static void mcadd(register struct parse *p, register cset *cs, \
1348 == register char *cp);
1352 register struct parse *p;
1356 register size_t oldend = cs->smultis;
1358 cs->smultis += strlen(cp) + 1;
1359 if (cs->multis == NULL)
1360 cs->multis = malloc(cs->smultis);
1362 cs->multis = reallocf(cs->multis, cs->smultis);
1363 if (cs->multis == NULL) {
1364 SETERROR(REG_ESPACE);
1368 (void) strcpy(cs->multis + oldend - 1, cp);
1369 cs->multis[cs->smultis - 1] = '\0';
1374 - mcsub - subtract a collating element from a cset
1375 == static void mcsub(register cset *cs, register char *cp);
1382 register char *fp = mcfind(cs, cp);
1383 register size_t len = strlen(fp);
1386 (void) memmove(fp, fp + len + 1,
1387 cs->smultis - (fp + len + 1 - cs->multis));
1390 if (cs->smultis == 0) {
1396 cs->multis = reallocf(cs->multis, cs->smultis);
1397 assert(cs->multis != NULL);
1401 - mcin - is a collating element in a cset?
1402 == static int mcin(register cset *cs, register char *cp);
1409 return(mcfind(cs, cp) != NULL);
1413 - mcfind - find a collating element in a cset
1414 == static char *mcfind(register cset *cs, register char *cp);
1423 if (cs->multis == NULL)
1425 for (p = cs->multis; *p != '\0'; p += strlen(p) + 1)
1426 if (strcmp(cp, p) == 0)
1433 - mcinvert - invert the list of collating elements in a cset
1434 == static void mcinvert(register struct parse *p, register cset *cs);
1436 * This would have to know the set of possibilities. Implementation
1441 register struct parse *p;
1444 assert(cs->multis == NULL); /* xxx */
1448 - mccase - add case counterparts of the list of collating elements in a cset
1449 == static void mccase(register struct parse *p, register cset *cs);
1451 * This would have to know the set of possibilities. Implementation
1456 register struct parse *p;
1459 assert(cs->multis == NULL); /* xxx */
1463 - isinsets - is this character in any sets?
1464 == static int isinsets(register struct re_guts *g, int c);
1466 static int /* predicate */
1468 register struct re_guts *g;
1473 register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1474 register unsigned uc = (uch)c;
1476 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1483 - samesets - are these two characters in exactly the same sets?
1484 == static int samesets(register struct re_guts *g, int c1, int c2);
1486 static int /* predicate */
1488 register struct re_guts *g;
1494 register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1495 register unsigned uc1 = (uch)c1;
1496 register unsigned uc2 = (uch)c2;
1498 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1499 if (col[uc1] != col[uc2])
1505 - categorize - sort out character categories
1506 == static void categorize(struct parse *p, register struct re_guts *g);
1511 register struct re_guts *g;
1513 register cat_t *cats = g->categories;
1518 /* avoid making error situations worse */
1522 for (c = CHAR_MIN; c <= CHAR_MAX; c++)
1523 if (cats[c] == 0 && isinsets(g, c)) {
1524 cat = g->ncategories++;
1526 for (c2 = c+1; c2 <= CHAR_MAX; c2++)
1527 if (cats[c2] == 0 && samesets(g, c, c2))
1533 - dupl - emit a duplicate of a bunch of sops
1534 == static sopno dupl(register struct parse *p, sopno start, sopno finish);
1536 static sopno /* start of duplicate */
1537 dupl(p, start, finish)
1538 register struct parse *p;
1539 sopno start; /* from here */
1540 sopno finish; /* to this less one */
1542 register sopno ret = HERE();
1543 register sopno len = finish - start;
1545 assert(finish >= start);
1548 enlarge(p, p->ssize + len); /* this many unexpected additions */
1549 assert(p->ssize >= p->slen + len);
1550 (void) memcpy((char *)(p->strip + p->slen),
1551 (char *)(p->strip + start), (size_t)len*sizeof(sop));
1557 - doemit - emit a strip operator
1558 == static void doemit(register struct parse *p, sop op, size_t opnd);
1560 * It might seem better to implement this as a macro with a function as
1561 * hard-case backup, but it's just too big and messy unless there are
1562 * some changes to the data structures. Maybe later.
1566 register struct parse *p;
1570 /* avoid making error situations worse */
1574 /* deal with oversize operands ("can't happen", more or less) */
1575 assert(opnd < 1<<OPSHIFT);
1577 /* deal with undersized strip */
1578 if (p->slen >= p->ssize)
1579 enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */
1580 assert(p->slen < p->ssize);
1582 /* finally, it's all reduced to the easy case */
1583 p->strip[p->slen++] = SOP(op, opnd);
1587 - doinsert - insert a sop into the strip
1588 == static void doinsert(register struct parse *p, sop op, size_t opnd, sopno pos);
1591 doinsert(p, op, opnd, pos)
1592 register struct parse *p;
1601 /* avoid making error situations worse */
1606 EMIT(op, opnd); /* do checks, ensure space */
1607 assert(HERE() == sn+1);
1610 /* adjust paren pointers */
1612 for (i = 1; i < NPAREN; i++) {
1613 if (p->pbegin[i] >= pos) {
1616 if (p->pend[i] >= pos) {
1621 memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
1622 (HERE()-pos-1)*sizeof(sop));
1627 - dofwd - complete a forward reference
1628 == static void dofwd(register struct parse *p, sopno pos, sop value);
1631 dofwd(p, pos, value)
1632 register struct parse *p;
1636 /* avoid making error situations worse */
1640 assert(value < 1<<OPSHIFT);
1641 p->strip[pos] = OP(p->strip[pos]) | value;
1645 - enlarge - enlarge the strip
1646 == static void enlarge(register struct parse *p, sopno size);
1650 register struct parse *p;
1651 register sopno size;
1655 if (p->ssize >= size)
1658 sp = (sop *)realloc(p->strip, size*sizeof(sop));
1660 SETERROR(REG_ESPACE);
1668 - stripsnug - compact the strip
1669 == static void stripsnug(register struct parse *p, register struct re_guts *g);
1673 register struct parse *p;
1674 register struct re_guts *g;
1676 g->nstates = p->slen;
1677 g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop));
1678 if (g->strip == NULL) {
1679 SETERROR(REG_ESPACE);
1680 g->strip = p->strip;
1685 - findmust - fill in must and mlen with longest mandatory literal string
1686 == static void findmust(register struct parse *p, register struct re_guts *g);
1688 * This algorithm could do fancy things like analyzing the operands of |
1689 * for common subsequences. Someday. This code is simple and finds most
1690 * of the interesting cases.
1692 * Note that must and mlen got initialized during setup.
1697 register struct re_guts *g;
1701 register sop *newstart;
1702 register sopno newlen;
1709 /* avoid making error situations worse */
1713 /* Find out if we can handle OANYOF or not */
1715 for (cs = 0; cs < g->ncsets; cs++)
1716 if (g->sets[cs].multis != NULL)
1719 /* find the longest OCHAR sequence in strip */
1723 scan = g->strip + 1;
1727 case OCHAR: /* sequence member */
1728 if (newlen == 0) /* new sequence */
1729 newstart = scan - 1;
1732 case OPLUS_: /* things that don't break one */
1736 case OQUEST_: /* things that must be skipped */
1738 offset = altoffset(scan, offset, mccs);
1743 /* assert() interferes w debug printouts */
1744 if (OP(s) != O_QUEST && OP(s) != O_CH &&
1749 } while (OP(s) != O_QUEST && OP(s) != O_CH);
1751 case OBOW: /* things that break a sequence */
1758 if (newlen > g->mlen) { /* ends one */
1762 g->moffset += offset;
1765 g->moffset = offset;
1773 if (newlen > g->mlen) { /* ends one */
1777 g->moffset += offset;
1780 g->moffset = offset;
1789 case OANYOF: /* may or may not invalidate offset */
1790 /* First, everything as OANY */
1791 if (newlen > g->mlen) { /* ends one */
1795 g->moffset += offset;
1798 g->moffset = offset;
1806 /* And, now, if we found out we can't deal with
1807 * it, make offset = -1.
1813 /* Anything here makes it impossible or too hard
1814 * to calculate the offset -- so we give up;
1815 * save the last known good offset, in case the
1816 * must sequence doesn't occur later.
1818 if (newlen > g->mlen) { /* ends one */
1822 g->moffset += offset;
1824 g->moffset = offset;
1830 } while (OP(s) != OEND);
1832 if (g->mlen == 0) { /* there isn't one */
1837 /* turn it into a character string */
1838 g->must = malloc((size_t)g->mlen + 1);
1839 if (g->must == NULL) { /* argh; just forget it */
1846 for (i = g->mlen; i > 0; i--) {
1847 while (OP(s = *scan++) != OCHAR)
1849 assert(cp < g->must + g->mlen);
1850 *cp++ = (char)OPND(s);
1852 assert(cp == g->must + g->mlen);
1853 *cp++ = '\0'; /* just on general principles */
1857 - altoffset - choose biggest offset among multiple choices
1858 == static int altoffset(sop *scan, int offset, int mccs);
1860 * Compute, recursively if necessary, the largest offset among multiple
1864 altoffset(scan, offset, mccs)
1873 /* If we gave up already on offsets, return */
1880 while (OP(s) != O_QUEST && OP(s) != O_CH) {
1889 try = altoffset(scan, try, mccs);
1896 if (OP(s) != O_QUEST && OP(s) != O_CH &&
1899 } while (OP(s) != O_QUEST && OP(s) != O_CH);
1900 /* We must skip to the next position, or we'll
1901 * leave altoffset() too early.
1929 return largest+offset;
1933 - computejumps - compute char jumps for BM scan
1934 == static void computejumps(register struct parse *p, register struct re_guts *g);
1936 * This algorithm assumes g->must exists and is has size greater than
1937 * zero. It's based on the algorithm found on Computer Algorithms by
1940 * A char jump is the number of characters one needs to jump based on
1941 * the value of the character from the text that was mismatched.
1951 /* Avoid making errors worse */
1955 g->charjump = (int*) malloc((NC + 1) * sizeof(int));
1956 if (g->charjump == NULL) /* Not a fatal error */
1958 /* Adjust for signed chars, if necessary */
1959 g->charjump = &g->charjump[-(CHAR_MIN)];
1961 /* If the character does not exist in the pattern, the jump
1962 * is equal to the number of characters in the pattern.
1964 for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++)
1965 g->charjump[ch] = g->mlen;
1967 /* If the character does exist, compute the jump that would
1968 * take us to the last character in the pattern equal to it
1969 * (notice that we match right to left, so that last character
1970 * is the first one that would be matched).
1972 for (mindex = 0; mindex < g->mlen; mindex++)
1973 g->charjump[g->must[mindex]] = g->mlen - mindex - 1;
1977 - computematchjumps - compute match jumps for BM scan
1978 == static void computematchjumps(register struct parse *p, register struct re_guts *g);
1980 * This algorithm assumes g->must exists and is has size greater than
1981 * zero. It's based on the algorithm found on Computer Algorithms by
1984 * A match jump is the number of characters one needs to advance based
1985 * on the already-matched suffix.
1986 * Notice that all values here are minus (g->mlen-1), because of the way
1987 * the search algorithm works.
1990 computematchjumps(p, g)
1994 int mindex; /* General "must" iterator */
1995 int suffix; /* Keeps track of matching suffix */
1996 int ssuffix; /* Keeps track of suffixes' suffix */
1997 int* pmatches; /* pmatches[k] points to the next i
1998 * such that i+1...mlen is a substring
1999 * of k+1...k+mlen-i-1
2002 /* Avoid making errors worse */
2006 pmatches = (int*) malloc(g->mlen * sizeof(unsigned int));
2007 if (pmatches == NULL) {
2008 g->matchjump = NULL;
2012 g->matchjump = (int*) malloc(g->mlen * sizeof(unsigned int));
2013 if (g->matchjump == NULL) /* Not a fatal error */
2016 /* Set maximum possible jump for each character in the pattern */
2017 for (mindex = 0; mindex < g->mlen; mindex++)
2018 g->matchjump[mindex] = 2*g->mlen - mindex - 1;
2020 /* Compute pmatches[] */
2021 for (mindex = g->mlen - 1, suffix = g->mlen; mindex >= 0;
2022 mindex--, suffix--) {
2023 pmatches[mindex] = suffix;
2025 /* If a mismatch is found, interrupting the substring,
2026 * compute the matchjump for that position. If no
2027 * mismatch is found, then a text substring mismatched
2028 * against the suffix will also mismatch against the
2031 while (suffix < g->mlen
2032 && g->must[mindex] != g->must[suffix]) {
2033 g->matchjump[suffix] = MIN(g->matchjump[suffix],
2034 g->mlen - mindex - 1);
2035 suffix = pmatches[suffix];
2039 /* Compute the matchjump up to the last substring found to jump
2040 * to the beginning of the largest must pattern prefix matching
2043 for (mindex = 0; mindex <= suffix; mindex++)
2044 g->matchjump[mindex] = MIN(g->matchjump[mindex],
2045 g->mlen + suffix - mindex);
2047 ssuffix = pmatches[suffix];
2048 while (suffix < g->mlen) {
2049 while (suffix <= ssuffix && suffix < g->mlen) {
2050 g->matchjump[suffix] = MIN(g->matchjump[suffix],
2051 g->mlen + ssuffix - suffix);
2054 ssuffix = pmatches[ssuffix];
2061 - pluscount - count + nesting
2062 == static sopno pluscount(register struct parse *p, register struct re_guts *g);
2064 static sopno /* nesting depth */
2067 register struct re_guts *g;
2071 register sopno plusnest = 0;
2072 register sopno maxnest = 0;
2075 return(0); /* there may not be an OEND */
2077 scan = g->strip + 1;
2085 if (plusnest > maxnest)
2090 } while (OP(s) != OEND);