1 // This file is (approximately) the concatenation of all the source
2 // files in the src/gnu/regexp directory of GNU Regexp, version 1.1.4
4 // Although doing this is a huge hack, it simplifies the deployment of
5 // applications that depend on org.ibex.js.
12 public class GnuRegexp {
14 * gnu/regexp/CharIndexed.java
15 * Copyright (C) 1998-2001 Wes Biggs
17 * This library is free software; you can redistribute it and/or modify
18 * it under the terms of the GNU Lesser General Public License as published
19 * by the Free Software Foundation; either version 2.1 of the License, or
20 * (at your option) any later version.
22 * This library is distributed in the hope that it will be useful,
23 * but WITHOUT ANY WARRANTY; without even the implied warranty of
24 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
25 * GNU Lesser General Public License for more details.
27 * You should have received a copy of the GNU Lesser General Public License
28 * along with this program; if not, write to the Free Software
29 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
33 * Defines the interface used internally so that different types of source
34 * text can be accessed in the same way. Built-in concrete classes provide
35 * support for String, StringBuffer, InputStream and char[] types.
36 * A class that is CharIndexed supports the notion of a cursor within a
37 * block of text. The cursor must be able to be advanced via the move()
38 * method. The charAt() method returns the character at the cursor position
39 * plus a given offset.
41 * @author <A HREF="mailto:wes@cacas.org">Wes Biggs</A>
43 public static interface CharIndexed {
45 * Defines a constant (0xFFFF was somewhat arbitrarily chosen)
46 * that can be returned by the charAt() function indicating that
47 * the specified index is out of range.
49 char OUT_OF_BOUNDS = '\uFFFF';
52 * Returns the character at the given offset past the current cursor
53 * position in the input. The index of the current position is zero.
54 * It is possible for this method to be called with a negative index.
55 * This happens when using the '^' operator in multiline matching mode
56 * or the '\b' or '\<' word boundary operators. In any case, the lower
57 * bound is currently fixed at -2 (for '^' with a two-character newline).
59 * @param index the offset position in the character field to examine
60 * @return the character at the specified index, or the OUT_OF_BOUNDS
61 * character defined by this interface.
63 char charAt(int index);
66 * Shifts the input buffer by a given number of positions. Returns
67 * true if the new cursor position is valid.
69 boolean move(int index);
72 * Returns true if the most recent move() operation placed the cursor
73 * position at a valid position in the input.
78 * gnu/regexp/CharIndexedCharArray.java
79 * Copyright (C) 1998-2001 Wes Biggs
81 * This library is free software; you can redistribute it and/or modify
82 * it under the terms of the GNU Lesser General Public License as published
83 * by the Free Software Foundation; either version 2.1 of the License, or
84 * (at your option) any later version.
86 * This library is distributed in the hope that it will be useful,
87 * but WITHOUT ANY WARRANTY; without even the implied warranty of
88 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
89 * GNU Lesser General Public License for more details.
91 * You should have received a copy of the GNU Lesser General Public License
92 * along with this program; if not, write to the Free Software
93 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
96 static class CharIndexedCharArray implements CharIndexed, Serializable {
100 CharIndexedCharArray(char[] str, int index) {
105 public char charAt(int index) {
106 int pos = anchor + index;
107 return ((pos < s.length) && (pos >= 0)) ? s[pos] : OUT_OF_BOUNDS;
110 public boolean isValid() {
111 return (anchor < s.length);
114 public boolean move(int index) {
115 return ((anchor += index) < s.length);
119 * gnu/regexp/CharIndexedReader.java
120 * Copyright (C) 1998-2001 Wes Biggs
122 * This library is free software; you can redistribute it and/or modify
123 * it under the terms of the GNU Lesser General Public License as published
124 * by the Free Software Foundation; either version 2.1 of the License, or
125 * (at your option) any later version.
127 * This library is distributed in the hope that it will be useful,
128 * but WITHOUT ANY WARRANTY; without even the implied warranty of
129 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
130 * GNU Lesser General Public License for more details.
132 * You should have received a copy of the GNU Lesser General Public License
133 * along with this program; if not, write to the Free Software
134 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
138 // TODO: move(x) shouldn't rely on calling next() x times
140 static class CharIndexedInputStream implements CharIndexed {
141 private static final int BUFFER_INCREMENT = 1024;
142 private static final int UNKNOWN = Integer.MAX_VALUE; // value for end
144 private BufferedInputStream br;
146 // so that we don't try to reset() right away
147 private int index = -1;
149 private int bufsize = BUFFER_INCREMENT;
151 private int end = UNKNOWN;
153 private char cached = OUT_OF_BOUNDS;
155 // Big enough for a \r\n pair
156 // lookBehind[0] = most recent
157 // lookBehind[1] = second most recent
158 private char[] lookBehind = new char[] { OUT_OF_BOUNDS, OUT_OF_BOUNDS };
160 CharIndexedInputStream(InputStream str, int index) {
161 if (str instanceof BufferedInputStream) br = (BufferedInputStream) str;
162 else br = new BufferedInputStream(str,BUFFER_INCREMENT);
164 if (index > 0) move(index);
167 private boolean next() {
168 if (end == 1) return false;
169 end--; // closer to end
179 cached = OUT_OF_BOUNDS;
184 } catch (IOException e) {
186 cached = OUT_OF_BOUNDS;
192 public char charAt(int index) {
195 } else if (index >= end) {
196 return OUT_OF_BOUNDS;
197 } else if (index == -1) {
198 return lookBehind[0];
199 } else if (index == -2) {
200 return lookBehind[1];
201 } else if (index < -2) {
202 return OUT_OF_BOUNDS;
203 } else if (index >= bufsize) {
204 // Allocate more space in the buffer.
206 while (bufsize <= index) bufsize += BUFFER_INCREMENT;
210 } catch (IOException e) { }
211 } else if (this.index != index) {
215 } catch (IOException e) { }
217 char ch = OUT_OF_BOUNDS;
221 this.index = index+1; // this.index is index of next pos relative to charAt(0)
223 // set flag that next should fail next time?
228 } catch (IOException ie) { }
233 public boolean move(int index) {
234 // move read position [index] clicks from 'charAt(0)'
235 boolean retval = true;
236 while (retval && (index-- > 0)) retval = next();
240 public boolean isValid() {
241 return (cached != OUT_OF_BOUNDS);
246 * gnu/regexp/CharIndexedReader.java
247 * Copyright (C) 2001 Lee Sau Dan
248 * Based on gnu.regexp.CharIndexedInputStream by Wes Biggs
250 * This library is free software; you can redistribute it and/or modify
251 * it under the terms of the GNU Lesser General Public License as published
252 * by the Free Software Foundation; either version 2.1 of the License, or
253 * (at your option) any later version.
255 * This library is distributed in the hope that it will be useful,
256 * but WITHOUT ANY WARRANTY; without even the implied warranty of
257 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
258 * GNU Lesser General Public License for more details.
260 * You should have received a copy of the GNU Lesser General Public License
261 * along with this program; if not, write to the Free Software
262 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
266 // TODO: move(x) shouldn't rely on calling next() x times
268 static class CharIndexedReader implements CharIndexed {
269 private static final int BUFFER_INCREMENT = 1024;
270 private static final int UNKNOWN = Integer.MAX_VALUE; // value for end
272 private final BufferedReader br;
273 // so that we don't try to reset() right away
274 private int index = -1;
276 private int bufsize = BUFFER_INCREMENT;
278 private int end = UNKNOWN;
280 private char cached = OUT_OF_BOUNDS;
282 // Big enough for a \r\n pair
283 // lookBehind[0] = most recent
284 // lookBehind[1] = second most recent
285 private char[] lookBehind = new char[] { OUT_OF_BOUNDS, OUT_OF_BOUNDS };
287 CharIndexedReader(Reader reader, int index) {
288 if (reader instanceof BufferedReader) {
289 br = (BufferedReader) reader;
291 br = new BufferedReader(reader,BUFFER_INCREMENT);
294 if (index > 0) move(index);
297 private boolean next() {
298 lookBehind[1] = lookBehind[0];
299 lookBehind[0] = cached;
302 cached = OUT_OF_BOUNDS;
305 end--; // closer to end
315 cached = OUT_OF_BOUNDS;
319 // convert the byte read into a char
322 } catch (IOException e) {
324 cached = OUT_OF_BOUNDS;
330 public char charAt(int index) {
333 } else if (index >= end) {
334 return OUT_OF_BOUNDS;
335 } else if (index >= bufsize) {
336 // Allocate more space in the buffer.
338 while (bufsize <= index) bufsize += BUFFER_INCREMENT;
342 } catch (IOException e) { }
343 } else if (this.index != index) {
347 } catch (IOException e) { }
348 } else if (index == -1) {
349 return lookBehind[0];
350 } else if (index == -2) {
351 return lookBehind[1];
352 } else if (index < -2) {
353 return OUT_OF_BOUNDS;
356 char ch = OUT_OF_BOUNDS;
360 this.index = index+1; // this.index is index of next pos relative to charAt(0)
362 // set flag that next should fail next time?
367 } catch (IOException ie) { }
372 public boolean move(int index) {
373 // move read position [index] clicks from 'charAt(0)'
374 boolean retval = true;
375 while (retval && (index-- > 0)) retval = next();
379 public boolean isValid() {
380 return (cached != OUT_OF_BOUNDS);
384 * gnu/regexp/CharIndexedString.java
385 * Copyright (C) 1998-2001 Wes Biggs
387 * This library is free software; you can redistribute it and/or modify
388 * it under the terms of the GNU Lesser General Public License as published
389 * by the Free Software Foundation; either version 2.1 of the License, or
390 * (at your option) any later version.
392 * This library is distributed in the hope that it will be useful,
393 * but WITHOUT ANY WARRANTY; without even the implied warranty of
394 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
395 * GNU Lesser General Public License for more details.
397 * You should have received a copy of the GNU Lesser General Public License
398 * along with this program; if not, write to the Free Software
399 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
402 static class CharIndexedString implements CharIndexed, Serializable {
407 CharIndexedString(String str, int index) {
413 public char charAt(int index) {
414 int pos = anchor + index;
415 return ((pos < len) && (pos >= 0)) ? s.charAt(pos) : OUT_OF_BOUNDS;
418 public boolean isValid() {
419 return (anchor < len);
422 public boolean move(int index) {
423 return ((anchor += index) < len);
427 * gnu/regexp/CharIndexedStringBuffer.java
428 * Copyright (C) 1998-2001 Wes Biggs
430 * This library is free software; you can redistribute it and/or modify
431 * it under the terms of the GNU Lesser General Public License as published
432 * by the Free Software Foundation; either version 2.1 of the License, or
433 * (at your option) any later version.
435 * This library is distributed in the hope that it will be useful,
436 * but WITHOUT ANY WARRANTY; without even the implied warranty of
437 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
438 * GNU Lesser General Public License for more details.
440 * You should have received a copy of the GNU Lesser General Public License
441 * along with this program; if not, write to the Free Software
442 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
445 static class CharIndexedStringBuffer implements CharIndexed, Serializable {
446 private StringBuffer s;
449 CharIndexedStringBuffer(StringBuffer str, int index) {
454 public char charAt(int index) {
455 int pos = anchor + index;
456 return ((pos < s.length()) && (pos >= 0)) ? s.charAt(pos) : OUT_OF_BOUNDS;
459 public boolean isValid() {
460 return (anchor < s.length());
463 public boolean move(int index) {
464 return ((anchor += index) < s.length());
469 * Copyright (C) 1998-2001 Wes Biggs
471 * This library is free software; you can redistribute it and/or modify
472 * it under the terms of the GNU Lesser General Public License as published
473 * by the Free Software Foundation; either version 2.1 of the License, or
474 * (at your option) any later version.
476 * This library is distributed in the hope that it will be useful,
477 * but WITHOUT ANY WARRANTY; without even the implied warranty of
478 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
479 * GNU Lesser General Public License for more details.
481 * You should have received a copy of the GNU Lesser General Public License
482 * along with this program; if not, write to the Free Software
483 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
487 static class IntPair implements Serializable {
488 public int first, second;
491 static class CharUnit implements Serializable {
497 * RE provides the user interface for compiling and matching regular
500 * A regular expression object (class RE) is compiled by constructing it
501 * from a String, StringBuffer or character array, with optional
502 * compilation flags (below)
503 * and an optional syntax specification (see RESyntax; if not specified,
504 * <code>RESyntax.RE_SYNTAX_PERL5</code> is used).
506 * Various methods attempt to match input text against a compiled
507 * regular expression. These methods are:
508 * <LI><code>isMatch</code>: returns true if the input text in its entirety
509 * matches the regular expression pattern.
510 * <LI><code>getMatch</code>: returns the first match found in the input text,
511 * or null if no match is found.
512 * <LI><code>getAllMatches</code>: returns an array of all non-overlapping
513 * matches found in the input text. If no matches are found, the array is
515 * <LI><code>substitute</code>: substitute the first occurence of the pattern
516 * in the input text with a replacement string (which may include
517 * metacharacters $0-$9, see REMatch.substituteInto).
518 * <LI><code>substituteAll</code>: same as above, but repeat for each match
520 * <LI><code>getMatchEnumeration</code>: returns an REMatchEnumeration object
521 * that allows iteration over the matches (see REMatchEnumeration for some
522 * reasons why you may want to do this instead of using <code>getAllMatches</code>.
525 * These methods all have similar argument lists. The input can be a
526 * String, a character array, a StringBuffer, a Reader or an
527 * InputStream of some sort. Note that when using a Reader or
528 * InputStream, the stream read position cannot be guaranteed after
529 * attempting a match (this is not a bug, but a consequence of the way
530 * regular expressions work). Using an REMatchEnumeration can
531 * eliminate most positioning problems.
535 * The optional index argument specifies the offset from the beginning
536 * of the text at which the search should start (see the descriptions
537 * of some of the execution flags for how this can affect positional
538 * pattern operators). For a Reader or InputStream, this means an
539 * offset from the current read position, so subsequent calls with the
540 * same index argument on a Reader or an InputStream will not
541 * necessarily access the same position on the stream, whereas
542 * repeated searches at a given index in a fixed string will return
543 * consistent results.
546 * You can optionally affect the execution environment by using a
547 * combination of execution flags (constants listed below).
550 * All operations on a regular expression are performed in a
551 * thread-safe manner.
553 * @author <A HREF="mailto:wes@cacas.org">Wes Biggs</A>
554 * @version 1.1.4-dev, to be released
557 public static class RE extends REToken {
558 // This String will be returned by getVersion()
559 private static final String VERSION = "1.1.4-dev";
561 // The localized strings are kept in a separate file
562 private static ResourceBundle messages =
563 new ListResourceBundle() {
564 public Object[][] getContents() { return stuff; }
565 private Object[][] stuff = new Object[][] {
566 { "error.prefix", "At position {0} in regular expression pattern:" },
567 { "repeat.assertion", "repeated token is zero-width assertion" },
568 { "repeat.chained", "attempted to repeat a token that is already repeated" },
569 { "repeat.no.token", "quantifier (?*+{}) without preceding token" },
570 { "repeat.empty.token", "repeated token may be empty" },
571 { "unmatched.brace", "unmatched brace" },
572 { "unmatched.bracket", "unmatched bracket" },
573 { "unmatched.paren", "unmatched parenthesis" },
574 { "interval.no.end", "expected end of interval" },
575 { "class.no.end", "expected end of character class" },
576 { "subexpr.no.end", "expected end of subexpression" },
577 { "interval.order", "interval minimum is greater than maximum" },
578 { "interval.error", "interval is empty or contains illegal chracters" },
579 { "ends.with.backslash", "backslash at end of pattern" },
580 { "syntax.final", "Syntax has been declared final and cannot be modified" } }; };
582 // These are, respectively, the first and last tokens in our linked list
583 // If there is only one token, firstToken == lastToken
584 private REToken firstToken, lastToken;
586 // This is the number of subexpressions in this regular expression,
587 // with a minimum value of zero. Returned by getNumSubs()
590 /** Minimum length, in characters, of any possible match. */
591 private int minimumLength;
594 * Compilation flag. Do not differentiate case. Subsequent
595 * searches using this RE will be case insensitive.
597 public static final int REG_ICASE = 2;
600 * Compilation flag. The match-any-character operator (dot)
601 * will match a newline character. When set this overrides the syntax
602 * bit RE_DOT_NEWLINE (see RESyntax for details). This is equivalent to
603 * the "/s" operator in Perl.
605 public static final int REG_DOT_NEWLINE = 4;
608 * Compilation flag. Use multiline mode. In this mode, the ^ and $
609 * anchors will match based on newlines within the input. This is
610 * equivalent to the "/m" operator in Perl.
612 public static final int REG_MULTILINE = 8;
616 * The match-beginning operator (^) will not match at the beginning
617 * of the input string. Useful for matching on a substring when you
618 * know the context of the input is such that position zero of the
619 * input to the match test is not actually position zero of the text.
621 * This example demonstrates the results of various ways of matching on
625 * String s = "food bar fool";<BR>
626 * RE exp = new RE("^foo.");<BR>
627 * REMatch m0 = exp.getMatch(s);<BR>
628 * REMatch m1 = exp.getMatch(s.substring(8));<BR>
629 * REMatch m2 = exp.getMatch(s.substring(8),0,RE.REG_NOTBOL); <BR>
630 * REMatch m3 = exp.getMatch(s,8); <BR>
631 * REMatch m4 = exp.getMatch(s,8,RE.REG_ANCHORINDEX); <BR>
641 public static final int REG_NOTBOL = 16;
645 * The match-end operator ($) does not match at the end
646 * of the input string. Useful for matching on substrings.
648 public static final int REG_NOTEOL = 32;
652 * When a match method is invoked that starts matching at a non-zero
653 * index into the input, treat the input as if it begins at the index
654 * given. The effect of this flag is that the engine does not "see"
655 * any text in the input before the given index. This is useful so
656 * that the match-beginning operator (^) matches not at position 0
657 * in the input string, but at the position the search started at
658 * (based on the index input given to the getMatch function). See
659 * the example under REG_NOTBOL. It also affects the use of the \<
662 public static final int REG_ANCHORINDEX = 64;
666 * The substitute and substituteAll methods will not attempt to
667 * interpolate occurrences of $1-$9 in the replacement text with
668 * the corresponding subexpressions. For example, you may want to
669 * replace all matches of "one dollar" with "$1".
671 public static final int REG_NO_INTERPOLATE = 128;
673 /** Returns a string representing the version of the gnu.regexp package. */
674 public static final String version() {
678 // Retrieves a message from the ResourceBundle
679 static final String getLocalizedMessage(String key) {
680 return messages.getString(key);
684 * Constructs a regular expression pattern buffer without any compilation
685 * flags set, and using the default syntax (RESyntax.RE_SYNTAX_PERL5).
687 * @param pattern A regular expression pattern, in the form of a String,
688 * StringBuffer or char[]. Other input types will be converted to
689 * strings using the toString() method.
690 * @exception REException The input pattern could not be parsed.
691 * @exception NullPointerException The pattern was null.
693 public RE(Object pattern) throws REException {
694 this(pattern,0,RESyntax.RE_SYNTAX_PERL5,0,0);
698 * Constructs a regular expression pattern buffer using the specified
699 * compilation flags and the default syntax (RESyntax.RE_SYNTAX_PERL5).
701 * @param pattern A regular expression pattern, in the form of a String,
702 * StringBuffer, or char[]. Other input types will be converted to
703 * strings using the toString() method.
704 * @param cflags The logical OR of any combination of the compilation flags listed above.
705 * @exception REException The input pattern could not be parsed.
706 * @exception NullPointerException The pattern was null.
708 public RE(Object pattern, int cflags) throws REException {
709 this(pattern,cflags,RESyntax.RE_SYNTAX_PERL5,0,0);
713 * Constructs a regular expression pattern buffer using the specified
714 * compilation flags and regular expression syntax.
716 * @param pattern A regular expression pattern, in the form of a String,
717 * StringBuffer, or char[]. Other input types will be converted to
718 * strings using the toString() method.
719 * @param cflags The logical OR of any combination of the compilation flags listed above.
720 * @param syntax The type of regular expression syntax to use.
721 * @exception REException The input pattern could not be parsed.
722 * @exception NullPointerException The pattern was null.
724 public RE(Object pattern, int cflags, RESyntax syntax) throws REException {
725 this(pattern,cflags,syntax,0,0);
728 // internal constructor used for alternation
729 private RE(REToken first, REToken last,int subs, int subIndex, int minLength) {
734 minimumLength = minLength;
735 addToken(new RETokenEndSub(subIndex));
738 private RE(Object patternObj, int cflags, RESyntax syntax, int myIndex, int nextSub) throws REException {
739 super(myIndex); // Subexpression index of this token.
740 initialize(patternObj, cflags, syntax, myIndex, nextSub);
743 // For use by subclasses
744 protected RE() { super(0); }
746 // The meat of construction
747 protected void initialize(Object patternObj, int cflags, RESyntax syntax, int myIndex, int nextSub) throws REException {
749 if (patternObj instanceof String) {
750 pattern = ((String) patternObj).toCharArray();
751 } else if (patternObj instanceof char[]) {
752 pattern = (char[]) patternObj;
753 } else if (patternObj instanceof StringBuffer) {
754 pattern = new char [((StringBuffer) patternObj).length()];
755 ((StringBuffer) patternObj).getChars(0,pattern.length,pattern,0);
757 pattern = patternObj.toString().toCharArray();
760 int pLength = pattern.length;
762 numSubs = 0; // Number of subexpressions in this token.
763 Vector branches = null;
765 // linked list of tokens (sort of -- some closed loops can exist)
766 firstToken = lastToken = null;
768 // Precalculate these so we don't pay for the math every time we
769 // need to access them.
770 boolean insens = ((cflags & REG_ICASE) > 0);
772 // Parse pattern into tokens. Does anyone know if it's more efficient
773 // to use char[] than a String.charAt()? I'm assuming so.
775 // index tracks the position in the char array
778 // this will be the current parse character (pattern[index])
779 CharUnit unit = new CharUnit();
781 // This is used for {x,y} calculations
782 IntPair minMax = new IntPair();
784 // Buffer a token so we can create a TokenRepeated, etc.
785 REToken currentToken = null;
788 while (index < pLength) {
789 // read the next character unit (including backslash escapes)
790 index = getCharUnit(pattern,index,unit);
792 // ALTERNATION OPERATOR
793 // \| or | (if RE_NO_BK_VBAR) or newline (if RE_NEWLINE_ALT)
794 // not available if RE_LIMITED_OPS is set
796 // TODO: the '\n' literal here should be a test against REToken.newline,
797 // which unfortunately may be more than a single character.
798 if ( ( (unit.ch == '|' && (syntax.get(RESyntax.RE_NO_BK_VBAR) ^ unit.bk))
799 || (syntax.get(RESyntax.RE_NEWLINE_ALT) && (unit.ch == '\n') && !unit.bk) )
800 && !syntax.get(RESyntax.RE_LIMITED_OPS)) {
801 // make everything up to here be a branch. create vector if nec.
802 addToken(currentToken);
803 RE theBranch = new RE(firstToken, lastToken, numSubs, subIndex, minimumLength);
805 if (branches == null) {
806 branches = new Vector();
808 branches.addElement(theBranch);
809 firstToken = lastToken = currentToken = null;
812 // INTERVAL OPERATOR:
813 // {x} | {x,} | {x,y} (RE_INTERVALS && RE_NO_BK_BRACES)
814 // \{x\} | \{x,\} | \{x,y\} (RE_INTERVALS && !RE_NO_BK_BRACES)
817 // what is proper interpretation of '{' at start of string?
819 else if ((unit.ch == '{') && syntax.get(RESyntax.RE_INTERVALS) && (syntax.get(RESyntax.RE_NO_BK_BRACES) ^ unit.bk)) {
820 int newIndex = getMinMax(pattern,index,minMax,syntax);
821 if (newIndex > index) {
822 if (minMax.first > minMax.second)
823 throw new REException(getLocalizedMessage("interval.order"),REException.REG_BADRPT,newIndex);
824 if (currentToken == null)
825 throw new REException(getLocalizedMessage("repeat.no.token"),REException.REG_BADRPT,newIndex);
826 if (currentToken instanceof RETokenRepeated)
827 throw new REException(getLocalizedMessage("repeat.chained"),REException.REG_BADRPT,newIndex);
828 if (currentToken instanceof RETokenWordBoundary || currentToken instanceof RETokenWordBoundary)
829 throw new REException(getLocalizedMessage("repeat.assertion"),REException.REG_BADRPT,newIndex);
830 if ((currentToken.getMinimumLength() == 0) && (minMax.second == Integer.MAX_VALUE))
831 throw new REException(getLocalizedMessage("repeat.empty.token"),REException.REG_BADRPT,newIndex);
833 currentToken = setRepeated(currentToken,minMax.first,minMax.second,index);
836 addToken(currentToken);
837 currentToken = new RETokenChar(subIndex,unit.ch,insens);
844 else if ((unit.ch == '[') && !unit.bk) {
845 Vector options = new Vector();
846 boolean negative = false;
848 if (index == pLength) throw new REException(getLocalizedMessage("unmatched.bracket"),REException.REG_EBRACK,index);
850 // Check for initial caret, negation
851 if ((ch = pattern[index]) == '^') {
853 if (++index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
857 // Check for leading right bracket literal
860 if (++index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
863 while ((ch = pattern[index++]) != ']') {
864 if ((ch == '-') && (lastChar != 0)) {
865 if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
866 if ((ch = pattern[index]) == ']') {
867 options.addElement(new RETokenChar(subIndex,lastChar,insens));
870 options.addElement(new RETokenRange(subIndex,lastChar,ch,insens));
874 } else if ((ch == '\\') && syntax.get(RESyntax.RE_BACKSLASH_ESCAPE_IN_LISTS)) {
875 if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
877 boolean negate = false;
879 if (("dswDSW".indexOf(pattern[index]) != -1) && syntax.get(RESyntax.RE_CHAR_CLASS_ESC_IN_LISTS)) {
880 switch (pattern[index]) {
884 posixID = RETokenPOSIX.DIGIT;
889 posixID = RETokenPOSIX.SPACE;
894 posixID = RETokenPOSIX.ALNUM;
898 else if ("nrt".indexOf(pattern[index]) != -1) {
899 switch (pattern[index]) {
911 if (lastChar != 0) options.addElement(new RETokenChar(subIndex,lastChar,insens));
914 options.addElement(new RETokenPOSIX(subIndex,posixID,insens,negate));
915 } else if (asciiEsc != 0) {
918 lastChar = pattern[index];
921 } else if ((ch == '[') && (syntax.get(RESyntax.RE_CHAR_CLASSES)) && (index < pLength) && (pattern[index] == ':')) {
922 StringBuffer posixSet = new StringBuffer();
923 index = getPosixSet(pattern,index+1,posixSet);
924 int posixId = RETokenPOSIX.intValue(posixSet.toString());
926 options.addElement(new RETokenPOSIX(subIndex,posixId,insens,false));
928 if (lastChar != 0) options.addElement(new RETokenChar(subIndex,lastChar,insens));
931 if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
933 // Out of list, index is one past ']'
935 if (lastChar != 0) options.addElement(new RETokenChar(subIndex,lastChar,insens));
937 // Create a new RETokenOneOf
938 addToken(currentToken);
939 options.trimToSize();
940 currentToken = new RETokenOneOf(subIndex,options,negative);
944 // (...) | \(...\) depending on RE_NO_BK_PARENS
946 else if ((unit.ch == '(') && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ unit.bk)) {
947 boolean pure = false;
948 boolean comment = false;
949 boolean lookAhead = false;
950 boolean negativelh = false;
951 if ((index+1 < pLength) && (pattern[index] == '?')) {
952 switch (pattern[index+1]) {
954 if (syntax.get(RESyntax.RE_LOOKAHEAD)) {
962 if (syntax.get(RESyntax.RE_LOOKAHEAD)) {
969 if (syntax.get(RESyntax.RE_PURE_GROUPING)) {
975 if (syntax.get(RESyntax.RE_COMMENTS)) {
980 throw new REException(getLocalizedMessage("repeat.no.token"), REException.REG_BADRPT, index);
984 if (index >= pLength) {
985 throw new REException(getLocalizedMessage("unmatched.paren"), REException.REG_ESUBREG,index);
988 // find end of subexpression
989 int endIndex = index;
990 int nextIndex = index;
993 while ( ((nextIndex = getCharUnit(pattern,endIndex,unit)) > 0)
994 && !(nested == 0 && (unit.ch == ')') && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ unit.bk)) )
995 if ((endIndex = nextIndex) >= pLength)
996 throw new REException(getLocalizedMessage("subexpr.no.end"),REException.REG_ESUBREG,nextIndex);
997 else if (unit.ch == '(' && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ unit.bk))
999 else if (unit.ch == ')' && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ unit.bk))
1002 // endIndex is now position at a ')','\)'
1003 // nextIndex is end of string or position after ')' or '\)'
1005 if (comment) index = nextIndex;
1006 else { // not a comment
1007 // create RE subexpression as token.
1008 addToken(currentToken);
1013 int useIndex = (pure || lookAhead) ? 0 : nextSub + numSubs;
1014 currentToken = new RE(String.valueOf(pattern,index,endIndex-index).toCharArray(),cflags,syntax,useIndex,nextSub + numSubs);
1015 numSubs += ((RE) currentToken).getNumSubs();
1018 currentToken = new RETokenLookAhead(currentToken,negativelh);
1025 // UNMATCHED RIGHT PAREN
1026 // ) or \) throw exception if
1027 // !syntax.get(RESyntax.RE_UNMATCHED_RIGHT_PAREN_ORD)
1028 else if (!syntax.get(RESyntax.RE_UNMATCHED_RIGHT_PAREN_ORD) && ((unit.ch == ')') && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ unit.bk))) {
1029 throw new REException(getLocalizedMessage("unmatched.paren"),REException.REG_EPAREN,index);
1032 // START OF LINE OPERATOR
1035 else if ((unit.ch == '^') && !unit.bk) {
1036 addToken(currentToken);
1037 currentToken = null;
1038 addToken(new RETokenStart(subIndex,((cflags & REG_MULTILINE) > 0) ? syntax.getLineSeparator() : null));
1041 // END OF LINE OPERATOR
1044 else if ((unit.ch == '$') && !unit.bk) {
1045 addToken(currentToken);
1046 currentToken = null;
1047 addToken(new RETokenEnd(subIndex,((cflags & REG_MULTILINE) > 0) ? syntax.getLineSeparator() : null));
1050 // MATCH-ANY-CHARACTER OPERATOR (except possibly newline and null)
1053 else if ((unit.ch == '.') && !unit.bk) {
1054 addToken(currentToken);
1055 currentToken = new RETokenAny(subIndex,syntax.get(RESyntax.RE_DOT_NEWLINE) || ((cflags & REG_DOT_NEWLINE) > 0),syntax.get(RESyntax.RE_DOT_NOT_NULL));
1058 // ZERO-OR-MORE REPEAT OPERATOR
1061 else if ((unit.ch == '*') && !unit.bk) {
1062 if (currentToken == null)
1063 throw new REException(getLocalizedMessage("repeat.no.token"),REException.REG_BADRPT,index);
1064 if (currentToken instanceof RETokenRepeated)
1065 throw new REException(getLocalizedMessage("repeat.chained"),REException.REG_BADRPT,index);
1066 if (currentToken instanceof RETokenWordBoundary || currentToken instanceof RETokenWordBoundary)
1067 throw new REException(getLocalizedMessage("repeat.assertion"),REException.REG_BADRPT,index);
1068 if (currentToken.getMinimumLength() == 0)
1069 throw new REException(getLocalizedMessage("repeat.empty.token"),REException.REG_BADRPT,index);
1070 currentToken = setRepeated(currentToken,0,Integer.MAX_VALUE,index);
1073 // ONE-OR-MORE REPEAT OPERATOR
1074 // + | \+ depending on RE_BK_PLUS_QM
1075 // not available if RE_LIMITED_OPS is set
1077 else if ((unit.ch == '+') && !syntax.get(RESyntax.RE_LIMITED_OPS) && (!syntax.get(RESyntax.RE_BK_PLUS_QM) ^ unit.bk)) {
1078 if (currentToken == null)
1079 throw new REException(getLocalizedMessage("repeat.no.token"),REException.REG_BADRPT,index);
1080 if (currentToken instanceof RETokenRepeated)
1081 throw new REException(getLocalizedMessage("repeat.chained"),REException.REG_BADRPT,index);
1082 if (currentToken instanceof RETokenWordBoundary || currentToken instanceof RETokenWordBoundary)
1083 throw new REException(getLocalizedMessage("repeat.assertion"),REException.REG_BADRPT,index);
1084 if (currentToken.getMinimumLength() == 0)
1085 throw new REException(getLocalizedMessage("repeat.empty.token"),REException.REG_BADRPT,index);
1086 currentToken = setRepeated(currentToken,1,Integer.MAX_VALUE,index);
1089 // ZERO-OR-ONE REPEAT OPERATOR / STINGY MATCHING OPERATOR
1090 // ? | \? depending on RE_BK_PLUS_QM
1091 // not available if RE_LIMITED_OPS is set
1092 // stingy matching if RE_STINGY_OPS is set and it follows a quantifier
1094 else if ((unit.ch == '?') && !syntax.get(RESyntax.RE_LIMITED_OPS) && (!syntax.get(RESyntax.RE_BK_PLUS_QM) ^ unit.bk)) {
1095 if (currentToken == null) throw new REException(getLocalizedMessage("repeat.no.token"),REException.REG_BADRPT,index);
1097 // Check for stingy matching on RETokenRepeated
1098 if (currentToken instanceof RETokenRepeated) {
1099 if (syntax.get(RESyntax.RE_STINGY_OPS) && !((RETokenRepeated)currentToken).isStingy())
1100 ((RETokenRepeated)currentToken).makeStingy();
1102 throw new REException(getLocalizedMessage("repeat.chained"),REException.REG_BADRPT,index);
1104 else if (currentToken instanceof RETokenWordBoundary || currentToken instanceof RETokenWordBoundary)
1105 throw new REException(getLocalizedMessage("repeat.assertion"),REException.REG_BADRPT,index);
1107 currentToken = setRepeated(currentToken,0,1,index);
1110 // BACKREFERENCE OPERATOR
1112 // not available if RE_NO_BK_REFS is set
1114 else if (unit.bk && Character.isDigit(unit.ch) && !syntax.get(RESyntax.RE_NO_BK_REFS)) {
1115 addToken(currentToken);
1116 currentToken = new RETokenBackRef(subIndex,Character.digit(unit.ch,10),insens);
1119 // START OF STRING OPERATOR
1120 // \A if RE_STRING_ANCHORS is set
1122 else if (unit.bk && (unit.ch == 'A') && syntax.get(RESyntax.RE_STRING_ANCHORS)) {
1123 addToken(currentToken);
1124 currentToken = new RETokenStart(subIndex,null);
1127 // WORD BREAK OPERATOR
1130 else if (unit.bk && (unit.ch == 'b') && syntax.get(RESyntax.RE_STRING_ANCHORS)) {
1131 addToken(currentToken);
1132 currentToken = new RETokenWordBoundary(subIndex, RETokenWordBoundary.BEGIN | RETokenWordBoundary.END, false);
1135 // WORD BEGIN OPERATOR
1137 else if (unit.bk && (unit.ch == '<')) {
1138 addToken(currentToken);
1139 currentToken = new RETokenWordBoundary(subIndex, RETokenWordBoundary.BEGIN, false);
1142 // WORD END OPERATOR
1144 else if (unit.bk && (unit.ch == '>')) {
1145 addToken(currentToken);
1146 currentToken = new RETokenWordBoundary(subIndex, RETokenWordBoundary.END, false);
1149 // NON-WORD BREAK OPERATOR
1152 else if (unit.bk && (unit.ch == 'B') && syntax.get(RESyntax.RE_STRING_ANCHORS)) {
1153 addToken(currentToken);
1154 currentToken = new RETokenWordBoundary(subIndex, RETokenWordBoundary.BEGIN | RETokenWordBoundary.END, true);
1159 // \d if RE_CHAR_CLASS_ESCAPES is set
1161 else if (unit.bk && (unit.ch == 'd') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
1162 addToken(currentToken);
1163 currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.DIGIT,insens,false);
1166 // NON-DIGIT OPERATOR
1169 else if (unit.bk && (unit.ch == 'D') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
1170 addToken(currentToken);
1171 currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.DIGIT,insens,true);
1177 else if (unit.bk && (unit.ch == 'n')) {
1178 addToken(currentToken);
1179 currentToken = new RETokenChar(subIndex,'\n',false);
1185 else if (unit.bk && (unit.ch == 'r')) {
1186 addToken(currentToken);
1187 currentToken = new RETokenChar(subIndex,'\r',false);
1190 // WHITESPACE OPERATOR
1191 // \s if RE_CHAR_CLASS_ESCAPES is set
1193 else if (unit.bk && (unit.ch == 's') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
1194 addToken(currentToken);
1195 currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.SPACE,insens,false);
1198 // NON-WHITESPACE OPERATOR
1201 else if (unit.bk && (unit.ch == 'S') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
1202 addToken(currentToken);
1203 currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.SPACE,insens,true);
1209 else if (unit.bk && (unit.ch == 't')) {
1210 addToken(currentToken);
1211 currentToken = new RETokenChar(subIndex,'\t',false);
1214 // ALPHANUMERIC OPERATOR
1217 else if (unit.bk && (unit.ch == 'w') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
1218 addToken(currentToken);
1219 currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.ALNUM,insens,false);
1222 // NON-ALPHANUMERIC OPERATOR
1225 else if (unit.bk && (unit.ch == 'W') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
1226 addToken(currentToken);
1227 currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.ALNUM,insens,true);
1230 // END OF STRING OPERATOR
1233 else if (unit.bk && (unit.ch == 'Z') && syntax.get(RESyntax.RE_STRING_ANCHORS)) {
1234 addToken(currentToken);
1235 currentToken = new RETokenEnd(subIndex,null);
1238 // NON-SPECIAL CHARACTER (or escape to make literal)
1239 // c | \* for example
1241 else { // not a special character
1242 addToken(currentToken);
1243 currentToken = new RETokenChar(subIndex,unit.ch,insens);
1247 // Add final buffered token and an EndSub marker
1248 addToken(currentToken);
1250 if (branches != null) {
1251 branches.addElement(new RE(firstToken,lastToken,numSubs,subIndex,minimumLength));
1252 branches.trimToSize(); // compact the Vector
1254 firstToken = lastToken = null;
1255 addToken(new RETokenOneOf(subIndex,branches,false));
1257 else addToken(new RETokenEndSub(subIndex));
1261 private static int getCharUnit(char[] input, int index, CharUnit unit) throws REException {
1262 unit.ch = input[index++];
1263 if (unit.bk = (unit.ch == '\\'))
1264 if (index < input.length)
1265 unit.ch = input[index++];
1266 else throw new REException(getLocalizedMessage("ends.with.backslash"),REException.REG_ESCAPE,index);
1271 * Checks if the regular expression matches the input in its entirety.
1273 * @param input The input text.
1275 public boolean isMatch(Object input) {
1276 return isMatch(input,0,0);
1280 * Checks if the input string, starting from index, is an exact match of
1281 * this regular expression.
1283 * @param input The input text.
1284 * @param index The offset index at which the search should be begin.
1286 public boolean isMatch(Object input,int index) {
1287 return isMatch(input,index,0);
1292 * Checks if the input, starting from index and using the specified
1293 * execution flags, is an exact match of this regular expression.
1295 * @param input The input text.
1296 * @param index The offset index at which the search should be begin.
1297 * @param eflags The logical OR of any execution flags above.
1299 public boolean isMatch(Object input,int index,int eflags) {
1300 return isMatchImpl(makeCharIndexed(input,index),index,eflags);
1303 private boolean isMatchImpl(CharIndexed input, int index, int eflags) {
1304 if (firstToken == null) // Trivial case
1305 return (input.charAt(0) == CharIndexed.OUT_OF_BOUNDS);
1306 REMatch m = new REMatch(numSubs, index, eflags);
1307 if (firstToken.match(input, m)) {
1309 if (input.charAt(m.index) == CharIndexed.OUT_OF_BOUNDS) {
1319 * Returns the maximum number of subexpressions in this regular expression.
1320 * If the expression contains branches, the value returned will be the
1321 * maximum subexpressions in any of the branches.
1323 public int getNumSubs() {
1327 // Overrides REToken.setUncle
1328 void setUncle(REToken uncle) {
1329 if (lastToken != null) {
1330 lastToken.setUncle(uncle);
1331 } else super.setUncle(uncle); // to deal with empty subexpressions
1334 // Overrides REToken.chain
1336 boolean chain(REToken next) {
1343 * Returns the minimum number of characters that could possibly
1344 * constitute a match of this regular expression.
1346 public int getMinimumLength() {
1347 return minimumLength;
1351 * Returns an array of all matches found in the input.
1353 * If the regular expression allows the empty string to match, it will
1354 * substitute matches at all positions except the end of the input.
1356 * @param input The input text.
1357 * @return a non-null (but possibly zero-length) array of matches
1359 public REMatch[] getAllMatches(Object input) {
1360 return getAllMatches(input,0,0);
1364 * Returns an array of all matches found in the input,
1365 * beginning at the specified index position.
1367 * If the regular expression allows the empty string to match, it will
1368 * substitute matches at all positions except the end of the input.
1370 * @param input The input text.
1371 * @param index The offset index at which the search should be begin.
1372 * @return a non-null (but possibly zero-length) array of matches
1374 public REMatch[] getAllMatches(Object input, int index) {
1375 return getAllMatches(input,index,0);
1379 * Returns an array of all matches found in the input string,
1380 * beginning at the specified index position and using the specified
1383 * If the regular expression allows the empty string to match, it will
1384 * substitute matches at all positions except the end of the input.
1386 * @param input The input text.
1387 * @param index The offset index at which the search should be begin.
1388 * @param eflags The logical OR of any execution flags above.
1389 * @return a non-null (but possibly zero-length) array of matches
1391 public REMatch[] getAllMatches(Object input, int index, int eflags) {
1392 return getAllMatchesImpl(makeCharIndexed(input,index),index,eflags);
1395 // this has been changed since 1.03 to be non-overlapping matches
1396 private REMatch[] getAllMatchesImpl(CharIndexed input, int index, int eflags) {
1397 Vector all = new Vector();
1399 while ((m = getMatchImpl(input,index,eflags,null)) != null) {
1401 index = m.getEndIndex();
1402 if (m.end[0] == 0) { // handle pathological case of zero-length match
1406 input.move(m.end[0]);
1408 if (!input.isValid()) break;
1410 REMatch[] mset = new REMatch[all.size()];
1415 /* Implements abstract method REToken.match() */
1416 boolean match(CharIndexed input, REMatch mymatch) {
1417 if (firstToken == null) return next(input, mymatch);
1419 // Note the start of this subexpression
1420 mymatch.start[subIndex] = mymatch.index;
1422 return firstToken.match(input, mymatch);
1426 * Returns the first match found in the input. If no match is found,
1429 * @param input The input text.
1430 * @return An REMatch instance referencing the match, or null if none.
1432 public REMatch getMatch(Object input) {
1433 return getMatch(input,0,0);
1437 * Returns the first match found in the input, beginning
1438 * the search at the specified index. If no match is found,
1441 * @param input The input text.
1442 * @param index The offset within the text to begin looking for a match.
1443 * @return An REMatch instance referencing the match, or null if none.
1445 public REMatch getMatch(Object input, int index) {
1446 return getMatch(input,index,0);
1450 * Returns the first match found in the input, beginning
1451 * the search at the specified index, and using the specified
1452 * execution flags. If no match is found, returns null.
1454 * @param input The input text.
1455 * @param index The offset index at which the search should be begin.
1456 * @param eflags The logical OR of any execution flags above.
1457 * @return An REMatch instance referencing the match, or null if none.
1459 public REMatch getMatch(Object input, int index, int eflags) {
1460 return getMatch(input,index,eflags,null);
1464 * Returns the first match found in the input, beginning the search
1465 * at the specified index, and using the specified execution flags.
1466 * If no match is found, returns null. If a StringBuffer is
1467 * provided and is non-null, the contents of the input text from the
1468 * index to the beginning of the match (or to the end of the input,
1469 * if there is no match) are appended to the StringBuffer.
1471 * @param input The input text.
1472 * @param index The offset index at which the search should be begin.
1473 * @param eflags The logical OR of any execution flags above.
1474 * @param buffer The StringBuffer to save pre-match text in.
1475 * @return An REMatch instance referencing the match, or null if none. */
1476 public REMatch getMatch(Object input, int index, int eflags, StringBuffer buffer) {
1477 return getMatchImpl(makeCharIndexed(input,index),index,eflags,buffer);
1480 REMatch getMatchImpl(CharIndexed input, int anchor, int eflags, StringBuffer buffer) {
1481 // Create a new REMatch to hold results
1482 REMatch mymatch = new REMatch(numSubs, anchor, eflags);
1484 // Optimization: check if anchor + minimumLength > length
1485 if (minimumLength == 0 || input.charAt(minimumLength-1) != CharIndexed.OUT_OF_BOUNDS) {
1486 if (match(input, mymatch)) {
1487 // Find longest match of them all to observe leftmost longest
1488 REMatch longest = mymatch;
1489 while ((mymatch = mymatch.next) != null) {
1490 if (mymatch.index > longest.index) {
1495 longest.end[0] = longest.index;
1496 longest.finish(input);
1500 mymatch.clear(++anchor);
1501 // Append character to buffer if needed
1502 if (buffer != null && input.charAt(0) != CharIndexed.OUT_OF_BOUNDS) {
1503 buffer.append(input.charAt(0));
1505 } while (input.move(1));
1511 * Returns an REMatchEnumeration that can be used to iterate over the
1512 * matches found in the input text.
1514 * @param input The input text.
1515 * @return A non-null REMatchEnumeration instance.
1517 public REMatchEnumeration getMatchEnumeration(Object input) {
1518 return getMatchEnumeration(input,0,0);
1523 * Returns an REMatchEnumeration that can be used to iterate over the
1524 * matches found in the input text.
1526 * @param input The input text.
1527 * @param index The offset index at which the search should be begin.
1528 * @return A non-null REMatchEnumeration instance, with its input cursor
1529 * set to the index position specified.
1531 public REMatchEnumeration getMatchEnumeration(Object input, int index) {
1532 return getMatchEnumeration(input,index,0);
1536 * Returns an REMatchEnumeration that can be used to iterate over the
1537 * matches found in the input text.
1539 * @param input The input text.
1540 * @param index The offset index at which the search should be begin.
1541 * @param eflags The logical OR of any execution flags above.
1542 * @return A non-null REMatchEnumeration instance, with its input cursor
1543 * set to the index position specified.
1545 public REMatchEnumeration getMatchEnumeration(Object input, int index, int eflags) {
1546 return new REMatchEnumeration(this,makeCharIndexed(input,index),index,eflags);
1551 * Substitutes the replacement text for the first match found in the input.
1553 * @param input The input text.
1554 * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1555 * @return A String interpolating the substituted text.
1556 * @see REMatch#substituteInto
1558 public String substitute(Object input,String replace) {
1559 return substitute(input,replace,0,0);
1563 * Substitutes the replacement text for the first match found in the input
1564 * beginning at the specified index position. Specifying an index
1565 * effectively causes the regular expression engine to throw away the
1566 * specified number of characters.
1568 * @param input The input text.
1569 * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1570 * @param index The offset index at which the search should be begin.
1571 * @return A String containing the substring of the input, starting
1572 * at the index position, and interpolating the substituted text.
1573 * @see REMatch#substituteInto
1575 public String substitute(Object input,String replace,int index) {
1576 return substitute(input,replace,index,0);
1580 * Substitutes the replacement text for the first match found in the input
1581 * string, beginning at the specified index position and using the
1582 * specified execution flags.
1584 * @param input The input text.
1585 * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1586 * @param index The offset index at which the search should be begin.
1587 * @param eflags The logical OR of any execution flags above.
1588 * @return A String containing the substring of the input, starting
1589 * at the index position, and interpolating the substituted text.
1590 * @see REMatch#substituteInto
1592 public String substitute(Object input,String replace,int index,int eflags) {
1593 return substituteImpl(makeCharIndexed(input,index),replace,index,eflags);
1596 private String substituteImpl(CharIndexed input,String replace,int index,int eflags) {
1597 StringBuffer buffer = new StringBuffer();
1598 REMatch m = getMatchImpl(input,index,eflags,buffer);
1599 if (m==null) return buffer.toString();
1600 buffer.append( ((eflags & REG_NO_INTERPOLATE) > 0) ?
1601 replace : m.substituteInto(replace) );
1602 if (input.move(m.end[0])) {
1604 buffer.append(input.charAt(0));
1605 } while (input.move(1));
1607 return buffer.toString();
1611 * Substitutes the replacement text for each non-overlapping match found
1612 * in the input text.
1614 * @param input The input text.
1615 * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1616 * @return A String interpolating the substituted text.
1617 * @see REMatch#substituteInto
1619 public String substituteAll(Object input,String replace) {
1620 return substituteAll(input,replace,0,0);
1624 * Substitutes the replacement text for each non-overlapping match found
1625 * in the input text, starting at the specified index.
1627 * If the regular expression allows the empty string to match, it will
1628 * substitute matches at all positions except the end of the input.
1630 * @param input The input text.
1631 * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1632 * @param index The offset index at which the search should be begin.
1633 * @return A String containing the substring of the input, starting
1634 * at the index position, and interpolating the substituted text.
1635 * @see REMatch#substituteInto
1637 public String substituteAll(Object input,String replace,int index) {
1638 return substituteAll(input,replace,index,0);
1642 * Substitutes the replacement text for each non-overlapping match found
1643 * in the input text, starting at the specified index and using the
1644 * specified execution flags.
1646 * @param input The input text.
1647 * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1648 * @param index The offset index at which the search should be begin.
1649 * @param eflags The logical OR of any execution flags above.
1650 * @return A String containing the substring of the input, starting
1651 * at the index position, and interpolating the substituted text.
1652 * @see REMatch#substituteInto
1654 public String substituteAll(Object input,String replace,int index,int eflags) {
1655 return substituteAllImpl(makeCharIndexed(input,index),replace,index,eflags);
1658 private String substituteAllImpl(CharIndexed input,String replace,int index,int eflags) {
1659 StringBuffer buffer = new StringBuffer();
1661 while ((m = getMatchImpl(input,index,eflags,buffer)) != null) {
1662 buffer.append( ((eflags & REG_NO_INTERPOLATE) > 0) ?
1663 replace : m.substituteInto(replace) );
1664 index = m.getEndIndex();
1665 if (m.end[0] == 0) {
1666 char ch = input.charAt(0);
1667 if (ch != CharIndexed.OUT_OF_BOUNDS)
1671 input.move(m.end[0]);
1674 if (!input.isValid()) break;
1676 return buffer.toString();
1679 /* Helper function for constructor */
1680 private void addToken(REToken next) {
1681 if (next == null) return;
1682 minimumLength += next.getMinimumLength();
1683 if (firstToken == null) {
1684 lastToken = firstToken = next;
1686 // if chain returns false, it "rejected" the token due to
1687 // an optimization, and next was combined with lastToken
1688 if (lastToken.chain(next)) {
1694 private static REToken setRepeated(REToken current, int min, int max, int index) throws REException {
1695 if (current == null) throw new REException(getLocalizedMessage("repeat.no.token"),REException.REG_BADRPT,index);
1696 return new RETokenRepeated(current.subIndex,current,min,max);
1699 private static int getPosixSet(char[] pattern,int index,StringBuffer buf) {
1700 // Precondition: pattern[index-1] == ':'
1701 // we will return pos of closing ']'.
1703 for (i=index; i<(pattern.length-1); i++) {
1704 if ((pattern[i] == ':') && (pattern[i+1] == ']'))
1706 buf.append(pattern[i]);
1708 return index; // didn't match up
1711 private int getMinMax(char[] input,int index,IntPair minMax,RESyntax syntax) throws REException {
1712 // Precondition: input[index-1] == '{', minMax != null
1714 boolean mustMatch = !syntax.get(RESyntax.RE_NO_BK_BRACES);
1715 int startIndex = index;
1716 if (index == input.length) {
1718 throw new REException(getLocalizedMessage("unmatched.brace"),REException.REG_EBRACE,index);
1724 CharUnit unit = new CharUnit();
1725 StringBuffer buf = new StringBuffer();
1727 // Read string of digits
1729 index = getCharUnit(input,index,unit);
1730 if (Character.isDigit(unit.ch))
1731 buf.append(unit.ch);
1732 } while ((index != input.length) && Character.isDigit(unit.ch));
1734 // Check for {} tomfoolery
1735 if (buf.length() == 0) {
1737 throw new REException(getLocalizedMessage("interval.error"),REException.REG_EBRACE,index);
1742 min = Integer.parseInt(buf.toString());
1744 if ((unit.ch == '}') && (syntax.get(RESyntax.RE_NO_BK_BRACES) ^ unit.bk))
1746 else if (index == input.length)
1748 throw new REException(getLocalizedMessage("interval.no.end"),REException.REG_EBRACE,index);
1751 else if ((unit.ch == ',') && !unit.bk) {
1752 buf = new StringBuffer();
1753 // Read string of digits
1754 while (((index = getCharUnit(input,index,unit)) != input.length) && Character.isDigit(unit.ch))
1755 buf.append(unit.ch);
1757 if (!((unit.ch == '}') && (syntax.get(RESyntax.RE_NO_BK_BRACES) ^ unit.bk)))
1759 throw new REException(getLocalizedMessage("interval.error"),REException.REG_EBRACE,index);
1763 // This is the case of {x,}
1764 if (buf.length() == 0) max = Integer.MAX_VALUE;
1765 else max = Integer.parseInt(buf.toString());
1768 throw new REException(getLocalizedMessage("interval.error"),REException.REG_EBRACE,index);
1772 // We know min and max now, and they are valid.
1775 minMax.second = max;
1777 // return the index following the '}'
1782 * Return a human readable form of the compiled regular expression,
1783 * useful for debugging.
1785 public String toString() {
1786 StringBuffer sb = new StringBuffer();
1788 return sb.toString();
1791 void dump(StringBuffer os) {
1795 if (firstToken != null)
1796 firstToken.dumpAll(os);
1800 // Cast input appropriately or throw exception
1801 private static CharIndexed makeCharIndexed(Object input, int index) {
1802 // We could let a String fall through to final input, but since
1803 // it's the most likely input type, we check it first.
1804 if (input instanceof String)
1805 return new CharIndexedString((String) input,index);
1806 else if (input instanceof char[])
1807 return new CharIndexedCharArray((char[]) input,index);
1808 else if (input instanceof StringBuffer)
1809 return new CharIndexedStringBuffer((StringBuffer) input,index);
1810 else if (input instanceof InputStream)
1811 return new CharIndexedInputStream((InputStream) input,index);
1812 else if (input instanceof Reader)
1813 return new CharIndexedReader((Reader) input, index);
1814 else if (input instanceof CharIndexed)
1815 return (CharIndexed) input; // do we lose index info?
1817 return new CharIndexedString(input.toString(), index);
1821 * gnu/regexp/REException.java
1822 * Copyright (C) 1998-2001 Wes Biggs
1824 * This library is free software; you can redistribute it and/or modify
1825 * it under the terms of the GNU Lesser General Public License as published
1826 * by the Free Software Foundation; either version 2.1 of the License, or
1827 * (at your option) any later version.
1829 * This library is distributed in the hope that it will be useful,
1830 * but WITHOUT ANY WARRANTY; without even the implied warranty of
1831 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
1832 * GNU Lesser General Public License for more details.
1834 * You should have received a copy of the GNU Lesser General Public License
1835 * along with this program; if not, write to the Free Software
1836 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
1842 * This is the regular expression exception class. An exception of this type
1843 * defines the three attributes:
1845 * <LI> A descriptive message of the error.
1846 * <LI> An integral type code equivalent to one of the statically
1847 * defined symbols listed below.
1848 * <LI> The approximate position in the input string where the error
1852 * @author <A HREF="mailto:wes@cacas.org">Wes Biggs</A>
1855 public static class REException extends Exception {
1859 // Error conditions from GNU regcomp(3) manual
1863 * Invalid use of repetition operators such as using
1864 * `*' as the first character.
1866 public static final int REG_BADRPT = 1;
1870 * Invalid use of back reference operator.
1872 public static final int REG_BADBR = 2;
1876 * Un-matched brace interval operators.
1878 public static final int REG_EBRACE = 3;
1882 * Un-matched bracket list operators.
1884 public static final int REG_EBRACK = 4;
1888 * Invalid use of the range operator, eg. the ending
1889 * point of the range occurs prior to the starting
1892 public static final int REG_ERANGE = 5;
1896 * Unknown character class name. <B>Not implemented</B>.
1898 public static final int REG_ECTYPE = 6;
1902 * Un-matched parenthesis group operators.
1904 public static final int REG_EPAREN = 7;
1908 * Invalid back reference to a subexpression.
1910 public static final int REG_ESUBREG = 8;
1914 * Non specific error. <B>Not implemented</B>.
1916 public static final int REG_EEND = 9;
1920 * Invalid escape sequence. <B>Not implemented</B>.
1922 public static final int REG_ESCAPE = 10;
1926 * Invalid use of pattern operators such as group or list.
1928 public static final int REG_BADPAT = 11;
1932 * Compiled regular expression requires a pattern
1933 * buffer larger than 64Kb. <B>Not implemented</B>.
1935 public static final int REG_ESIZE = 12;
1939 * The regex routines ran out of memory. <B>Not implemented</B>.
1941 public static final int REG_ESPACE = 13;
1943 REException(String msg, int type, int position) {
1946 this.pos = position;
1950 * Returns the type of the exception, one of the constants listed above.
1953 public int getType() {
1958 * Returns the position, relative to the string or character array being
1959 * compiled, where the error occurred. This position is generally the point
1960 * where the error was detected, not necessarily the starting index of
1961 * a bad subexpression.
1963 public int getPosition() {
1968 * Reports the descriptive message associated with this exception
1969 * as well as its index position in the string or character array
1972 public String getMessage() {
1973 Object[] args = {new Integer(pos)};
1974 StringBuffer sb = new StringBuffer();
1975 String prefix = RE.getLocalizedMessage("error.prefix");
1976 sb.append(MessageFormat.format(prefix, args));
1978 sb.append(super.getMessage());
1979 return sb.toString();
1983 * gnu/regexp/REFilterInputStream.java
1984 * Copyright (C) 1998-2001 Wes Biggs
1986 * This library is free software; you can redistribute it and/or modify
1987 * it under the terms of the GNU Lesser General Public License as published
1988 * by the Free Software Foundation; either version 2.1 of the License, or
1989 * (at your option) any later version.
1991 * This library is distributed in the hope that it will be useful,
1992 * but WITHOUT ANY WARRANTY; without even the implied warranty of
1993 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
1994 * GNU Lesser General Public License for more details.
1996 * You should have received a copy of the GNU Lesser General Public License
1997 * along with this program; if not, write to the Free Software
1998 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
2003 * Replaces instances of a given RE found within an InputStream
2004 * with replacement text. The replacements are interpolated into the
2005 * stream when a match is found.
2007 * @author <A HREF="mailto:wes@cacas.org">Wes Biggs</A>
2008 * @deprecated This class cannot properly handle all character
2009 * encodings. For proper handling, use the REFilterReader
2013 public static class REFilterInputStream extends FilterInputStream {
2016 private String replace;
2017 private String buffer;
2020 private CharIndexedInputStream stream;
2023 * Creates an REFilterInputStream. When reading from this stream,
2024 * occurrences of patterns matching the supplied regular expression
2025 * will be replaced with the supplied replacement text (the
2026 * metacharacters $0 through $9 may be used to refer to the full
2027 * match or subexpression matches).
2029 * @param stream The InputStream to be filtered.
2030 * @param expr The regular expression to search for.
2031 * @param replace The text pattern to replace matches with.
2033 public REFilterInputStream(InputStream stream, RE expr, String replace) {
2035 this.stream = new CharIndexedInputStream(stream,0);
2037 this.replace = replace;
2041 * Reads the next byte from the stream per the general contract of
2042 * InputStream.read(). Returns -1 on error or end of stream.
2045 // If we have buffered replace data, use it.
2046 if ((buffer != null) && (bufpos < buffer.length())) {
2047 return (int) buffer.charAt(bufpos++);
2050 // check if input is at a valid position
2051 if (!stream.isValid()) return -1;
2053 REMatch mymatch = new REMatch(expr.getNumSubs(),offset,0);
2054 if (expr.match(stream, mymatch)) {
2055 mymatch.end[0] = mymatch.index;
2056 mymatch.finish(stream);
2057 stream.move(mymatch.toString().length());
2058 offset += mymatch.toString().length();
2059 buffer = mymatch.substituteInto(replace);
2062 // This is prone to infinite loops if replace string turns out empty.
2063 if (buffer.length() > 0) {
2064 return buffer.charAt(0);
2067 char ch = stream.charAt(0);
2068 if (ch == CharIndexed.OUT_OF_BOUNDS) return -1;
2075 * Returns false. REFilterInputStream does not support mark() and
2078 public boolean markSupported() {
2082 /** Reads from the stream into the provided array. */
2083 public int read(byte[] b, int off, int len) {
2088 if (i == -1) return (ok == 0) ? -1 : ok;
2089 b[off++] = (byte) i;
2095 /** Reads from the stream into the provided array. */
2096 public int read(byte[] b) {
2097 return read(b,0,b.length);
2101 * gnu/regexp/REFilterReader.java
2102 * Copyright (C) 2001 Lee Sau Dan
2103 * Based on gnu.regexp.REFilterInputStream by Wes Biggs
2105 * This library is free software; you can redistribute it and/or modify
2106 * it under the terms of the GNU Lesser General Public License as published
2107 * by the Free Software Foundation; either version 2.1 of the License, or
2108 * (at your option) any later version.
2110 * This library is distributed in the hope that it will be useful,
2111 * but WITHOUT ANY WARRANTY; without even the implied warranty of
2112 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
2113 * GNU Lesser General Public License for more details.
2115 * You should have received a copy of the GNU Lesser General Public License
2116 * along with this program; if not, write to the Free Software
2117 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
2122 * Replaces instances of a given RE with replacement text.
2124 * @author <A HREF="http://www.csis.hku.hk/~sdlee/">Lee Sau Dan</A>
2125 * @since gnu.regexp 1.1.0
2128 public static class REFilterReader extends FilterReader {
2131 private String replace;
2132 private String buffer;
2135 private CharIndexedReader stream;
2138 * Creates an REFilterReader. When reading from this stream,
2139 * occurrences of patterns matching the supplied regular expression
2140 * will be replaced with the supplied replacement text (the
2141 * metacharacters $0 through $9 may be used to refer to the full
2142 * match or subexpression matches.
2144 * @param stream The Reader to be filtered.
2145 * @param expr The regular expression to search for.
2146 * @param replace The text pattern to replace matches with.
2148 public REFilterReader(Reader stream, RE expr, String replace) {
2150 this.stream = new CharIndexedReader(stream,0);
2152 this.replace = replace;
2156 * Reads the next character from the stream per the general contract of
2157 * Reader.read(). Returns -1 on error or end of stream.
2160 // If we have buffered replace data, use it.
2161 if ((buffer != null) && (bufpos < buffer.length())) {
2162 return (int) buffer.charAt(bufpos++);
2165 // check if input is at a valid position
2166 if (!stream.isValid()) return -1;
2168 REMatch mymatch = new REMatch(expr.getNumSubs(),offset,0);
2169 if (expr.match(stream,mymatch)) {
2170 mymatch.end[0] = mymatch.index;
2171 mymatch.finish(stream);
2172 stream.move(mymatch.toString().length());
2173 offset += mymatch.toString().length();
2174 buffer = mymatch.substituteInto(replace);
2177 if (buffer.length() > 0) {
2178 return buffer.charAt(0);
2181 char ch = stream.charAt(0);
2182 if (ch == CharIndexed.OUT_OF_BOUNDS) return -1;
2189 * Returns false. REFilterReader does not support mark() and
2192 public boolean markSupported() {
2196 /** Reads from the stream into the provided array. */
2197 public int read(char[] b, int off, int len) {
2202 if (i == -1) return (ok == 0) ? -1 : ok;
2203 b[off++] = (char) i;
2209 /** Reads from the stream into the provided array. */
2210 public int read(char[] b) {
2211 return read(b,0,b.length);
2215 * gnu/regexp/REMatch.java
2216 * Copyright (C) 1998-2001 Wes Biggs
2218 * This library is free software; you can redistribute it and/or modify
2219 * it under the terms of the GNU Lesser General Public License as published
2220 * by the Free Software Foundation; either version 2.1 of the License, or
2221 * (at your option) any later version.
2223 * This library is distributed in the hope that it will be useful,
2224 * but WITHOUT ANY WARRANTY; without even the implied warranty of
2225 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
2226 * GNU Lesser General Public License for more details.
2228 * You should have received a copy of the GNU Lesser General Public License
2229 * along with this program; if not, write to the Free Software
2230 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
2235 * An instance of this class represents a match
2236 * completed by a gnu.regexp matching function. It can be used
2237 * to obtain relevant information about the location of a match
2240 * @author <A HREF="mailto:wes@cacas.org">Wes Biggs</A>
2242 public static final class REMatch implements Serializable, Cloneable {
2243 private String matchedText;
2245 // These variables are package scope for fast access within the engine
2246 int eflags; // execution flags this match was made using
2248 // Offset in source text where match was tried. This is zero-based;
2249 // the actual position in the source text is given by (offset + anchor).
2252 // Anchor position refers to the index into the source input
2253 // at which the matching operation began.
2254 // This is also useful for the ANCHORINDEX option.
2257 // Package scope; used by RE.
2258 int index; // used while matching to mark current match position in input
2259 int[] start; // start positions (relative to offset) for each (sub)exp.
2260 int[] end; // end positions for the same
2261 REMatch next; // other possibility (to avoid having to use arrays)
2263 public Object clone() {
2265 REMatch copy = (REMatch) super.clone();
2268 copy.start = (int[]) start.clone();
2269 copy.end = (int[]) end.clone();
2272 } catch (CloneNotSupportedException e) {
2273 throw new Error(); // doesn't happen
2277 void assignFrom(REMatch other) {
2278 start = other.start;
2280 index = other.index;
2281 // need to deep clone?
2285 REMatch(int subs, int anchor, int eflags) {
2286 start = new int[subs+1];
2287 end = new int[subs+1];
2288 this.anchor = anchor;
2289 this.eflags = eflags;
2293 void finish(CharIndexed text) {
2295 StringBuffer sb = new StringBuffer();
2297 for (i = 0; i < end[0]; i++)
2298 sb.append(text.charAt(i));
2299 matchedText = sb.toString();
2300 for (i = 0; i < start.length; i++) {
2301 // If any subexpressions didn't terminate, they don't count
2302 // TODO check if this code ever gets hit
2303 if ((start[i] == -1) ^ (end[i] == -1)) {
2308 next = null; // cut off alternates
2311 /** Clears the current match and moves the offset to the new index. */
2312 void clear(int index) {
2315 for (int i = 0; i < start.length; i++) {
2316 start[i] = end[i] = -1;
2318 next = null; // cut off alternates
2322 * Returns the string matching the pattern. This makes it convenient
2323 * to write code like the following:
2326 * REMatch myMatch = myExpression.getMatch(myString);<br>
2327 * if (myMatch != null) System.out.println("Regexp found: "+myMatch);
2330 public String toString() {
2335 * Returns the index within the input text where the match in its entirety
2338 public int getStartIndex() {
2339 return offset + start[0];
2343 * Returns the index within the input string where the match in
2344 * its entirety ends. The return value is the next position after
2345 * the end of the string; therefore, a match created by the
2349 * <code>REMatch myMatch = myExpression.getMatch(myString);</code>
2351 * can be viewed (given that myMatch is not null) by creating
2353 * <code>String theMatch = myString.substring(myMatch.getStartIndex(),
2354 * myMatch.getEndIndex());</code>
2356 * But you can save yourself that work, since the <code>toString()</code>
2357 * method (above) does exactly that for you.
2359 public int getEndIndex() {
2360 return offset + end[0];
2364 * Returns the string matching the given subexpression. The subexpressions
2365 * are indexed starting with one, not zero. That is, the subexpression
2366 * identified by the first set of parentheses in a regular expression
2367 * could be retrieved from an REMatch by calling match.toString(1).
2369 * @param sub Index of the subexpression.
2371 public String toString(int sub) {
2372 if ((sub >= start.length) || (start[sub] == -1)) return "";
2373 return (matchedText.substring(start[sub],end[sub]));
2377 * Returns the index within the input string used to generate this match
2378 * where subexpression number <i>sub</i> begins, or <code>-1</code> if
2379 * the subexpression does not exist. The initial position is zero.
2381 * @param sub Subexpression index
2382 * @deprecated Use getStartIndex(int) instead.
2384 public int getSubStartIndex(int sub) {
2385 if (sub >= start.length) return -1;
2387 return (x == -1) ? x : offset + x;
2391 * Returns the index within the input string used to generate this match
2392 * where subexpression number <i>sub</i> begins, or <code>-1</code> if
2393 * the subexpression does not exist. The initial position is zero.
2395 * @param sub Subexpression index
2396 * @since gnu.regexp 1.1.0
2398 public int getStartIndex(int sub) {
2399 if (sub >= start.length) return -1;
2401 return (x == -1) ? x : offset + x;
2405 * Returns the index within the input string used to generate this match
2406 * where subexpression number <i>sub</i> ends, or <code>-1</code> if
2407 * the subexpression does not exist. The initial position is zero.
2409 * @param sub Subexpression index
2410 * @deprecated Use getEndIndex(int) instead
2412 public int getSubEndIndex(int sub) {
2413 if (sub >= start.length) return -1;
2415 return (x == -1) ? x : offset + x;
2419 * Returns the index within the input string used to generate this match
2420 * where subexpression number <i>sub</i> ends, or <code>-1</code> if
2421 * the subexpression does not exist. The initial position is zero.
2423 * @param sub Subexpression index
2425 public int getEndIndex(int sub) {
2426 if (sub >= start.length) return -1;
2428 return (x == -1) ? x : offset + x;
2432 * Substitute the results of this match to create a new string.
2433 * This is patterned after PERL, so the tokens to watch out for are
2434 * <code>$0</code> through <code>$9</code>. <code>$0</code> matches
2435 * the full substring matched; <code>$<i>n</i></code> matches
2436 * subexpression number <i>n</i>.
2438 * @param input A string consisting of literals and <code>$<i>n</i></code> tokens.
2440 public String substituteInto(String input) {
2441 // a la Perl, $0 is whole thing, $1 - $9 are subexpressions
2442 StringBuffer output = new StringBuffer();
2444 for (pos = 0; pos < input.length()-1; pos++) {
2445 if ((input.charAt(pos) == '$') && (Character.isDigit(input.charAt(pos+1)))) {
2446 int val = Character.digit(input.charAt(++pos),10);
2447 if (val < start.length) {
2448 output.append(toString(val));
2450 } else output.append(input.charAt(pos));
2452 if (pos < input.length()) output.append(input.charAt(pos));
2453 return output.toString();
2457 * gnu/regexp/REMatchEnumeration.java
2458 * Copyright (C) 1998-2001 Wes Biggs
2460 * This library is free software; you can redistribute it and/or modify
2461 * it under the terms of the GNU Lesser General Public License as published
2462 * by the Free Software Foundation; either version 2.1 of the License, or
2463 * (at your option) any later version.
2465 * This library is distributed in the hope that it will be useful,
2466 * but WITHOUT ANY WARRANTY; without even the implied warranty of
2467 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
2468 * GNU Lesser General Public License for more details.
2470 * You should have received a copy of the GNU Lesser General Public License
2471 * along with this program; if not, write to the Free Software
2472 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
2476 * An REMatchEnumeration enumerates regular expression matches over a
2477 * given input text. You obtain a reference to an enumeration using
2478 * the <code>getMatchEnumeration()</code> methods on an instance of
2483 * REMatchEnumeration does lazy computation; that is, it will not
2484 * search for a match until it needs to. If you'd rather just get all
2485 * the matches at once in a big array, use the
2486 * <code>getAllMatches()</code> methods on RE. However, using an
2487 * enumeration can help speed performance when the entire text does
2488 * not need to be searched immediately.
2492 * The enumerated type is especially useful when searching on a Reader
2493 * or InputStream, because the InputStream read position cannot be
2494 * guaranteed after calling <code>getMatch()</code> (see the
2495 * description of that method for an explanation of why). Enumeration
2496 * also saves a lot of overhead required when calling
2497 * <code>getMatch()</code> multiple times.
2499 * @author <A HREF="mailto:wes@cacas.org">Wes Biggs</A>
2501 public static class REMatchEnumeration implements Enumeration, Serializable {
2502 private static final int YES = 1;
2503 private static final int MAYBE = 0;
2504 private static final int NO = -1;
2507 private REMatch match;
2509 private CharIndexed input;
2513 // Package scope constructor is used by RE.getMatchEnumeration()
2514 REMatchEnumeration(RE expr, CharIndexed input, int index, int eflags) {
2519 this.eflags = eflags;
2522 /** Returns true if there are more matches in the input text. */
2523 public boolean hasMoreElements() {
2524 return hasMoreMatches(null);
2527 /** Returns true if there are more matches in the input text. */
2528 public boolean hasMoreMatches() {
2529 return hasMoreMatches(null);
2532 /** Returns true if there are more matches in the input text.
2533 * Saves the text leading up to the match (or to the end of the input)
2534 * in the specified buffer.
2536 public boolean hasMoreMatches(StringBuffer buffer) {
2537 if (more == MAYBE) {
2538 match = expr.getMatchImpl(input,index,eflags,buffer);
2539 if (match != null) {
2540 input.move((match.end[0] > 0) ? match.end[0] : 1);
2542 index = (match.end[0] > 0) ? match.end[0] + match.offset : index + 1;
2546 return (more == YES);
2549 /** Returns the next match in the input text. */
2550 public Object nextElement() throws NoSuchElementException {
2555 * Returns the next match in the input text. This method is provided
2556 * for convenience to avoid having to explicitly cast the return value
2559 public REMatch nextMatch() throws NoSuchElementException {
2560 if (hasMoreElements()) {
2561 more = (input.isValid()) ? MAYBE : NO;
2564 throw new NoSuchElementException();
2569 * gnu/regexp/RESyntax.java
2570 * Copyright (C) 1998-2001 Wes Biggs
2572 * This library is free software; you can redistribute it and/or modify
2573 * it under the terms of the GNU Lesser General Public License as published
2574 * by the Free Software Foundation; either version 2.1 of the License, or
2575 * (at your option) any later version.
2577 * This library is distributed in the hope that it will be useful,
2578 * but WITHOUT ANY WARRANTY; without even the implied warranty of
2579 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
2580 * GNU Lesser General Public License for more details.
2582 * You should have received a copy of the GNU Lesser General Public License
2583 * along with this program; if not, write to the Free Software
2584 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
2589 * An RESyntax specifies the way a regular expression will be compiled.
2590 * This class provides a number of predefined useful constants for
2591 * emulating popular regular expression syntaxes. Additionally the
2592 * user may construct his or her own syntax, using any combination of the
2593 * syntax bit constants. The syntax is an optional argument to any of the
2594 * matching methods on class RE.
2596 * @author <A HREF="mailto:wes@cacas.org">Wes Biggs</A>
2599 public static final class RESyntax implements Serializable {
2600 static final String DEFAULT_LINE_SEPARATOR = System.getProperty("line.separator");
2602 private static final String SYNTAX_IS_FINAL = RE.getLocalizedMessage("syntax.final");
2604 private BitSet bits;
2606 // true for the constant defined syntaxes
2607 private boolean isFinal = false;
2609 private String lineSeparator = DEFAULT_LINE_SEPARATOR;
2611 // Values for constants are bit indexes
2614 * Syntax bit. Backslash is an escape character in lists.
2616 public static final int RE_BACKSLASH_ESCAPE_IN_LISTS = 0;
2619 * Syntax bit. Use \? instead of ? and \+ instead of +.
2621 public static final int RE_BK_PLUS_QM = 1;
2624 * Syntax bit. POSIX character classes ([:...:]) in lists are allowed.
2626 public static final int RE_CHAR_CLASSES = 2;
2629 * Syntax bit. ^ and $ are special everywhere.
2630 * <B>Not implemented.</B>
2632 public static final int RE_CONTEXT_INDEP_ANCHORS = 3;
2635 * Syntax bit. Repetition operators are only special in valid positions.
2636 * <B>Not implemented.</B>
2638 public static final int RE_CONTEXT_INDEP_OPS = 4;
2641 * Syntax bit. Repetition and alternation operators are invalid
2642 * at start and end of pattern and other places.
2643 * <B>Not implemented</B>.
2645 public static final int RE_CONTEXT_INVALID_OPS = 5;
2648 * Syntax bit. Match-any-character operator (.) matches a newline.
2650 public static final int RE_DOT_NEWLINE = 6;
2653 * Syntax bit. Match-any-character operator (.) does not match a null.
2655 public static final int RE_DOT_NOT_NULL = 7;
2658 * Syntax bit. Intervals ({x}, {x,}, {x,y}) are allowed.
2660 public static final int RE_INTERVALS = 8;
2663 * Syntax bit. No alternation (|), match one-or-more (+), or
2664 * match zero-or-one (?) operators.
2666 public static final int RE_LIMITED_OPS = 9;
2669 * Syntax bit. Newline is an alternation operator.
2671 public static final int RE_NEWLINE_ALT = 10; // impl.
2674 * Syntax bit. Intervals use { } instead of \{ \}
2676 public static final int RE_NO_BK_BRACES = 11;
2679 * Syntax bit. Grouping uses ( ) instead of \( \).
2681 public static final int RE_NO_BK_PARENS = 12;
2684 * Syntax bit. Backreferences not allowed.
2686 public static final int RE_NO_BK_REFS = 13;
2689 * Syntax bit. Alternation uses | instead of \|
2691 public static final int RE_NO_BK_VBAR = 14;
2694 * Syntax bit. <B>Not implemented</B>.
2696 public static final int RE_NO_EMPTY_RANGES = 15;
2699 * Syntax bit. An unmatched right parenthesis (')' or '\)', depending
2700 * on RE_NO_BK_PARENS) will throw an exception when compiling.
2702 public static final int RE_UNMATCHED_RIGHT_PAREN_ORD = 16;
2705 * Syntax bit. <B>Not implemented.</B>
2707 public static final int RE_HAT_LISTS_NOT_NEWLINE = 17;
2710 * Syntax bit. Stingy matching is allowed (+?, *?, ??, {x,y}?).
2712 public static final int RE_STINGY_OPS = 18;
2715 * Syntax bit. Allow character class escapes (\d, \D, \s, \S, \w, \W).
2717 public static final int RE_CHAR_CLASS_ESCAPES = 19;
2720 * Syntax bit. Allow use of (?:xxx) grouping (subexpression is not saved).
2722 public static final int RE_PURE_GROUPING = 20;
2725 * Syntax bit. Allow use of (?=xxx) and (?!xxx) apply the subexpression
2726 * to the text following the current position without consuming that text.
2728 public static final int RE_LOOKAHEAD = 21;
2731 * Syntax bit. Allow beginning- and end-of-string anchors (\A, \Z).
2733 public static final int RE_STRING_ANCHORS = 22;
2736 * Syntax bit. Allow embedded comments, (?#comment), as in Perl5.
2738 public static final int RE_COMMENTS = 23;
2741 * Syntax bit. Allow character class escapes within lists, as in Perl5.
2743 public static final int RE_CHAR_CLASS_ESC_IN_LISTS = 24;
2745 private static final int BIT_TOTAL = 25;
2748 * Predefined syntax.
2749 * Emulates regular expression support in the awk utility.
2751 public static final RESyntax RE_SYNTAX_AWK;
2754 * Predefined syntax.
2755 * Emulates regular expression support in the ed utility.
2757 public static final RESyntax RE_SYNTAX_ED;
2760 * Predefined syntax.
2761 * Emulates regular expression support in the egrep utility.
2763 public static final RESyntax RE_SYNTAX_EGREP;
2766 * Predefined syntax.
2767 * Emulates regular expression support in the GNU Emacs editor.
2769 public static final RESyntax RE_SYNTAX_EMACS;
2772 * Predefined syntax.
2773 * Emulates regular expression support in the grep utility.
2775 public static final RESyntax RE_SYNTAX_GREP;
2778 * Predefined syntax.
2779 * Emulates regular expression support in the POSIX awk specification.
2781 public static final RESyntax RE_SYNTAX_POSIX_AWK;
2784 * Predefined syntax.
2785 * Emulates POSIX basic regular expression support.
2787 public static final RESyntax RE_SYNTAX_POSIX_BASIC;
2790 * Predefined syntax.
2791 * Emulates regular expression support in the POSIX egrep specification.
2793 public static final RESyntax RE_SYNTAX_POSIX_EGREP;
2796 * Predefined syntax.
2797 * Emulates POSIX extended regular expression support.
2799 public static final RESyntax RE_SYNTAX_POSIX_EXTENDED;
2802 * Predefined syntax.
2803 * Emulates POSIX basic minimal regular expressions.
2805 public static final RESyntax RE_SYNTAX_POSIX_MINIMAL_BASIC;
2808 * Predefined syntax.
2809 * Emulates POSIX extended minimal regular expressions.
2811 public static final RESyntax RE_SYNTAX_POSIX_MINIMAL_EXTENDED;
2814 * Predefined syntax.
2815 * Emulates regular expression support in the sed utility.
2817 public static final RESyntax RE_SYNTAX_SED;
2820 * Predefined syntax.
2821 * Emulates regular expression support in Larry Wall's perl, version 4,
2823 public static final RESyntax RE_SYNTAX_PERL4;
2826 * Predefined syntax.
2827 * Emulates regular expression support in Larry Wall's perl, version 4,
2828 * using single line mode (/s modifier).
2830 public static final RESyntax RE_SYNTAX_PERL4_S; // single line mode (/s)
2833 * Predefined syntax.
2834 * Emulates regular expression support in Larry Wall's perl, version 5.
2836 public static final RESyntax RE_SYNTAX_PERL5;
2839 * Predefined syntax.
2840 * Emulates regular expression support in Larry Wall's perl, version 5,
2841 * using single line mode (/s modifier).
2843 public static final RESyntax RE_SYNTAX_PERL5_S;
2848 RE_SYNTAX_EMACS = new RESyntax().makeFinal();
2850 RESyntax RE_SYNTAX_POSIX_COMMON = new RESyntax()
2851 .set(RE_CHAR_CLASSES)
2852 .set(RE_DOT_NEWLINE)
2853 .set(RE_DOT_NOT_NULL)
2855 .set(RE_NO_EMPTY_RANGES)
2858 RE_SYNTAX_POSIX_BASIC = new RESyntax(RE_SYNTAX_POSIX_COMMON)
2862 RE_SYNTAX_POSIX_EXTENDED = new RESyntax(RE_SYNTAX_POSIX_COMMON)
2863 .set(RE_CONTEXT_INDEP_ANCHORS)
2864 .set(RE_CONTEXT_INDEP_OPS)
2865 .set(RE_NO_BK_BRACES)
2866 .set(RE_NO_BK_PARENS)
2868 .set(RE_UNMATCHED_RIGHT_PAREN_ORD)
2871 RE_SYNTAX_AWK = new RESyntax()
2872 .set(RE_BACKSLASH_ESCAPE_IN_LISTS)
2873 .set(RE_DOT_NOT_NULL)
2874 .set(RE_NO_BK_PARENS)
2877 .set(RE_NO_EMPTY_RANGES)
2878 .set(RE_UNMATCHED_RIGHT_PAREN_ORD)
2881 RE_SYNTAX_POSIX_AWK = new RESyntax(RE_SYNTAX_POSIX_EXTENDED)
2882 .set(RE_BACKSLASH_ESCAPE_IN_LISTS)
2885 RE_SYNTAX_GREP = new RESyntax()
2887 .set(RE_CHAR_CLASSES)
2888 .set(RE_HAT_LISTS_NOT_NEWLINE)
2890 .set(RE_NEWLINE_ALT)
2893 RE_SYNTAX_EGREP = new RESyntax()
2894 .set(RE_CHAR_CLASSES)
2895 .set(RE_CONTEXT_INDEP_ANCHORS)
2896 .set(RE_CONTEXT_INDEP_OPS)
2897 .set(RE_HAT_LISTS_NOT_NEWLINE)
2898 .set(RE_NEWLINE_ALT)
2899 .set(RE_NO_BK_PARENS)
2903 RE_SYNTAX_POSIX_EGREP = new RESyntax(RE_SYNTAX_EGREP)
2905 .set(RE_NO_BK_BRACES)
2908 /* P1003.2/D11.2, section 4.20.7.1, lines 5078ff. */
2910 RE_SYNTAX_ED = new RESyntax(RE_SYNTAX_POSIX_BASIC)
2913 RE_SYNTAX_SED = new RESyntax(RE_SYNTAX_POSIX_BASIC)
2916 RE_SYNTAX_POSIX_MINIMAL_BASIC = new RESyntax(RE_SYNTAX_POSIX_COMMON)
2917 .set(RE_LIMITED_OPS)
2920 /* Differs from RE_SYNTAX_POSIX_EXTENDED in that RE_CONTEXT_INVALID_OPS
2921 replaces RE_CONTEXT_INDEP_OPS and RE_NO_BK_REFS is added. */
2923 RE_SYNTAX_POSIX_MINIMAL_EXTENDED = new RESyntax(RE_SYNTAX_POSIX_COMMON)
2924 .set(RE_CONTEXT_INDEP_ANCHORS)
2925 .set(RE_CONTEXT_INVALID_OPS)
2926 .set(RE_NO_BK_BRACES)
2927 .set(RE_NO_BK_PARENS)
2930 .set(RE_UNMATCHED_RIGHT_PAREN_ORD)
2933 /* There is no official Perl spec, but here's a "best guess" */
2935 RE_SYNTAX_PERL4 = new RESyntax()
2936 .set(RE_BACKSLASH_ESCAPE_IN_LISTS)
2937 .set(RE_CONTEXT_INDEP_ANCHORS)
2938 .set(RE_CONTEXT_INDEP_OPS) // except for '{', apparently
2940 .set(RE_NO_BK_BRACES)
2941 .set(RE_NO_BK_PARENS)
2943 .set(RE_NO_EMPTY_RANGES)
2944 .set(RE_CHAR_CLASS_ESCAPES) // \d,\D,\w,\W,\s,\S
2947 RE_SYNTAX_PERL4_S = new RESyntax(RE_SYNTAX_PERL4)
2948 .set(RE_DOT_NEWLINE)
2951 RE_SYNTAX_PERL5 = new RESyntax(RE_SYNTAX_PERL4)
2952 .set(RE_PURE_GROUPING) // (?:)
2953 .set(RE_STINGY_OPS) // *?,??,+?,{}?
2954 .set(RE_LOOKAHEAD) // (?=)(?!)
2955 .set(RE_STRING_ANCHORS) // \A,\Z
2956 .set(RE_CHAR_CLASS_ESC_IN_LISTS)// \d,\D,\w,\W,\s,\S within []
2957 .set(RE_COMMENTS) // (?#)
2960 RE_SYNTAX_PERL5_S = new RESyntax(RE_SYNTAX_PERL5)
2961 .set(RE_DOT_NEWLINE)
2966 * Construct a new syntax object with all bits turned off.
2967 * This is equivalent to RE_SYNTAX_EMACS.
2970 bits = new BitSet(BIT_TOTAL);
2974 * Called internally when constructing predefined syntaxes
2975 * so their interpretation cannot vary. Conceivably useful
2976 * for your syntaxes as well. Causes IllegalAccessError to
2977 * be thrown if any attempt to modify the syntax is made.
2979 * @return this object for convenient chaining
2981 public RESyntax makeFinal() {
2987 * Construct a new syntax object with all bits set the same
2988 * as the other syntax.
2990 public RESyntax(RESyntax other) {
2991 bits = (BitSet) other.bits.clone();
2995 * Check if a given bit is set in this syntax.
2997 public boolean get(int index) {
2998 return bits.get(index);
3002 * Set a given bit in this syntax.
3004 * @param index the constant (RESyntax.RE_xxx) bit to set.
3005 * @return a reference to this object for easy chaining.
3007 public RESyntax set(int index) {
3008 if (isFinal) throw new IllegalAccessError(SYNTAX_IS_FINAL);
3014 * Clear a given bit in this syntax.
3016 * @param index the constant (RESyntax.RE_xxx) bit to clear.
3017 * @return a reference to this object for easy chaining.
3019 public RESyntax clear(int index) {
3020 if (isFinal) throw new IllegalAccessError(SYNTAX_IS_FINAL);
3026 * Changes the line separator string for regular expressions
3027 * created using this RESyntax. The default separator is the
3028 * value returned by the system property "line.separator", which
3029 * should be correct when reading platform-specific files from a
3030 * filesystem. However, many programs may collect input from
3031 * sources where the line separator is differently specified (for
3032 * example, in the applet environment, the text box widget
3033 * interprets line breaks as single-character newlines,
3034 * regardless of the host platform.
3036 * Note that setting the line separator to a character or
3037 * characters that have specific meaning within the current syntax
3038 * can cause unexpected chronosynclastic infundibula.
3040 * @return this object for convenient chaining
3042 public RESyntax setLineSeparator(String aSeparator) {
3043 if (isFinal) throw new IllegalAccessError(SYNTAX_IS_FINAL);
3044 lineSeparator = aSeparator;
3049 * Returns the currently active line separator string. The default
3050 * is the platform-dependent system property "line.separator".
3052 public String getLineSeparator() {
3053 return lineSeparator;
3057 * gnu/regexp/REToken.java
3058 * Copyright (C) 1998-2001 Wes Biggs
3060 * This library is free software; you can redistribute it and/or modify
3061 * it under the terms of the GNU Lesser General Public License as published
3062 * by the Free Software Foundation; either version 2.1 of the License, or
3063 * (at your option) any later version.
3065 * This library is distributed in the hope that it will be useful,
3066 * but WITHOUT ANY WARRANTY; without even the implied warranty of
3067 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
3068 * GNU Lesser General Public License for more details.
3070 * You should have received a copy of the GNU Lesser General Public License
3071 * along with this program; if not, write to the Free Software
3072 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
3076 abstract static class REToken implements Serializable {
3078 protected REToken next = null;
3079 protected REToken uncle = null;
3080 protected int subIndex;
3082 protected REToken(int subIndex) {
3083 this.subIndex = subIndex;
3086 int getMinimumLength() {
3090 void setUncle(REToken anUncle) {
3094 /** Returns true if the match succeeded, false if it failed. */
3095 abstract boolean match(CharIndexed input, REMatch mymatch);
3097 /** Returns true if the rest of the tokens match, false if they fail. */
3098 protected boolean next(CharIndexed input, REMatch mymatch) {
3100 if (uncle == null) {
3103 return uncle.match(input, mymatch);
3106 return next.match(input, mymatch);
3110 boolean chain(REToken token) {
3112 return true; // Token was accepted
3115 abstract void dump(StringBuffer os);
3117 void dumpAll(StringBuffer os) {
3119 if (next != null) next.dumpAll(os);
3123 * gnu/regexp/RETokenAny.java
3124 * Copyright (C) 1998-2001 Wes Biggs
3126 * This library is free software; you can redistribute it and/or modify
3127 * it under the terms of the GNU Lesser General Public License as published
3128 * by the Free Software Foundation; either version 2.1 of the License, or
3129 * (at your option) any later version.
3131 * This library is distributed in the hope that it will be useful,
3132 * but WITHOUT ANY WARRANTY; without even the implied warranty of
3133 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
3134 * GNU Lesser General Public License for more details.
3136 * You should have received a copy of the GNU Lesser General Public License
3137 * along with this program; if not, write to the Free Software
3138 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
3142 final static class RETokenAny extends REToken {
3143 /** True if '.' can match a newline (RE_DOT_NEWLINE) */
3144 private boolean newline;
3146 /** True if '.' can't match a null (RE_DOT_NOT_NULL) */
3147 private boolean matchNull;
3149 RETokenAny(int subIndex, boolean newline, boolean matchNull) {
3151 this.newline = newline;
3152 this.matchNull = matchNull;
3155 int getMinimumLength() {
3159 boolean match(CharIndexed input, REMatch mymatch) {
3160 char ch = input.charAt(mymatch.index);
3161 if ((ch == CharIndexed.OUT_OF_BOUNDS)
3162 || (!newline && (ch == '\n'))
3163 || (matchNull && (ch == 0))) {
3167 return next(input, mymatch);
3170 void dump(StringBuffer os) {
3176 * gnu/regexp/RETokenBackRef.java
3177 * Copyright (C) 1998-2001 Wes Biggs
3179 * This library is free software; you can redistribute it and/or modify
3180 * it under the terms of the GNU Lesser General Public License as published
3181 * by the Free Software Foundation; either version 2.1 of the License, or
3182 * (at your option) any later version.
3184 * This library is distributed in the hope that it will be useful,
3185 * but WITHOUT ANY WARRANTY; without even the implied warranty of
3186 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
3187 * GNU Lesser General Public License for more details.
3189 * You should have received a copy of the GNU Lesser General Public License
3190 * along with this program; if not, write to the Free Software
3191 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
3195 final static class RETokenBackRef extends REToken {
3197 private boolean insens;
3199 RETokenBackRef(int subIndex, int num, boolean insens) {
3202 this.insens = insens;
3205 // should implement getMinimumLength() -- any ideas?
3207 boolean match(CharIndexed input, REMatch mymatch) {
3209 b = mymatch.start[num];
3210 e = mymatch.end[num];
3211 if ((b==-1)||(e==-1)) return false; // this shouldn't happen, but...
3212 for (int i=b; i<e; i++) {
3213 if (input.charAt(mymatch.index+i-b) != input.charAt(i)) {
3217 mymatch.index += e-b;
3218 return next(input, mymatch);
3221 void dump(StringBuffer os) {
3222 os.append('\\').append(num);
3228 * gnu/regexp/RETokenChar.java
3229 * Copyright (C) 1998-2001 Wes Biggs
3231 * This library is free software; you can redistribute it and/or modify
3232 * it under the terms of the GNU Lesser General Public License as published
3233 * by the Free Software Foundation; either version 2.1 of the License, or
3234 * (at your option) any later version.
3236 * This library is distributed in the hope that it will be useful,
3237 * but WITHOUT ANY WARRANTY; without even the implied warranty of
3238 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
3239 * GNU Lesser General Public License for more details.
3241 * You should have received a copy of the GNU Lesser General Public License
3242 * along with this program; if not, write to the Free Software
3243 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
3247 final static class RETokenChar extends REToken {
3249 private boolean insens;
3251 RETokenChar(int subIndex, char c, boolean ins) {
3254 ch[0] = (insens = ins) ? Character.toLowerCase(c) : c;
3257 int getMinimumLength() {
3261 boolean match(CharIndexed input, REMatch mymatch) {
3264 for (int i=0; i<z; i++) {
3265 c = input.charAt(mymatch.index+i);
3266 if (( (insens) ? Character.toLowerCase(c) : c ) != ch[i]) {
3272 return next(input, mymatch);
3275 // Overrides REToken.chain() to optimize for strings
3276 boolean chain(REToken next) {
3277 if (next instanceof RETokenChar) {
3278 RETokenChar cnext = (RETokenChar) next;
3279 // assume for now that next can only be one character
3280 int newsize = ch.length + cnext.ch.length;
3282 char[] chTemp = new char [newsize];
3284 System.arraycopy(ch,0,chTemp,0,ch.length);
3285 System.arraycopy(cnext.ch,0,chTemp,ch.length,cnext.ch.length);
3289 } else return super.chain(next);
3292 void dump(StringBuffer os) {
3299 * gnu/regexp/RETokenEnd.java
3300 * Copyright (C) 1998-2001 Wes Biggs
3302 * This library is free software; you can redistribute it and/or modify
3303 * it under the terms of the GNU Lesser General Public License as published
3304 * by the Free Software Foundation; either version 2.1 of the License, or
3305 * (at your option) any later version.
3307 * This library is distributed in the hope that it will be useful,
3308 * but WITHOUT ANY WARRANTY; without even the implied warranty of
3309 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
3310 * GNU Lesser General Public License for more details.
3312 * You should have received a copy of the GNU Lesser General Public License
3313 * along with this program; if not, write to the Free Software
3314 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
3317 final static class RETokenEnd extends REToken {
3319 * Indicates whether this token should match on a line break.
3321 private String newline;
3323 RETokenEnd(int subIndex,String newline) {
3325 this.newline = newline;
3328 boolean match(CharIndexed input, REMatch mymatch) {
3329 char ch = input.charAt(mymatch.index);
3330 if (ch == CharIndexed.OUT_OF_BOUNDS)
3331 return ((mymatch.eflags & RE.REG_NOTEOL)>0) ?
3332 false : next(input, mymatch);
3333 if (newline != null) {
3335 int i = 0; // position in newline
3337 z = newline.charAt(i);
3338 if (ch != z) return false;
3340 ch = input.charAt(mymatch.index + i);
3341 } while (i < newline.length());
3343 return next(input, mymatch);
3348 void dump(StringBuffer os) {
3353 * gnu/regexp/RETokenEndSub.java
3354 * Copyright (C) 2001 Wes Biggs
3356 * This library is free software; you can redistribute it and/or modify
3357 * it under the terms of the GNU Lesser General Public License as published
3358 * by the Free Software Foundation; either version 2.1 of the License, or
3359 * (at your option) any later version.
3361 * This library is distributed in the hope that it will be useful,
3362 * but WITHOUT ANY WARRANTY; without even the implied warranty of
3363 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
3364 * GNU Lesser General Public License for more details.
3366 * You should have received a copy of the GNU Lesser General Public License
3367 * along with this program; if not, write to the Free Software
3368 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
3372 final static class RETokenEndSub extends REToken {
3373 RETokenEndSub(int subIndex) {
3377 boolean match(CharIndexed input, REMatch mymatch) {
3378 mymatch.end[subIndex] = mymatch.index;
3379 return next(input, mymatch);
3382 void dump(StringBuffer os) {
3387 * gnu/regexp/RETokenOneOf.java
3388 * Copyright (C) 1998-2001 Wes Biggs
3390 * This library is free software; you can redistribute it and/or modify
3391 * it under the terms of the GNU Lesser General Public License as published
3392 * by the Free Software Foundation; either version 2.1 of the License, or
3393 * (at your option) any later version.
3395 * This library is distributed in the hope that it will be useful,
3396 * but WITHOUT ANY WARRANTY; without even the implied warranty of
3397 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
3398 * GNU Lesser General Public License for more details.
3400 * You should have received a copy of the GNU Lesser General Public License
3401 * along with this program; if not, write to the Free Software
3402 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
3406 * @since gnu.regexp 1.1.3
3407 * @author Shashank Bapat
3409 final static class RETokenLookAhead extends REToken
3414 RETokenLookAhead(REToken re, boolean negative) throws REException {
3417 this.negative = negative;
3420 boolean match(CharIndexed input, REMatch mymatch)
3422 REMatch trymatch = (REMatch)mymatch.clone();
3423 REMatch trymatch1 = (REMatch)mymatch.clone();
3424 REMatch newMatch = null;
3425 if (re.match(input, trymatch)) {
3426 if (negative) return false;
3427 if (next(input, trymatch1))
3428 newMatch = trymatch1;
3431 if (newMatch != null) {
3432 if (negative) return false;
3434 mymatch.assignFrom(newMatch);
3439 return next(input, mymatch);
3445 void dump(StringBuffer os) {
3447 os.append(negative ? '!' : '=');
3454 * gnu/regexp/RETokenOneOf.java
3455 * Copyright (C) 1998-2001 Wes Biggs
3457 * This library is free software; you can redistribute it and/or modify
3458 * it under the terms of the GNU Lesser General Public License as published
3459 * by the Free Software Foundation; either version 2.1 of the License, or
3460 * (at your option) any later version.
3462 * This library is distributed in the hope that it will be useful,
3463 * but WITHOUT ANY WARRANTY; without even the implied warranty of
3464 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
3465 * GNU Lesser General Public License for more details.
3467 * You should have received a copy of the GNU Lesser General Public License
3468 * along with this program; if not, write to the Free Software
3469 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
3473 final static class RETokenOneOf extends REToken {
3474 private Vector options;
3475 private boolean negative;
3477 // This constructor is used for convenience when we know the set beforehand,
3478 // e.g. \d --> new RETokenOneOf("0123456789",false, ..)
3479 // \D --> new RETokenOneOf("0123456789",true, ..)
3481 RETokenOneOf(int subIndex, String optionsStr, boolean negative, boolean insens) {
3483 options = new Vector();
3484 this.negative = negative;
3485 for (int i = 0; i < optionsStr.length(); i++)
3486 options.addElement(new RETokenChar(subIndex,optionsStr.charAt(i),insens));
3489 RETokenOneOf(int subIndex, Vector options, boolean negative) {
3491 this.options = options;
3492 this.negative = negative;
3495 int getMinimumLength() {
3496 int min = Integer.MAX_VALUE;
3498 for (int i=0; i < options.size(); i++) {
3499 if ((x = ((REToken) options.elementAt(i)).getMinimumLength()) < min)
3505 boolean match(CharIndexed input, REMatch mymatch) {
3506 if (negative && (input.charAt(mymatch.index) == CharIndexed.OUT_OF_BOUNDS))
3509 REMatch newMatch = null;
3510 REMatch last = null;
3513 for (int i=0; i < options.size(); i++) {
3514 tk = (REToken) options.elementAt(i);
3515 REMatch tryMatch = (REMatch) mymatch.clone();
3516 if (tk.match(input, tryMatch)) { // match was successful
3517 if (negative) return false;
3519 if (next(input, tryMatch)) {
3520 // Add tryMatch to list of possibilities.
3522 newMatch = tryMatch;
3525 last.next = tryMatch;
3530 } // try next option
3532 if (newMatch != null) {
3536 // set contents of mymatch equal to newMatch
3538 // try each one that matched
3539 mymatch.assignFrom(newMatch);
3545 return next(input, mymatch);
3551 // index+1 works for [^abc] lists, not for generic lookahead (--> index)
3554 void dump(StringBuffer os) {
3555 os.append(negative ? "[^" : "(?:");
3556 for (int i = 0; i < options.size(); i++) {
3557 if (!negative && (i > 0)) os.append('|');
3558 ((REToken) options.elementAt(i)).dumpAll(os);
3560 os.append(negative ? ']' : ')');
3564 * gnu/regexp/RETokenPOSIX.java
3565 * Copyright (C) 1998-2001 Wes Biggs
3567 * This library is free software; you can redistribute it and/or modify
3568 * it under the terms of the GNU Lesser General Public License as published
3569 * by the Free Software Foundation; either version 2.1 of the License, or
3570 * (at your option) any later version.
3572 * This library is distributed in the hope that it will be useful,
3573 * but WITHOUT ANY WARRANTY; without even the implied warranty of
3574 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
3575 * GNU Lesser General Public License for more details.
3577 * You should have received a copy of the GNU Lesser General Public License
3578 * along with this program; if not, write to the Free Software
3579 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
3583 final static class RETokenPOSIX extends REToken {
3588 static final int ALNUM = 0;
3589 static final int ALPHA = 1;
3590 static final int BLANK = 2;
3591 static final int CNTRL = 3;
3592 static final int DIGIT = 4;
3593 static final int GRAPH = 5;
3594 static final int LOWER = 6;
3595 static final int PRINT = 7;
3596 static final int PUNCT = 8;
3597 static final int SPACE = 9;
3598 static final int UPPER = 10;
3599 static final int XDIGIT = 11;
3601 // Array indices correspond to constants defined above.
3602 static final String[] s_nameTable = {
3603 "alnum", "alpha", "blank", "cntrl", "digit", "graph", "lower",
3604 "print", "punct", "space", "upper", "xdigit"
3607 // The RE constructor uses this to look up the constant for a string
3608 static int intValue(String key) {
3609 for (int i = 0; i < s_nameTable.length; i++) {
3610 if (s_nameTable[i].equals(key)) return i;
3615 RETokenPOSIX(int subIndex, int type, boolean insens, boolean negated) {
3618 this.insens = insens;
3619 this.negated = negated;
3622 int getMinimumLength() {
3626 boolean match(CharIndexed input, REMatch mymatch) {
3627 char ch = input.charAt(mymatch.index);
3628 if (ch == CharIndexed.OUT_OF_BOUNDS)
3631 boolean retval = false;
3634 // Note that there is some debate over whether '_' should be included
3635 retval = Character.isLetterOrDigit(ch) || (ch == '_');
3638 retval = Character.isLetter(ch);
3641 retval = ((ch == ' ') || (ch == '\t'));
3644 retval = Character.isISOControl(ch);
3647 retval = Character.isDigit(ch);
3650 retval = (!(Character.isWhitespace(ch) || Character.isISOControl(ch)));
3653 retval = ((insens && Character.isLetter(ch)) || Character.isLowerCase(ch));
3656 retval = (!(Character.isWhitespace(ch) || Character.isISOControl(ch)))
3660 // This feels sloppy, especially for non-U.S. locales.
3661 retval = ("`~!@#$%^&*()-_=+[]{}\\|;:'\"/?,.<>".indexOf(ch)!=-1);
3664 retval = Character.isWhitespace(ch);
3667 retval = ((insens && Character.isLetter(ch)) || Character.isUpperCase(ch));
3670 retval = (Character.isDigit(ch) || ("abcdefABCDEF".indexOf(ch)!=-1));
3674 if (negated) retval = !retval;
3677 return next(input, mymatch);
3682 void dump(StringBuffer os) {
3683 if (negated) os.append('^');
3684 os.append("[:" + s_nameTable[type] + ":]");
3688 * gnu/regexp/RETokenRange.java
3689 * Copyright (C) 1998-2001 Wes Biggs
3691 * This library is free software; you can redistribute it and/or modify
3692 * it under the terms of the GNU Lesser General Public License as published
3693 * by the Free Software Foundation; either version 2.1 of the License, or
3694 * (at your option) any later version.
3696 * This library is distributed in the hope that it will be useful,
3697 * but WITHOUT ANY WARRANTY; without even the implied warranty of
3698 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
3699 * GNU Lesser General Public License for more details.
3701 * You should have received a copy of the GNU Lesser General Public License
3702 * along with this program; if not, write to the Free Software
3703 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
3707 final static class RETokenRange extends REToken {
3708 private char lo, hi;
3709 private boolean insens;
3711 RETokenRange(int subIndex, char lo, char hi, boolean ins) {
3713 this.lo = (insens = ins) ? Character.toLowerCase(lo) : lo;
3714 this.hi = ins ? Character.toLowerCase(hi) : hi;
3717 int getMinimumLength() {
3721 boolean match(CharIndexed input, REMatch mymatch) {
3722 char c = input.charAt(mymatch.index);
3723 if (c == CharIndexed.OUT_OF_BOUNDS) return false;
3724 if (insens) c = Character.toLowerCase(c);
3725 if ((c >= lo) && (c <= hi)) {
3727 return next(input, mymatch);
3732 void dump(StringBuffer os) {
3733 os.append(lo).append('-').append(hi);
3738 * gnu/regexp/RETokenRepeated.java
3739 * Copyright (C) 1998-2001 Wes Biggs
3741 * This library is free software; you can redistribute it and/or modify
3742 * it under the terms of the GNU Lesser General Public License as published
3743 * by the Free Software Foundation; either version 2.1 of the License, or
3744 * (at your option) any later version.
3746 * This library is distributed in the hope that it will be useful,
3747 * but WITHOUT ANY WARRANTY; without even the implied warranty of
3748 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
3749 * GNU Lesser General Public License for more details.
3751 * You should have received a copy of the GNU Lesser General Public License
3752 * along with this program; if not, write to the Free Software
3753 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
3757 final static class RETokenRepeated extends REToken {
3758 private REToken token;
3759 private int min,max;
3760 private boolean stingy;
3762 RETokenRepeated(int subIndex, REToken token, int min, int max) {
3769 /** Sets the minimal matching mode to true. */
3774 /** Queries if this token has minimal matching enabled. */
3775 boolean isStingy() {
3780 * The minimum length of a repeated token is the minimum length
3781 * of the token multiplied by the minimum number of times it must
3784 int getMinimumLength() {
3785 return (min * token.getMinimumLength());
3788 // We do need to save every possible point, but the number of clone()
3789 // invocations here is really a killer for performance on non-stingy
3790 // repeat operators. I'm open to suggestions...
3792 // Hypothetical question: can you have a RE that matches 1 times,
3793 // 3 times, 5 times, but not 2 times or 4 times? Does having
3794 // the subexpression back-reference operator allow that?
3796 boolean match(CharIndexed input, REMatch mymatch) {
3797 // number of times we've matched so far
3800 // Possible positions for the next repeat to match at
3801 REMatch newMatch = mymatch;
3802 REMatch last = null;
3805 // Add the '0-repeats' index
3806 // positions.elementAt(z) == position [] in input after <<z>> matches
3807 Vector positions = new Vector();
3808 positions.addElement(newMatch);
3810 // Declare variables used in loop
3812 REMatch doablesLast;
3816 // Check for stingy match for each possibility.
3817 if (stingy && (numRepeats >= min)) {
3818 REMatch result = matchRest(input, newMatch);
3819 if (result != null) {
3820 mymatch.assignFrom(result);
3828 // try next repeat at all possible positions
3829 for (current = newMatch; current != null; current = current.next) {
3830 recurrent = (REMatch) current.clone();
3831 if (token.match(input, recurrent)) {
3832 // add all items in current to doables array
3833 if (doables == null) {
3834 doables = recurrent;
3835 doablesLast = recurrent;
3837 // Order these from longest to shortest
3838 // Start by assuming longest (more repeats)
3839 doablesLast.next = recurrent;
3841 // Find new doablesLast
3842 while (doablesLast.next != null) {
3843 doablesLast = doablesLast.next;
3847 // if none of the possibilities worked out, break out of do/while
3848 if (doables == null) break;
3850 // reassign where the next repeat can match
3853 // increment how many repeats we've successfully found
3856 positions.addElement(newMatch);
3857 } while (numRepeats < max);
3859 // If there aren't enough repeats, then fail
3860 if (numRepeats < min) return false;
3862 // We're greedy, but ease off until a true match is found
3863 int posIndex = positions.size();
3865 // At this point we've either got too many or just the right amount.
3866 // See if this numRepeats works with the rest of the regexp.
3867 REMatch allResults = null;
3868 REMatch allResultsLast = null;
3870 REMatch results = null;
3871 while (--posIndex >= min) {
3872 newMatch = (REMatch) positions.elementAt(posIndex);
3873 results = matchRest(input, newMatch);
3874 if (results != null) {
3875 if (allResults == null) {
3876 allResults = results;
3877 allResultsLast = results;
3879 // Order these from longest to shortest
3880 // Start by assuming longest (more repeats)
3881 allResultsLast.next = results;
3883 // Find new doablesLast
3884 while (allResultsLast.next != null) {
3885 allResultsLast = allResultsLast.next;
3888 // else did not match rest of the tokens, try again on smaller sample
3890 if (allResults != null) {
3891 mymatch.assignFrom(allResults); // does this get all?
3894 // If we fall out, no matches.
3898 private REMatch matchRest(CharIndexed input, final REMatch newMatch) {
3899 REMatch current, single;
3900 REMatch doneIndex = null;
3901 REMatch doneIndexLast = null;
3902 // Test all possible matches for this number of repeats
3903 for (current = newMatch; current != null; current = current.next) {
3904 // clone() separates a single match from the chain
3905 single = (REMatch) current.clone();
3906 if (next(input, single)) {
3907 // chain results to doneIndex
3908 if (doneIndex == null) {
3910 doneIndexLast = single;
3912 doneIndexLast.next = single;
3914 // Find new doneIndexLast
3915 while (doneIndexLast.next != null) {
3916 doneIndexLast = doneIndexLast.next;
3923 void dump(StringBuffer os) {
3927 if ((max == Integer.MAX_VALUE) && (min <= 1))
3928 os.append( (min == 0) ? '*' : '+' );
3929 else if ((min == 0) && (max == 1))
3932 os.append('{').append(min);
3935 if (max != Integer.MAX_VALUE) os.append(max);
3939 if (stingy) os.append('?');
3943 * gnu/regexp/RETokenStart.java
3944 * Copyright (C) 1998-2001 Wes Biggs
3946 * This library is free software; you can redistribute it and/or modify
3947 * it under the terms of the GNU Lesser General Public License as published
3948 * by the Free Software Foundation; either version 2.1 of the License, or
3949 * (at your option) any later version.
3951 * This library is distributed in the hope that it will be useful,
3952 * but WITHOUT ANY WARRANTY; without even the implied warranty of
3953 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
3954 * GNU Lesser General Public License for more details.
3956 * You should have received a copy of the GNU Lesser General Public License
3957 * along with this program; if not, write to the Free Software
3958 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
3962 static class RETokenStart extends REToken {
3963 private String newline; // matches after a newline
3965 RETokenStart(int subIndex, String newline) {
3967 this.newline = newline;
3970 boolean match(CharIndexed input, REMatch mymatch) {
3971 // charAt(index-n) may be unknown on a Reader/InputStream.
3972 // Match after a newline if in multiline mode
3974 if (newline != null) {
3975 int len = newline.length();
3976 if (mymatch.offset >= len) {
3977 boolean found = true;
3979 int i = 0; // position in REToken.newline
3980 char ch = input.charAt(mymatch.index - len);
3982 z = newline.charAt(i);
3988 ch = input.charAt(mymatch.index - len + i);
3991 if (found) return next(input, mymatch);
3995 // Don't match at all if REG_NOTBOL is set.
3996 if ((mymatch.eflags & RE.REG_NOTBOL) > 0) return false;
3998 if ((mymatch.eflags & RE.REG_ANCHORINDEX) > 0)
3999 return (mymatch.anchor == mymatch.offset) ?
4000 next(input, mymatch) : false;
4002 return ((mymatch.index == 0) && (mymatch.offset == 0)) ?
4003 next(input, mymatch) : false;
4006 void dump(StringBuffer os) {
4011 * gnu/regexp/RETokenWordBoundary.java
4012 * Copyright (C) 2001 Wes Biggs
4014 * This library is free software; you can redistribute it and/or modify
4015 * it under the terms of the GNU Lesser General Public License as published
4016 * by the Free Software Foundation; either version 2.1 of the License, or
4017 * (at your option) any later version.
4019 * This library is distributed in the hope that it will be useful,
4020 * but WITHOUT ANY WARRANTY; without even the implied warranty of
4021 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
4022 * GNU Lesser General Public License for more details.
4024 * You should have received a copy of the GNU Lesser General Public License
4025 * along with this program; if not, write to the Free Software
4026 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
4031 * Represents a combination lookahead/lookbehind for POSIX [:alnum:].
4033 final static class RETokenWordBoundary extends REToken {
4034 private boolean negated;
4036 static final int BEGIN = 1;
4037 static final int END = 2;
4039 RETokenWordBoundary(int subIndex, int where, boolean negated) {
4042 this.negated = negated;
4045 boolean match(CharIndexed input, REMatch mymatch) {
4046 // Word boundary means input[index-1] was a word character
4047 // and input[index] is not, or input[index] is a word character
4048 // and input[index-1] was not
4049 // In the string "one two three", these positions match:
4050 // |o|n|e| |t|w|o| |t|h|r|e|e|
4052 boolean after = false; // is current character a letter or digit?
4053 boolean before = false; // is previous character a letter or digit?
4056 // TODO: Also check REG_ANCHORINDEX vs. anchor
4057 if (((mymatch.eflags & RE.REG_ANCHORINDEX) != RE.REG_ANCHORINDEX)
4058 || (mymatch.offset + mymatch.index > mymatch.anchor)) {
4059 if ((ch = input.charAt(mymatch.index - 1)) != CharIndexed.OUT_OF_BOUNDS) {
4060 before = Character.isLetterOrDigit(ch) || (ch == '_');
4064 if ((ch = input.charAt(mymatch.index)) != CharIndexed.OUT_OF_BOUNDS) {
4065 after = Character.isLetterOrDigit(ch) || (ch == '_');
4068 // if (before) and (!after), we're at end (\>)
4069 // if (after) and (!before), we're at beginning (\<)
4070 boolean doNext = false;
4072 if ((where & BEGIN) == BEGIN) {
4073 doNext = after && !before;
4075 if ((where & END) == END) {
4076 doNext ^= before && !after;
4079 if (negated) doNext = !doNext;
4081 return (doNext ? next(input, mymatch) : false);
4084 void dump(StringBuffer os) {
4085 if (where == (BEGIN | END)) {
4086 os.append( negated ? "\\B" : "\\b" );
4087 } else if (where == BEGIN) {
4095 * gnu/regexp/UncheckedRE.java
4096 * Copyright (C) 2001 Wes Biggs
4098 * This library is free software; you can redistribute it and/or modify
4099 * it under the terms of the GNU Lesser General Public License as published
4100 * by the Free Software Foundation; either version 2.1 of the License, or
4101 * (at your option) any later version.
4103 * This library is distributed in the hope that it will be useful,
4104 * but WITHOUT ANY WARRANTY; without even the implied warranty of
4105 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
4106 * GNU Lesser General Public License for more details.
4108 * You should have received a copy of the GNU Lesser General Public License
4109 * along with this program; if not, write to the Free Software
4110 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
4115 * UncheckedRE is a subclass of RE that allows programmers an easier means
4116 * of programmatically precompiling regular expressions. It is constructed
4117 * and used in exactly the same manner as an instance of the RE class; the
4118 * only difference is that its constructors do not throw REException.
4119 * Instead, if a syntax error is encountered during construction, a
4120 * RuntimeException will be thrown.
4122 * Note that this makes UncheckedRE dangerous if constructed with
4123 * dynamic data. Do not use UncheckedRE unless you are completely sure
4124 * that all input being passed to it contains valid, well-formed
4125 * regular expressions for the syntax specified.
4127 * @author <A HREF="mailto:wes@cacas.org">Wes Biggs</A>
4128 * @see gnu.regexp.RE
4129 * @since gnu.regexp 1.1.4
4132 public final static class UncheckedRE extends RE {
4134 * Constructs a regular expression pattern buffer without any compilation
4135 * flags set, and using the default syntax (RESyntax.RE_SYNTAX_PERL5).
4137 * @param pattern A regular expression pattern, in the form of a String,
4138 * StringBuffer or char[]. Other input types will be converted to
4139 * strings using the toString() method.
4140 * @exception RuntimeException The input pattern could not be parsed.
4141 * @exception NullPointerException The pattern was null.
4143 public UncheckedRE(Object pattern) {
4144 this(pattern,0,RESyntax.RE_SYNTAX_PERL5);
4148 * Constructs a regular expression pattern buffer using the specified
4149 * compilation flags and the default syntax (RESyntax.RE_SYNTAX_PERL5).
4151 * @param pattern A regular expression pattern, in the form of a String,
4152 * StringBuffer, or char[]. Other input types will be converted to
4153 * strings using the toString() method.
4154 * @param cflags The logical OR of any combination of the compilation flags in the RE class.
4155 * @exception RuntimeException The input pattern could not be parsed.
4156 * @exception NullPointerException The pattern was null.
4158 public UncheckedRE(Object pattern, int cflags) {
4159 this(pattern,cflags,RESyntax.RE_SYNTAX_PERL5);
4163 * Constructs a regular expression pattern buffer using the specified
4164 * compilation flags and regular expression syntax.
4166 * @param pattern A regular expression pattern, in the form of a String,
4167 * StringBuffer, or char[]. Other input types will be converted to
4168 * strings using the toString() method.
4169 * @param cflags The logical OR of any combination of the compilation flags in the RE class.
4170 * @param syntax The type of regular expression syntax to use.
4171 * @exception RuntimeException The input pattern could not be parsed.
4172 * @exception NullPointerException The pattern was null.
4174 public UncheckedRE(Object pattern, int cflags, RESyntax syntax) {
4176 initialize(pattern,cflags,syntax,0,0);
4177 } catch (REException e) {
4178 throw new RuntimeException(e.getMessage());