src/gnu/regexp/RE.java

   1 /*
   2  *  gnu/regexp/RE.java
   3  *  Copyright (C) 1998-2001 Wes Biggs
   4  *
   5  *  This library is free software; you can redistribute it and/or modify
   6  *  it under the terms of the GNU Lesser General Public License as published
   7  *  by the Free Software Foundation; either version 2.1 of the License, or
   8  *  (at your option) any later version.
   9  *
  10  *  This library is distributed in the hope that it will be useful,
  11  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13  *  GNU Lesser General Public License for more details.
  14  *
  15  *  You should have received a copy of the GNU Lesser General Public License
  16  *  along with this program; if not, write to the Free Software
  17  *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  18  */
  19
  20 package gnu.regexp;
  21 import java.io.InputStream;
  22 import java.io.Reader;
  23 import java.io.Serializable;
  24 import java.util.Vector;
  25
  26 class IntPair implements Serializable {
  27   public int first, second;
  28 }
  29
  30 class CharUnit implements Serializable {
  31   public char ch;
  32   public boolean bk;
  33 }
  34
  35 /**
  36  * RE provides the user interface for compiling and matching regular
  37  * expressions.
  38  * <P>
  39  * A regular expression object (class RE) is compiled by constructing it
  40  * from a String, StringBuffer or character array, with optional
  41  * compilation flags (below)
  42  * and an optional syntax specification (see RESyntax; if not specified,
  43  * <code>RESyntax.RE_SYNTAX_PERL5</code> is used).
  44  * <P>
  45  * Various methods attempt to match input text against a compiled
  46  * regular expression.  These methods are:
  47  * <LI><code>isMatch</code>: returns true if the input text in its entirety
  48  * matches the regular expression pattern.
  49  * <LI><code>getMatch</code>: returns the first match found in the input text,
  50  * or null if no match is found.
  51  * <LI><code>getAllMatches</code>: returns an array of all non-overlapping
  52  * matches found in the input text.  If no matches are found, the array is
  53  * zero-length.
  54  * <LI><code>substitute</code>: substitute the first occurence of the pattern
  55  * in the input text with a replacement string (which may include
  56  * metacharacters $0-$9, see REMatch.substituteInto).
  57  * <LI><code>substituteAll</code>: same as above, but repeat for each match
  58  * before returning.
  59  * <LI><code>getMatchEnumeration</code>: returns an REMatchEnumeration object
  60  * that allows iteration over the matches (see REMatchEnumeration for some
  61  * reasons why you may want to do this instead of using <code>getAllMatches</code>.
  62  * <P>
  63  *
  64  * These methods all have similar argument lists.  The input can be a
  65  * String, a character array, a StringBuffer, a Reader or an
  66  * InputStream of some sort.  Note that when using a Reader or
  67  * InputStream, the stream read position cannot be guaranteed after
  68  * attempting a match (this is not a bug, but a consequence of the way
  69  * regular expressions work).  Using an REMatchEnumeration can
  70  * eliminate most positioning problems.
  71  *
  72  * <P>
  73  *
  74  * The optional index argument specifies the offset from the beginning
  75  * of the text at which the search should start (see the descriptions
  76  * of some of the execution flags for how this can affect positional
  77  * pattern operators).  For a Reader or InputStream, this means an
  78  * offset from the current read position, so subsequent calls with the
  79  * same index argument on a Reader or an InputStream will not
  80  * necessarily access the same position on the stream, whereas
  81  * repeated searches at a given index in a fixed string will return
  82  * consistent results.
  83  *
  84  * <P>
  85  * You can optionally affect the execution environment by using a
  86  * combination of execution flags (constants listed below).
  87  *
  88  * <P>
  89  * All operations on a regular expression are performed in a
  90  * thread-safe manner.
  91  *
  92  * @author <A HREF="mailto:wes@cacas.org">Wes Biggs</A>
  93  * @version 1.1.4-dev, to be released
  94  */
  95
  96 public class RE extends REToken {
  97   // This String will be returned by getVersion()
  98   private static final String VERSION = "1.1.4-dev";
  99
 100   // The localized strings are kept in a separate file
 101   // Ibex LOCAL: we can't bundle property lists within the ibex executable on all platforms
 102   // private static ResourceBundle messages = PropertyResourceBundle.getBundle("gnu/regexp/MessagesBundle", Locale.getDefault());
 103
 104   // These are, respectively, the first and last tokens in our linked list
 105   // If there is only one token, firstToken == lastToken
 106   private REToken firstToken, lastToken;
 107
 108   // This is the number of subexpressions in this regular expression,
 109   // with a minimum value of zero.  Returned by getNumSubs()
 110   private int numSubs;
 111
 112     /** Minimum length, in characters, of any possible match. */
 113     private int minimumLength;
 114
 115   /**
 116    * Compilation flag. Do  not  differentiate  case.   Subsequent
 117    * searches  using  this  RE will be case insensitive.
 118    */
 119   public static final int REG_ICASE = 2;
 120
 121   /**
 122    * Compilation flag. The match-any-character operator (dot)
 123    * will match a newline character.  When set this overrides the syntax
 124    * bit RE_DOT_NEWLINE (see RESyntax for details).  This is equivalent to
 125    * the "/s" operator in Perl.
 126    */
 127   public static final int REG_DOT_NEWLINE = 4;
 128
 129   /**
 130    * Compilation flag. Use multiline mode.  In this mode, the ^ and $
 131    * anchors will match based on newlines within the input. This is
 132    * equivalent to the "/m" operator in Perl.
 133    */
 134   public static final int REG_MULTILINE = 8;
 135
 136   /**
 137    * Execution flag.
 138    * The match-beginning operator (^) will not match at the beginning
 139    * of the input string. Useful for matching on a substring when you
 140    * know the context of the input is such that position zero of the
 141    * input to the match test is not actually position zero of the text.
 142    * <P>
 143    * This example demonstrates the results of various ways of matching on
 144    * a substring.
 145    * <P>
 146    * <CODE>
 147    * String s = "food bar fool";<BR>
 148    * RE exp = new RE("^foo.");<BR>
 149    * REMatch m0 = exp.getMatch(s);<BR>
 150    * REMatch m1 = exp.getMatch(s.substring(8));<BR>
 151    * REMatch m2 = exp.getMatch(s.substring(8),0,RE.REG_NOTBOL); <BR>
 152    * REMatch m3 = exp.getMatch(s,8);                            <BR>
 153    * REMatch m4 = exp.getMatch(s,8,RE.REG_ANCHORINDEX);         <BR>
 154    * <P>
 155    * // Results:<BR>
 156    * //  m0 = "food"<BR>
 157    * //  m1 = "fool"<BR>
 158    * //  m2 = null<BR>
 159    * //  m3 = null<BR>
 160    * //  m4 = "fool"<BR>
 161    * </CODE>
 162    */
 163   public static final int REG_NOTBOL = 16;
 164
 165   /**
 166    * Execution flag.
 167    * The match-end operator ($) does not match at the end
 168    * of the input string. Useful for matching on substrings.
 169    */
 170   public static final int REG_NOTEOL = 32;
 171
 172   /**
 173    * Execution flag.
 174    * When a match method is invoked that starts matching at a non-zero
 175    * index into the input, treat the input as if it begins at the index
 176    * given.  The effect of this flag is that the engine does not "see"
 177    * any text in the input before the given index.  This is useful so
 178    * that the match-beginning operator (^) matches not at position 0
 179    * in the input string, but at the position the search started at
 180    * (based on the index input given to the getMatch function).  See
 181    * the example under REG_NOTBOL.  It also affects the use of the \&lt;
 182    * and \b operators.
 183    */
 184   public static final int REG_ANCHORINDEX = 64;
 185
 186   /**
 187    * Execution flag.
 188    * The substitute and substituteAll methods will not attempt to
 189    * interpolate occurrences of $1-$9 in the replacement text with
 190    * the corresponding subexpressions.  For example, you may want to
 191    * replace all matches of "one dollar" with "$1".
 192    */
 193   public static final int REG_NO_INTERPOLATE = 128;
 194
 195   /** Returns a string representing the version of the gnu.regexp package. */
 196   public static final String version() {
 197     return VERSION;
 198   }
 199
 200   // Retrieves a message from the ResourceBundle
 201   // Ibex LOCAL: we can't bundle property lists within the ibex executable on all platforms
 202   //            for simplicity, just lookup the errors this way.
 203   static final String getLocalizedMessage(String key) {
 204     if(key.equals("error.prefix")) return "At position {0} in regular expression pattern:";
 205     else if(key.equals("repeat.assertion")) return "repeated token is zero-width assertion";
 206     else if(key.equals("repeat.chained")) return "attempted to repeat a token that is already repeated";
 207     else if(key.equals("repeat.no.token")) return "quantifier (?*+{}) without preceding token";
 208     else if(key.equals("repeat.empty.token")) return "repeated token may be empty";
 209     else if(key.equals("unmatched.brace")) return "unmatched brace";
 210     else if(key.equals("unmatched.bracket")) return "unmatched bracket";
 211     else if(key.equals("unmatched.paren")) return "unmatched parenthesis";
 212     else if(key.equals("interval.no.end")) return "expected end of interval";
 213     else if(key.equals("class.no.end")) return "expected end of character class";
 214     else if(key.equals("subexpr.no.end")) return "expected end of subexpression";
 215     else if(key.equals("interval.order")) return "interval minimum is greater than maximum";
 216     else if(key.equals("interval.error")) return "interval is empty or contains illegal chracters";
 217     else if(key.equals("ends.with.backslash")) return "backslash at end of pattern";
 218     else if(key.equals("syntax.final")) return "Syntax has been declared final and cannot be modified";
 219     return "Unknown regexp error";
 220     //return messages.getString(key);
 221   }
 222
 223   /**
 224    * Constructs a regular expression pattern buffer without any compilation
 225    * flags set, and using the default syntax (RESyntax.RE_SYNTAX_PERL5).
 226    *
 227    * @param pattern A regular expression pattern, in the form of a String,
 228    *   StringBuffer or char[].  Other input types will be converted to
 229    *   strings using the toString() method.
 230    * @exception REException The input pattern could not be parsed.
 231    * @exception NullPointerException The pattern was null.
 232    */
 233   public RE(Object pattern) throws REException {
 234     this(pattern,0,RESyntax.RE_SYNTAX_PERL5,0,0);
 235   }
 236
 237   /**
 238    * Constructs a regular expression pattern buffer using the specified
 239    * compilation flags and the default syntax (RESyntax.RE_SYNTAX_PERL5).
 240    *
 241    * @param pattern A regular expression pattern, in the form of a String,
 242    *   StringBuffer, or char[].  Other input types will be converted to
 243    *   strings using the toString() method.
 244    * @param cflags The logical OR of any combination of the compilation flags listed above.
 245    * @exception REException The input pattern could not be parsed.
 246    * @exception NullPointerException The pattern was null.
 247    */
 248   public RE(Object pattern, int cflags) throws REException {
 249     this(pattern,cflags,RESyntax.RE_SYNTAX_PERL5,0,0);
 250   }
 251
 252   /**
 253    * Constructs a regular expression pattern buffer using the specified
 254    * compilation flags and regular expression syntax.
 255    *
 256    * @param pattern A regular expression pattern, in the form of a String,
 257    *   StringBuffer, or char[].  Other input types will be converted to
 258    *   strings using the toString() method.
 259    * @param cflags The logical OR of any combination of the compilation flags listed above.
 260    * @param syntax The type of regular expression syntax to use.
 261    * @exception REException The input pattern could not be parsed.
 262    * @exception NullPointerException The pattern was null.
 263    */
 264   public RE(Object pattern, int cflags, RESyntax syntax) throws REException {
 265     this(pattern,cflags,syntax,0,0);
 266   }
 267
 268   // internal constructor used for alternation
 269   private RE(REToken first, REToken last,int subs, int subIndex, int minLength) {
 270     super(subIndex);
 271     firstToken = first;
 272     lastToken = last;
 273     numSubs = subs;
 274     minimumLength = minLength;
 275     addToken(new RETokenEndSub(subIndex));
 276   }
 277
 278   private RE(Object patternObj, int cflags, RESyntax syntax, int myIndex, int nextSub) throws REException {
 279     super(myIndex); // Subexpression index of this token.
 280     initialize(patternObj, cflags, syntax, myIndex, nextSub);
 281   }
 282
 283     // For use by subclasses
 284     protected RE() { super(0); }
 285
 286     // The meat of construction
 287   protected void initialize(Object patternObj, int cflags, RESyntax syntax, int myIndex, int nextSub) throws REException {
 288       char[] pattern;
 289     if (patternObj instanceof String) {
 290       pattern = ((String) patternObj).toCharArray();
 291     } else if (patternObj instanceof char[]) {
 292       pattern = (char[]) patternObj;
 293     } else if (patternObj instanceof StringBuffer) {
 294       pattern = new char [((StringBuffer) patternObj).length()];
 295       ((StringBuffer) patternObj).getChars(0,pattern.length,pattern,0);
 296     } else {
 297         pattern = patternObj.toString().toCharArray();
 298     }
 299
 300     int pLength = pattern.length;
 301
 302     numSubs = 0; // Number of subexpressions in this token.
 303     Vector branches = null;
 304
 305     // linked list of tokens (sort of -- some closed loops can exist)
 306     firstToken = lastToken = null;
 307
 308     // Precalculate these so we don't pay for the math every time we
 309     // need to access them.
 310     boolean insens = ((cflags & REG_ICASE) > 0);
 311
 312     // Parse pattern into tokens.  Does anyone know if it's more efficient
 313     // to use char[] than a String.charAt()?  I'm assuming so.
 314
 315     // index tracks the position in the char array
 316     int index = 0;
 317
 318     // this will be the current parse character (pattern[index])
 319     CharUnit unit = new CharUnit();
 320
 321     // This is used for {x,y} calculations
 322     IntPair minMax = new IntPair();
 323
 324     // Buffer a token so we can create a TokenRepeated, etc.
 325     REToken currentToken = null;
 326     char ch;
 327
 328     while (index < pLength) {
 329       // read the next character unit (including backslash escapes)
 330       index = getCharUnit(pattern,index,unit);
 331
 332       // ALTERNATION OPERATOR
 333       //  \| or | (if RE_NO_BK_VBAR) or newline (if RE_NEWLINE_ALT)
 334       //  not available if RE_LIMITED_OPS is set
 335
 336       // TODO: the '\n' literal here should be a test against REToken.newline,
 337       // which unfortunately may be more than a single character.
 338       if ( ( (unit.ch == '|' && (syntax.get(RESyntax.RE_NO_BK_VBAR) ^ unit.bk))
 339              || (syntax.get(RESyntax.RE_NEWLINE_ALT) && (unit.ch == '\n') && !unit.bk) )
 340            && !syntax.get(RESyntax.RE_LIMITED_OPS)) {
 341         // make everything up to here be a branch. create vector if nec.
 342         addToken(currentToken);
 343         RE theBranch = new RE(firstToken, lastToken, numSubs, subIndex, minimumLength);
 344         minimumLength = 0;
 345         if (branches == null) {
 346             branches = new Vector();
 347         }
 348         branches.addElement(theBranch);
 349         firstToken = lastToken = currentToken = null;
 350       }
 351
 352       // INTERVAL OPERATOR:
 353       //  {x} | {x,} | {x,y}  (RE_INTERVALS && RE_NO_BK_BRACES)
 354       //  \{x\} | \{x,\} | \{x,y\} (RE_INTERVALS && !RE_NO_BK_BRACES)
 355       //
 356       // OPEN QUESTION:
 357       //  what is proper interpretation of '{' at start of string?
 358
 359       else if ((unit.ch == '{') && syntax.get(RESyntax.RE_INTERVALS) && (syntax.get(RESyntax.RE_NO_BK_BRACES) ^ unit.bk)) {
 360         int newIndex = getMinMax(pattern,index,minMax,syntax);
 361         if (newIndex > index) {
 362           if (minMax.first > minMax.second)
 363             throw new REException(getLocalizedMessage("interval.order"),REException.REG_BADRPT,newIndex);
 364           if (currentToken == null)
 365             throw new REException(getLocalizedMessage("repeat.no.token"),REException.REG_BADRPT,newIndex);
 366           if (currentToken instanceof RETokenRepeated)
 367             throw new REException(getLocalizedMessage("repeat.chained"),REException.REG_BADRPT,newIndex);
 368           if (currentToken instanceof RETokenWordBoundary || currentToken instanceof RETokenWordBoundary)
 369             throw new REException(getLocalizedMessage("repeat.assertion"),REException.REG_BADRPT,newIndex);
 370           if ((currentToken.getMinimumLength() == 0) && (minMax.second == Integer.MAX_VALUE))
 371             throw new REException(getLocalizedMessage("repeat.empty.token"),REException.REG_BADRPT,newIndex);
 372           index = newIndex;
 373           currentToken = setRepeated(currentToken,minMax.first,minMax.second,index);
 374         }
 375         else {
 376           addToken(currentToken);
 377           currentToken = new RETokenChar(subIndex,unit.ch,insens);
 378         }
 379       }
 380
 381       // LIST OPERATOR:
 382       //  [...] | [^...]
 383
 384       else if ((unit.ch == '[') && !unit.bk) {
 385         Vector options = new Vector();
 386         boolean negative = false;
 387         char lastChar = 0;
 388         if (index == pLength) throw new REException(getLocalizedMessage("unmatched.bracket"),REException.REG_EBRACK,index);
 389
 390         // Check for initial caret, negation
 391         if ((ch = pattern[index]) == '^') {
 392           negative = true;
 393           if (++index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
 394           ch = pattern[index];
 395         }
 396
 397         // Check for leading right bracket literal
 398         if (ch == ']') {
 399           lastChar = ch;
 400           if (++index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
 401         }
 402
 403         while ((ch = pattern[index++]) != ']') {
 404           if ((ch == '-') && (lastChar != 0)) {
 405             if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
 406             if ((ch = pattern[index]) == ']') {
 407               options.addElement(new RETokenChar(subIndex,lastChar,insens));
 408               lastChar = '-';
 409             } else {
 410               options.addElement(new RETokenRange(subIndex,lastChar,ch,insens));
 411               lastChar = 0;
 412               index++;
 413             }
 414           } else if ((ch == '\\') && syntax.get(RESyntax.RE_BACKSLASH_ESCAPE_IN_LISTS)) {
 415             if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
 416             int posixID = -1;
 417             boolean negate = false;
 418             char asciiEsc = 0;
 419             if (("dswDSW".indexOf(pattern[index]) != -1) && syntax.get(RESyntax.RE_CHAR_CLASS_ESC_IN_LISTS)) {
 420               switch (pattern[index]) {
 421               case 'D':
 422                 negate = true;
 423               case 'd':
 424                 posixID = RETokenPOSIX.DIGIT;
 425                 break;
 426               case 'S':
 427                 negate = true;
 428               case 's':
 429                 posixID = RETokenPOSIX.SPACE;
 430                 break;
 431               case 'W':
 432                 negate = true;
 433               case 'w':
 434                 posixID = RETokenPOSIX.ALNUM;
 435                 break;
 436               }
 437             }
 438             else if ("nrt".indexOf(pattern[index]) != -1) {
 439               switch (pattern[index]) {
 440                 case 'n':
 441                   asciiEsc = '\n';
 442                   break;
 443                 case 't':
 444                   asciiEsc = '\t';
 445                   break;
 446                 case 'r':
 447                   asciiEsc = '\r';
 448                   break;
 449               }
 450             }
 451             if (lastChar != 0) options.addElement(new RETokenChar(subIndex,lastChar,insens));
 452
 453             if (posixID != -1) {
 454               options.addElement(new RETokenPOSIX(subIndex,posixID,insens,negate));
 455             } else if (asciiEsc != 0) {
 456               lastChar = asciiEsc;
 457             } else {
 458               lastChar = pattern[index];
 459             }
 460             ++index;
 461           } else if ((ch == '[') && (syntax.get(RESyntax.RE_CHAR_CLASSES)) && (index < pLength) && (pattern[index] == ':')) {
 462             StringBuffer posixSet = new StringBuffer();
 463             index = getPosixSet(pattern,index+1,posixSet);
 464             int posixId = RETokenPOSIX.intValue(posixSet.toString());
 465             if (posixId != -1)
 466               options.addElement(new RETokenPOSIX(subIndex,posixId,insens,false));
 467           } else {
 468             if (lastChar != 0) options.addElement(new RETokenChar(subIndex,lastChar,insens));
 469             lastChar = ch;
 470           }
 471           if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
 472         } // while in list
 473         // Out of list, index is one past ']'
 474
 475         if (lastChar != 0) options.addElement(new RETokenChar(subIndex,lastChar,insens));
 476
 477         // Create a new RETokenOneOf
 478         addToken(currentToken);
 479         options.trimToSize();
 480         currentToken = new RETokenOneOf(subIndex,options,negative);
 481       }
 482
 483       // SUBEXPRESSIONS
 484       //  (...) | \(...\) depending on RE_NO_BK_PARENS
 485
 486       else if ((unit.ch == '(') && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ unit.bk)) {
 487         boolean pure = false;
 488         boolean comment = false;
 489         boolean lookAhead = false;
 490         boolean negativelh = false;
 491         if ((index+1 < pLength) && (pattern[index] == '?')) {
 492           switch (pattern[index+1]) {
 493           case '!':
 494             if (syntax.get(RESyntax.RE_LOOKAHEAD)) {
 495               pure = true;
 496               negativelh = true;
 497               lookAhead = true;
 498               index += 2;
 499             }
 500             break;
 501           case '=':
 502             if (syntax.get(RESyntax.RE_LOOKAHEAD)) {
 503               pure = true;
 504               lookAhead = true;
 505               index += 2;
 506             }
 507             break;
 508           case ':':
 509             if (syntax.get(RESyntax.RE_PURE_GROUPING)) {
 510               pure = true;
 511               index += 2;
 512             }
 513             break;
 514           case '#':
 515             if (syntax.get(RESyntax.RE_COMMENTS)) {
 516               comment = true;
 517             }
 518             break;
 519           default:
 520             throw new REException(getLocalizedMessage("repeat.no.token"), REException.REG_BADRPT, index);
 521           }
 522         }
 523
 524         if (index >= pLength) {
 525             throw new REException(getLocalizedMessage("unmatched.paren"), REException.REG_ESUBREG,index);
 526         }
 527
 528         // find end of subexpression
 529         int endIndex = index;
 530         int nextIndex = index;
 531         int nested = 0;
 532
 533         while ( ((nextIndex = getCharUnit(pattern,endIndex,unit)) > 0)
 534                 && !(nested == 0 && (unit.ch == ')') && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ unit.bk)) )
 535           if ((endIndex = nextIndex) >= pLength)
 536             throw new REException(getLocalizedMessage("subexpr.no.end"),REException.REG_ESUBREG,nextIndex);
 537           else if (unit.ch == '(' && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ unit.bk))
 538             nested++;
 539           else if (unit.ch == ')' && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ unit.bk))
 540             nested--;
 541
 542         // endIndex is now position at a ')','\)'
 543         // nextIndex is end of string or position after ')' or '\)'
 544
 545         if (comment) index = nextIndex;
 546         else { // not a comment
 547           // create RE subexpression as token.
 548           addToken(currentToken);
 549           if (!pure) {
 550             numSubs++;
 551           }
 552
 553           int useIndex = (pure || lookAhead) ? 0 : nextSub + numSubs;
 554           currentToken = new RE(String.valueOf(pattern,index,endIndex-index).toCharArray(),cflags,syntax,useIndex,nextSub + numSubs);
 555           numSubs += ((RE) currentToken).getNumSubs();
 556
 557           if (lookAhead) {
 558               currentToken = new RETokenLookAhead(currentToken,negativelh);
 559           }
 560
 561           index = nextIndex;
 562         } // not a comment
 563       } // subexpression
 564
 565       // UNMATCHED RIGHT PAREN
 566       // ) or \) throw exception if
 567       // !syntax.get(RESyntax.RE_UNMATCHED_RIGHT_PAREN_ORD)
 568       else if (!syntax.get(RESyntax.RE_UNMATCHED_RIGHT_PAREN_ORD) && ((unit.ch == ')') && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ unit.bk))) {
 569         throw new REException(getLocalizedMessage("unmatched.paren"),REException.REG_EPAREN,index);
 570       }
 571
 572       // START OF LINE OPERATOR
 573       //  ^
 574
 575       else if ((unit.ch == '^') && !unit.bk) {
 576         addToken(currentToken);
 577         currentToken = null;
 578         addToken(new RETokenStart(subIndex,((cflags & REG_MULTILINE) > 0) ? syntax.getLineSeparator() : null));
 579       }
 580
 581       // END OF LINE OPERATOR
 582       //  $
 583
 584       else if ((unit.ch == '$') && !unit.bk) {
 585         addToken(currentToken);
 586         currentToken = null;
 587         addToken(new RETokenEnd(subIndex,((cflags & REG_MULTILINE) > 0) ? syntax.getLineSeparator() : null));
 588       }
 589
 590       // MATCH-ANY-CHARACTER OPERATOR (except possibly newline and null)
 591       //  .
 592
 593       else if ((unit.ch == '.') && !unit.bk) {
 594         addToken(currentToken);
 595         currentToken = new RETokenAny(subIndex,syntax.get(RESyntax.RE_DOT_NEWLINE) || ((cflags & REG_DOT_NEWLINE) > 0),syntax.get(RESyntax.RE_DOT_NOT_NULL));
 596       }
 597
 598       // ZERO-OR-MORE REPEAT OPERATOR
 599       //  *
 600
 601       else if ((unit.ch == '*') && !unit.bk) {
 602         if (currentToken == null)
 603           throw new REException(getLocalizedMessage("repeat.no.token"),REException.REG_BADRPT,index);
 604         if (currentToken instanceof RETokenRepeated)
 605           throw new REException(getLocalizedMessage("repeat.chained"),REException.REG_BADRPT,index);
 606         if (currentToken instanceof RETokenWordBoundary || currentToken instanceof RETokenWordBoundary)
 607           throw new REException(getLocalizedMessage("repeat.assertion"),REException.REG_BADRPT,index);
 608         if (currentToken.getMinimumLength() == 0)
 609           throw new REException(getLocalizedMessage("repeat.empty.token"),REException.REG_BADRPT,index);
 610         currentToken = setRepeated(currentToken,0,Integer.MAX_VALUE,index);
 611       }
 612
 613       // ONE-OR-MORE REPEAT OPERATOR
 614       //  + | \+ depending on RE_BK_PLUS_QM
 615       //  not available if RE_LIMITED_OPS is set
 616
 617       else if ((unit.ch == '+') && !syntax.get(RESyntax.RE_LIMITED_OPS) && (!syntax.get(RESyntax.RE_BK_PLUS_QM) ^ unit.bk)) {
 618         if (currentToken == null)
 619           throw new REException(getLocalizedMessage("repeat.no.token"),REException.REG_BADRPT,index);
 620         if (currentToken instanceof RETokenRepeated)
 621           throw new REException(getLocalizedMessage("repeat.chained"),REException.REG_BADRPT,index);
 622         if (currentToken instanceof RETokenWordBoundary || currentToken instanceof RETokenWordBoundary)
 623           throw new REException(getLocalizedMessage("repeat.assertion"),REException.REG_BADRPT,index);
 624         if (currentToken.getMinimumLength() == 0)
 625           throw new REException(getLocalizedMessage("repeat.empty.token"),REException.REG_BADRPT,index);
 626         currentToken = setRepeated(currentToken,1,Integer.MAX_VALUE,index);
 627       }
 628
 629       // ZERO-OR-ONE REPEAT OPERATOR / STINGY MATCHING OPERATOR
 630       //  ? | \? depending on RE_BK_PLUS_QM
 631       //  not available if RE_LIMITED_OPS is set
 632       //  stingy matching if RE_STINGY_OPS is set and it follows a quantifier
 633
 634       else if ((unit.ch == '?') && !syntax.get(RESyntax.RE_LIMITED_OPS) && (!syntax.get(RESyntax.RE_BK_PLUS_QM) ^ unit.bk)) {
 635         if (currentToken == null) throw new REException(getLocalizedMessage("repeat.no.token"),REException.REG_BADRPT,index);
 636
 637         // Check for stingy matching on RETokenRepeated
 638         if (currentToken instanceof RETokenRepeated) {
 639           if (syntax.get(RESyntax.RE_STINGY_OPS) && !((RETokenRepeated)currentToken).isStingy())
 640             ((RETokenRepeated)currentToken).makeStingy();
 641           else
 642             throw new REException(getLocalizedMessage("repeat.chained"),REException.REG_BADRPT,index);
 643         }
 644         else if (currentToken instanceof RETokenWordBoundary || currentToken instanceof RETokenWordBoundary)
 645           throw new REException(getLocalizedMessage("repeat.assertion"),REException.REG_BADRPT,index);
 646         else
 647           currentToken = setRepeated(currentToken,0,1,index);
 648       }
 649
 650       // BACKREFERENCE OPERATOR
 651       //  \1 \2 ... \9
 652       // not available if RE_NO_BK_REFS is set
 653
 654       else if (unit.bk && Character.isDigit(unit.ch) && !syntax.get(RESyntax.RE_NO_BK_REFS)) {
 655         addToken(currentToken);
 656         currentToken = new RETokenBackRef(subIndex,Character.digit(unit.ch,10),insens);
 657       }
 658
 659       // START OF STRING OPERATOR
 660       //  \A if RE_STRING_ANCHORS is set
 661
 662       else if (unit.bk && (unit.ch == 'A') && syntax.get(RESyntax.RE_STRING_ANCHORS)) {
 663         addToken(currentToken);
 664         currentToken = new RETokenStart(subIndex,null);
 665       }
 666
 667       // WORD BREAK OPERATOR
 668       //  \b if ????
 669
 670       else if (unit.bk && (unit.ch == 'b') && syntax.get(RESyntax.RE_STRING_ANCHORS)) {
 671           addToken(currentToken);
 672           currentToken = new RETokenWordBoundary(subIndex, RETokenWordBoundary.BEGIN | RETokenWordBoundary.END, false);
 673       }
 674
 675       // WORD BEGIN OPERATOR
 676       //  \< if ????
 677       else if (unit.bk && (unit.ch == '<')) {
 678           addToken(currentToken);
 679           currentToken = new RETokenWordBoundary(subIndex, RETokenWordBoundary.BEGIN, false);
 680       }
 681
 682       // WORD END OPERATOR
 683       //  \> if ????
 684       else if (unit.bk && (unit.ch == '>')) {
 685           addToken(currentToken);
 686           currentToken = new RETokenWordBoundary(subIndex, RETokenWordBoundary.END, false);
 687       }
 688
 689       // NON-WORD BREAK OPERATOR
 690       // \B if ????
 691
 692       else if (unit.bk && (unit.ch == 'B') && syntax.get(RESyntax.RE_STRING_ANCHORS)) {
 693           addToken(currentToken);
 694           currentToken = new RETokenWordBoundary(subIndex, RETokenWordBoundary.BEGIN | RETokenWordBoundary.END, true);
 695       }
 696
 697
 698       // DIGIT OPERATOR
 699       //  \d if RE_CHAR_CLASS_ESCAPES is set
 700
 701       else if (unit.bk && (unit.ch == 'd') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
 702         addToken(currentToken);
 703         currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.DIGIT,insens,false);
 704       }
 705
 706       // NON-DIGIT OPERATOR
 707       //  \D
 708
 709         else if (unit.bk && (unit.ch == 'D') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
 710           addToken(currentToken);
 711           currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.DIGIT,insens,true);
 712         }
 713
 714         // NEWLINE ESCAPE
 715         //  \n
 716
 717         else if (unit.bk && (unit.ch == 'n')) {
 718           addToken(currentToken);
 719           currentToken = new RETokenChar(subIndex,'\n',false);
 720         }
 721
 722         // RETURN ESCAPE
 723         //  \r
 724
 725         else if (unit.bk && (unit.ch == 'r')) {
 726           addToken(currentToken);
 727           currentToken = new RETokenChar(subIndex,'\r',false);
 728         }
 729
 730         // WHITESPACE OPERATOR
 731         //  \s if RE_CHAR_CLASS_ESCAPES is set
 732
 733         else if (unit.bk && (unit.ch == 's') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
 734           addToken(currentToken);
 735           currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.SPACE,insens,false);
 736         }
 737
 738         // NON-WHITESPACE OPERATOR
 739         //  \S
 740
 741         else if (unit.bk && (unit.ch == 'S') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
 742           addToken(currentToken);
 743           currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.SPACE,insens,true);
 744         }
 745
 746         // TAB ESCAPE
 747         //  \t
 748
 749         else if (unit.bk && (unit.ch == 't')) {
 750           addToken(currentToken);
 751           currentToken = new RETokenChar(subIndex,'\t',false);
 752         }
 753
 754         // ALPHANUMERIC OPERATOR
 755         //  \w
 756
 757         else if (unit.bk && (unit.ch == 'w') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
 758           addToken(currentToken);
 759           currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.ALNUM,insens,false);
 760         }
 761
 762         // NON-ALPHANUMERIC OPERATOR
 763         //  \W
 764
 765         else if (unit.bk && (unit.ch == 'W') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
 766           addToken(currentToken);
 767           currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.ALNUM,insens,true);
 768         }
 769
 770         // END OF STRING OPERATOR
 771         //  \Z
 772
 773         else if (unit.bk && (unit.ch == 'Z') && syntax.get(RESyntax.RE_STRING_ANCHORS)) {
 774           addToken(currentToken);
 775           currentToken = new RETokenEnd(subIndex,null);
 776         }
 777
 778         // NON-SPECIAL CHARACTER (or escape to make literal)
 779         //  c | \* for example
 780
 781         else {  // not a special character
 782           addToken(currentToken);
 783           currentToken = new RETokenChar(subIndex,unit.ch,insens);
 784         }
 785       } // end while
 786
 787     // Add final buffered token and an EndSub marker
 788     addToken(currentToken);
 789
 790     if (branches != null) {
 791         branches.addElement(new RE(firstToken,lastToken,numSubs,subIndex,minimumLength));
 792         branches.trimToSize(); // compact the Vector
 793         minimumLength = 0;
 794         firstToken = lastToken = null;
 795         addToken(new RETokenOneOf(subIndex,branches,false));
 796     }
 797     else addToken(new RETokenEndSub(subIndex));
 798
 799   }
 800
 801   private static int getCharUnit(char[] input, int index, CharUnit unit) throws REException {
 802     unit.ch = input[index++];
 803     if (unit.bk = (unit.ch == '\\'))
 804       if (index < input.length)
 805         unit.ch = input[index++];
 806       else throw new REException(getLocalizedMessage("ends.with.backslash"),REException.REG_ESCAPE,index);
 807     return index;
 808   }
 809
 810   /**
 811    * Checks if the regular expression matches the input in its entirety.
 812    *
 813    * @param input The input text.
 814    */
 815   public boolean isMatch(Object input) {
 816     return isMatch(input,0,0);
 817   }
 818
 819   /**
 820    * Checks if the input string, starting from index, is an exact match of
 821    * this regular expression.
 822    *
 823    * @param input The input text.
 824    * @param index The offset index at which the search should be begin.
 825    */
 826   public boolean isMatch(Object input,int index) {
 827     return isMatch(input,index,0);
 828   }
 829
 830
 831   /**
 832    * Checks if the input, starting from index and using the specified
 833    * execution flags, is an exact match of this regular expression.
 834    *
 835    * @param input The input text.
 836    * @param index The offset index at which the search should be begin.
 837    * @param eflags The logical OR of any execution flags above.
 838    */
 839   public boolean isMatch(Object input,int index,int eflags) {
 840     return isMatchImpl(makeCharIndexed(input,index),index,eflags);
 841   }
 842
 843   private boolean isMatchImpl(CharIndexed input, int index, int eflags) {
 844     if (firstToken == null)  // Trivial case
 845       return (input.charAt(0) == CharIndexed.OUT_OF_BOUNDS);
 846     REMatch m = new REMatch(numSubs, index, eflags);
 847     if (firstToken.match(input, m)) {
 848         while (m != null) {
 849             if (input.charAt(m.index) == CharIndexed.OUT_OF_BOUNDS) {
 850                 return true;
 851             }
 852             m = m.next;
 853         }
 854     }
 855     return false;
 856   }
 857
 858   /**
 859    * Returns the maximum number of subexpressions in this regular expression.
 860    * If the expression contains branches, the value returned will be the
 861    * maximum subexpressions in any of the branches.
 862    */
 863   public int getNumSubs() {
 864     return numSubs;
 865   }
 866
 867   // Overrides REToken.setUncle
 868   void setUncle(REToken uncle) {
 869       if (lastToken != null) {
 870           lastToken.setUncle(uncle);
 871       } else super.setUncle(uncle); // to deal with empty subexpressions
 872   }
 873
 874   // Overrides REToken.chain
 875
 876   boolean chain(REToken next) {
 877     super.chain(next);
 878     setUncle(next);
 879     return true;
 880   }
 881
 882   /**
 883    * Returns the minimum number of characters that could possibly
 884    * constitute a match of this regular expression.
 885    */
 886   public int getMinimumLength() {
 887       return minimumLength;
 888   }
 889
 890   /**
 891    * Returns an array of all matches found in the input.
 892    *
 893    * If the regular expression allows the empty string to match, it will
 894    * substitute matches at all positions except the end of the input.
 895    *
 896    * @param input The input text.
 897    * @return a non-null (but possibly zero-length) array of matches
 898    */
 899   public REMatch[] getAllMatches(Object input) {
 900     return getAllMatches(input,0,0);
 901   }
 902
 903   /**
 904    * Returns an array of all matches found in the input,
 905    * beginning at the specified index position.
 906    *
 907    * If the regular expression allows the empty string to match, it will
 908    * substitute matches at all positions except the end of the input.
 909    *
 910    * @param input The input text.
 911    * @param index The offset index at which the search should be begin.
 912    * @return a non-null (but possibly zero-length) array of matches
 913    */
 914   public REMatch[] getAllMatches(Object input, int index) {
 915     return getAllMatches(input,index,0);
 916   }
 917
 918   /**
 919    * Returns an array of all matches found in the input string,
 920    * beginning at the specified index position and using the specified
 921    * execution flags.
 922    *
 923    * If the regular expression allows the empty string to match, it will
 924    * substitute matches at all positions except the end of the input.
 925    *
 926    * @param input The input text.
 927    * @param index The offset index at which the search should be begin.
 928    * @param eflags The logical OR of any execution flags above.
 929    * @return a non-null (but possibly zero-length) array of matches
 930    */
 931   public REMatch[] getAllMatches(Object input, int index, int eflags) {
 932     return getAllMatchesImpl(makeCharIndexed(input,index),index,eflags);
 933   }
 934
 935   // this has been changed since 1.03 to be non-overlapping matches
 936   private REMatch[] getAllMatchesImpl(CharIndexed input, int index, int eflags) {
 937     Vector all = new Vector();
 938     REMatch m = null;
 939     while ((m = getMatchImpl(input,index,eflags,null)) != null) {
 940       all.addElement(m);
 941       index = m.getEndIndex();
 942       if (m.end[0] == 0) {   // handle pathological case of zero-length match
 943         index++;
 944         input.move(1);
 945       } else {
 946         input.move(m.end[0]);
 947       }
 948       if (!input.isValid()) break;
 949     }
 950     REMatch[] mset = new REMatch[all.size()];
 951     all.copyInto(mset);
 952     return mset;
 953   }
 954
 955     /* Implements abstract method REToken.match() */
 956     boolean match(CharIndexed input, REMatch mymatch) {
 957         if (firstToken == null) return next(input, mymatch);
 958
 959         // Note the start of this subexpression
 960         mymatch.start[subIndex] = mymatch.index;
 961
 962         return firstToken.match(input, mymatch);
 963     }
 964
 965   /**
 966    * Returns the first match found in the input.  If no match is found,
 967    * null is returned.
 968    *
 969    * @param input The input text.
 970    * @return An REMatch instance referencing the match, or null if none.
 971    */
 972   public REMatch getMatch(Object input) {
 973     return getMatch(input,0,0);
 974   }
 975
 976   /**
 977    * Returns the first match found in the input, beginning
 978    * the search at the specified index.  If no match is found,
 979    * returns null.
 980    *
 981    * @param input The input text.
 982    * @param index The offset within the text to begin looking for a match.
 983    * @return An REMatch instance referencing the match, or null if none.
 984    */
 985   public REMatch getMatch(Object input, int index) {
 986     return getMatch(input,index,0);
 987   }
 988
 989   /**
 990    * Returns the first match found in the input, beginning
 991    * the search at the specified index, and using the specified
 992    * execution flags.  If no match is found, returns null.
 993    *
 994    * @param input The input text.
 995    * @param index The offset index at which the search should be begin.
 996    * @param eflags The logical OR of any execution flags above.
 997    * @return An REMatch instance referencing the match, or null if none.
 998    */
 999   public REMatch getMatch(Object input, int index, int eflags) {
1000     return getMatch(input,index,eflags,null);
1001   }
1002
1003   /**
1004    * Returns the first match found in the input, beginning the search
1005    * at the specified index, and using the specified execution flags.
1006    * If no match is found, returns null.  If a StringBuffer is
1007    * provided and is non-null, the contents of the input text from the
1008    * index to the beginning of the match (or to the end of the input,
1009    * if there is no match) are appended to the StringBuffer.
1010    *
1011    * @param input The input text.
1012    * @param index The offset index at which the search should be begin.
1013    * @param eflags The logical OR of any execution flags above.
1014    * @param buffer The StringBuffer to save pre-match text in.
1015    * @return An REMatch instance referencing the match, or null if none.  */
1016   public REMatch getMatch(Object input, int index, int eflags, StringBuffer buffer) {
1017     return getMatchImpl(makeCharIndexed(input,index),index,eflags,buffer);
1018   }
1019
1020   REMatch getMatchImpl(CharIndexed input, int anchor, int eflags, StringBuffer buffer) {
1021       // Create a new REMatch to hold results
1022       REMatch mymatch = new REMatch(numSubs, anchor, eflags);
1023       do {
1024           // Optimization: check if anchor + minimumLength > length
1025           if (minimumLength == 0 || input.charAt(minimumLength-1) != CharIndexed.OUT_OF_BOUNDS) {
1026               if (match(input, mymatch)) {
1027                   // Find longest match of them all to observe leftmost longest
1028                   REMatch longest = mymatch;
1029                   while ((mymatch = mymatch.next) != null) {
1030                       if (mymatch.index > longest.index) {
1031                           longest = mymatch;
1032                       }
1033                   }
1034
1035                   longest.end[0] = longest.index;
1036                   longest.finish(input);
1037                   return longest;
1038               }
1039           }
1040           mymatch.clear(++anchor);
1041           // Append character to buffer if needed
1042           if (buffer != null && input.charAt(0) != CharIndexed.OUT_OF_BOUNDS) {
1043               buffer.append(input.charAt(0));
1044           }
1045       } while (input.move(1));
1046
1047       return null;
1048   }
1049
1050   /**
1051    * Returns an REMatchEnumeration that can be used to iterate over the
1052    * matches found in the input text.
1053    *
1054    * @param input The input text.
1055    * @return A non-null REMatchEnumeration instance.
1056    */
1057   public REMatchEnumeration getMatchEnumeration(Object input) {
1058     return getMatchEnumeration(input,0,0);
1059   }
1060
1061
1062   /**
1063    * Returns an REMatchEnumeration that can be used to iterate over the
1064    * matches found in the input text.
1065    *
1066    * @param input The input text.
1067    * @param index The offset index at which the search should be begin.
1068    * @return A non-null REMatchEnumeration instance, with its input cursor
1069    *  set to the index position specified.
1070    */
1071   public REMatchEnumeration getMatchEnumeration(Object input, int index) {
1072     return getMatchEnumeration(input,index,0);
1073   }
1074
1075   /**
1076    * Returns an REMatchEnumeration that can be used to iterate over the
1077    * matches found in the input text.
1078    *
1079    * @param input The input text.
1080    * @param index The offset index at which the search should be begin.
1081    * @param eflags The logical OR of any execution flags above.
1082    * @return A non-null REMatchEnumeration instance, with its input cursor
1083    *  set to the index position specified.
1084    */
1085   public REMatchEnumeration getMatchEnumeration(Object input, int index, int eflags) {
1086     return new REMatchEnumeration(this,makeCharIndexed(input,index),index,eflags);
1087   }
1088
1089
1090   /**
1091    * Substitutes the replacement text for the first match found in the input.
1092    *
1093    * @param input The input text.
1094    * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1095    * @return A String interpolating the substituted text.
1096    * @see REMatch#substituteInto
1097    */
1098   public String substitute(Object input,String replace) {
1099     return substitute(input,replace,0,0);
1100   }
1101
1102   /**
1103    * Substitutes the replacement text for the first match found in the input
1104    * beginning at the specified index position.  Specifying an index
1105    * effectively causes the regular expression engine to throw away the
1106    * specified number of characters.
1107    *
1108    * @param input The input text.
1109    * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1110    * @param index The offset index at which the search should be begin.
1111    * @return A String containing the substring of the input, starting
1112    *   at the index position, and interpolating the substituted text.
1113    * @see REMatch#substituteInto
1114    */
1115   public String substitute(Object input,String replace,int index) {
1116     return substitute(input,replace,index,0);
1117   }
1118
1119   /**
1120    * Substitutes the replacement text for the first match found in the input
1121    * string, beginning at the specified index position and using the
1122    * specified execution flags.
1123    *
1124    * @param input The input text.
1125    * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1126    * @param index The offset index at which the search should be begin.
1127    * @param eflags The logical OR of any execution flags above.
1128    * @return A String containing the substring of the input, starting
1129    *   at the index position, and interpolating the substituted text.
1130    * @see REMatch#substituteInto
1131    */
1132   public String substitute(Object input,String replace,int index,int eflags) {
1133     return substituteImpl(makeCharIndexed(input,index),replace,index,eflags);
1134   }
1135
1136   private String substituteImpl(CharIndexed input,String replace,int index,int eflags) {
1137     StringBuffer buffer = new StringBuffer();
1138     REMatch m = getMatchImpl(input,index,eflags,buffer);
1139     if (m==null) return buffer.toString();
1140     buffer.append( ((eflags & REG_NO_INTERPOLATE) > 0) ?
1141                    replace : m.substituteInto(replace) );
1142     if (input.move(m.end[0])) {
1143       do {
1144         buffer.append(input.charAt(0));
1145       } while (input.move(1));
1146     }
1147     return buffer.toString();
1148   }
1149
1150   /**
1151    * Substitutes the replacement text for each non-overlapping match found
1152    * in the input text.
1153    *
1154    * @param input The input text.
1155    * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1156    * @return A String interpolating the substituted text.
1157    * @see REMatch#substituteInto
1158    */
1159   public String substituteAll(Object input,String replace) {
1160     return substituteAll(input,replace,0,0);
1161   }
1162
1163   /**
1164    * Substitutes the replacement text for each non-overlapping match found
1165    * in the input text, starting at the specified index.
1166    *
1167    * If the regular expression allows the empty string to match, it will
1168    * substitute matches at all positions except the end of the input.
1169    *
1170    * @param input The input text.
1171    * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1172    * @param index The offset index at which the search should be begin.
1173    * @return A String containing the substring of the input, starting
1174    *   at the index position, and interpolating the substituted text.
1175    * @see REMatch#substituteInto
1176    */
1177   public String substituteAll(Object input,String replace,int index) {
1178     return substituteAll(input,replace,index,0);
1179   }
1180
1181   /**
1182    * Substitutes the replacement text for each non-overlapping match found
1183    * in the input text, starting at the specified index and using the
1184    * specified execution flags.
1185    *
1186    * @param input The input text.
1187    * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1188    * @param index The offset index at which the search should be begin.
1189    * @param eflags The logical OR of any execution flags above.
1190    * @return A String containing the substring of the input, starting
1191    *   at the index position, and interpolating the substituted text.
1192    * @see REMatch#substituteInto
1193    */
1194   public String substituteAll(Object input,String replace,int index,int eflags) {
1195     return substituteAllImpl(makeCharIndexed(input,index),replace,index,eflags);
1196   }
1197
1198   private String substituteAllImpl(CharIndexed input,String replace,int index,int eflags) {
1199     StringBuffer buffer = new StringBuffer();
1200     REMatch m;
1201     while ((m = getMatchImpl(input,index,eflags,buffer)) != null) {
1202         buffer.append( ((eflags & REG_NO_INTERPOLATE) > 0) ?
1203                        replace : m.substituteInto(replace) );
1204       index = m.getEndIndex();
1205       if (m.end[0] == 0) {
1206         char ch = input.charAt(0);
1207         if (ch != CharIndexed.OUT_OF_BOUNDS)
1208             buffer.append(ch);
1209         input.move(1);
1210       } else {
1211           input.move(m.end[0]);
1212       }
1213
1214       if (!input.isValid()) break;
1215     }
1216     return buffer.toString();
1217   }
1218
1219   /* Helper function for constructor */
1220   private void addToken(REToken next) {
1221     if (next == null) return;
1222     minimumLength += next.getMinimumLength();
1223     if (firstToken == null) {
1224         lastToken = firstToken = next;
1225     } else {
1226       // if chain returns false, it "rejected" the token due to
1227       // an optimization, and next was combined with lastToken
1228       if (lastToken.chain(next)) {
1229           lastToken = next;
1230       }
1231     }
1232   }
1233
1234   private static REToken setRepeated(REToken current, int min, int max, int index) throws REException {
1235     if (current == null) throw new REException(getLocalizedMessage("repeat.no.token"),REException.REG_BADRPT,index);
1236     return new RETokenRepeated(current.subIndex,current,min,max);
1237   }
1238
1239   private static int getPosixSet(char[] pattern,int index,StringBuffer buf) {
1240     // Precondition: pattern[index-1] == ':'
1241     // we will return pos of closing ']'.
1242     int i;
1243     for (i=index; i<(pattern.length-1); i++) {
1244       if ((pattern[i] == ':') && (pattern[i+1] == ']'))
1245         return i+2;
1246       buf.append(pattern[i]);
1247     }
1248     return index; // didn't match up
1249   }
1250
1251   private int getMinMax(char[] input,int index,IntPair minMax,RESyntax syntax) throws REException {
1252     // Precondition: input[index-1] == '{', minMax != null
1253
1254     boolean mustMatch = !syntax.get(RESyntax.RE_NO_BK_BRACES);
1255     int startIndex = index;
1256     if (index == input.length) {
1257       if (mustMatch)
1258         throw new REException(getLocalizedMessage("unmatched.brace"),REException.REG_EBRACE,index);
1259       else
1260         return startIndex;
1261     }
1262
1263     int min,max=0;
1264     CharUnit unit = new CharUnit();
1265     StringBuffer buf = new StringBuffer();
1266
1267     // Read string of digits
1268     do {
1269       index = getCharUnit(input,index,unit);
1270       if (Character.isDigit(unit.ch))
1271         buf.append(unit.ch);
1272     } while ((index != input.length) && Character.isDigit(unit.ch));
1273
1274     // Check for {} tomfoolery
1275     if (buf.length() == 0) {
1276       if (mustMatch)
1277         throw new REException(getLocalizedMessage("interval.error"),REException.REG_EBRACE,index);
1278       else
1279         return startIndex;
1280     }
1281
1282     min = Integer.parseInt(buf.toString());
1283
1284     if ((unit.ch == '}') && (syntax.get(RESyntax.RE_NO_BK_BRACES) ^ unit.bk))
1285       max = min;
1286     else if (index == input.length)
1287       if (mustMatch)
1288         throw new REException(getLocalizedMessage("interval.no.end"),REException.REG_EBRACE,index);
1289       else
1290         return startIndex;
1291     else if ((unit.ch == ',') && !unit.bk) {
1292       buf = new StringBuffer();
1293       // Read string of digits
1294       while (((index = getCharUnit(input,index,unit)) != input.length) && Character.isDigit(unit.ch))
1295         buf.append(unit.ch);
1296
1297       if (!((unit.ch == '}') && (syntax.get(RESyntax.RE_NO_BK_BRACES) ^ unit.bk)))
1298         if (mustMatch)
1299           throw new REException(getLocalizedMessage("interval.error"),REException.REG_EBRACE,index);
1300         else
1301           return startIndex;
1302
1303       // This is the case of {x,}
1304       if (buf.length() == 0) max = Integer.MAX_VALUE;
1305       else max = Integer.parseInt(buf.toString());
1306     } else
1307       if (mustMatch)
1308         throw new REException(getLocalizedMessage("interval.error"),REException.REG_EBRACE,index);
1309       else
1310         return startIndex;
1311
1312     // We know min and max now, and they are valid.
1313
1314     minMax.first = min;
1315     minMax.second = max;
1316
1317     // return the index following the '}'
1318     return index;
1319   }
1320
1321    /**
1322     * Return a human readable form of the compiled regular expression,
1323     * useful for debugging.
1324     */
1325    public String toString() {
1326      StringBuffer sb = new StringBuffer();
1327      dump(sb);
1328      return sb.toString();
1329    }
1330
1331   void dump(StringBuffer os) {
1332     os.append('(');
1333     if (subIndex == 0)
1334       os.append("?:");
1335     if (firstToken != null)
1336       firstToken.dumpAll(os);
1337     os.append(')');
1338   }
1339
1340   // Cast input appropriately or throw exception
1341   private static CharIndexed makeCharIndexed(Object input, int index) {
1342       // We could let a String fall through to final input, but since
1343       // it's the most likely input type, we check it first.
1344     if (input instanceof String)
1345       return new CharIndexedString((String) input,index);
1346     else if (input instanceof char[])
1347       return new CharIndexedCharArray((char[]) input,index);
1348     else if (input instanceof StringBuffer)
1349       return new CharIndexedStringBuffer((StringBuffer) input,index);
1350     else if (input instanceof InputStream)
1351       return new CharIndexedInputStream((InputStream) input,index);
1352     else if (input instanceof Reader)
1353         return new CharIndexedReader((Reader) input, index);
1354     else if (input instanceof CharIndexed)
1355         return (CharIndexed) input; // do we lose index info?
1356     else
1357         return new CharIndexedString(input.toString(), index);
1358   }
1359 }