src/gnu/regexp/RE.java

   1 /*
   2  *  gnu/regexp/RE.java
   3  *  Copyright (C) 1998-2001 Wes Biggs
   4  *
   5  *  This library is free software; you can redistribute it and/or modify
   6  *  it under the terms of the GNU Lesser General Public License as published
   7  *  by the Free Software Foundation; either version 2.1 of the License, or
   8  *  (at your option) any later version.
   9  *
  10  *  This library is distributed in the hope that it will be useful,
  11  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13  *  GNU Lesser General Public License for more details.
  14  *
  15  *  You should have received a copy of the GNU Lesser General Public License
  16  *  along with this program; if not, write to the Free Software
  17  *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  18  */
  19
  20 package gnu.regexp;
  21 import java.io.InputStream;
  22 import java.io.Reader;
  23 import java.io.Serializable;
  24 import java.util.Locale;
  25 import java.util.PropertyResourceBundle;
  26 import java.util.ResourceBundle;
  27 import java.util.Vector;
  28
  29 class IntPair implements Serializable {
  30   public int first, second;
  31 }
  32
  33 class CharUnit implements Serializable {
  34   public char ch;
  35   public boolean bk;
  36 }
  37
  38 /**
  39  * RE provides the user interface for compiling and matching regular
  40  * expressions.
  41  * <P>
  42  * A regular expression object (class RE) is compiled by constructing it
  43  * from a String, StringBuffer or character array, with optional
  44  * compilation flags (below)
  45  * and an optional syntax specification (see RESyntax; if not specified,
  46  * <code>RESyntax.RE_SYNTAX_PERL5</code> is used).
  47  * <P>
  48  * Various methods attempt to match input text against a compiled
  49  * regular expression.  These methods are:
  50  * <LI><code>isMatch</code>: returns true if the input text in its entirety
  51  * matches the regular expression pattern.
  52  * <LI><code>getMatch</code>: returns the first match found in the input text,
  53  * or null if no match is found.
  54  * <LI><code>getAllMatches</code>: returns an array of all non-overlapping
  55  * matches found in the input text.  If no matches are found, the array is
  56  * zero-length.
  57  * <LI><code>substitute</code>: substitute the first occurence of the pattern
  58  * in the input text with a replacement string (which may include
  59  * metacharacters $0-$9, see REMatch.substituteInto).
  60  * <LI><code>substituteAll</code>: same as above, but repeat for each match
  61  * before returning.
  62  * <LI><code>getMatchEnumeration</code>: returns an REMatchEnumeration object
  63  * that allows iteration over the matches (see REMatchEnumeration for some
  64  * reasons why you may want to do this instead of using <code>getAllMatches</code>.
  65  * <P>
  66  *
  67  * These methods all have similar argument lists.  The input can be a
  68  * String, a character array, a StringBuffer, a Reader or an
  69  * InputStream of some sort.  Note that when using a Reader or
  70  * InputStream, the stream read position cannot be guaranteed after
  71  * attempting a match (this is not a bug, but a consequence of the way
  72  * regular expressions work).  Using an REMatchEnumeration can
  73  * eliminate most positioning problems.
  74  *
  75  * <P>
  76  *
  77  * The optional index argument specifies the offset from the beginning
  78  * of the text at which the search should start (see the descriptions
  79  * of some of the execution flags for how this can affect positional
  80  * pattern operators).  For a Reader or InputStream, this means an
  81  * offset from the current read position, so subsequent calls with the
  82  * same index argument on a Reader or an InputStream will not
  83  * necessarily access the same position on the stream, whereas
  84  * repeated searches at a given index in a fixed string will return
  85  * consistent results.
  86  *
  87  * <P>
  88  * You can optionally affect the execution environment by using a
  89  * combination of execution flags (constants listed below).
  90  *
  91  * <P>
  92  * All operations on a regular expression are performed in a
  93  * thread-safe manner.
  94  *
  95  * @author <A HREF="mailto:wes@cacas.org">Wes Biggs</A>
  96  * @version 1.1.4-dev, to be released
  97  */
  98
  99 public class RE extends REToken {
 100   // This String will be returned by getVersion()
 101   private static final String VERSION = "1.1.4-dev";
 102
 103   // The localized strings are kept in a separate file
 104   // XWT LOCAL: we can't bundle property lists within the xwt executable on all platforms
 105   // private static ResourceBundle messages = PropertyResourceBundle.getBundle("gnu/regexp/MessagesBundle", Locale.getDefault());
 106
 107   // These are, respectively, the first and last tokens in our linked list
 108   // If there is only one token, firstToken == lastToken
 109   private REToken firstToken, lastToken;
 110
 111   // This is the number of subexpressions in this regular expression,
 112   // with a minimum value of zero.  Returned by getNumSubs()
 113   private int numSubs;
 114
 115     /** Minimum length, in characters, of any possible match. */
 116     private int minimumLength;
 117
 118   /**
 119    * Compilation flag. Do  not  differentiate  case.   Subsequent
 120    * searches  using  this  RE will be case insensitive.
 121    */
 122   public static final int REG_ICASE = 2;
 123
 124   /**
 125    * Compilation flag. The match-any-character operator (dot)
 126    * will match a newline character.  When set this overrides the syntax
 127    * bit RE_DOT_NEWLINE (see RESyntax for details).  This is equivalent to
 128    * the "/s" operator in Perl.
 129    */
 130   public static final int REG_DOT_NEWLINE = 4;
 131
 132   /**
 133    * Compilation flag. Use multiline mode.  In this mode, the ^ and $
 134    * anchors will match based on newlines within the input. This is
 135    * equivalent to the "/m" operator in Perl.
 136    */
 137   public static final int REG_MULTILINE = 8;
 138
 139   /**
 140    * Execution flag.
 141    * The match-beginning operator (^) will not match at the beginning
 142    * of the input string. Useful for matching on a substring when you
 143    * know the context of the input is such that position zero of the
 144    * input to the match test is not actually position zero of the text.
 145    * <P>
 146    * This example demonstrates the results of various ways of matching on
 147    * a substring.
 148    * <P>
 149    * <CODE>
 150    * String s = "food bar fool";<BR>
 151    * RE exp = new RE("^foo.");<BR>
 152    * REMatch m0 = exp.getMatch(s);<BR>
 153    * REMatch m1 = exp.getMatch(s.substring(8));<BR>
 154    * REMatch m2 = exp.getMatch(s.substring(8),0,RE.REG_NOTBOL); <BR>
 155    * REMatch m3 = exp.getMatch(s,8);                            <BR>
 156    * REMatch m4 = exp.getMatch(s,8,RE.REG_ANCHORINDEX);         <BR>
 157    * <P>
 158    * // Results:<BR>
 159    * //  m0 = "food"<BR>
 160    * //  m1 = "fool"<BR>
 161    * //  m2 = null<BR>
 162    * //  m3 = null<BR>
 163    * //  m4 = "fool"<BR>
 164    * </CODE>
 165    */
 166   public static final int REG_NOTBOL = 16;
 167
 168   /**
 169    * Execution flag.
 170    * The match-end operator ($) does not match at the end
 171    * of the input string. Useful for matching on substrings.
 172    */
 173   public static final int REG_NOTEOL = 32;
 174
 175   /**
 176    * Execution flag.
 177    * When a match method is invoked that starts matching at a non-zero
 178    * index into the input, treat the input as if it begins at the index
 179    * given.  The effect of this flag is that the engine does not "see"
 180    * any text in the input before the given index.  This is useful so
 181    * that the match-beginning operator (^) matches not at position 0
 182    * in the input string, but at the position the search started at
 183    * (based on the index input given to the getMatch function).  See
 184    * the example under REG_NOTBOL.  It also affects the use of the \&lt;
 185    * and \b operators.
 186    */
 187   public static final int REG_ANCHORINDEX = 64;
 188
 189   /**
 190    * Execution flag.
 191    * The substitute and substituteAll methods will not attempt to
 192    * interpolate occurrences of $1-$9 in the replacement text with
 193    * the corresponding subexpressions.  For example, you may want to
 194    * replace all matches of "one dollar" with "$1".
 195    */
 196   public static final int REG_NO_INTERPOLATE = 128;
 197
 198   /** Returns a string representing the version of the gnu.regexp package. */
 199   public static final String version() {
 200     return VERSION;
 201   }
 202
 203   // Retrieves a message from the ResourceBundle
 204   // XWT LOCAL: we can't bundle property lists within the xwt executable on all platforms
 205   //            for simplicity, just lookup the errors this way.
 206   static final String getLocalizedMessage(String key) {
 207     if(key.equals("error.prefix")) return "At position {0} in regular expression pattern:";
 208     else if(key.equals("repeat.assertion")) return "repeated token is zero-width assertion";
 209     else if(key.equals("repeat.chained")) return "attempted to repeat a token that is already repeated";
 210     else if(key.equals("repeat.no.token")) return "quantifier (?*+{}) without preceding token";
 211     else if(key.equals("repeat.empty.token")) return "repeated token may be empty";
 212     else if(key.equals("unmatched.brace")) return "unmatched brace";
 213     else if(key.equals("unmatched.bracket")) return "unmatched bracket";
 214     else if(key.equals("unmatched.paren")) return "unmatched parenthesis";
 215     else if(key.equals("interval.no.end")) return "expected end of interval";
 216     else if(key.equals("class.no.end")) return "expected end of character class";
 217     else if(key.equals("subexpr.no.end")) return "expected end of subexpression";
 218     else if(key.equals("interval.order")) return "interval minimum is greater than maximum";
 219     else if(key.equals("interval.error")) return "interval is empty or contains illegal chracters";
 220     else if(key.equals("ends.with.backslash")) return "backslash at end of pattern";
 221     else if(key.equals("syntax.final")) return "Syntax has been declared final and cannot be modified";
 222     return "Unknown regexp error";
 223     //return messages.getString(key);
 224   }
 225
 226   /**
 227    * Constructs a regular expression pattern buffer without any compilation
 228    * flags set, and using the default syntax (RESyntax.RE_SYNTAX_PERL5).
 229    *
 230    * @param pattern A regular expression pattern, in the form of a String,
 231    *   StringBuffer or char[].  Other input types will be converted to
 232    *   strings using the toString() method.
 233    * @exception REException The input pattern could not be parsed.
 234    * @exception NullPointerException The pattern was null.
 235    */
 236   public RE(Object pattern) throws REException {
 237     this(pattern,0,RESyntax.RE_SYNTAX_PERL5,0,0);
 238   }
 239
 240   /**
 241    * Constructs a regular expression pattern buffer using the specified
 242    * compilation flags and the default syntax (RESyntax.RE_SYNTAX_PERL5).
 243    *
 244    * @param pattern A regular expression pattern, in the form of a String,
 245    *   StringBuffer, or char[].  Other input types will be converted to
 246    *   strings using the toString() method.
 247    * @param cflags The logical OR of any combination of the compilation flags listed above.
 248    * @exception REException The input pattern could not be parsed.
 249    * @exception NullPointerException The pattern was null.
 250    */
 251   public RE(Object pattern, int cflags) throws REException {
 252     this(pattern,cflags,RESyntax.RE_SYNTAX_PERL5,0,0);
 253   }
 254
 255   /**
 256    * Constructs a regular expression pattern buffer using the specified
 257    * compilation flags and regular expression syntax.
 258    *
 259    * @param pattern A regular expression pattern, in the form of a String,
 260    *   StringBuffer, or char[].  Other input types will be converted to
 261    *   strings using the toString() method.
 262    * @param cflags The logical OR of any combination of the compilation flags listed above.
 263    * @param syntax The type of regular expression syntax to use.
 264    * @exception REException The input pattern could not be parsed.
 265    * @exception NullPointerException The pattern was null.
 266    */
 267   public RE(Object pattern, int cflags, RESyntax syntax) throws REException {
 268     this(pattern,cflags,syntax,0,0);
 269   }
 270
 271   // internal constructor used for alternation
 272   private RE(REToken first, REToken last,int subs, int subIndex, int minLength) {
 273     super(subIndex);
 274     firstToken = first;
 275     lastToken = last;
 276     numSubs = subs;
 277     minimumLength = minLength;
 278     addToken(new RETokenEndSub(subIndex));
 279   }
 280
 281   private RE(Object patternObj, int cflags, RESyntax syntax, int myIndex, int nextSub) throws REException {
 282     super(myIndex); // Subexpression index of this token.
 283     initialize(patternObj, cflags, syntax, myIndex, nextSub);
 284   }
 285
 286     // For use by subclasses
 287     protected RE() { super(0); }
 288
 289     // The meat of construction
 290   protected void initialize(Object patternObj, int cflags, RESyntax syntax, int myIndex, int nextSub) throws REException {
 291       char[] pattern;
 292     if (patternObj instanceof String) {
 293       pattern = ((String) patternObj).toCharArray();
 294     } else if (patternObj instanceof char[]) {
 295       pattern = (char[]) patternObj;
 296     } else if (patternObj instanceof StringBuffer) {
 297       pattern = new char [((StringBuffer) patternObj).length()];
 298       ((StringBuffer) patternObj).getChars(0,pattern.length,pattern,0);
 299     } else {
 300         pattern = patternObj.toString().toCharArray();
 301     }
 302
 303     int pLength = pattern.length;
 304
 305     numSubs = 0; // Number of subexpressions in this token.
 306     Vector branches = null;
 307
 308     // linked list of tokens (sort of -- some closed loops can exist)
 309     firstToken = lastToken = null;
 310
 311     // Precalculate these so we don't pay for the math every time we
 312     // need to access them.
 313     boolean insens = ((cflags & REG_ICASE) > 0);
 314
 315     // Parse pattern into tokens.  Does anyone know if it's more efficient
 316     // to use char[] than a String.charAt()?  I'm assuming so.
 317
 318     // index tracks the position in the char array
 319     int index = 0;
 320
 321     // this will be the current parse character (pattern[index])
 322     CharUnit unit = new CharUnit();
 323
 324     // This is used for {x,y} calculations
 325     IntPair minMax = new IntPair();
 326
 327     // Buffer a token so we can create a TokenRepeated, etc.
 328     REToken currentToken = null;
 329     char ch;
 330
 331     while (index < pLength) {
 332       // read the next character unit (including backslash escapes)
 333       index = getCharUnit(pattern,index,unit);
 334
 335       // ALTERNATION OPERATOR
 336       //  \| or | (if RE_NO_BK_VBAR) or newline (if RE_NEWLINE_ALT)
 337       //  not available if RE_LIMITED_OPS is set
 338
 339       // TODO: the '\n' literal here should be a test against REToken.newline,
 340       // which unfortunately may be more than a single character.
 341       if ( ( (unit.ch == '|' && (syntax.get(RESyntax.RE_NO_BK_VBAR) ^ unit.bk))
 342              || (syntax.get(RESyntax.RE_NEWLINE_ALT) && (unit.ch == '\n') && !unit.bk) )
 343            && !syntax.get(RESyntax.RE_LIMITED_OPS)) {
 344         // make everything up to here be a branch. create vector if nec.
 345         addToken(currentToken);
 346         RE theBranch = new RE(firstToken, lastToken, numSubs, subIndex, minimumLength);
 347         minimumLength = 0;
 348         if (branches == null) {
 349             branches = new Vector();
 350         }
 351         branches.addElement(theBranch);
 352         firstToken = lastToken = currentToken = null;
 353       }
 354
 355       // INTERVAL OPERATOR:
 356       //  {x} | {x,} | {x,y}  (RE_INTERVALS && RE_NO_BK_BRACES)
 357       //  \{x\} | \{x,\} | \{x,y\} (RE_INTERVALS && !RE_NO_BK_BRACES)
 358       //
 359       // OPEN QUESTION:
 360       //  what is proper interpretation of '{' at start of string?
 361
 362       else if ((unit.ch == '{') && syntax.get(RESyntax.RE_INTERVALS) && (syntax.get(RESyntax.RE_NO_BK_BRACES) ^ unit.bk)) {
 363         int newIndex = getMinMax(pattern,index,minMax,syntax);
 364         if (newIndex > index) {
 365           if (minMax.first > minMax.second)
 366             throw new REException(getLocalizedMessage("interval.order"),REException.REG_BADRPT,newIndex);
 367           if (currentToken == null)
 368             throw new REException(getLocalizedMessage("repeat.no.token"),REException.REG_BADRPT,newIndex);
 369           if (currentToken instanceof RETokenRepeated)
 370             throw new REException(getLocalizedMessage("repeat.chained"),REException.REG_BADRPT,newIndex);
 371           if (currentToken instanceof RETokenWordBoundary || currentToken instanceof RETokenWordBoundary)
 372             throw new REException(getLocalizedMessage("repeat.assertion"),REException.REG_BADRPT,newIndex);
 373           if ((currentToken.getMinimumLength() == 0) && (minMax.second == Integer.MAX_VALUE))
 374             throw new REException(getLocalizedMessage("repeat.empty.token"),REException.REG_BADRPT,newIndex);
 375           index = newIndex;
 376           currentToken = setRepeated(currentToken,minMax.first,minMax.second,index);
 377         }
 378         else {
 379           addToken(currentToken);
 380           currentToken = new RETokenChar(subIndex,unit.ch,insens);
 381         }
 382       }
 383
 384       // LIST OPERATOR:
 385       //  [...] | [^...]
 386
 387       else if ((unit.ch == '[') && !unit.bk) {
 388         Vector options = new Vector();
 389         boolean negative = false;
 390         char lastChar = 0;
 391         if (index == pLength) throw new REException(getLocalizedMessage("unmatched.bracket"),REException.REG_EBRACK,index);
 392
 393         // Check for initial caret, negation
 394         if ((ch = pattern[index]) == '^') {
 395           negative = true;
 396           if (++index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
 397           ch = pattern[index];
 398         }
 399
 400         // Check for leading right bracket literal
 401         if (ch == ']') {
 402           lastChar = ch;
 403           if (++index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
 404         }
 405
 406         while ((ch = pattern[index++]) != ']') {
 407           if ((ch == '-') && (lastChar != 0)) {
 408             if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
 409             if ((ch = pattern[index]) == ']') {
 410               options.addElement(new RETokenChar(subIndex,lastChar,insens));
 411               lastChar = '-';
 412             } else {
 413               options.addElement(new RETokenRange(subIndex,lastChar,ch,insens));
 414               lastChar = 0;
 415               index++;
 416             }
 417           } else if ((ch == '\\') && syntax.get(RESyntax.RE_BACKSLASH_ESCAPE_IN_LISTS)) {
 418             if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
 419             int posixID = -1;
 420             boolean negate = false;
 421             char asciiEsc = 0;
 422             if (("dswDSW".indexOf(pattern[index]) != -1) && syntax.get(RESyntax.RE_CHAR_CLASS_ESC_IN_LISTS)) {
 423               switch (pattern[index]) {
 424               case 'D':
 425                 negate = true;
 426               case 'd':
 427                 posixID = RETokenPOSIX.DIGIT;
 428                 break;
 429               case 'S':
 430                 negate = true;
 431               case 's':
 432                 posixID = RETokenPOSIX.SPACE;
 433                 break;
 434               case 'W':
 435                 negate = true;
 436               case 'w':
 437                 posixID = RETokenPOSIX.ALNUM;
 438                 break;
 439               }
 440             }
 441             else if ("nrt".indexOf(pattern[index]) != -1) {
 442               switch (pattern[index]) {
 443                 case 'n':
 444                   asciiEsc = '\n';
 445                   break;
 446                 case 't':
 447                   asciiEsc = '\t';
 448                   break;
 449                 case 'r':
 450                   asciiEsc = '\r';
 451                   break;
 452               }
 453             }
 454             if (lastChar != 0) options.addElement(new RETokenChar(subIndex,lastChar,insens));
 455
 456             if (posixID != -1) {
 457               options.addElement(new RETokenPOSIX(subIndex,posixID,insens,negate));
 458             } else if (asciiEsc != 0) {
 459               lastChar = asciiEsc;
 460             } else {
 461               lastChar = pattern[index];
 462             }
 463             ++index;
 464           } else if ((ch == '[') && (syntax.get(RESyntax.RE_CHAR_CLASSES)) && (index < pLength) && (pattern[index] == ':')) {
 465             StringBuffer posixSet = new StringBuffer();
 466             index = getPosixSet(pattern,index+1,posixSet);
 467             int posixId = RETokenPOSIX.intValue(posixSet.toString());
 468             if (posixId != -1)
 469               options.addElement(new RETokenPOSIX(subIndex,posixId,insens,false));
 470           } else {
 471             if (lastChar != 0) options.addElement(new RETokenChar(subIndex,lastChar,insens));
 472             lastChar = ch;
 473           }
 474           if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
 475         } // while in list
 476         // Out of list, index is one past ']'
 477
 478         if (lastChar != 0) options.addElement(new RETokenChar(subIndex,lastChar,insens));
 479
 480         // Create a new RETokenOneOf
 481         addToken(currentToken);
 482         options.trimToSize();
 483         currentToken = new RETokenOneOf(subIndex,options,negative);
 484       }
 485
 486       // SUBEXPRESSIONS
 487       //  (...) | \(...\) depending on RE_NO_BK_PARENS
 488
 489       else if ((unit.ch == '(') && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ unit.bk)) {
 490         boolean pure = false;
 491         boolean comment = false;
 492         boolean lookAhead = false;
 493         boolean negativelh = false;
 494         if ((index+1 < pLength) && (pattern[index] == '?')) {
 495           switch (pattern[index+1]) {
 496           case '!':
 497             if (syntax.get(RESyntax.RE_LOOKAHEAD)) {
 498               pure = true;
 499               negativelh = true;
 500               lookAhead = true;
 501               index += 2;
 502             }
 503             break;
 504           case '=':
 505             if (syntax.get(RESyntax.RE_LOOKAHEAD)) {
 506               pure = true;
 507               lookAhead = true;
 508               index += 2;
 509             }
 510             break;
 511           case ':':
 512             if (syntax.get(RESyntax.RE_PURE_GROUPING)) {
 513               pure = true;
 514               index += 2;
 515             }
 516             break;
 517           case '#':
 518             if (syntax.get(RESyntax.RE_COMMENTS)) {
 519               comment = true;
 520             }
 521             break;
 522           default:
 523             throw new REException(getLocalizedMessage("repeat.no.token"), REException.REG_BADRPT, index);
 524           }
 525         }
 526
 527         if (index >= pLength) {
 528             throw new REException(getLocalizedMessage("unmatched.paren"), REException.REG_ESUBREG,index);
 529         }
 530
 531         // find end of subexpression
 532         int endIndex = index;
 533         int nextIndex = index;
 534         int nested = 0;
 535
 536         while ( ((nextIndex = getCharUnit(pattern,endIndex,unit)) > 0)
 537                 && !(nested == 0 && (unit.ch == ')') && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ unit.bk)) )
 538           if ((endIndex = nextIndex) >= pLength)
 539             throw new REException(getLocalizedMessage("subexpr.no.end"),REException.REG_ESUBREG,nextIndex);
 540           else if (unit.ch == '(' && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ unit.bk))
 541             nested++;
 542           else if (unit.ch == ')' && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ unit.bk))
 543             nested--;
 544
 545         // endIndex is now position at a ')','\)'
 546         // nextIndex is end of string or position after ')' or '\)'
 547
 548         if (comment) index = nextIndex;
 549         else { // not a comment
 550           // create RE subexpression as token.
 551           addToken(currentToken);
 552           if (!pure) {
 553             numSubs++;
 554           }
 555
 556           int useIndex = (pure || lookAhead) ? 0 : nextSub + numSubs;
 557           currentToken = new RE(String.valueOf(pattern,index,endIndex-index).toCharArray(),cflags,syntax,useIndex,nextSub + numSubs);
 558           numSubs += ((RE) currentToken).getNumSubs();
 559
 560           if (lookAhead) {
 561               currentToken = new RETokenLookAhead(currentToken,negativelh);
 562           }
 563
 564           index = nextIndex;
 565         } // not a comment
 566       } // subexpression
 567
 568       // UNMATCHED RIGHT PAREN
 569       // ) or \) throw exception if
 570       // !syntax.get(RESyntax.RE_UNMATCHED_RIGHT_PAREN_ORD)
 571       else if (!syntax.get(RESyntax.RE_UNMATCHED_RIGHT_PAREN_ORD) && ((unit.ch == ')') && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ unit.bk))) {
 572         throw new REException(getLocalizedMessage("unmatched.paren"),REException.REG_EPAREN,index);
 573       }
 574
 575       // START OF LINE OPERATOR
 576       //  ^
 577
 578       else if ((unit.ch == '^') && !unit.bk) {
 579         addToken(currentToken);
 580         currentToken = null;
 581         addToken(new RETokenStart(subIndex,((cflags & REG_MULTILINE) > 0) ? syntax.getLineSeparator() : null));
 582       }
 583
 584       // END OF LINE OPERATOR
 585       //  $
 586
 587       else if ((unit.ch == '$') && !unit.bk) {
 588         addToken(currentToken);
 589         currentToken = null;
 590         addToken(new RETokenEnd(subIndex,((cflags & REG_MULTILINE) > 0) ? syntax.getLineSeparator() : null));
 591       }
 592
 593       // MATCH-ANY-CHARACTER OPERATOR (except possibly newline and null)
 594       //  .
 595
 596       else if ((unit.ch == '.') && !unit.bk) {
 597         addToken(currentToken);
 598         currentToken = new RETokenAny(subIndex,syntax.get(RESyntax.RE_DOT_NEWLINE) || ((cflags & REG_DOT_NEWLINE) > 0),syntax.get(RESyntax.RE_DOT_NOT_NULL));
 599       }
 600
 601       // ZERO-OR-MORE REPEAT OPERATOR
 602       //  *
 603
 604       else if ((unit.ch == '*') && !unit.bk) {
 605         if (currentToken == null)
 606           throw new REException(getLocalizedMessage("repeat.no.token"),REException.REG_BADRPT,index);
 607         if (currentToken instanceof RETokenRepeated)
 608           throw new REException(getLocalizedMessage("repeat.chained"),REException.REG_BADRPT,index);
 609         if (currentToken instanceof RETokenWordBoundary || currentToken instanceof RETokenWordBoundary)
 610           throw new REException(getLocalizedMessage("repeat.assertion"),REException.REG_BADRPT,index);
 611         if (currentToken.getMinimumLength() == 0)
 612           throw new REException(getLocalizedMessage("repeat.empty.token"),REException.REG_BADRPT,index);
 613         currentToken = setRepeated(currentToken,0,Integer.MAX_VALUE,index);
 614       }
 615
 616       // ONE-OR-MORE REPEAT OPERATOR
 617       //  + | \+ depending on RE_BK_PLUS_QM
 618       //  not available if RE_LIMITED_OPS is set
 619
 620       else if ((unit.ch == '+') && !syntax.get(RESyntax.RE_LIMITED_OPS) && (!syntax.get(RESyntax.RE_BK_PLUS_QM) ^ unit.bk)) {
 621         if (currentToken == null)
 622           throw new REException(getLocalizedMessage("repeat.no.token"),REException.REG_BADRPT,index);
 623         if (currentToken instanceof RETokenRepeated)
 624           throw new REException(getLocalizedMessage("repeat.chained"),REException.REG_BADRPT,index);
 625         if (currentToken instanceof RETokenWordBoundary || currentToken instanceof RETokenWordBoundary)
 626           throw new REException(getLocalizedMessage("repeat.assertion"),REException.REG_BADRPT,index);
 627         if (currentToken.getMinimumLength() == 0)
 628           throw new REException(getLocalizedMessage("repeat.empty.token"),REException.REG_BADRPT,index);
 629         currentToken = setRepeated(currentToken,1,Integer.MAX_VALUE,index);
 630       }
 631
 632       // ZERO-OR-ONE REPEAT OPERATOR / STINGY MATCHING OPERATOR
 633       //  ? | \? depending on RE_BK_PLUS_QM
 634       //  not available if RE_LIMITED_OPS is set
 635       //  stingy matching if RE_STINGY_OPS is set and it follows a quantifier
 636
 637       else if ((unit.ch == '?') && !syntax.get(RESyntax.RE_LIMITED_OPS) && (!syntax.get(RESyntax.RE_BK_PLUS_QM) ^ unit.bk)) {
 638         if (currentToken == null) throw new REException(getLocalizedMessage("repeat.no.token"),REException.REG_BADRPT,index);
 639
 640         // Check for stingy matching on RETokenRepeated
 641         if (currentToken instanceof RETokenRepeated) {
 642           if (syntax.get(RESyntax.RE_STINGY_OPS) && !((RETokenRepeated)currentToken).isStingy())
 643             ((RETokenRepeated)currentToken).makeStingy();
 644           else
 645             throw new REException(getLocalizedMessage("repeat.chained"),REException.REG_BADRPT,index);
 646         }
 647         else if (currentToken instanceof RETokenWordBoundary || currentToken instanceof RETokenWordBoundary)
 648           throw new REException(getLocalizedMessage("repeat.assertion"),REException.REG_BADRPT,index);
 649         else
 650           currentToken = setRepeated(currentToken,0,1,index);
 651       }
 652
 653       // BACKREFERENCE OPERATOR
 654       //  \1 \2 ... \9
 655       // not available if RE_NO_BK_REFS is set
 656
 657       else if (unit.bk && Character.isDigit(unit.ch) && !syntax.get(RESyntax.RE_NO_BK_REFS)) {
 658         addToken(currentToken);
 659         currentToken = new RETokenBackRef(subIndex,Character.digit(unit.ch,10),insens);
 660       }
 661
 662       // START OF STRING OPERATOR
 663       //  \A if RE_STRING_ANCHORS is set
 664
 665       else if (unit.bk && (unit.ch == 'A') && syntax.get(RESyntax.RE_STRING_ANCHORS)) {
 666         addToken(currentToken);
 667         currentToken = new RETokenStart(subIndex,null);
 668       }
 669
 670       // WORD BREAK OPERATOR
 671       //  \b if ????
 672
 673       else if (unit.bk && (unit.ch == 'b') && syntax.get(RESyntax.RE_STRING_ANCHORS)) {
 674           addToken(currentToken);
 675           currentToken = new RETokenWordBoundary(subIndex, RETokenWordBoundary.BEGIN | RETokenWordBoundary.END, false);
 676       }
 677
 678       // WORD BEGIN OPERATOR
 679       //  \< if ????
 680       else if (unit.bk && (unit.ch == '<')) {
 681           addToken(currentToken);
 682           currentToken = new RETokenWordBoundary(subIndex, RETokenWordBoundary.BEGIN, false);
 683       }
 684
 685       // WORD END OPERATOR
 686       //  \> if ????
 687       else if (unit.bk && (unit.ch == '>')) {
 688           addToken(currentToken);
 689           currentToken = new RETokenWordBoundary(subIndex, RETokenWordBoundary.END, false);
 690       }
 691
 692       // NON-WORD BREAK OPERATOR
 693       // \B if ????
 694
 695       else if (unit.bk && (unit.ch == 'B') && syntax.get(RESyntax.RE_STRING_ANCHORS)) {
 696           addToken(currentToken);
 697           currentToken = new RETokenWordBoundary(subIndex, RETokenWordBoundary.BEGIN | RETokenWordBoundary.END, true);
 698       }
 699
 700
 701       // DIGIT OPERATOR
 702       //  \d if RE_CHAR_CLASS_ESCAPES is set
 703
 704       else if (unit.bk && (unit.ch == 'd') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
 705         addToken(currentToken);
 706         currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.DIGIT,insens,false);
 707       }
 708
 709       // NON-DIGIT OPERATOR
 710       //  \D
 711
 712         else if (unit.bk && (unit.ch == 'D') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
 713           addToken(currentToken);
 714           currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.DIGIT,insens,true);
 715         }
 716
 717         // NEWLINE ESCAPE
 718         //  \n
 719
 720         else if (unit.bk && (unit.ch == 'n')) {
 721           addToken(currentToken);
 722           currentToken = new RETokenChar(subIndex,'\n',false);
 723         }
 724
 725         // RETURN ESCAPE
 726         //  \r
 727
 728         else if (unit.bk && (unit.ch == 'r')) {
 729           addToken(currentToken);
 730           currentToken = new RETokenChar(subIndex,'\r',false);
 731         }
 732
 733         // WHITESPACE OPERATOR
 734         //  \s if RE_CHAR_CLASS_ESCAPES is set
 735
 736         else if (unit.bk && (unit.ch == 's') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
 737           addToken(currentToken);
 738           currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.SPACE,insens,false);
 739         }
 740
 741         // NON-WHITESPACE OPERATOR
 742         //  \S
 743
 744         else if (unit.bk && (unit.ch == 'S') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
 745           addToken(currentToken);
 746           currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.SPACE,insens,true);
 747         }
 748
 749         // TAB ESCAPE
 750         //  \t
 751
 752         else if (unit.bk && (unit.ch == 't')) {
 753           addToken(currentToken);
 754           currentToken = new RETokenChar(subIndex,'\t',false);
 755         }
 756
 757         // ALPHANUMERIC OPERATOR
 758         //  \w
 759
 760         else if (unit.bk && (unit.ch == 'w') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
 761           addToken(currentToken);
 762           currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.ALNUM,insens,false);
 763         }
 764
 765         // NON-ALPHANUMERIC OPERATOR
 766         //  \W
 767
 768         else if (unit.bk && (unit.ch == 'W') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
 769           addToken(currentToken);
 770           currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.ALNUM,insens,true);
 771         }
 772
 773         // END OF STRING OPERATOR
 774         //  \Z
 775
 776         else if (unit.bk && (unit.ch == 'Z') && syntax.get(RESyntax.RE_STRING_ANCHORS)) {
 777           addToken(currentToken);
 778           currentToken = new RETokenEnd(subIndex,null);
 779         }
 780
 781         // NON-SPECIAL CHARACTER (or escape to make literal)
 782         //  c | \* for example
 783
 784         else {  // not a special character
 785           addToken(currentToken);
 786           currentToken = new RETokenChar(subIndex,unit.ch,insens);
 787         }
 788       } // end while
 789
 790     // Add final buffered token and an EndSub marker
 791     addToken(currentToken);
 792
 793     if (branches != null) {
 794         branches.addElement(new RE(firstToken,lastToken,numSubs,subIndex,minimumLength));
 795         branches.trimToSize(); // compact the Vector
 796         minimumLength = 0;
 797         firstToken = lastToken = null;
 798         addToken(new RETokenOneOf(subIndex,branches,false));
 799     }
 800     else addToken(new RETokenEndSub(subIndex));
 801
 802   }
 803
 804   private static int getCharUnit(char[] input, int index, CharUnit unit) throws REException {
 805     unit.ch = input[index++];
 806     if (unit.bk = (unit.ch == '\\'))
 807       if (index < input.length)
 808         unit.ch = input[index++];
 809       else throw new REException(getLocalizedMessage("ends.with.backslash"),REException.REG_ESCAPE,index);
 810     return index;
 811   }
 812
 813   /**
 814    * Checks if the regular expression matches the input in its entirety.
 815    *
 816    * @param input The input text.
 817    */
 818   public boolean isMatch(Object input) {
 819     return isMatch(input,0,0);
 820   }
 821
 822   /**
 823    * Checks if the input string, starting from index, is an exact match of
 824    * this regular expression.
 825    *
 826    * @param input The input text.
 827    * @param index The offset index at which the search should be begin.
 828    */
 829   public boolean isMatch(Object input,int index) {
 830     return isMatch(input,index,0);
 831   }
 832
 833
 834   /**
 835    * Checks if the input, starting from index and using the specified
 836    * execution flags, is an exact match of this regular expression.
 837    *
 838    * @param input The input text.
 839    * @param index The offset index at which the search should be begin.
 840    * @param eflags The logical OR of any execution flags above.
 841    */
 842   public boolean isMatch(Object input,int index,int eflags) {
 843     return isMatchImpl(makeCharIndexed(input,index),index,eflags);
 844   }
 845
 846   private boolean isMatchImpl(CharIndexed input, int index, int eflags) {
 847     if (firstToken == null)  // Trivial case
 848       return (input.charAt(0) == CharIndexed.OUT_OF_BOUNDS);
 849     REMatch m = new REMatch(numSubs, index, eflags);
 850     if (firstToken.match(input, m)) {
 851         while (m != null) {
 852             if (input.charAt(m.index) == CharIndexed.OUT_OF_BOUNDS) {
 853                 return true;
 854             }
 855             m = m.next;
 856         }
 857     }
 858     return false;
 859   }
 860
 861   /**
 862    * Returns the maximum number of subexpressions in this regular expression.
 863    * If the expression contains branches, the value returned will be the
 864    * maximum subexpressions in any of the branches.
 865    */
 866   public int getNumSubs() {
 867     return numSubs;
 868   }
 869
 870   // Overrides REToken.setUncle
 871   void setUncle(REToken uncle) {
 872       if (lastToken != null) {
 873           lastToken.setUncle(uncle);
 874       } else super.setUncle(uncle); // to deal with empty subexpressions
 875   }
 876
 877   // Overrides REToken.chain
 878
 879   boolean chain(REToken next) {
 880     super.chain(next);
 881     setUncle(next);
 882     return true;
 883   }
 884
 885   /**
 886    * Returns the minimum number of characters that could possibly
 887    * constitute a match of this regular expression.
 888    */
 889   public int getMinimumLength() {
 890       return minimumLength;
 891   }
 892
 893   /**
 894    * Returns an array of all matches found in the input.
 895    *
 896    * If the regular expression allows the empty string to match, it will
 897    * substitute matches at all positions except the end of the input.
 898    *
 899    * @param input The input text.
 900    * @return a non-null (but possibly zero-length) array of matches
 901    */
 902   public REMatch[] getAllMatches(Object input) {
 903     return getAllMatches(input,0,0);
 904   }
 905
 906   /**
 907    * Returns an array of all matches found in the input,
 908    * beginning at the specified index position.
 909    *
 910    * If the regular expression allows the empty string to match, it will
 911    * substitute matches at all positions except the end of the input.
 912    *
 913    * @param input The input text.
 914    * @param index The offset index at which the search should be begin.
 915    * @return a non-null (but possibly zero-length) array of matches
 916    */
 917   public REMatch[] getAllMatches(Object input, int index) {
 918     return getAllMatches(input,index,0);
 919   }
 920
 921   /**
 922    * Returns an array of all matches found in the input string,
 923    * beginning at the specified index position and using the specified
 924    * execution flags.
 925    *
 926    * If the regular expression allows the empty string to match, it will
 927    * substitute matches at all positions except the end of the input.
 928    *
 929    * @param input The input text.
 930    * @param index The offset index at which the search should be begin.
 931    * @param eflags The logical OR of any execution flags above.
 932    * @return a non-null (but possibly zero-length) array of matches
 933    */
 934   public REMatch[] getAllMatches(Object input, int index, int eflags) {
 935     return getAllMatchesImpl(makeCharIndexed(input,index),index,eflags);
 936   }
 937
 938   // this has been changed since 1.03 to be non-overlapping matches
 939   private REMatch[] getAllMatchesImpl(CharIndexed input, int index, int eflags) {
 940     Vector all = new Vector();
 941     REMatch m = null;
 942     while ((m = getMatchImpl(input,index,eflags,null)) != null) {
 943       all.addElement(m);
 944       index = m.getEndIndex();
 945       if (m.end[0] == 0) {   // handle pathological case of zero-length match
 946         index++;
 947         input.move(1);
 948       } else {
 949         input.move(m.end[0]);
 950       }
 951       if (!input.isValid()) break;
 952     }
 953     REMatch[] mset = new REMatch[all.size()];
 954     all.copyInto(mset);
 955     return mset;
 956   }
 957
 958     /* Implements abstract method REToken.match() */
 959     boolean match(CharIndexed input, REMatch mymatch) {
 960         if (firstToken == null) return next(input, mymatch);
 961
 962         // Note the start of this subexpression
 963         mymatch.start[subIndex] = mymatch.index;
 964
 965         return firstToken.match(input, mymatch);
 966     }
 967
 968   /**
 969    * Returns the first match found in the input.  If no match is found,
 970    * null is returned.
 971    *
 972    * @param input The input text.
 973    * @return An REMatch instance referencing the match, or null if none.
 974    */
 975   public REMatch getMatch(Object input) {
 976     return getMatch(input,0,0);
 977   }
 978
 979   /**
 980    * Returns the first match found in the input, beginning
 981    * the search at the specified index.  If no match is found,
 982    * returns null.
 983    *
 984    * @param input The input text.
 985    * @param index The offset within the text to begin looking for a match.
 986    * @return An REMatch instance referencing the match, or null if none.
 987    */
 988   public REMatch getMatch(Object input, int index) {
 989     return getMatch(input,index,0);
 990   }
 991
 992   /**
 993    * Returns the first match found in the input, beginning
 994    * the search at the specified index, and using the specified
 995    * execution flags.  If no match is found, returns null.
 996    *
 997    * @param input The input text.
 998    * @param index The offset index at which the search should be begin.
 999    * @param eflags The logical OR of any execution flags above.
1000    * @return An REMatch instance referencing the match, or null if none.
1001    */
1002   public REMatch getMatch(Object input, int index, int eflags) {
1003     return getMatch(input,index,eflags,null);
1004   }
1005
1006   /**
1007    * Returns the first match found in the input, beginning the search
1008    * at the specified index, and using the specified execution flags.
1009    * If no match is found, returns null.  If a StringBuffer is
1010    * provided and is non-null, the contents of the input text from the
1011    * index to the beginning of the match (or to the end of the input,
1012    * if there is no match) are appended to the StringBuffer.
1013    *
1014    * @param input The input text.
1015    * @param index The offset index at which the search should be begin.
1016    * @param eflags The logical OR of any execution flags above.
1017    * @param buffer The StringBuffer to save pre-match text in.
1018    * @return An REMatch instance referencing the match, or null if none.  */
1019   public REMatch getMatch(Object input, int index, int eflags, StringBuffer buffer) {
1020     return getMatchImpl(makeCharIndexed(input,index),index,eflags,buffer);
1021   }
1022
1023   REMatch getMatchImpl(CharIndexed input, int anchor, int eflags, StringBuffer buffer) {
1024       // Create a new REMatch to hold results
1025       REMatch mymatch = new REMatch(numSubs, anchor, eflags);
1026       do {
1027           // Optimization: check if anchor + minimumLength > length
1028           if (minimumLength == 0 || input.charAt(minimumLength-1) != CharIndexed.OUT_OF_BOUNDS) {
1029               if (match(input, mymatch)) {
1030                   // Find longest match of them all to observe leftmost longest
1031                   REMatch longest = mymatch;
1032                   while ((mymatch = mymatch.next) != null) {
1033                       if (mymatch.index > longest.index) {
1034                           longest = mymatch;
1035                       }
1036                   }
1037
1038                   longest.end[0] = longest.index;
1039                   longest.finish(input);
1040                   return longest;
1041               }
1042           }
1043           mymatch.clear(++anchor);
1044           // Append character to buffer if needed
1045           if (buffer != null && input.charAt(0) != CharIndexed.OUT_OF_BOUNDS) {
1046               buffer.append(input.charAt(0));
1047           }
1048       } while (input.move(1));
1049
1050       return null;
1051   }
1052
1053   /**
1054    * Returns an REMatchEnumeration that can be used to iterate over the
1055    * matches found in the input text.
1056    *
1057    * @param input The input text.
1058    * @return A non-null REMatchEnumeration instance.
1059    */
1060   public REMatchEnumeration getMatchEnumeration(Object input) {
1061     return getMatchEnumeration(input,0,0);
1062   }
1063
1064
1065   /**
1066    * Returns an REMatchEnumeration that can be used to iterate over the
1067    * matches found in the input text.
1068    *
1069    * @param input The input text.
1070    * @param index The offset index at which the search should be begin.
1071    * @return A non-null REMatchEnumeration instance, with its input cursor
1072    *  set to the index position specified.
1073    */
1074   public REMatchEnumeration getMatchEnumeration(Object input, int index) {
1075     return getMatchEnumeration(input,index,0);
1076   }
1077
1078   /**
1079    * Returns an REMatchEnumeration that can be used to iterate over the
1080    * matches found in the input text.
1081    *
1082    * @param input The input text.
1083    * @param index The offset index at which the search should be begin.
1084    * @param eflags The logical OR of any execution flags above.
1085    * @return A non-null REMatchEnumeration instance, with its input cursor
1086    *  set to the index position specified.
1087    */
1088   public REMatchEnumeration getMatchEnumeration(Object input, int index, int eflags) {
1089     return new REMatchEnumeration(this,makeCharIndexed(input,index),index,eflags);
1090   }
1091
1092
1093   /**
1094    * Substitutes the replacement text for the first match found in the input.
1095    *
1096    * @param input The input text.
1097    * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1098    * @return A String interpolating the substituted text.
1099    * @see REMatch#substituteInto
1100    */
1101   public String substitute(Object input,String replace) {
1102     return substitute(input,replace,0,0);
1103   }
1104
1105   /**
1106    * Substitutes the replacement text for the first match found in the input
1107    * beginning at the specified index position.  Specifying an index
1108    * effectively causes the regular expression engine to throw away the
1109    * specified number of characters.
1110    *
1111    * @param input The input text.
1112    * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1113    * @param index The offset index at which the search should be begin.
1114    * @return A String containing the substring of the input, starting
1115    *   at the index position, and interpolating the substituted text.
1116    * @see REMatch#substituteInto
1117    */
1118   public String substitute(Object input,String replace,int index) {
1119     return substitute(input,replace,index,0);
1120   }
1121
1122   /**
1123    * Substitutes the replacement text for the first match found in the input
1124    * string, beginning at the specified index position and using the
1125    * specified execution flags.
1126    *
1127    * @param input The input text.
1128    * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1129    * @param index The offset index at which the search should be begin.
1130    * @param eflags The logical OR of any execution flags above.
1131    * @return A String containing the substring of the input, starting
1132    *   at the index position, and interpolating the substituted text.
1133    * @see REMatch#substituteInto
1134    */
1135   public String substitute(Object input,String replace,int index,int eflags) {
1136     return substituteImpl(makeCharIndexed(input,index),replace,index,eflags);
1137   }
1138
1139   private String substituteImpl(CharIndexed input,String replace,int index,int eflags) {
1140     StringBuffer buffer = new StringBuffer();
1141     REMatch m = getMatchImpl(input,index,eflags,buffer);
1142     if (m==null) return buffer.toString();
1143     buffer.append( ((eflags & REG_NO_INTERPOLATE) > 0) ?
1144                    replace : m.substituteInto(replace) );
1145     if (input.move(m.end[0])) {
1146       do {
1147         buffer.append(input.charAt(0));
1148       } while (input.move(1));
1149     }
1150     return buffer.toString();
1151   }
1152
1153   /**
1154    * Substitutes the replacement text for each non-overlapping match found
1155    * in the input text.
1156    *
1157    * @param input The input text.
1158    * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1159    * @return A String interpolating the substituted text.
1160    * @see REMatch#substituteInto
1161    */
1162   public String substituteAll(Object input,String replace) {
1163     return substituteAll(input,replace,0,0);
1164   }
1165
1166   /**
1167    * Substitutes the replacement text for each non-overlapping match found
1168    * in the input text, starting at the specified index.
1169    *
1170    * If the regular expression allows the empty string to match, it will
1171    * substitute matches at all positions except the end of the input.
1172    *
1173    * @param input The input text.
1174    * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1175    * @param index The offset index at which the search should be begin.
1176    * @return A String containing the substring of the input, starting
1177    *   at the index position, and interpolating the substituted text.
1178    * @see REMatch#substituteInto
1179    */
1180   public String substituteAll(Object input,String replace,int index) {
1181     return substituteAll(input,replace,index,0);
1182   }
1183
1184   /**
1185    * Substitutes the replacement text for each non-overlapping match found
1186    * in the input text, starting at the specified index and using the
1187    * specified execution flags.
1188    *
1189    * @param input The input text.
1190    * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1191    * @param index The offset index at which the search should be begin.
1192    * @param eflags The logical OR of any execution flags above.
1193    * @return A String containing the substring of the input, starting
1194    *   at the index position, and interpolating the substituted text.
1195    * @see REMatch#substituteInto
1196    */
1197   public String substituteAll(Object input,String replace,int index,int eflags) {
1198     return substituteAllImpl(makeCharIndexed(input,index),replace,index,eflags);
1199   }
1200
1201   private String substituteAllImpl(CharIndexed input,String replace,int index,int eflags) {
1202     StringBuffer buffer = new StringBuffer();
1203     REMatch m;
1204     while ((m = getMatchImpl(input,index,eflags,buffer)) != null) {
1205         buffer.append( ((eflags & REG_NO_INTERPOLATE) > 0) ?
1206                        replace : m.substituteInto(replace) );
1207       index = m.getEndIndex();
1208       if (m.end[0] == 0) {
1209         char ch = input.charAt(0);
1210         if (ch != CharIndexed.OUT_OF_BOUNDS)
1211             buffer.append(ch);
1212         input.move(1);
1213       } else {
1214           input.move(m.end[0]);
1215       }
1216
1217       if (!input.isValid()) break;
1218     }
1219     return buffer.toString();
1220   }
1221
1222   /* Helper function for constructor */
1223   private void addToken(REToken next) {
1224     if (next == null) return;
1225     minimumLength += next.getMinimumLength();
1226     if (firstToken == null) {
1227         lastToken = firstToken = next;
1228     } else {
1229       // if chain returns false, it "rejected" the token due to
1230       // an optimization, and next was combined with lastToken
1231       if (lastToken.chain(next)) {
1232           lastToken = next;
1233       }
1234     }
1235   }
1236
1237   private static REToken setRepeated(REToken current, int min, int max, int index) throws REException {
1238     if (current == null) throw new REException(getLocalizedMessage("repeat.no.token"),REException.REG_BADRPT,index);
1239     return new RETokenRepeated(current.subIndex,current,min,max);
1240   }
1241
1242   private static int getPosixSet(char[] pattern,int index,StringBuffer buf) {
1243     // Precondition: pattern[index-1] == ':'
1244     // we will return pos of closing ']'.
1245     int i;
1246     for (i=index; i<(pattern.length-1); i++) {
1247       if ((pattern[i] == ':') && (pattern[i+1] == ']'))
1248         return i+2;
1249       buf.append(pattern[i]);
1250     }
1251     return index; // didn't match up
1252   }
1253
1254   private int getMinMax(char[] input,int index,IntPair minMax,RESyntax syntax) throws REException {
1255     // Precondition: input[index-1] == '{', minMax != null
1256
1257     boolean mustMatch = !syntax.get(RESyntax.RE_NO_BK_BRACES);
1258     int startIndex = index;
1259     if (index == input.length) {
1260       if (mustMatch)
1261         throw new REException(getLocalizedMessage("unmatched.brace"),REException.REG_EBRACE,index);
1262       else
1263         return startIndex;
1264     }
1265
1266     int min,max=0;
1267     CharUnit unit = new CharUnit();
1268     StringBuffer buf = new StringBuffer();
1269
1270     // Read string of digits
1271     do {
1272       index = getCharUnit(input,index,unit);
1273       if (Character.isDigit(unit.ch))
1274         buf.append(unit.ch);
1275     } while ((index != input.length) && Character.isDigit(unit.ch));
1276
1277     // Check for {} tomfoolery
1278     if (buf.length() == 0) {
1279       if (mustMatch)
1280         throw new REException(getLocalizedMessage("interval.error"),REException.REG_EBRACE,index);
1281       else
1282         return startIndex;
1283     }
1284
1285     min = Integer.parseInt(buf.toString());
1286
1287     if ((unit.ch == '}') && (syntax.get(RESyntax.RE_NO_BK_BRACES) ^ unit.bk))
1288       max = min;
1289     else if (index == input.length)
1290       if (mustMatch)
1291         throw new REException(getLocalizedMessage("interval.no.end"),REException.REG_EBRACE,index);
1292       else
1293         return startIndex;
1294     else if ((unit.ch == ',') && !unit.bk) {
1295       buf = new StringBuffer();
1296       // Read string of digits
1297       while (((index = getCharUnit(input,index,unit)) != input.length) && Character.isDigit(unit.ch))
1298         buf.append(unit.ch);
1299
1300       if (!((unit.ch == '}') && (syntax.get(RESyntax.RE_NO_BK_BRACES) ^ unit.bk)))
1301         if (mustMatch)
1302           throw new REException(getLocalizedMessage("interval.error"),REException.REG_EBRACE,index);
1303         else
1304           return startIndex;
1305
1306       // This is the case of {x,}
1307       if (buf.length() == 0) max = Integer.MAX_VALUE;
1308       else max = Integer.parseInt(buf.toString());
1309     } else
1310       if (mustMatch)
1311         throw new REException(getLocalizedMessage("interval.error"),REException.REG_EBRACE,index);
1312       else
1313         return startIndex;
1314
1315     // We know min and max now, and they are valid.
1316
1317     minMax.first = min;
1318     minMax.second = max;
1319
1320     // return the index following the '}'
1321     return index;
1322   }
1323
1324    /**
1325     * Return a human readable form of the compiled regular expression,
1326     * useful for debugging.
1327     */
1328    public String toString() {
1329      StringBuffer sb = new StringBuffer();
1330      dump(sb);
1331      return sb.toString();
1332    }
1333
1334   void dump(StringBuffer os) {
1335     os.append('(');
1336     if (subIndex == 0)
1337       os.append("?:");
1338     if (firstToken != null)
1339       firstToken.dumpAll(os);
1340     os.append(')');
1341   }
1342
1343   // Cast input appropriately or throw exception
1344   private static CharIndexed makeCharIndexed(Object input, int index) {
1345       // We could let a String fall through to final input, but since
1346       // it's the most likely input type, we check it first.
1347     if (input instanceof String)
1348       return new CharIndexedString((String) input,index);
1349     else if (input instanceof char[])
1350       return new CharIndexedCharArray((char[]) input,index);
1351     else if (input instanceof StringBuffer)
1352       return new CharIndexedStringBuffer((StringBuffer) input,index);
1353     else if (input instanceof InputStream)
1354       return new CharIndexedInputStream((InputStream) input,index);
1355     else if (input instanceof Reader)
1356         return new CharIndexedReader((Reader) input, index);
1357     else if (input instanceof CharIndexed)
1358         return (CharIndexed) input; // do we lose index info?
1359     else
1360         return new CharIndexedString(input.toString(), index);
1361   }
1362 }