2 * gnu/regexp/RESyntax.java
3 * Copyright (C) 1998-2001 Wes Biggs
5 * This library is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU Lesser General Public License as published
7 * by the Free Software Foundation; either version 2.1 of the License, or
8 * (at your option) any later version.
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU Lesser General Public License for more details.
15 * You should have received a copy of the GNU Lesser General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 import java.io.Serializable;
22 import java.util.BitSet;
25 * An RESyntax specifies the way a regular expression will be compiled.
26 * This class provides a number of predefined useful constants for
27 * emulating popular regular expression syntaxes. Additionally the
28 * user may construct his or her own syntax, using any combination of the
29 * syntax bit constants. The syntax is an optional argument to any of the
30 * matching methods on class RE.
32 * @author <A HREF="mailto:wes@cacas.org">Wes Biggs</A>
35 public final class RESyntax implements Serializable {
36 static final String DEFAULT_LINE_SEPARATOR = System.getProperty("line.separator");
38 private static final String SYNTAX_IS_FINAL = RE.getLocalizedMessage("syntax.final");
42 // true for the constant defined syntaxes
43 private boolean isFinal = false;
45 private String lineSeparator = DEFAULT_LINE_SEPARATOR;
47 // Values for constants are bit indexes
50 * Syntax bit. Backslash is an escape character in lists.
52 public static final int RE_BACKSLASH_ESCAPE_IN_LISTS = 0;
55 * Syntax bit. Use \? instead of ? and \+ instead of +.
57 public static final int RE_BK_PLUS_QM = 1;
60 * Syntax bit. POSIX character classes ([:...:]) in lists are allowed.
62 public static final int RE_CHAR_CLASSES = 2;
65 * Syntax bit. ^ and $ are special everywhere.
66 * <B>Not implemented.</B>
68 public static final int RE_CONTEXT_INDEP_ANCHORS = 3;
71 * Syntax bit. Repetition operators are only special in valid positions.
72 * <B>Not implemented.</B>
74 public static final int RE_CONTEXT_INDEP_OPS = 4;
77 * Syntax bit. Repetition and alternation operators are invalid
78 * at start and end of pattern and other places.
79 * <B>Not implemented</B>.
81 public static final int RE_CONTEXT_INVALID_OPS = 5;
84 * Syntax bit. Match-any-character operator (.) matches a newline.
86 public static final int RE_DOT_NEWLINE = 6;
89 * Syntax bit. Match-any-character operator (.) does not match a null.
91 public static final int RE_DOT_NOT_NULL = 7;
94 * Syntax bit. Intervals ({x}, {x,}, {x,y}) are allowed.
96 public static final int RE_INTERVALS = 8;
99 * Syntax bit. No alternation (|), match one-or-more (+), or
100 * match zero-or-one (?) operators.
102 public static final int RE_LIMITED_OPS = 9;
105 * Syntax bit. Newline is an alternation operator.
107 public static final int RE_NEWLINE_ALT = 10; // impl.
110 * Syntax bit. Intervals use { } instead of \{ \}
112 public static final int RE_NO_BK_BRACES = 11;
115 * Syntax bit. Grouping uses ( ) instead of \( \).
117 public static final int RE_NO_BK_PARENS = 12;
120 * Syntax bit. Backreferences not allowed.
122 public static final int RE_NO_BK_REFS = 13;
125 * Syntax bit. Alternation uses | instead of \|
127 public static final int RE_NO_BK_VBAR = 14;
130 * Syntax bit. <B>Not implemented</B>.
132 public static final int RE_NO_EMPTY_RANGES = 15;
135 * Syntax bit. An unmatched right parenthesis (')' or '\)', depending
136 * on RE_NO_BK_PARENS) will throw an exception when compiling.
138 public static final int RE_UNMATCHED_RIGHT_PAREN_ORD = 16;
141 * Syntax bit. <B>Not implemented.</B>
143 public static final int RE_HAT_LISTS_NOT_NEWLINE = 17;
146 * Syntax bit. Stingy matching is allowed (+?, *?, ??, {x,y}?).
148 public static final int RE_STINGY_OPS = 18;
151 * Syntax bit. Allow character class escapes (\d, \D, \s, \S, \w, \W).
153 public static final int RE_CHAR_CLASS_ESCAPES = 19;
156 * Syntax bit. Allow use of (?:xxx) grouping (subexpression is not saved).
158 public static final int RE_PURE_GROUPING = 20;
161 * Syntax bit. Allow use of (?=xxx) and (?!xxx) apply the subexpression
162 * to the text following the current position without consuming that text.
164 public static final int RE_LOOKAHEAD = 21;
167 * Syntax bit. Allow beginning- and end-of-string anchors (\A, \Z).
169 public static final int RE_STRING_ANCHORS = 22;
172 * Syntax bit. Allow embedded comments, (?#comment), as in Perl5.
174 public static final int RE_COMMENTS = 23;
177 * Syntax bit. Allow character class escapes within lists, as in Perl5.
179 public static final int RE_CHAR_CLASS_ESC_IN_LISTS = 24;
181 private static final int BIT_TOTAL = 25;
185 * Emulates regular expression support in the awk utility.
187 public static final RESyntax RE_SYNTAX_AWK;
191 * Emulates regular expression support in the ed utility.
193 public static final RESyntax RE_SYNTAX_ED;
197 * Emulates regular expression support in the egrep utility.
199 public static final RESyntax RE_SYNTAX_EGREP;
203 * Emulates regular expression support in the GNU Emacs editor.
205 public static final RESyntax RE_SYNTAX_EMACS;
209 * Emulates regular expression support in the grep utility.
211 public static final RESyntax RE_SYNTAX_GREP;
215 * Emulates regular expression support in the POSIX awk specification.
217 public static final RESyntax RE_SYNTAX_POSIX_AWK;
221 * Emulates POSIX basic regular expression support.
223 public static final RESyntax RE_SYNTAX_POSIX_BASIC;
227 * Emulates regular expression support in the POSIX egrep specification.
229 public static final RESyntax RE_SYNTAX_POSIX_EGREP;
233 * Emulates POSIX extended regular expression support.
235 public static final RESyntax RE_SYNTAX_POSIX_EXTENDED;
239 * Emulates POSIX basic minimal regular expressions.
241 public static final RESyntax RE_SYNTAX_POSIX_MINIMAL_BASIC;
245 * Emulates POSIX extended minimal regular expressions.
247 public static final RESyntax RE_SYNTAX_POSIX_MINIMAL_EXTENDED;
251 * Emulates regular expression support in the sed utility.
253 public static final RESyntax RE_SYNTAX_SED;
257 * Emulates regular expression support in Larry Wall's perl, version 4,
259 public static final RESyntax RE_SYNTAX_PERL4;
263 * Emulates regular expression support in Larry Wall's perl, version 4,
264 * using single line mode (/s modifier).
266 public static final RESyntax RE_SYNTAX_PERL4_S; // single line mode (/s)
270 * Emulates regular expression support in Larry Wall's perl, version 5.
272 public static final RESyntax RE_SYNTAX_PERL5;
276 * Emulates regular expression support in Larry Wall's perl, version 5,
277 * using single line mode (/s modifier).
279 public static final RESyntax RE_SYNTAX_PERL5_S;
284 RE_SYNTAX_EMACS = new RESyntax().makeFinal();
286 RESyntax RE_SYNTAX_POSIX_COMMON = new RESyntax()
287 .set(RE_CHAR_CLASSES)
289 .set(RE_DOT_NOT_NULL)
291 .set(RE_NO_EMPTY_RANGES)
294 RE_SYNTAX_POSIX_BASIC = new RESyntax(RE_SYNTAX_POSIX_COMMON)
298 RE_SYNTAX_POSIX_EXTENDED = new RESyntax(RE_SYNTAX_POSIX_COMMON)
299 .set(RE_CONTEXT_INDEP_ANCHORS)
300 .set(RE_CONTEXT_INDEP_OPS)
301 .set(RE_NO_BK_BRACES)
302 .set(RE_NO_BK_PARENS)
304 .set(RE_UNMATCHED_RIGHT_PAREN_ORD)
307 RE_SYNTAX_AWK = new RESyntax()
308 .set(RE_BACKSLASH_ESCAPE_IN_LISTS)
309 .set(RE_DOT_NOT_NULL)
310 .set(RE_NO_BK_PARENS)
313 .set(RE_NO_EMPTY_RANGES)
314 .set(RE_UNMATCHED_RIGHT_PAREN_ORD)
317 RE_SYNTAX_POSIX_AWK = new RESyntax(RE_SYNTAX_POSIX_EXTENDED)
318 .set(RE_BACKSLASH_ESCAPE_IN_LISTS)
321 RE_SYNTAX_GREP = new RESyntax()
323 .set(RE_CHAR_CLASSES)
324 .set(RE_HAT_LISTS_NOT_NEWLINE)
329 RE_SYNTAX_EGREP = new RESyntax()
330 .set(RE_CHAR_CLASSES)
331 .set(RE_CONTEXT_INDEP_ANCHORS)
332 .set(RE_CONTEXT_INDEP_OPS)
333 .set(RE_HAT_LISTS_NOT_NEWLINE)
335 .set(RE_NO_BK_PARENS)
339 RE_SYNTAX_POSIX_EGREP = new RESyntax(RE_SYNTAX_EGREP)
341 .set(RE_NO_BK_BRACES)
344 /* P1003.2/D11.2, section 4.20.7.1, lines 5078ff. */
346 RE_SYNTAX_ED = new RESyntax(RE_SYNTAX_POSIX_BASIC)
349 RE_SYNTAX_SED = new RESyntax(RE_SYNTAX_POSIX_BASIC)
352 RE_SYNTAX_POSIX_MINIMAL_BASIC = new RESyntax(RE_SYNTAX_POSIX_COMMON)
356 /* Differs from RE_SYNTAX_POSIX_EXTENDED in that RE_CONTEXT_INVALID_OPS
357 replaces RE_CONTEXT_INDEP_OPS and RE_NO_BK_REFS is added. */
359 RE_SYNTAX_POSIX_MINIMAL_EXTENDED = new RESyntax(RE_SYNTAX_POSIX_COMMON)
360 .set(RE_CONTEXT_INDEP_ANCHORS)
361 .set(RE_CONTEXT_INVALID_OPS)
362 .set(RE_NO_BK_BRACES)
363 .set(RE_NO_BK_PARENS)
366 .set(RE_UNMATCHED_RIGHT_PAREN_ORD)
369 /* There is no official Perl spec, but here's a "best guess" */
371 RE_SYNTAX_PERL4 = new RESyntax()
372 .set(RE_BACKSLASH_ESCAPE_IN_LISTS)
373 .set(RE_CONTEXT_INDEP_ANCHORS)
374 .set(RE_CONTEXT_INDEP_OPS) // except for '{', apparently
376 .set(RE_NO_BK_BRACES)
377 .set(RE_NO_BK_PARENS)
379 .set(RE_NO_EMPTY_RANGES)
380 .set(RE_CHAR_CLASS_ESCAPES) // \d,\D,\w,\W,\s,\S
383 RE_SYNTAX_PERL4_S = new RESyntax(RE_SYNTAX_PERL4)
387 RE_SYNTAX_PERL5 = new RESyntax(RE_SYNTAX_PERL4)
388 .set(RE_PURE_GROUPING) // (?:)
389 .set(RE_STINGY_OPS) // *?,??,+?,{}?
390 .set(RE_LOOKAHEAD) // (?=)(?!)
391 .set(RE_STRING_ANCHORS) // \A,\Z
392 .set(RE_CHAR_CLASS_ESC_IN_LISTS)// \d,\D,\w,\W,\s,\S within []
393 .set(RE_COMMENTS) // (?#)
396 RE_SYNTAX_PERL5_S = new RESyntax(RE_SYNTAX_PERL5)
402 * Construct a new syntax object with all bits turned off.
403 * This is equivalent to RE_SYNTAX_EMACS.
406 bits = new BitSet(BIT_TOTAL);
410 * Called internally when constructing predefined syntaxes
411 * so their interpretation cannot vary. Conceivably useful
412 * for your syntaxes as well. Causes IllegalAccessError to
413 * be thrown if any attempt to modify the syntax is made.
415 * @return this object for convenient chaining
417 public RESyntax makeFinal() {
423 * Construct a new syntax object with all bits set the same
424 * as the other syntax.
426 public RESyntax(RESyntax other) {
427 bits = (BitSet) other.bits.clone();
431 * Check if a given bit is set in this syntax.
433 public boolean get(int index) {
434 return bits.get(index);
438 * Set a given bit in this syntax.
440 * @param index the constant (RESyntax.RE_xxx) bit to set.
441 * @return a reference to this object for easy chaining.
443 public RESyntax set(int index) {
444 if (isFinal) throw new IllegalAccessError(SYNTAX_IS_FINAL);
450 * Clear a given bit in this syntax.
452 * @param index the constant (RESyntax.RE_xxx) bit to clear.
453 * @return a reference to this object for easy chaining.
455 public RESyntax clear(int index) {
456 if (isFinal) throw new IllegalAccessError(SYNTAX_IS_FINAL);
462 * Changes the line separator string for regular expressions
463 * created using this RESyntax. The default separator is the
464 * value returned by the system property "line.separator", which
465 * should be correct when reading platform-specific files from a
466 * filesystem. However, many programs may collect input from
467 * sources where the line separator is differently specified (for
468 * example, in the applet environment, the text box widget
469 * interprets line breaks as single-character newlines,
470 * regardless of the host platform.
472 * Note that setting the line separator to a character or
473 * characters that have specific meaning within the current syntax
474 * can cause unexpected chronosynclastic infundibula.
476 * @return this object for convenient chaining
478 public RESyntax setLineSeparator(String aSeparator) {
479 if (isFinal) throw new IllegalAccessError(SYNTAX_IS_FINAL);
480 lineSeparator = aSeparator;
485 * Returns the currently active line separator string. The default
486 * is the platform-dependent system property "line.separator".
488 public String getLineSeparator() {
489 return lineSeparator;