1 /* -*- Mode: java; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
\r
3 * The contents of this file are subject to the Netscape Public
\r
4 * License Version 1.1 (the "License"); you may not use this file
\r
5 * except in compliance with the License. You may obtain a copy of
\r
6 * the License at http://www.mozilla.org/NPL/
\r
8 * Software distributed under the License is distributed on an "AS
\r
9 * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express oqr
\r
10 * implied. See the License for the specific language governing
\r
11 * rights and limitations under the License.
\r
13 * The Original Code is Rhino code, released
\r
16 * The Initial Developer of the Original Code is Netscape
\r
17 * Communications Corporation. Portions created by Netscape are
\r
18 * Copyright (C) 1997-1999 Netscape Communications Corporation. All
\r
24 * Matthias Radestock
\r
26 * Alternatively, the contents of this file may be used under the
\r
27 * terms of the GNU Public License (the "GPL"), in which case the
\r
28 * provisions of the GPL are applicable instead of those above.
\r
29 * If you wish to allow use of your version of this file only
\r
30 * under the terms of the GPL and not to allow others to use your
\r
31 * version of this file under the NPL, indicate your decision by
\r
32 * deleting the provisions above and replace them with the notice
\r
33 * and other provisions required by the GPL. If you do not delete
\r
34 * the provisions above, a recipient may use your version of this
\r
35 * file under either the NPL or the GPL.
\r
38 package org.mozilla.javascript.regexp;
\r
40 import org.mozilla.javascript.*;
\r
41 import java.lang.reflect.Method;
\r
44 * This class implements the RegExp native object.
\r
47 * Implementation in C by Brendan Eich
\r
48 * Initial port to Java by Norris Boyd from jsregexp.c version 1.36
\r
49 * Merged up to version 1.38, which included Unicode support.
\r
50 * Merged bug fixes in version 1.39.
\r
51 * Merged JSFUN13_BRANCH changes up to 1.32.2.13
\r
53 * @author Brendan Eich
\r
54 * @author Norris Boyd
\r
56 public class NativeRegExp extends IdScriptable implements Function {
\r
58 public static final int GLOB = 0x1; // 'g' flag: global
\r
59 public static final int FOLD = 0x2; // 'i' flag: fold
\r
60 public static final int MULTILINE = 0x4; // 'm' flag: multiline
\r
62 //type of match to perform
\r
63 public static final int TEST = 0;
\r
64 public static final int MATCH = 1;
\r
65 public static final int PREFIX = 2;
\r
67 private static final boolean debug = false;
\r
69 public static void init(Context cx, Scriptable scope, boolean sealed) {
\r
71 NativeRegExp proto = new NativeRegExp();
\r
72 proto.prototypeFlag = true;
\r
73 proto.activateIdMap(MAX_PROTOTYPE_ID);
\r
74 proto.setSealFunctionsFlag(sealed);
\r
75 proto.setFunctionParametrs(cx);
\r
76 proto.setParentScope(scope);
\r
77 proto.setPrototype(getObjectPrototype(scope));
\r
80 NativeRegExpCtor ctor = new NativeRegExpCtor();
\r
82 ctor.setPrototype(getClassPrototype(scope, "Function"));
\r
83 ctor.setParentScope(scope);
\r
85 ctor.setImmunePrototypeProperty(proto);
\r
92 defineProperty(scope, "RegExp", ctor, ScriptableObject.DONTENUM);
\r
95 public NativeRegExp(Context cx, Scriptable scope, String source,
\r
96 String global, boolean flat) {
\r
97 init(cx, scope, source, global, flat);
\r
100 public void init(Context cx, Scriptable scope, String source,
\r
101 String global, boolean flat) {
\r
102 this.source = source;
\r
104 if (global != null) {
\r
105 for (int i=0; i < global.length(); i++) {
\r
106 char c = global.charAt(i);
\r
109 } else if (c == 'i') {
\r
111 } else if (c == 'm') {
\r
112 flags |= MULTILINE;
\r
114 Object[] errArgs = { new Character(c) };
\r
115 throw NativeGlobal.constructError(cx, "SyntaxError",
\r
116 ScriptRuntime.getMessage(
\r
117 "msg.invalid.re.flag", errArgs),
\r
123 CompilerState state = new CompilerState(source, flags, cx, scope);
\r
126 int sourceLen = source.length();
\r
128 while (sourceLen > 0) {
\r
129 int len = sourceLen;
\r
130 if (len > REOP_FLATLEN_MAX) {
\r
131 len = REOP_FLATLEN_MAX;
\r
133 RENode ren2 = new RENode(state, len == 1 ? REOP_FLAT1 : REOP_FLAT,
\r
134 new Integer(index));
\r
135 ren2.flags = RENode.NONEMPTY;
\r
137 ren2.kid2 = index + len;
\r
139 ren2.flags |= RENode.SINGLE;
\r
140 ren2.chr = state.source[index];
\r
147 setNext(state, ren, ren2);
\r
151 this.ren = parseRegExp(state);
\r
152 if (ren == null) return;
\r
153 RENode end = new RENode(state, REOP_END, null);
\r
154 setNext(state, ren, end);
\r
156 dumpRegExp(state, ren);
\r
157 this.lastIndex = 0;
\r
158 this.parenCount = state.parenCount;
\r
159 this.flags = flags;
\r
161 scope = org.xwt.util.JSObject.defaultObjects;
\r
162 setPrototype(getClassPrototype(scope, "RegExp"));
\r
163 setParentScope(scope);
\r
166 public String getClassName() {
\r
170 public Object call(Context cx, Scriptable scope, Scriptable thisObj,
\r
172 return execSub(cx, scope, args, MATCH);
\r
175 public Scriptable construct(Context cx, Scriptable scope, Object[] args) {
\r
176 return (Scriptable) call(cx, scope, null, args);
\r
179 Scriptable compile(Context cx, Scriptable scope, Object[] args) {
\r
180 if (args.length > 0 && args[0] instanceof NativeRegExp) {
\r
181 if (args.length > 1 && args[1] != Undefined.instance) {
\r
183 throw NativeGlobal.constructError(
\r
185 "only one argument may be specified " +
\r
186 "if the first argument is a RegExp object",
\r
189 NativeRegExp thatObj = (NativeRegExp) args[0];
\r
190 source = thatObj.source;
\r
191 lastIndex = thatObj.lastIndex;
\r
192 parenCount = thatObj.parenCount;
\r
193 flags = thatObj.flags;
\r
194 program = thatObj.program;
\r
198 String s = args.length == 0 ? "" : ScriptRuntime.toString(args[0]);
\r
199 String global = args.length > 1 && args[1] != Undefined.instance
\r
200 ? ScriptRuntime.toString(args[1])
\r
202 init(cx, scope, s, global, false);
\r
206 public String toString() {
\r
207 StringBuffer buf = new StringBuffer();
\r
209 buf.append(source);
\r
211 if ((flags & GLOB) != 0)
\r
213 if ((flags & FOLD) != 0)
\r
215 if ((flags & MULTILINE) != 0)
\r
217 return buf.toString();
\r
220 public NativeRegExp() {
\r
223 private static RegExpImpl getImpl(Context cx) {
\r
224 return (RegExpImpl) ScriptRuntime.getRegExpProxy(cx);
\r
227 private Object execSub(Context cx, Scriptable scopeObj,
\r
228 Object[] args, int matchType)
\r
230 RegExpImpl reImpl = getImpl(cx);
\r
232 if (args.length == 0) {
\r
233 str = reImpl.input;
\r
235 Object[] errArgs = { toString() };
\r
236 throw NativeGlobal.constructError(
\r
238 ScriptRuntime.getMessage
\r
239 ("msg.no.re.input.for", errArgs),
\r
243 str = ScriptRuntime.toString(args[0]);
\r
245 int i = ((flags & GLOB) != 0) ? lastIndex : 0;
\r
246 int indexp[] = { i };
\r
247 Object rval = executeRegExp(cx, scopeObj,
\r
248 reImpl, str, indexp, matchType);
\r
249 if ((flags & GLOB) != 0) {
\r
250 lastIndex = (rval == null || rval == Undefined.instance)
\r
256 private Object exec(Context cx, Scriptable scopeObj, Object[] args) {
\r
257 return execSub(cx, scopeObj, args, MATCH);
\r
260 private Object test(Context cx, Scriptable scopeObj, Object[] args) {
\r
261 Object rval = execSub(cx, scopeObj, args, TEST);
\r
262 if (rval == null || !rval.equals(Boolean.TRUE))
\r
263 rval = Boolean.FALSE;
\r
267 private Object prefix(Context cx, Scriptable scopeObj, Object[] args) {
\r
268 return execSub(cx, scopeObj, args, PREFIX);
\r
271 static final int JS_BITS_PER_BYTE = 8;
\r
273 private static final byte REOP_EMPTY = 0; /* match rest of input against rest of r.e. */
\r
274 private static final byte REOP_ALT = 1; /* alternative subexpressions in kid and next */
\r
275 private static final byte REOP_BOL = 2; /* beginning of input (or line if multiline) */
\r
276 private static final byte REOP_EOL = 3; /* end of input (or line if multiline) */
\r
277 private static final byte REOP_WBDRY = 4; /* match "" at word boundary */
\r
278 private static final byte REOP_WNONBDRY = 5; /* match "" at word non-boundary */
\r
279 private static final byte REOP_QUANT = 6; /* quantified atom: atom{1,2} */
\r
280 private static final byte REOP_STAR = 7; /* zero or more occurrences of kid */
\r
281 private static final byte REOP_PLUS = 8; /* one or more occurrences of kid */
\r
282 private static final byte REOP_OPT = 9; /* optional subexpression in kid */
\r
283 private static final byte REOP_LPAREN = 10; /* left paren bytecode: kid is u.num'th sub-regexp */
\r
284 private static final byte REOP_RPAREN = 11; /* right paren bytecode */
\r
285 private static final byte REOP_DOT = 12; /* stands for any character */
\r
286 private static final byte REOP_CCLASS = 13; /* character class: [a-f] */
\r
287 private static final byte REOP_DIGIT = 14; /* match a digit char: [0-9] */
\r
288 private static final byte REOP_NONDIGIT = 15; /* match a non-digit char: [^0-9] */
\r
289 private static final byte REOP_ALNUM = 16; /* match an alphanumeric char: [0-9a-z_A-Z] */
\r
290 private static final byte REOP_NONALNUM = 17; /* match a non-alphanumeric char: [^0-9a-z_A-Z] */
\r
291 private static final byte REOP_SPACE = 18; /* match a whitespace char */
\r
292 private static final byte REOP_NONSPACE = 19; /* match a non-whitespace char */
\r
293 private static final byte REOP_BACKREF = 20; /* back-reference (e.g., \1) to a parenthetical */
\r
294 private static final byte REOP_FLAT = 21; /* match a flat string */
\r
295 private static final byte REOP_FLAT1 = 22; /* match a single char */
\r
296 private static final byte REOP_JUMP = 23; /* for deoptimized closure loops */
\r
297 private static final byte REOP_DOTSTAR = 24; /* optimize .* to use a single opcode */
\r
298 private static final byte REOP_ANCHOR = 25; /* like .* but skips left context to unanchored r.e. */
\r
299 private static final byte REOP_EOLONLY = 26; /* $ not preceded by any pattern */
\r
300 private static final byte REOP_UCFLAT = 27; /* flat Unicode string; len immediate counts chars */
\r
301 private static final byte REOP_UCFLAT1 = 28; /* single Unicode char */
\r
302 private static final byte REOP_UCCLASS = 29; /* Unicode character class, vector of chars to match */
\r
303 private static final byte REOP_NUCCLASS = 30; /* negated Unicode character class */
\r
304 private static final byte REOP_BACKREFi = 31; /* case-independent REOP_BACKREF */
\r
305 private static final byte REOP_FLATi = 32; /* case-independent REOP_FLAT */
\r
306 private static final byte REOP_FLAT1i = 33; /* case-independent REOP_FLAT1 */
\r
307 private static final byte REOP_UCFLATi = 34; /* case-independent REOP_UCFLAT */
\r
308 private static final byte REOP_UCFLAT1i = 35; /* case-independent REOP_UCFLAT1 */
\r
309 private static final byte REOP_ANCHOR1 = 36; /* first-char discriminating REOP_ANCHOR */
\r
310 private static final byte REOP_NCCLASS = 37; /* negated 8-bit character class */
\r
311 private static final byte REOP_DOTSTARMIN = 38; /* ungreedy version of REOP_DOTSTAR */
\r
312 private static final byte REOP_LPARENNON = 39; /* non-capturing version of REOP_LPAREN */
\r
313 private static final byte REOP_RPARENNON = 40; /* non-capturing version of REOP_RPAREN */
\r
314 private static final byte REOP_ASSERT = 41; /* zero width positive lookahead assertion */
\r
315 private static final byte REOP_ASSERT_NOT = 42; /* zero width negative lookahead assertion */
\r
316 private static final byte REOP_END = 43;
\r
318 /* maximum length of FLAT string */
\r
319 private static final int REOP_FLATLEN_MAX = 255;
\r
321 /* not thread safe, used only for debugging */
\r
322 private static int level;
\r
324 private static String[] reopname = null;
\r
375 private String getPrintableString(String str) {
\r
377 StringBuffer buf = new StringBuffer(str.length());
\r
378 for (int i = 0; i < str.length(); i++) {
\r
379 int c = str.charAt(i);
\r
380 if ((c < 0x20) || (c > 0x7F)) {
\r
384 buf.append("\\u" + Integer.toHexString(c));
\r
387 buf.append((char)c);
\r
389 return buf.toString();
\r
395 private void dumpRegExp(CompilerState state, RENode ren) {
\r
398 System.out.print("level offset flags description\n");
\r
401 char[] source = ren.s != null ? ren.s : state.source;
\r
402 System.out.print(level);
\r
403 System.out.print(" ");
\r
404 System.out.print(ren.offset);
\r
405 System.out.print(" " +
\r
406 ((ren.flags & RENode.ANCHORED) != 0 ? "A" : "-") +
\r
407 ((ren.flags & RENode.SINGLE) != 0 ? "S" : "-") +
\r
408 ((ren.flags & RENode.NONEMPTY) != 0 ? "F" : "-") + // F for full
\r
409 ((ren.flags & RENode.ISNEXT) != 0 ? "N" : "-") + // N for next
\r
410 ((ren.flags & RENode.GOODNEXT) != 0 ? "G" : "-") +
\r
411 ((ren.flags & RENode.ISJOIN) != 0 ? "J" : "-") +
\r
412 ((ren.flags & RENode.MINIMAL) != 0 ? "M" : "-") +
\r
418 System.out.print(" ");
\r
419 System.out.println(ren.next.offset);
\r
420 dumpRegExp(state, (RENode) ren.kid);
\r
428 case REOP_ASSERT_NOT:
\r
429 System.out.println();
\r
430 dumpRegExp(state, (RENode) ren.kid);
\r
434 System.out.print(" next ");
\r
435 System.out.print(ren.next.offset);
\r
436 System.out.print(" min ");
\r
437 System.out.print(ren.min);
\r
438 System.out.print(" max ");
\r
439 System.out.println(ren.max);
\r
440 dumpRegExp(state, (RENode) ren.kid);
\r
444 System.out.print(" num ");
\r
445 System.out.println(ren.num);
\r
446 dumpRegExp(state, (RENode) ren.kid);
\r
449 case REOP_LPARENNON:
\r
450 System.out.println();
\r
451 dumpRegExp(state, (RENode) ren.kid);
\r
456 System.out.print(" num ");
\r
457 System.out.println(ren.num);
\r
460 case REOP_CCLASS: {
\r
461 int index = ((Integer) ren.kid).intValue();
\r
462 int index2 = ren.kid2;
\r
463 int len = index2 - index;
\r
464 System.out.print(" [");
\r
465 System.out.print(getPrintableString(new String(source, index, len)));
\r
466 System.out.println("]");
\r
471 int index = ((Integer) ren.kid).intValue();
\r
472 int index2 = ren.kid2;
\r
473 int len = index2 - index;
\r
474 System.out.print(" ");
\r
475 System.out.print(getPrintableString(new String(source, index, len)));
\r
476 System.out.print(" (");
\r
477 System.out.print(len);
\r
478 System.out.println(")");
\r
483 System.out.print(" ");
\r
484 System.out.print(ren.chr);
\r
485 System.out.print(" ('\\");
\r
486 System.out.print(Integer.toString(ren.chr, 8));
\r
487 System.out.println("')");
\r
491 System.out.print(" ");
\r
492 System.out.println(ren.next.offset);
\r
495 case REOP_UCFLAT: {
\r
496 int index = ((Integer) ren.kid).intValue();
\r
497 int len = ren.kid2 - index;
\r
498 for (int i = 0; i < len; i++)
\r
499 System.out.print("\\u" +
\r
500 Integer.toHexString(source[index+i]));
\r
501 System.out.println();
\r
506 System.out.print("\\u" +
\r
507 Integer.toHexString(ren.chr));
\r
508 System.out.println();
\r
511 case REOP_UCCLASS: {
\r
512 int index = ((Integer) ren.kid).intValue();
\r
513 int len = ren.kid2 - index;
\r
514 System.out.print(" [");
\r
515 for (int i = 0; i < len; i++)
\r
516 System.out.print("\\u" +
\r
517 Integer.toHexString(source[index+i]));
\r
518 System.out.println("]");
\r
523 System.out.println();
\r
527 if ((ren.flags & RENode.GOODNEXT) == 0)
\r
529 } while ((ren = ren.next) != null);
\r
534 private void fixNext(CompilerState state, RENode ren1, RENode ren2,
\r
537 RENode next, kid, ren;
\r
539 goodnext = ren2 != null && (ren2.flags & RENode.ISNEXT) == 0;
\r
542 * Find the final node in a list of alternatives, or concatenations, or
\r
543 * even a concatenation of alternatives followed by non-alternatives (e.g.
\r
544 * ((x|y)z)w where ((x|y)z) is ren1 and w is ren2).
\r
546 for (; (next = ren1.next) != null && next != oldnext; ren1 = next) {
\r
547 if (ren1.op == REOP_ALT) {
\r
548 /* Find the end of this alternative's operand list. */
\r
549 kid = (RENode) ren1.kid;
\r
550 if (kid.op == REOP_JUMP)
\r
552 for (ren = kid; ren.next != null; ren = ren.next) {
\r
553 if (ren.op == REOP_ALT)
\r
554 throw new RuntimeException("REOP_ALT not expected");
\r
557 /* Append a jump node to all but the last alternative. */
\r
558 ren.next = new RENode(state, REOP_JUMP, null);
\r
559 ren.next.flags |= RENode.ISNEXT;
\r
560 ren.flags |= RENode.GOODNEXT;
\r
562 /* Recur to fix all descendent nested alternatives. */
\r
563 fixNext(state, kid, ren2, oldnext);
\r
568 * Now ren1 points to the last alternative, or to the final node on a
\r
569 * concatenation list. Set its next link to ren2, flagging a join point
\r
572 if (ren2 != null) {
\r
573 if ((ren2.flags & RENode.ISNEXT) == 0)
\r
574 ren2.flags |= RENode.ISNEXT;
\r
576 ren2.flags |= RENode.ISJOIN;
\r
580 ren1.flags |= RENode.GOODNEXT;
\r
583 * The following ops have a kid subtree through which to recur. Here is
\r
584 * where we fix the next links under the final ALT node's kid.
\r
593 case REOP_LPARENNON:
\r
595 case REOP_ASSERT_NOT:
\r
596 fixNext(state, (RENode) ren1.kid, ren2, oldnext);
\r
602 private void setNext(CompilerState state, RENode ren1, RENode ren2) {
\r
603 fixNext(state, ren1, ren2, null);
\r
607 * Top-down regular expression grammar, based closely on Perl 4.
\r
609 * regexp: altern A regular expression is one or more
\r
610 * altern '|' regexp alternatives separated by vertical bar.
\r
612 private RENode parseRegExp(CompilerState state) {
\r
613 RENode ren = parseAltern(state);
\r
616 char[] source = state.source;
\r
617 int index = state.index;
\r
618 if (index < source.length && source[index] == '|') {
\r
620 ren = new RENode(state, REOP_ALT, kid);
\r
623 ren.flags = (byte) (kid.flags & (RENode.ANCHORED | RENode.NONEMPTY));
\r
627 state.index = ++index;
\r
628 if (index < source.length && (source[index] == '|' ||
\r
629 source[index] == ')'))
\r
631 kid = new RENode(state, REOP_EMPTY, null);
\r
633 kid = parseAltern(state);
\r
634 index = state.index;
\r
638 RENode ren2 = new RENode(state, REOP_ALT, kid);
\r
642 ren1.flags |= RENode.GOODNEXT;
\r
643 ren2.flags = (byte) ((kid.flags & (RENode.ANCHORED |
\r
647 } while (index < source.length && source[index] == '|');
\r
653 * altern: item An alternative is one or more items,
\r
654 * item altern concatenated together.
\r
656 private RENode parseAltern(CompilerState state) {
\r
657 RENode ren = parseItem(state);
\r
662 char[] source = state.source;
\r
663 int index = state.index;
\r
666 while (index != source.length && (c = source[index]) != '|' &&
\r
669 RENode ren2 = parseItem(state);
\r
672 setNext(state, ren1, ren2);
\r
673 flags |= ren2.flags;
\r
675 index = state.index;
\r
679 * Propagate NONEMPTY to the front of a concatenation list, so that the
\r
680 * first alternative in (^a|b) is considered non-empty. The first node
\r
681 * in a list may match the empty string (as ^ does), but if the list is
\r
682 * non-empty, then the first node's NONEMPTY flag must be set.
\r
684 ren.flags |= flags & RENode.NONEMPTY;
\r
689 * item: assertion An item is either an assertion or
\r
690 * quantatom a quantified atom.
\r
692 * assertion: '^' Assertions match beginning of string
\r
693 * (or line if the class static property
\r
694 * RegExp.multiline is true).
\r
695 * '$' End of string (or line if the class
\r
696 * static property RegExp.multiline is
\r
698 * '\b' Word boundary (between \w and \W).
\r
699 * '\B' Word non-boundary.
\r
701 RENode parseItem(CompilerState state) {
\r
705 char[] source = state.source;
\r
706 int index = state.index;
\r
707 switch (index < source.length ? source[index] : '\0') {
\r
709 state.index = index + 1;
\r
710 ren = new RENode(state, REOP_BOL, null);
\r
711 ren.flags |= RENode.ANCHORED;
\r
715 state.index = index + 1;
\r
716 return new RENode(state,
\r
717 (index == state.indexBegin ||
\r
718 ((source[index-1] == '(' ||
\r
719 source[index-1] == '|') && /*balance)*/
\r
720 (index - 1 == state.indexBegin ||
\r
721 source[index-2] != '\\')))
\r
727 switch (++index < source.length ? source[index] : '\0') {
\r
732 op = REOP_WNONBDRY;
\r
735 return parseQuantAtom(state);
\r
739 * Word boundaries and non-boundaries are flagged as non-empty
\r
740 * so they will be prefixed by an anchoring node.
\r
742 state.index = index + 1;
\r
743 ren = new RENode(state, op, null);
\r
744 ren.flags |= RENode.NONEMPTY;
\r
749 return parseQuantAtom(state);
\r
753 * quantatom: atom An unquantified atom.
\r
754 * quantatom '{' n ',' m '}'
\r
755 * Atom must occur between n and m times.
\r
756 * quantatom '{' n ',' '}' Atom must occur at least n times.
\r
757 * quantatom '{' n '}' Atom must occur exactly n times.
\r
758 * quantatom '*' Zero or more times (same as {0,}).
\r
759 * quantatom '+' One or more times (same as {1,}).
\r
760 * quantatom '?' Zero or one time (same as {0,1}).
\r
762 * any of which can be optionally followed by '?' for ungreedy
\r
764 RENode parseQuantAtom(CompilerState state) {
\r
765 RENode ren = parseAtom(state);
\r
773 char[] source = state.source;
\r
774 int index = state.index;
\r
776 while (index < source.length) {
\r
777 switch (source[index]) {
\r
779 if (++index == source.length || !isDigit(c = source[index])) {
\r
780 reportError("msg.bad.quant",
\r
781 String.valueOf(source[state.index]), state);
\r
785 while (++index < source.length && isDigit(c = source[index])) {
\r
786 min = 10 * min + unDigit(c);
\r
787 if ((min >> 16) != 0) {
\r
788 reportError("msg.overlarge.max", tail(source, index),
\r
793 if (source[index] == ',') {
\r
795 if (isDigit(source[index])) {
\r
796 max = unDigit(source[index]);
\r
797 while (isDigit(c = source[++index])) {
\r
798 max = 10 * max + unDigit(c);
\r
799 if ((max >> 16) != 0) {
\r
800 reportError("msg.overlarge.max",
\r
801 String.valueOf(source[up]), state);
\r
806 reportError("msg.zero.quant",
\r
807 tail(source, state.index), state);
\r
811 reportError("msg.max.lt.min", tail(source, up), state);
\r
815 /* 0 means no upper bound. */
\r
819 /* Exactly n times. */
\r
821 reportError("msg.zero.quant",
\r
822 tail(source, state.index), state);
\r
827 if (source[index] != '}') {
\r
828 reportError("msg.unterm.quant",
\r
829 String.valueOf(source[state.index]), state);
\r
834 ren2 = new RENode(state, REOP_QUANT, ren);
\r
835 if (min > 0 && (ren.flags & RENode.NONEMPTY) != 0)
\r
836 ren2.flags |= RENode.NONEMPTY;
\r
837 ren2.min = (short) min;
\r
838 ren2.max = (short) max;
\r
844 ren = new RENode(state, REOP_STAR, ren);
\r
849 ren2 = new RENode(state, REOP_PLUS, ren);
\r
850 if ((ren.flags & RENode.NONEMPTY) != 0)
\r
851 ren2.flags |= RENode.NONEMPTY;
\r
857 ren = new RENode(state, REOP_OPT, ren);
\r
863 if ((index < source.length) && (source[index] == '?')) {
\r
864 ren.flags |= RENode.MINIMAL;
\r
869 state.index = index;
\r
874 * atom: '(' regexp ')' A parenthesized regexp (what matched
\r
875 * can be addressed using a backreference,
\r
876 * see '\' n below).
\r
877 * '.' Matches any char except '\n'.
\r
878 * '[' classlist ']' A character class.
\r
879 * '[' '^' classlist ']' A negated character class.
\r
881 * '\n' Newline (Line Feed).
\r
882 * '\r' Carriage Return.
\r
883 * '\t' Horizontal Tab.
\r
884 * '\v' Vertical Tab.
\r
885 * '\d' A digit (same as [0-9]).
\r
886 * '\D' A non-digit.
\r
887 * '\w' A word character, [0-9a-z_A-Z].
\r
888 * '\W' A non-word character.
\r
889 * '\s' A whitespace character, [ \b\f\n\r\t\v].
\r
890 * '\S' A non-whitespace character.
\r
891 * '\' n A backreference to the nth (n decimal
\r
892 * and positive) parenthesized expression.
\r
893 * '\' octal An octal escape sequence (octal must be
\r
894 * two or three digits long, unless it is
\r
895 * 0 for the null character).
\r
896 * '\x' hex A hex escape (hex must be two digits).
\r
897 * '\c' ctrl A control character, ctrl is a letter.
\r
898 * '\' literalatomchar Any character except one of the above
\r
899 * that follow '\' in an atom.
\r
900 * otheratomchar Any character not first among the other
\r
901 * atom right-hand sides.
\r
903 static final String metachars = "|^${*+?().[\\";
\r
904 static final String closurechars = "{*+?";
\r
906 RENode parseAtom(CompilerState state) {
\r
913 boolean skipCommon = false;
\r
914 boolean doFlat = false;
\r
916 char[] source = state.source;
\r
917 int index = state.index;
\r
919 if (index == source.length) {
\r
920 state.index = index;
\r
921 return new RENode(state, REOP_EMPTY, null);
\r
923 switch (source[index]) {
\r
924 /* handle /|a/ by returning an empty node for the leftside */
\r
926 return new RENode(state, REOP_EMPTY, null);
\r
930 if (source[index + 1] == '?') {
\r
931 switch (source[index + 2]) {
\r
933 op = REOP_LPARENNON;
\r
939 op = REOP_ASSERT_NOT;
\r
943 if (op == REOP_END) {
\r
945 num = state.parenCount++; /* \1 is numbered 0, etc. */
\r
950 state.index = index;
\r
951 /* Handle empty paren */
\r
952 if (source[index] == ')') {
\r
953 ren2 = new RENode(state, REOP_EMPTY, null);
\r
956 ren2 = parseRegExp(state);
\r
959 index = state.index;
\r
960 if (index >= source.length || source[index] != ')') {
\r
961 reportError("msg.unterm.paren", tail(source, ocp), state);
\r
966 ren = new RENode(state, op, ren2);
\r
967 ren.flags = (byte) (ren2.flags & (RENode.ANCHORED |
\r
970 if ((op == REOP_LPAREN) || (op == REOP_LPARENNON)) {
\r
971 /* Assume RPAREN ops immediately succeed LPAREN ops */
\r
972 ren2 = new RENode(state, (byte)(op + 1), null);
\r
973 setNext(state, ren, ren2);
\r
981 if ((index < source.length) && (source[index] == '*')) {
\r
984 if ((index < source.length) && (source[index] == '?')) {
\r
986 op = REOP_DOTSTARMIN;
\r
989 ren = new RENode(state, op, null);
\r
990 if (ren.op == REOP_DOT)
\r
991 ren.flags = RENode.SINGLE | RENode.NONEMPTY;
\r
995 /* A char class must have at least one char in it. */
\r
996 if (++index == source.length) {
\r
997 reportError("msg.unterm.class", tail(source, ocp), state);
\r
1000 c = source[index];
\r
1001 ren = new RENode(state, REOP_CCLASS, new Integer(index));
\r
1003 /* A negated class must have at least one char in it after the ^. */
\r
1004 if (c == '^' && ++index == source.length) {
\r
1005 reportError("msg.unterm.class", tail(source, ocp), state);
\r
1010 if (++index == source.length) {
\r
1011 reportError("msg.unterm.paren", tail(source, ocp), state);
\r
1014 c = source[index];
\r
1017 if (c == '\\' && index+1 != source.length)
\r
1020 ren.kid2 = index++;
\r
1022 /* Since we rule out [] and [^], we can set the non-empty flag. */
\r
1023 ren.flags = RENode.SINGLE | RENode.NONEMPTY;
\r
1027 if (++index == source.length) {
\r
1028 Context.reportError(ScriptRuntime.getMessage("msg.trail.backslash",
\r
1032 c = source[index];
\r
1040 ren = new RENode(state, REOP_FLAT1, null);
\r
1043 ren = new RENode(state, REOP_DIGIT, null);
\r
1046 ren = new RENode(state, REOP_NONDIGIT, null);
\r
1049 ren = new RENode(state, REOP_ALNUM, null);
\r
1052 ren = new RENode(state, REOP_NONALNUM, null);
\r
1055 ren = new RENode(state, REOP_SPACE, null);
\r
1058 ren = new RENode(state, REOP_NONSPACE, null);
\r
1072 Yuk. Keeping the old style \n interpretation for 1.2
\r
1075 if ((state.cx.getLanguageVersion() != Context.VERSION_DEFAULT)
\r
1076 && (state.cx.getLanguageVersion() <= Context.VERSION_1_4)) {
\r
1079 state.index = index;
\r
1080 num = doOctal(state);
\r
1081 index = state.index;
\r
1082 ren = new RENode(state, REOP_FLAT1, null);
\r
1097 while (++index < source.length
\r
1098 && isDigit(c = source[index])) {
\r
1099 num = 10 * num + unDigit(c);
\r
1102 /* n in [8-9] and > count of parenetheses, then revert to
\r
1103 '8' or '9', ignoring the '\' */
\r
1104 if (((num == 8) || (num == 9)) && (num > state.parenCount)) {
\r
1105 ocp = --index; /* skip beyond the '\' */
\r
1107 skipCommon = true;
\r
1110 /* more than 1 digit, or a number greater than
\r
1111 the count of parentheses => it's an octal */
\r
1112 if ((len > 1) || (num > state.parenCount)) {
\r
1113 state.index = ocp;
\r
1114 num = doOctal(state);
\r
1115 index = state.index;
\r
1116 ren = new RENode(state, REOP_FLAT1, null);
\r
1121 ren = new RENode(state, REOP_BACKREF, null);
\r
1122 ren.num = num - 1; /* \1 is numbered 0, etc. */
\r
1124 /* Avoid common chr- and flags-setting
\r
1125 code after switch. */
\r
1126 ren.flags = RENode.NONEMPTY;
\r
1127 skipCommon = true;
\r
1133 ren = new RENode(state, REOP_FLAT1, null);
\r
1139 while (++index < source.length
\r
1140 && isDigit(c = source[index])) {
\r
1141 num = 10 * num + unDigit(c);
\r
1145 ren = new RENode(state, REOP_BACKREF, null);
\r
1146 ren.num = num - 1; /* \1 is numbered 0, etc. */
\r
1147 /* Avoid common chr- and flags-setting
\r
1148 code after switch. */
\r
1149 ren.flags = RENode.NONEMPTY;
\r
1150 skipCommon = true;
\r
1157 if (++index < source.length && isHex(c = source[index])) {
\r
1159 if (++index < source.length && isHex(c = source[index])) {
\r
1163 if ((state.cx.getLanguageVersion()
\r
1164 != Context.VERSION_DEFAULT)
\r
1165 && (state.cx.getLanguageVersion()
\r
1166 <= Context.VERSION_1_4))
\r
1167 index--; /* back up so index points to last hex char */
\r
1168 else { /* ecma 2 requires pairs of hex digits. */
\r
1174 index = ocp; /* \xZZ is xZZ (Perl does \0ZZ!) */
\r
1177 ren = new RENode(state, REOP_FLAT1, null);
\r
1182 c = source[++index];
\r
1183 if (!('A' <= c && c <= 'Z') && !('a' <= c && c <= 'z')) {
\r
1187 skipCommon = true;
\r
1190 c = Character.toUpperCase(c);
\r
1191 c = (char) (c ^ 64); // JS_TOCTRL
\r
1192 ren = new RENode(state, REOP_FLAT1, null);
\r
1196 if (index+4 < source.length &&
\r
1197 isHex(source[index+1]) && isHex(source[index+2]) &&
\r
1198 isHex(source[index+3]) && isHex(source[index+4]))
\r
1200 num = (((((unHex(source[index+1]) << 4) +
\r
1201 unHex(source[index+2])) << 4) +
\r
1202 unHex(source[index+3])) << 4) +
\r
1203 unHex(source[index+4]);
\r
1206 ren = new RENode(state, REOP_FLAT1, null);
\r
1210 /* Unlike Perl \\xZZ, we take \\uZZZ to be literal-u then ZZZ. */
\r
1213 skipCommon = true;
\r
1219 skipCommon = true;
\r
1223 /* Common chr- and flags-setting code for escape opcodes. */
\r
1224 if (ren != null && !skipCommon) {
\r
1226 ren.flags = RENode.SINGLE | RENode.NONEMPTY;
\r
1228 skipCommon = false;
\r
1231 /* Skip to next unparsed char. */
\r
1236 /* fall through since doFlat was true */
\r
1240 while (++index != source.length &&
\r
1241 metachars.indexOf(source[index]) == -1)
\r
1243 len = (int)(index - ocp);
\r
1244 if (index != source.length && len > 1 &&
\r
1245 closurechars.indexOf(source[index]) != -1)
\r
1250 if (len > REOP_FLATLEN_MAX) {
\r
1251 len = REOP_FLATLEN_MAX;
\r
1252 index = ocp + len;
\r
1254 ren = new RENode(state, len == 1 ? REOP_FLAT1 : REOP_FLAT,
\r
1255 new Integer(ocp));
\r
1256 ren.flags = RENode.NONEMPTY;
\r
1260 ren.flags |= RENode.SINGLE;
\r
1261 ren.chr = source[ocp];
\r
1266 state.index = index;
\r
1270 private int doOctal(CompilerState state) {
\r
1271 char[] source = state.source;
\r
1272 int index = state.index;
\r
1275 while (++index < source.length && '0' <= (c = source[index]) &&
\r
1278 tmp = 8 * num + (int)(c - '0');
\r
1285 state.index = index;
\r
1289 static char getEscape(char c) {
\r
1302 return (char) 11; // '\v' is not vtab in Java
\r
1304 throw new RuntimeException();
\r
1307 static public boolean isDigit(char c) {
\r
1308 return '0' <= c && c <= '9';
\r
1311 static int unDigit(char c) {
\r
1315 static boolean isHex(char c) {
\r
1316 return ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') ||
\r
1317 ('A' <= c && c <= 'F');
\r
1320 static int unHex(char c) {
\r
1321 if ('0' <= c && c <= '9')
\r
1323 return 10 + Character.toLowerCase(c) - 'a';
\r
1326 static boolean isWord(char c) {
\r
1327 return Character.isLetter(c) || isDigit(c) || c == '_';
\r
1330 private String tail(char[] a, int i) {
\r
1331 return new String(a, i, a.length - i);
\r
1334 private static boolean matchChar(int flags, char c, char c2) {
\r
1338 if ((flags & FOLD) != 0) {
\r
1339 c = Character.toUpperCase(c);
\r
1340 c2 = Character.toUpperCase(c2);
\r
1342 Character.toLowerCase(c) == Character.toLowerCase(c2);
\r
1349 int greedyRecurse(GreedyState grState, int index, int previousKid) {
\r
1355 * when the kid match fails, we reset the parencount and run any
\r
1356 * previously succesful kid in order to restablish it's paren
\r
1360 num = grState.state.parenCount;
\r
1361 kidMatch = matchRENodes(grState.state, grState.kid, grState.next, index);
\r
1362 if (kidMatch == -1) {
\r
1363 match = matchRENodes(grState.state, grState.next, grState.stop, index);
\r
1364 if (match != -1) {
\r
1365 grState.state.parenCount = num;
\r
1366 if (previousKid != -1)
\r
1367 matchRENodes(grState.state, grState.kid, grState.next, previousKid);
\r
1374 if (kidMatch == index) {
\r
1375 if (previousKid != -1)
\r
1376 matchRENodes(grState.state, grState.kid, grState.next, previousKid);
\r
1377 return kidMatch; /* no point pursuing an empty match forever */
\r
1379 if ((grState.maxKid == 0) || (++grState.kidCount < grState.maxKid)) {
\r
1380 match = greedyRecurse(grState, kidMatch, index);
\r
1381 if (match != -1) return match;
\r
1382 if (grState.maxKid != 0) --grState.kidCount;
\r
1384 grState.state.parenCount = num;
\r
1385 match = matchRENodes(grState.state, grState.next, grState.stop, kidMatch);
\r
1386 if (match != -1) {
\r
1387 matchRENodes(grState.state, grState.kid, grState.next, index);
\r
1395 int matchGreedyKid(MatchState state, RENode ren, RENode stop,
\r
1396 int kidCount, int index, int previousKid) {
\r
1397 GreedyState grState = new GreedyState();
\r
1398 grState.state = state;
\r
1399 grState.kid = (RENode)ren.kid;
\r
1400 grState.next = ren.next;
\r
1401 grState.maxKid = (ren.op == REOP_QUANT) ? ren.max : 0;
\r
1403 * We try to match the sub-tree to completion first, and if that
\r
1404 * doesn't work, match only up to the given end of the sub-tree.
\r
1406 grState.stop = null;
\r
1407 grState.kidCount = kidCount;
\r
1408 int match = greedyRecurse(grState, index, previousKid);
\r
1409 if (match != -1 || stop == null) return match;
\r
1410 grState.kidCount = kidCount;
\r
1411 grState.stop = stop;
\r
1412 return greedyRecurse(grState, index, previousKid);
\r
1415 int matchNonGreedyKid(MatchState state, RENode ren,
\r
1416 int kidCount, int maxKid,
\r
1421 match = matchRENodes(state, ren.next, null, index);
\r
1422 if (match != -1) return index;
\r
1423 kidMatch = matchRENodes(state, (RENode)ren.kid, ren.next, index);
\r
1424 if (kidMatch == -1) return -1;
\r
1425 if (kidMatch == index) return kidMatch; /* no point pursuing an empty match forever */
\r
1426 return matchNonGreedyKid(state, ren, kidCount, maxKid, kidMatch);
\r
1429 int matchRENodes(MatchState state, RENode ren, RENode stop, int index) {
\r
1431 char[] input = state.input;
\r
1432 while ((ren != stop) && (ren != null)) {
\r
1437 if (ren.next.op != REOP_ALT) {
\r
1438 ren = (RENode)ren.kid;
\r
1442 num = state.parenCount;
\r
1443 int kidMatch = matchRENodes(state, (RENode)ren.kid,
\r
1445 if (kidMatch != -1) return kidMatch;
\r
1446 for (int i = num; i < state.parenCount; i++)
\r
1447 state.parens[i] = null;
\r
1448 state.parenCount = num;
\r
1452 case REOP_QUANT: {
\r
1454 for (num = 0; num < ren.min; num++) {
\r
1455 int kidMatch = matchRENodes(state, (RENode)ren.kid,
\r
1457 if (kidMatch == -1)
\r
1464 if (num == ren.max)
\r
1465 // Have matched the exact count required,
\r
1466 // need to match the rest of the regexp.
\r
1468 if ((ren.flags & RENode.MINIMAL) == 0) {
\r
1469 int kidMatch = matchGreedyKid(state, ren, stop, num,
\r
1471 if (kidMatch == -1)
\r
1472 index = matchRENodes(state, (RENode)ren.kid,
\r
1478 index = matchNonGreedyKid(state, ren, num,
\r
1481 if (index == -1) return -1;
\r
1485 int kidMatch = matchRENodes(state, (RENode)ren.kid,
\r
1487 if (kidMatch == -1)
\r
1489 if ((ren.flags & RENode.MINIMAL) == 0) {
\r
1490 kidMatch = matchGreedyKid(state, ren, stop, 1,
\r
1492 if (kidMatch == -1)
\r
1493 index = matchRENodes(state,(RENode)ren.kid,
\r
1499 index = matchNonGreedyKid(state, ren, 1, 0, kidMatch);
\r
1500 if (index == -1) return -1;
\r
1504 if ((ren.flags & RENode.MINIMAL) == 0) {
\r
1505 int kidMatch = matchGreedyKid(state, ren, stop, 0, index, -1);
\r
1506 if (kidMatch != -1)
\r
1510 index = matchNonGreedyKid(state, ren, 0, 0, index);
\r
1511 if (index == -1) return -1;
\r
1515 int saveNum = state.parenCount;
\r
1516 if (((ren.flags & RENode.MINIMAL) != 0)) {
\r
1517 int restMatch = matchRENodes(state, ren.next,
\r
1519 if (restMatch != -1) return restMatch;
\r
1521 int kidMatch = matchRENodes(state, (RENode)ren.kid,
\r
1523 if (kidMatch == -1) {
\r
1524 state.parenCount = saveNum;
\r
1528 int restMatch = matchRENodes(state, ren.next,
\r
1530 if (restMatch == -1) {
\r
1531 // need to undo the result of running the kid
\r
1532 state.parenCount = saveNum;
\r
1539 case REOP_LPARENNON:
\r
1540 ren = (RENode)ren.kid;
\r
1542 case REOP_RPARENNON:
\r
1544 case REOP_LPAREN: {
\r
1546 ren = (RENode)ren.kid;
\r
1547 SubString parsub = state.parens[num];
\r
1548 if (parsub == null) {
\r
1549 parsub = state.parens[num] = new SubString();
\r
1550 parsub.charArray = input;
\r
1552 parsub.index = index;
\r
1553 parsub.length = 0;
\r
1554 if (num >= state.parenCount)
\r
1555 state.parenCount = num + 1;
\r
1558 case REOP_RPAREN: {
\r
1560 SubString parsub = state.parens[num];
\r
1561 if (parsub == null)
\r
1562 throw new RuntimeException("Paren problem");
\r
1563 parsub.length = index - parsub.index;
\r
1566 case REOP_ASSERT: {
\r
1567 int kidMatch = matchRENodes(state, (RENode)ren.kid,
\r
1569 if (kidMatch == -1) return -1;
\r
1572 case REOP_ASSERT_NOT: {
\r
1573 int kidMatch = matchRENodes(state, (RENode)ren.kid,
\r
1575 if (kidMatch != -1) return -1;
\r
1578 case REOP_BACKREF: {
\r
1580 if (num >= state.parens.length) {
\r
1581 Context.reportError(
\r
1582 ScriptRuntime.getMessage(
\r
1583 "msg.bad.backref", null));
\r
1586 SubString parsub = state.parens[num];
\r
1587 if (parsub == null)
\r
1588 parsub = state.parens[num] = new SubString();
\r
1589 int length = parsub.length;
\r
1590 for (int i = 0; i < length; i++, index++) {
\r
1591 if (index >= input.length) {
\r
1592 return state.noMoreInput();
\r
1594 if (!matchChar(state.flags, input[index],
\r
1595 parsub.charArray[parsub.index + i]))
\r
1601 if (index >= input.length) {
\r
1602 return state.noMoreInput();
\r
1604 if (ren.bitmap == null) {
\r
1605 char[] source = (ren.s != null)
\r
1607 : this.source.toCharArray();
\r
1608 ren.buildBitmap(state, source, ((state.flags & FOLD) != 0));
\r
1610 char c = input[index];
\r
1611 int b = (c >>> 3);
\r
1612 if (b >= ren.bmsize) {
\r
1613 if (ren.kid2 == -1) // a ^ class
\r
1620 if ((ren.bitmap[b] & bit) != 0)
\r
1627 if (index >= input.length) {
\r
1628 return state.noMoreInput();
\r
1630 if (input[index] != '\n')
\r
1635 case REOP_DOTSTARMIN: {
\r
1637 for (cp2 = index; cp2 < input.length; cp2++) {
\r
1638 int cp3 = matchRENodes(state, ren.next, stop, cp2);
\r
1639 if (cp3 != -1) return cp3;
\r
1640 if (input[cp2] == '\n')
\r
1643 return state.noMoreInput();
\r
1645 case REOP_DOTSTAR: {
\r
1647 for (cp2 = index; cp2 < input.length; cp2++)
\r
1648 if (input[cp2] == '\n')
\r
1650 while (cp2 >= index) {
\r
1651 int cp3 = matchRENodes(state, ren.next, stop, cp2);
\r
1661 if (index == 0 || !isWord(input[index-1])) {
\r
1662 if (index >= input.length)
\r
1663 return state.noMoreInput();
\r
1664 if (!isWord(input[index]))
\r
1668 if (index < input.length && isWord(input[index]))
\r
1672 case REOP_WNONBDRY:
\r
1673 if (index == 0 || !isWord(input[index-1])) {
\r
1674 if (index < input.length && isWord(input[index]))
\r
1678 if (index >= input.length)
\r
1679 return state.noMoreInput();
\r
1680 if (!isWord(input[index]))
\r
1684 case REOP_EOLONLY:
\r
1686 if (index == input.length)
\r
1689 Context cx = Context.getCurrentContext();
\r
1690 RegExpImpl reImpl = getImpl(cx);
\r
1691 if ((reImpl.multiline)
\r
1692 || ((state.flags & MULTILINE) != 0))
\r
1693 if (input[index] == '\n')
\r
1703 Context cx = Context.getCurrentContext();
\r
1704 RegExpImpl reImpl = getImpl(cx);
\r
1706 if (reImpl.multiline ||
\r
1707 ((state.flags & MULTILINE) != 0)) {
\r
1708 if (index >= input.length) {
\r
1709 return state.noMoreInput();
\r
1711 if (input[index - 1] == '\n') {
\r
1721 if (index >= input.length) {
\r
1722 return state.noMoreInput();
\r
1724 if (!isDigit(input[index])) return -1;
\r
1727 case REOP_NONDIGIT:
\r
1728 if (index >= input.length) {
\r
1729 return state.noMoreInput();
\r
1731 if (isDigit(input[index])) return -1;
\r
1735 if (index >= input.length) {
\r
1736 return state.noMoreInput();
\r
1738 if (!isWord(input[index])) return -1;
\r
1741 case REOP_NONALNUM:
\r
1742 if (index >= input.length) {
\r
1743 return state.noMoreInput();
\r
1745 if (isWord(input[index])) return -1;
\r
1749 if (index >= input.length) {
\r
1750 return state.noMoreInput();
\r
1752 if (!(TokenStream.isJSSpace(input[index]) ||
\r
1753 TokenStream.isJSLineTerminator(input[index])))
\r
1757 case REOP_NONSPACE:
\r
1758 if (index >= input.length) {
\r
1759 return state.noMoreInput();
\r
1761 if (TokenStream.isJSSpace(input[index]) ||
\r
1762 TokenStream.isJSLineTerminator(input[index]))
\r
1767 if (index >= input.length) {
\r
1768 return state.noMoreInput();
\r
1770 if (!matchChar(state.flags, ren.chr, input[index]))
\r
1775 char[] source = (ren.s != null)
\r
1777 : this.source.toCharArray();
\r
1778 int start = ((Integer)ren.kid).intValue();
\r
1779 int length = ren.kid2 - start;
\r
1780 for (int i = 0; i < length; i++, index++) {
\r
1781 if (index >= input.length) {
\r
1782 return state.noMoreInput();
\r
1784 if (!matchChar(state.flags, input[index],
\r
1785 source[start + i]))
\r
1795 throw new RuntimeException("Unsupported by node matcher");
\r
1802 int matchRegExp(MatchState state, RENode ren, int index) {
\r
1803 // have to include the position beyond the last character
\r
1804 // in order to detect end-of-input/line condition
\r
1805 for (int i = index; i <= state.input.length; i++) {
\r
1806 state.skipped = i - index;
\r
1807 state.parenCount = 0;
\r
1808 int result = matchRENodes(state, ren, null, i);
\r
1816 * indexp is assumed to be an array of length 1
\r
1818 Object executeRegExp(Context cx, Scriptable scopeObj, RegExpImpl res,
\r
1819 String str, int indexp[], int matchType)
\r
1821 NativeRegExp re = this;
\r
1823 * Initialize a CompilerState to minimize recursive argument traffic.
\r
1825 MatchState state = new MatchState();
\r
1826 state.inputExhausted = false;
\r
1827 state.anchoring = false;
\r
1828 state.flags = re.flags;
\r
1829 state.scope = scopeObj;
\r
1831 char[] charArray = str.toCharArray();
\r
1832 int start = indexp[0];
\r
1833 if (start > charArray.length)
\r
1834 start = charArray.length;
\r
1835 int index = start;
\r
1836 state.cpbegin = 0;
\r
1837 state.cpend = charArray.length;
\r
1838 state.start = start;
\r
1839 state.skipped = 0;
\r
1840 state.input = charArray;
\r
1842 state.parenCount = 0;
\r
1843 state.maybeParens = new SubString[re.parenCount];
\r
1844 state.parens = new SubString[re.parenCount];
\r
1845 // We allocate the elements of "parens" and "maybeParens" lazily in
\r
1846 // the Java port since we don't have arenas.
\r
1849 * Call the recursive matcher to do the real work. Return null on mismatch
\r
1850 * whether testing or not. On match, return an extended Array object.
\r
1852 index = matchRegExp(state, ren, index);
\r
1853 if (index == -1) {
\r
1854 if (matchType != PREFIX || !state.inputExhausted) return null;
\r
1855 return Undefined.instance;
\r
1857 int i = index - state.cpbegin;
\r
1859 int matchlen = i - (start + state.skipped);
\r
1861 index -= matchlen;
\r
1865 if (matchType == TEST) {
\r
1867 * Testing for a match and updating cx.regExpImpl: don't allocate
\r
1868 * an array object, do return true.
\r
1870 result = Boolean.TRUE;
\r
1875 * The array returned on match has element 0 bound to the matched
\r
1876 * string, elements 1 through state.parenCount bound to the paren
\r
1877 * matches, an index property telling the length of the left context,
\r
1878 * and an input property referring to the input string.
\r
1880 Scriptable scope = getTopLevelScope(scopeObj);
\r
1881 result = ScriptRuntime.newObject(cx, scope, "Array", null);
\r
1882 obj = (Scriptable) result;
\r
1884 String matchstr = new String(charArray, index, matchlen);
\r
1885 obj.put(0, obj, matchstr);
\r
1888 if (state.parenCount > re.parenCount)
\r
1889 throw new RuntimeException();
\r
1890 if (state.parenCount == 0) {
\r
1891 res.parens.setSize(0);
\r
1892 res.lastParen = SubString.emptySubString;
\r
1894 SubString parsub = null;
\r
1896 res.parens.setSize(state.parenCount);
\r
1897 for (num = 0; num < state.parenCount; num++) {
\r
1898 parsub = state.parens[num];
\r
1899 res.parens.setElementAt(parsub, num);
\r
1900 if (matchType == TEST) continue;
\r
1901 String parstr = parsub == null ? "": parsub.toString();
\r
1902 obj.put(num+1, obj, parstr);
\r
1904 res.lastParen = parsub;
\r
1907 if (! (matchType == TEST)) {
\r
1909 * Define the index and input properties last for better for/in loop
\r
1910 * order (so they come after the elements).
\r
1912 obj.put("index", obj, new Integer(start + state.skipped));
\r
1913 obj.put("input", obj, str);
\r
1916 if (res.lastMatch == null) {
\r
1917 res.lastMatch = new SubString();
\r
1918 res.leftContext = new SubString();
\r
1919 res.rightContext = new SubString();
\r
1921 res.lastMatch.charArray = charArray;
\r
1922 res.lastMatch.index = index;
\r
1923 res.lastMatch.length = matchlen;
\r
1925 res.leftContext.charArray = charArray;
\r
1926 if (cx.getLanguageVersion() == Context.VERSION_1_2) {
\r
1928 * JS1.2 emulated Perl4.0.1.8 (patch level 36) for global regexps used
\r
1929 * in scalar contexts, and unintentionally for the string.match "list"
\r
1930 * psuedo-context. On "hi there bye", the following would result:
\r
1932 * Language while(/ /g){print("$`");} s/ /$`/g
\r
1933 * perl4.036 "hi", "there" "hihitherehi therebye"
\r
1934 * perl5 "hi", "hi there" "hihitherehi therebye"
\r
1935 * js1.2 "hi", "there" "hihitheretherebye"
\r
1937 * Insofar as JS1.2 always defined $` as "left context from the last
\r
1938 * match" for global regexps, it was more consistent than perl4.
\r
1940 res.leftContext.index = start;
\r
1941 res.leftContext.length = state.skipped;
\r
1944 * For JS1.3 and ECMAv2, emulate Perl5 exactly:
\r
1946 * js1.3 "hi", "hi there" "hihitherehi therebye"
\r
1948 res.leftContext.index = 0;
\r
1949 res.leftContext.length = start + state.skipped;
\r
1952 res.rightContext.charArray = charArray;
\r
1953 res.rightContext.index = ep;
\r
1954 res.rightContext.length = state.cpend - ep;
\r
1959 public byte getFlags() {
\r
1963 private void reportError(String msg, String arg, CompilerState state) {
\r
1964 Object[] args = { arg };
\r
1965 throw NativeGlobal.constructError(
\r
1966 state.cx, "SyntaxError",
\r
1967 ScriptRuntime.getMessage(msg, args),
\r
1971 protected int getIdDefaultAttributes(int id) {
\r
1973 case Id_lastIndex:
\r
1974 return ScriptableObject.PERMANENT;
\r
1977 case Id_ignoreCase:
\r
1978 case Id_multiline:
\r
1979 return ScriptableObject.PERMANENT | ScriptableObject.READONLY;
\r
1981 return super.getIdDefaultAttributes(id);
\r
1984 protected Object getIdValue(int id) {
\r
1986 case Id_lastIndex: return wrap_long(0xffffffffL & lastIndex);
\r
1987 case Id_source: return source;
\r
1988 case Id_global: return wrap_boolean((flags & GLOB) != 0);
\r
1989 case Id_ignoreCase: return wrap_boolean((flags & FOLD) != 0);
\r
1990 case Id_multiline: return wrap_boolean((flags & MULTILINE) != 0);
\r
1992 return super.getIdValue(id);
\r
1995 protected void setIdValue(int id, Object value) {
\r
1996 if (id == Id_lastIndex) {
\r
1997 setLastIndex(ScriptRuntime.toInt32(value));
\r
2000 super.setIdValue(id, value);
\r
2003 void setLastIndex(int value) {
\r
2004 lastIndex = value;
\r
2007 public int methodArity(int methodId) {
\r
2008 if (prototypeFlag) {
\r
2009 switch (methodId) {
\r
2010 case Id_compile: return 1;
\r
2011 case Id_toString: return 0;
\r
2012 case Id_exec: return 1;
\r
2013 case Id_test: return 1;
\r
2014 case Id_prefix: return 1;
\r
2017 return super.methodArity(methodId);
\r
2020 public Object execMethod(int methodId, IdFunction f, Context cx,
\r
2021 Scriptable scope, Scriptable thisObj,
\r
2023 throws JavaScriptException
\r
2025 if (prototypeFlag) {
\r
2026 switch (methodId) {
\r
2028 return realThis(thisObj, f, false).compile(cx, scope, args);
\r
2031 return realThis(thisObj, f, true).toString();
\r
2034 return realThis(thisObj, f, false).exec(cx, scope, args);
\r
2037 return realThis(thisObj, f, false).test(cx, scope, args);
\r
2040 return realThis(thisObj, f, false).prefix(cx, scope, args);
\r
2043 return super.execMethod(methodId, f, cx, scope, thisObj, args);
\r
2046 private NativeRegExp realThis(Scriptable thisObj, IdFunction f,
\r
2049 while (!(thisObj instanceof NativeRegExp)) {
\r
2050 thisObj = nextInstanceCheck(thisObj, f, readOnly);
\r
2052 return (NativeRegExp)thisObj;
\r
2055 protected String getIdName(int id) {
\r
2057 case Id_lastIndex: return "lastIndex";
\r
2058 case Id_source: return "source";
\r
2059 case Id_global: return "global";
\r
2060 case Id_ignoreCase: return "ignoreCase";
\r
2061 case Id_multiline: return "multiline";
\r
2064 if (prototypeFlag) {
\r
2066 case Id_compile: return "compile";
\r
2067 case Id_toString: return "toString";
\r
2068 case Id_exec: return "exec";
\r
2069 case Id_test: return "test";
\r
2070 case Id_prefix: return "prefix";
\r
2076 protected int maxInstanceId() { return MAX_INSTANCE_ID; }
\r
2078 // #string_id_map#
\r
2080 private static final int
\r
2084 Id_ignoreCase = 4,
\r
2087 MAX_INSTANCE_ID = 5;
\r
2089 protected int mapNameToId(String s) {
\r
2091 // #generated# Last update: 2001-05-24 12:01:22 GMT+02:00
\r
2092 L0: { id = 0; String X = null; int c;
\r
2093 int s_length = s.length();
\r
2094 if (s_length==6) {
\r
2096 if (c=='g') { X="global";id=Id_global; }
\r
2097 else if (c=='s') { X="source";id=Id_source; }
\r
2099 else if (s_length==9) {
\r
2101 if (c=='l') { X="lastIndex";id=Id_lastIndex; }
\r
2102 else if (c=='m') { X="multiline";id=Id_multiline; }
\r
2104 else if (s_length==10) { X="ignoreCase";id=Id_ignoreCase; }
\r
2105 if (X!=null && X!=s && !X.equals(s)) id = 0;
\r
2108 // #/string_id_map#
\r
2110 if (id != 0 || !prototypeFlag) { return id; }
\r
2112 // #string_id_map#
\r
2113 // #generated# Last update: 2001-05-24 12:01:22 GMT+02:00
\r
2114 L0: { id = 0; String X = null; int c;
\r
2115 L: switch (s.length()) {
\r
2116 case 4: c=s.charAt(0);
\r
2117 if (c=='e') { X="exec";id=Id_exec; }
\r
2118 else if (c=='t') { X="test";id=Id_test; }
\r
2120 case 6: X="prefix";id=Id_prefix; break L;
\r
2121 case 7: X="compile";id=Id_compile; break L;
\r
2122 case 8: X="toString";id=Id_toString; break L;
\r
2124 if (X!=null && X!=s && !X.equals(s)) id = 0;
\r
2130 private static final int
\r
2131 Id_compile = MAX_INSTANCE_ID + 1,
\r
2132 Id_toString = MAX_INSTANCE_ID + 2,
\r
2133 Id_exec = MAX_INSTANCE_ID + 3,
\r
2134 Id_test = MAX_INSTANCE_ID + 4,
\r
2135 Id_prefix = MAX_INSTANCE_ID + 5,
\r
2137 MAX_PROTOTYPE_ID = MAX_INSTANCE_ID + 5;
\r
2139 // #/string_id_map#
\r
2140 private boolean prototypeFlag;
\r
2142 private String source; /* locked source string, sans // */
\r
2143 private int lastIndex; /* index after last match, for //g iterator */
\r
2144 private int parenCount; /* number of parenthesized submatches */
\r
2145 private byte flags; /* flags */
\r
2146 private byte[] program; /* regular expression bytecode */
\r
2151 class CompilerState {
\r
2152 CompilerState(String source, int flags, Context cx, Scriptable scope) {
\r
2153 this.source = source.toCharArray();
\r
2154 this.scope = scope;
\r
2155 this.flags = flags;
\r
2171 public static final int ANCHORED = 0x01; /* anchored at the front */
\r
2172 public static final int SINGLE = 0x02; /* matches a single char */
\r
2173 public static final int NONEMPTY = 0x04; /* does not match empty string */
\r
2174 public static final int ISNEXT = 0x08; /* ren is next after at least one node */
\r
2175 public static final int GOODNEXT = 0x10; /* ren.next is a tree-like edge in the graph */
\r
2176 public static final int ISJOIN = 0x20; /* ren is a join point in the graph */
\r
2177 public static final int REALLOK = 0x40; /* REOP_FLAT owns tempPool space to realloc */
\r
2178 public static final int MINIMAL = 0x80; /* un-greedy matching for ? * + {} */
\r
2180 RENode(CompilerState state, byte op, Object kid) {
\r
2185 private void calcBMSize(char[] s, int index, int cp2, boolean fold) {
\r
2187 while (index < cp2) {
\r
2188 char c = s[index++];
\r
2190 if (index + 5 <= cp2 && s[index] == 'u'
\r
2191 && NativeRegExp.isHex(s[index+1])
\r
2192 && NativeRegExp.isHex(s[index+2])
\r
2193 && NativeRegExp.isHex(s[index+3])
\r
2194 && NativeRegExp.isHex(s[index+4]))
\r
2196 int x = (((((NativeRegExp.unHex(s[index+0]) << 4) +
\r
2197 NativeRegExp.unHex(s[index+1])) << 4) +
\r
2198 NativeRegExp.unHex(s[index+2])) << 4) +
\r
2199 NativeRegExp.unHex(s[index+3]);
\r
2204 * Octal and hex escapes can't be > 255. Skip this
\r
2205 * backslash and let the loop pass over the remaining
\r
2206 * escape sequence as if it were text to match.
\r
2208 if (maxc < 255) maxc = 255;
\r
2214 * Don't assume that lowercase are above uppercase, or
\r
2215 * that c is either even when c has upper and lowercase
\r
2219 if ((c2 = Character.toUpperCase(c)) > maxc)
\r
2221 if ((c2 = Character.toLowerCase(c2)) > maxc)
\r
2227 bmsize = (short)((maxc + NativeRegExp.JS_BITS_PER_BYTE)
\r
2228 / NativeRegExp.JS_BITS_PER_BYTE);
\r
2231 private void matchBit(char c, int fill) {
\r
2233 byte b = (byte) (c & 7);
\r
2234 b = (byte) (1 << b);
\r
2241 private void checkRange(char lastc, int fill) {
\r
2242 matchBit(lastc, fill);
\r
2243 matchBit('-', fill);
\r
2246 void buildBitmap(MatchState state, char[] s, boolean fold) {
\r
2247 int index = ((Integer) kid).intValue();
\r
2252 boolean not = false;
\r
2254 if (s[index] == '^') {
\r
2260 calcBMSize(s, index, end, fold);
\r
2261 bitmap = new byte[bmsize];
\r
2263 fill = (byte)0xff;
\r
2264 for (i = 0; i < bmsize; i++)
\r
2265 bitmap[i] = (byte)0xff;
\r
2266 bitmap[0] = (byte)0xfe;
\r
2268 int nchars = bmsize * NativeRegExp.JS_BITS_PER_BYTE;
\r
2269 char lastc = (char)nchars;
\r
2270 boolean inrange = false;
\r
2272 while (index < end) {
\r
2273 char c = s[index++];
\r
2283 c = NativeRegExp.getEscape(c);
\r
2288 checkRange(lastc, fill);
\r
2289 lastc = (char) nchars;
\r
2290 for (c = '0'; c <= '9'; c++)
\r
2291 matchBit(c, fill);
\r
2296 checkRange(lastc, fill);
\r
2297 lastc = (char) nchars;
\r
2298 for (c = 0; c < '0'; c++)
\r
2299 matchBit(c, fill);
\r
2300 for (c = '9' + 1; c < nchars; c++)
\r
2301 matchBit(c, fill);
\r
2306 checkRange(lastc, fill);
\r
2307 lastc = (char) nchars;
\r
2308 for (c = 0; c < nchars; c++)
\r
2309 if (NativeRegExp.isWord(c))
\r
2310 matchBit(c, fill);
\r
2315 checkRange(lastc, fill);
\r
2316 lastc = (char) nchars;
\r
2317 for (c = 0; c < nchars; c++)
\r
2318 if (!NativeRegExp.isWord(c))
\r
2319 matchBit(c, fill);
\r
2324 checkRange(lastc, fill);
\r
2325 lastc = (char) nchars;
\r
2326 for (c = 0; c < nchars; c++)
\r
2327 if (Character.isWhitespace(c))
\r
2328 matchBit(c, fill);
\r
2333 checkRange(lastc, fill);
\r
2334 lastc = (char) nchars;
\r
2335 for (c = 0; c < nchars; c++)
\r
2336 if (!Character.isWhitespace(c))
\r
2337 matchBit(c, fill);
\r
2348 n = NativeRegExp.unDigit(c);
\r
2351 if ('0' <= c && c <= '7') {
\r
2353 n = 8 * n + NativeRegExp.unDigit(c);
\r
2356 if ('0' <= c && c <= '7') {
\r
2358 i = 8 * n + NativeRegExp.unDigit(c);
\r
2370 if (index < s.length &&
\r
2371 NativeRegExp.isHex(c = s[index++]))
\r
2373 n = NativeRegExp.unHex(c);
\r
2374 if (index < s.length &&
\r
2375 NativeRegExp.isHex(c = s[index++]))
\r
2378 n += NativeRegExp.unHex(c);
\r
2381 index = ocp; /* \xZZ is xZZ (Perl does \0ZZ!) */
\r
2388 if (s.length > index+3
\r
2389 && NativeRegExp.isHex(s[index+0])
\r
2390 && NativeRegExp.isHex(s[index+1])
\r
2391 && NativeRegExp.isHex(s[index+2])
\r
2392 && NativeRegExp.isHex(s[index+3])) {
\r
2393 n = (((((NativeRegExp.unHex(s[index+0]) << 4) +
\r
2394 NativeRegExp.unHex(s[index+1])) << 4) +
\r
2395 NativeRegExp.unHex(s[index+2])) << 4) +
\r
2396 NativeRegExp.unHex(s[index+3]);
\r
2404 c = Character.toUpperCase(c);
\r
2405 c = (char) (c ^ 64); // JS_TOCTRL
\r
2412 throw NativeGlobal.constructError(
\r
2413 Context.getCurrentContext(), "RangeError",
\r
2414 ScriptRuntime.getMessage(
\r
2415 "msg.bad.range", null),
\r
2420 // Set lastc so we match just c's bit in the for loop.
\r
2424 if (index + 1 < end && s[index] == '-' &&
\r
2425 s[index+1] != ']')
\r
2433 // Match characters in the range [lastc, c].
\r
2434 for (; lastc <= c; lastc++) {
\r
2435 matchBit(lastc, fill);
\r
2438 * Must do both upper and lower for Turkish dotless i,
\r
2441 char foldc = Character.toUpperCase(lastc);
\r
2442 matchBit(foldc, fill);
\r
2443 foldc = Character.toLowerCase(foldc);
\r
2444 matchBit(foldc, fill);
\r
2450 byte op; /* packed r.e. op bytecode */
\r
2451 byte flags; /* flags, see below */
\r
2452 short offset; /* bytecode offset */
\r
2453 RENode next; /* next in concatenation order */
\r
2454 Object kid; /* first operand */
\r
2455 int kid2; /* second operand */
\r
2456 int num; /* could be a number */
\r
2457 char chr; /* or a char */
\r
2458 short min,max; /* or a range */
\r
2459 short kidlen; /* length of string at kid, in chars */
\r
2460 short bmsize; /* bitmap size, based on max char code */
\r
2461 char[] s; /* if null, use state.source */
\r
2462 byte[] bitmap; /* cclass bitmap */
\r
2466 class MatchState {
\r
2467 boolean inputExhausted; /* did we run out of input chars ? */
\r
2468 boolean anchoring; /* true if multiline anchoring ^/$ */
\r
2469 int pcend; /* pc limit (fencepost) */
\r
2470 int cpbegin, cpend; /* cp base address and limit */
\r
2471 int start; /* offset from cpbegin to start at */
\r
2472 int skipped; /* chars skipped anchoring this r.e. */
\r
2473 byte flags; /* pennants */
\r
2474 int parenCount; /* number of paren substring matches */
\r
2475 SubString[] maybeParens; /* possible paren substring pointers */
\r
2476 SubString[] parens; /* certain paren substring matches */
\r
2480 public int noMoreInput() {
\r
2481 inputExhausted = true;
\r
2484 throw new Exception();
\r
2486 catch (Exception e) {
\r
2487 e.printStackTrace();
\r
2494 class GreedyState {
\r