X-Git-Url: http://git.megacz.com/?p=sbp.git;a=blobdiff_plain;f=src%2Fedu%2Fberkeley%2Fsbp%2FParser.java;h=e5cd67cbeccd21cf65ef61ae9356a49486cb0dd9;hp=34d7da98b59eccf9d185b3901816f69a5ca1b25c;hb=24219bdf084b45273e869cd19382d1640b396566;hpb=2c05c84a714f54b3bc026f51416492ddb13f33b1 diff --git a/src/edu/berkeley/sbp/Parser.java b/src/edu/berkeley/sbp/Parser.java index 34d7da9..e5cd67c 100644 --- a/src/edu/berkeley/sbp/Parser.java +++ b/src/edu/berkeley/sbp/Parser.java @@ -1,209 +1,370 @@ -// Copyright 2006 all rights reserved; see LICENSE file for BSD-style license +// Copyright 2006-2007 all rights reserved; see LICENSE file for BSD-style license package edu.berkeley.sbp; -import edu.berkeley.sbp.*; import edu.berkeley.sbp.util.*; -import edu.berkeley.sbp.Sequence.Position; +import edu.berkeley.sbp.Sequence.Pos; +import edu.berkeley.sbp.Sequence.Pos; import java.io.*; import java.util.*; /** a parser which translates an Input<Token> into a Forest<NodeType> */ -public abstract class Parser { +public abstract class Parser implements Serializable { - protected final Table pt; + final Table pt; /** create a parser to parse the grammar with start symbol u */ - public Parser(Union u, Topology top) { this.pt = new Table(u, top); } - Parser(Table pt) { this.pt = pt; } + public Parser(Union u) { this.pt = new Table(u); } /** implement this method to create the output forest corresponding to a lone shifted input token */ - public abstract Forest shiftToken(Token t, Input.Location newloc); + public abstract Forest shiftToken(Token t, Input.Region region); - boolean helpgc = true; + public abstract Topology emptyTopology(); public String toString() { return pt.toString(); } /** parse input, and return the shared packed parse forest (or throw an exception) */ public Forest parse(Input input) throws IOException, ParseFailed { - GSS gss = new GSS(input); - Input.Location loc = input.getLocation(); - Token tok = input.next(); - GSS.Phase current = gss.new Phase(null, this, null, tok, loc, input.getLocation(), null); - current.newNode(null, Forest.create(loc.createRegion(loc), null, null, false), pt.start, true); - int count = 1; - for(int idx=0;;idx++) { - Input.Location oldloc = loc; - current.reduce(); - Forest forest = current.token==null ? null : shiftToken((Token)current.token, loc); - loc = input.getLocation(); - Token nextToken = input.next(); - GSS.Phase next = gss.new Phase(current, this, current, nextToken, loc, input.getLocation(), forest); - if (!helpgc) { - FileOutputStream fos = new FileOutputStream("out-"+idx+".dot"); - PrintWriter p = new PrintWriter(new OutputStreamWriter(fos)); - GraphViz gv = new GraphViz(); - for(Object n : next) - ((GSS.Phase.Node)n).toGraphViz(gv); - gv.dump(p); - p.flush(); - p.close(); + long start = System.currentTimeMillis(); + verbose = System.getProperty("sbp.verbose", null) != null; + spinpos = 0; + GSS gss = new GSS(input, this); + int idmax = 0; + int[][] count = new int[1024*1024][]; + HashMap ids = new HashMap(); + try { + for(GSS.Phase current = gss.new Phase(pt.start); ;) { + if (verbose) debug(current.token, gss, input); + if (current.isDone()) return (Forest)current.finalResult; + Input.Region region = current.getLocation().createRegion(current.getNextLocation()); + Forest forest = shiftToken((Token)current.token, region); + /* + int maxid = 0; + for(Reduction r : gss.finishedReductions) + if (ids.get(r.reduction())==null) + ids.put(r.reduction(), idmax++); + count[current.pos] = new int[idmax]; + for(Reduction r : gss.finishedReductions) + count[current.pos][ids.get(r.reduction())]++; + */ + current = gss.new Phase(current, forest); } - count = next.size(); - if (current.isDone()) return (Forest)gss.finalResult; - current = next; + } finally { + if (verbose) { + long time = System.currentTimeMillis() - start; + System.err.println("\r parse time: " + time +"ms "+ ANSI.clreol()); + debug(null, gss, input); + } + /* + PrintWriter pw = new PrintWriter(new OutputStreamWriter(new FileOutputStream("out.plot"))); + boolean[] use = new boolean[idmax]; + for(int i=0; i20) + use[j] = true; + for(int i=0; i=count[i].length ? 0 : count[i][j])); + } + pw.println(); + } + pw.close(); + pw = new PrintWriter(new OutputStreamWriter(new FileOutputStream("test.plot"))); + pw.println("set terminal postscript enhanced color"); + pw.println("set output \"out.ps\""); + pw.println("set pm3d map"); + pw.println("set autoscale"); + pw.println("set view 0,0"); + pw.println("set ytics (\\"); + int q = -1; + for(int j=0; j extends Walk.Cache { - - public String toString() { - StringBuffer sb = new StringBuffer(); - sb.append("parse table"); - for(State state : all_states.values()) { - sb.append(" " + state + "\n"); - for(Topology t : state.shifts) { - sb.append(" shift \""+ - new edu.berkeley.sbp.chr.CharTopology((IntegerTopology)t)+"\" => "); - for(State st : state.shifts.getAll(t)) - sb.append(st.idx+" "); - sb.append("\n"); + private int _last = -1; + private String buf = ""; + private void debug(Object t, GSS gss, Input input) { + //FIXME + int c = t==null ? -1 : ((t+"").charAt(0)); + int last = _last; + _last = c; + switch(c) { + case edu.berkeley.sbp.chr.CharAtom.left: + buf += "\033[31m>\033[0m"; + break; + case edu.berkeley.sbp.chr.CharAtom.right: + buf += "\033[31m<\033[0m"; + break; + case -1: // FIXME + case '\n': + if (verbose) { + if (last==' ') buf += ANSI.blue("\\n"); + System.err.println("\r"+ANSI.clreol()+"\r"+buf); + buf = ""; } - for(Topology t : state.reductions) - sb.append(" reduce \""+ - new edu.berkeley.sbp.chr.CharTopology((IntegerTopology)t)+"\" => " + - state.reductions.getAll(t) + "\n"); - } - return sb.toString(); + break; + default: + buf += ANSI.cyan(""+((char)c)); + break; } + if (t==null) return; + + // FIXME: clean this up + String s; + s = " " + spin[spinpos++ % (spin.length)]+" parsing "; + s += input.getName(); + s += " "+input.getLocation(); + while(s.indexOf(':') != -1 && s.indexOf(':') < 8) s = " " + s; + String y = "@"+gss.viewPos+" "; + while(y.length() < 9) y = " " + y; + s += y; + s += " nodes="+gss.numOldNodes; + while(s.length() < 50) s = s + " "; + s += " shifted="+gss.numNewNodes; + while(s.length() < 60) s = s + " "; + s += " reductions="+gss.numReductions; + while(s.length() < 78) s = s + " "; + System.err.print("\r"+ANSI.invert(s+ANSI.clreol())+"\r"); + } - public final Walk.Cache cache = this; + // Table ////////////////////////////////////////////////////////////////////////////// - private void walk(Element e, HashSet hs) { - if (e==null) return; - if (hs.contains(e)) return; - hs.add(e); - if (e instanceof Atom) return; - for(Sequence s : (Union)e) - walk(s, hs); - } - private void walk(Sequence s, HashSet hs) { - hs.add(s); - for(Position p = s.firstp(); p != null; p = p.next()) - walk(p.element(), hs); - for(Sequence ss : s.needs()) walk(ss, hs); - for(Sequence ss : s.hates()) walk(ss, hs); - } + /** an SLR(1) parse table which may contain conflicts */ + class Table implements Serializable { /** the start state */ - public final State start; + final State start; - /** the state from which no reductions can be done */ + /** a dummy state from which no reductions can be performed */ private final State dead_state; /** used to generate unique values for State.idx */ private int master_state_idx = 0; - HashMap,State> all_states = new HashMap,State>(); + + /** all the states for this table */ + private transient HashSet> all_states = new HashSet>(); + + /** all the doomed states in this table */ + private transient HashMap,State> doomed_states = new HashMap,State>(); + + /** all the non-doomed states in this table */ + private transient HashMap,State> normal_states = new HashMap,State>(); /** construct a parse table for the given grammar */ - public Table(Topology top) { this("s", top); } - public Table(String startSymbol, Topology top) { this(new Union(startSymbol), top); } - public Table(Union ux, Topology top) { - Union start0 = new Union("0"); - start0.add(new Sequence.Singleton(ux)); - - for(Sequence s : start0) cache.eof.put(s, true); - cache.eof.put(start0, true); - - // construct the set of states - HashSet all_elements = new HashSet(); - walk(start0, all_elements); - for(SequenceOrElement e : all_elements) - cache.ys.addAll(e, new Walk.YieldSet(e, cache).walk()); - HashSet hp = new HashSet(); - reachable(start0, hp); - - this.dead_state = new State(new HashSet(), all_states, all_elements); - this.start = new State(hp, all_states, all_elements); + Table(Union ux) { + Union rootUnion = new Union("0", Sequence.create(ux), true); + Grammar grammar = new Grammar(rootUnion) { + public Topology emptyTopology() { return Parser.this.emptyTopology(); } + }; + // create the "dead state" + this.dead_state = new State(new HashSet(), true, grammar); + + // construct the start state; this will recursively create *all* the states + this.start = new State(reachable(rootUnion), false, grammar); + + buildReductions(grammar); + sortReductions(grammar); + } + + /** fill in the reductions table */ + private void buildReductions(Grammar grammar) { // for each state, fill in the corresponding "row" of the parse table - for(State state : all_states.values()) - for(Position p : state.hs) { - - // the Grammar's designated "last position" is the only accepting state - if (start0.contains(p.owner()) && p.next()==null) - state.accept = true; - - if (isRightNullable(p)) { - Walk.Follow wf = new Walk.Follow(top.empty(), p.owner(), all_elements, cache); - Topology follow = wf.walk(p.owner()); - for(Position p2 = p; p2 != null && p2.element() != null; p2 = p2.next()) - follow = follow.intersect(new Walk.Follow(top.empty(), p2.element(), all_elements, cache).walk(p2.element())); - state.reductions.put(follow, p); - if (wf.includesEof()) state.eofReductions.add(p); - } + for(State state : all_states) + for(Pos p : state.hs) { // if the element following this position is an atom, copy the corresponding // set of rows out of the "master" goto table and into this state's shift table if (p.element() != null && p.element() instanceof Atom) state.shifts.addAll(state.gotoSetTerminals.subset(((Atom)p.element()).getTokenTopology())); + + // RNGLR: we can potentially reduce from any "right-nullable" position -- that is, + // any position for which all Elements after it in the Sequence are capable of + // matching the empty string. + if (!grammar.isRightNullable(p)) continue; + Topology follow = grammar.follow(p.owner()); + for(Pos p2 = p; p2 != null && p2.element() != null; p2 = p2.next()) { + if (!(p2.element() instanceof Union)) + throw new Error("impossible -- only Unions can be nullable"); + + // interesting RNGLR-followRestriction interaction: we must intersect + // not just the follow-set of the last non-nullable element, but the + // follow-sets of the nulled elements as well. + for(Sequence s : ((Union)p2.element())) + follow = follow.intersect(grammar.follow(s)); + Topology set = grammar.epsilonFollowSet((Union)p2.element()); + if (set != null) follow = follow.intersect(set); + } + + // indicate that when the next token is in the set "follow", nodes in this + // state should reduce according to Pos "p" + state.reductions.put(follow, p); + if (grammar.followEof.contains(p.owner())) state.eofReductions.add(p); } - if (top instanceof IntegerTopology) - for(State state : all_states.values()) { - state.oreductions = state.reductions.optimize(((IntegerTopology)top).functor()); - state.oshifts = state.shifts.optimize(((IntegerTopology)top).functor()); + + // optimize the reductions table + if (emptyTopology() instanceof IntegerTopology) + for(State state : all_states) { + // FIXME: this is pretty ugly + state.oreductions = state.reductions.optimize(((IntegerTopology)emptyTopology()).functor()); + state.oshifts = state.shifts.optimize(((IntegerTopology)emptyTopology()).functor()); } } - private boolean isRightNullable(Position p) { - if (p.isLast()) return true; - if (!possiblyEpsilon(p.element())) return false; - return isRightNullable(p.next()); - } + // FIXME: this method needs to be cleaned up and documented + private void sortReductions(Grammar grammar) { + // crude algorithm to assing an ordinal ordering to every position + // al will be sorted in DECREASING order (al[0] >= al[1]) + ArrayList al = new ArrayList(); + for(State s : all_states) { + for(Object po : s.positions()) { + Sequence.Pos p = (Sequence.Pos)po; + if (al.contains(p)) continue; + int i=0; + for(; i 0) { + Sequence.Pos p = al.remove(j); + al.add(i, p); + continue OUTER; + } + break; + } - /** a single state in the LR table and the transitions possible from it */ + int j = 1; + int pk = 0; + for(int i=0; i 0) + { inc = true; break; } + } + inc = true; + if (inc) { + j++; + pk = i; + } + al.get(i).ord = j; + } + } - class State implements Comparable>, IntegerMappable, Iterable { + /** + * A single state in the LR table and the transitions + * possible from it + * + * A state corresponds to a set of Sequence.Pos's. Each + * StateNode in the GSS has a State; the StateNode represents a set of + * possible parses, one for each Pos in the State. + * + * Every state is either "doomed" or "normal". If a Pos + * is part of a Sequence which is a conjunct (that is, it was + * passed to Sequence.{and(),andnot()}), then that Pos + * will appear only in doomed States. Furthermore, any set + * of Positions reachable from a doomed State also forms a + * doomed State. Note that in this latter case, a doomed + * state might have exactly the same set of Positions as a + * non-doomed state. + * + * Nodes with non-doomed states represent nodes which + * contribute to actual valid parses. Nodes with doomed + * States exist for no other purpose than to enable/disable + * some future reduction from a non-doomed StateNode. Because of + * this, we "garbage-collect" Nodes with doomed states if + * there are no more non-doomed Nodes which they could + * affect (see Result, Reduction, and StateNode for details). + * + * Without this optimization, many seemingly-innocuous uses + * of positive and negative conjuncts can trigger O(n^2) + * space+time complexity in otherwise simple grammars. There + * is an example of this in the regression suite. + */ + class State implements IntegerMappable, Serializable { public final int idx = master_state_idx++; - private final HashSet hs; + private final transient HashSet hs; + public HashSet> conjunctStates = new HashSet>(); - public transient HashMap> gotoSetNonTerminals = new HashMap>(); - private transient TopologicalBag> gotoSetTerminals = new TopologicalBag>(); + HashMap> gotoSetNonTerminals = new HashMap>(); + private transient TopologicalBag> gotoSetTerminals = new TopologicalBag>(); - private TopologicalBag reductions = new TopologicalBag(); - private HashSet eofReductions = new HashSet(); - private TopologicalBag> shifts = new TopologicalBag>(); - private boolean accept = false; + TopologicalBag reductions = new TopologicalBag(); + HashSet eofReductions = new HashSet(); + private TopologicalBag> shifts = new TopologicalBag>(); + private boolean accept = false; - private VisitableMap> oshifts = null; - private VisitableMap oreductions = null; + private VisitableMap> oshifts = null; + private VisitableMap oreductions = null; + public final boolean doomed; // Interface Methods ////////////////////////////////////////////////////////////////////////////// - boolean isAccepting() { return accept; } - public Iterator iterator() { return hs.iterator(); } + public boolean doomed() { return doomed; } + boolean isAccepting() { return accept; } - boolean canShift(Token t) { return oshifts!=null && oshifts.contains(t); } - void invokeShifts(Token t, Invokable,B,C> irbc, B b, C c) { - oshifts.invoke(t, irbc, b, c); - } + Iterable positions() { return hs; } - boolean canReduce(Token t) { return oreductions != null && (t==null ? eofReductions.size()>0 : oreductions.contains(t)); } - void invokeReductions(Token t, Invokable irbc, B b, C c) { - if (t==null) for(Position r : eofReductions) irbc.invoke(r, b, c); - else oreductions.invoke(t, irbc, b, c); + boolean canShift(Token t) { return oshifts!=null && oshifts.contains(t); } + void invokeShifts(Token t, GSS.Phase phase, StateNode pred, Forest f) { oshifts.invoke(t, phase, pred, f); } + boolean canReduce(Token t) { + return oreductions != null && (t==null ? eofReductions.size()>0 : oreductions.contains(t)); } + void invokeEpsilonReductions(Token t, StateNode node) { + if (t==null) for(Pos r : eofReductions) node.invoke(r, null, null); + else oreductions.invoke(t, node, null, null); + } + void invokeReductions(Token t, StateNode node, Result b) { + if (t==null) for(Pos r : eofReductions) node.invoke(r, b, null); + else oreductions.invoke(t, node, b, null); } // Constructor ////////////////////////////////////////////////////////////////////////////// /** - * create a new state consisting of all the Positions in hs - * @param hs the set of Positions comprising this State - * @param all_states the set of states already constructed (to avoid recreating states) - * @param all_elements the set of all elements (Atom instances need not be included) + * create a new state consisting of all the Poss in hs + * @param hs the set of Poss comprising this State + * @param all the set of all elements (Atom instances need not be included) * * In principle these two steps could be merged, but they * are written separately to highlight these two facts: @@ -221,114 +382,130 @@ public abstract class Parser { * for non-Atom Elements. * */ - public State(HashSet hs, - HashMap,State> all_states, - HashSet all_elements) { + public State(HashSet hs, boolean doomed, Grammar grammar) { this.hs = hs; + this.doomed = doomed; + + // register ourselves so that no two states are ever + // created with an identical position set (termination depends on this) + ((HashMap)(doomed ? doomed_states : normal_states)).put(hs, this); + ((HashSet)all_states).add(this); + + for(Pos p : hs) { + // Step 1a: take note if we are an accepting state + // (last position of the root Union's sequence) + if (p.next()==null && !doomed && grammar.rootUnion.contains(p.owner())) + accept = true; + + // Step 1b: If any Pos in the set is the first position of its sequence, then this + // state is responsible for spawning the "doomed" states for each of the + // Sequence's conjuncts. This obligation is recorded by adding the to-be-spawned + // states to conjunctStates. + if (!p.isFirst()) continue; + for(Sequence s : p.owner().needs()) + if (!hs.contains(s.firstp())) + conjunctStates.add(mkstate(reachable(s.firstp()), true, grammar)); + for(Sequence s : p.owner().hates()) + if (!hs.contains(s.firstp())) + conjunctStates.add(mkstate(reachable(s.firstp()), true, grammar)); + } - // register ourselves in the all_states hash so that no - // two states are ever created with an identical position set - all_states.put(hs, this); - - // Step 1a: examine all Position's in this state and compute the mappings from + // Step 2a: examine all Pos's in this state and compute the mappings from // sets of follow tokens (tokens which could follow this position) to sets // of _new_ positions (positions after shifting). These mappings are // collectively known as the _closure_ - TopologicalBag bag0 = new TopologicalBag(); - for(Position position : hs) { + TopologicalBag bag0 = new TopologicalBag(); + for(Pos position : hs) { if (position.isLast() || !(position.element() instanceof Atom)) continue; Atom a = (Atom)position.element(); - HashSet hp = new HashSet(); + HashSet hp = new HashSet(); reachable(position.next(), hp); bag0.addAll(a.getTokenTopology(), hp); } - // Step 1b: for each _minimal, contiguous_ set of characters having an identical next-position + // Step 2b: for each _minimal, contiguous_ set of characters having an identical next-position // set, add that character set to the goto table (with the State corresponding to the // computed next-position set). for(Topology r : bag0) { - HashSet h = new HashSet(); - for(Position p : bag0.getAll(r)) h.add(p); - gotoSetTerminals.put(r, all_states.get(h) == null ? new State(h, all_states, all_elements) : all_states.get(h)); + HashSet h = new HashSet(); + for(Pos p : bag0.getAll(r)) h.add(p); + ((TopologicalBag)gotoSetTerminals).put(r, mkstate(h, doomed, grammar)); } - // Step 2: for every non-Atom element (ie every Element which has a corresponding reduction), - // compute the closure over every position in this set which is followed by a symbol - // which could yield the Element in question. + // Step 3: for every Sequence, compute the closure over every position in this set which + // is followed by a symbol which could yield the Sequence. // // "yields" [in one or more step] is used instead of "produces" [in exactly one step] // to avoid having to iteratively construct our set of States as shown in most // expositions of the algorithm (ie "keep doing XYZ until things stop changing"). - HashMapBag move = new HashMapBag(); - for(Position p : hs) { - Element e = p.element(); - if (e==null) continue; - for(SequenceOrElement y : cache.ys.getAll(e)) { - HashSet hp = new HashSet(); - reachable(p.next(), hp); - move.addAll(y, hp); - } - } - OUTER: for(SequenceOrElement y : move) { - HashSet h = move.getAll(y); - State s = all_states.get(h) == null ? new State(h, all_states, all_elements) : all_states.get(h); - // if a reduction is "lame", it should wind up in the dead_state after reducing - if (y instanceof Sequence) { - for(Position p : hs) { - if (p.element() != null && (p.element() instanceof Union)) { - Union u = (Union)p.element(); - for(Sequence seq : u) - if (seq.needs.contains((Sequence)y) || seq.hates.contains((Sequence)y)) { - // FIXME: what if there are two "routes" to get to the sequence? - ((HashMap)gotoSetNonTerminals).put((Sequence)y, dead_state); - continue OUTER; - } - } + HashMapBag move = new HashMapBag(); + for(Pos p : hs) + if (!p.isLast() && p.element() instanceof Union) + for(Sequence s : ((Union)p.element())) { + HashSet hp = new HashSet(); + reachable(p.next(), hp); + move.addAll(s, hp); } - gotoSetNonTerminals.put((Sequence)y, s); - } + OUTER: for(Sequence y : move) { + // if a reduction is "lame", it should wind up in the dead_state after reducing + HashSet h = move.getAll(y); + State s = mkstate(h, doomed, grammar); + for(Pos p : hs) + if (p.element() != null && (p.element() instanceof Union)) + for(Sequence seq : ((Union)p.element())) + if (seq.needs.contains(y) || seq.hates.contains(y)) { + // FIXME: assumption that no sequence is ever both usefully (non-lamely) matched + // and also directly lamely matched + for(Pos pp = y.firstp(); pp != null; pp = pp.next()) + ((HashMap)gotoSetNonTerminals).put(pp, dead_state); + continue OUTER; + } + for(Pos pp = y.firstp(); pp != null; pp = pp.next()) + gotoSetNonTerminals.put(pp, s); } } - public String toStringx() { - StringBuffer st = new StringBuffer(); - for(Position p : this) { - if (st.length() > 0) st.append("\n"); - st.append(p); - } - return st.toString(); + private State mkstate(HashSet h, boolean b, Grammar grammar) { + State ret = (b?doomed_states:normal_states).get(h); + if (ret==null) ret = new State(h,b, grammar); + return ret; } + + public int toInt() { return idx; } public String toString() { StringBuffer ret = new StringBuffer(); - ret.append("state["+idx+"]: "); - for(Position p : this) ret.append("{"+p+"} "); + for(Pos p : hs) + ret.append(p+"\n"); return ret.toString(); } - - public int compareTo(State s) { return idx==s.idx ? 0 : idx < s.idx ? -1 : 1; } - public int toInt() { return idx; } } + } // Helpers ////////////////////////////////////////////////////////////////////////////// - private static void reachable(Sequence s, HashSet h) { - reachable(s.firstp(), h); - for(Sequence ss : s.needs()) reachable(ss, h); - for(Sequence ss : s.hates()) reachable(ss, h); + private static HashSet reachable(Element e) { + HashSet h = new HashSet(); + reachable(e, h); + return h; } - private static void reachable(Element e, HashSet h) { + private static void reachable(Element e, HashSet h) { if (e instanceof Atom) return; for(Sequence s : ((Union)e)) - reachable(s, h); + reachable(s.firstp(), h); } - private static void reachable(Position p, HashSet h) { + private static void reachable(Pos p, HashSet h) { if (h.contains(p)) return; h.add(p); if (p.element() != null) reachable(p.element(), h); } + private static HashSet reachable(Pos p) { + HashSet ret = new HashSet(); + reachable(p, ret); + return ret; + } }