src/edu/berkeley/sbp/Parser.java

   1 // Copyright 2006 all rights reserved; see LICENSE file for BSD-style license
   2
   3 package edu.berkeley.sbp;
   4 import edu.berkeley.sbp.util.*;
   5 import edu.berkeley.sbp.Sequence.Position;
   6 import java.io.*;
   7 import java.util.*;
   8
   9 // FEATURE: try harder to "fuse" states together along two dimensions:
  10 //   - identical (equivalent) states, or states that subsume each other
  11 //   - unnecessary intermediate states ("short cut" GLR)
  12
  13 /** a parser which translates an Input&lt;Token&gt; into a Forest&lt;NodeType&gt; */
  14 public abstract class Parser<Token, NodeType> {
  15
  16     final Table pt;
  17
  18     /** create a parser to parse the grammar with start symbol <tt>u</tt> */
  19     public Parser(Union u)  { this.pt = new Table(u); }
  20
  21     /** implement this method to create the output forest corresponding to a lone shifted input token */
  22     public abstract Forest<NodeType> shiftToken(Token t, Input.Region region);
  23
  24     public abstract Topology<Token> emptyTopology();
  25
  26     public String toString() { return pt.toString(); }
  27     Cache cache() { return pt; }
  28
  29     /** parse <tt>input</tt>, and return the shared packed parse forest (or throw an exception) */
  30     public Forest<NodeType> parse(Input<Token> input) throws IOException, ParseFailed {
  31         verbose = System.getProperty("sbp.verbose", null) != null;
  32         spinpos = 0;
  33         try {
  34             GSS gss = new GSS(input, this);
  35             for(GSS.Phase current = gss.new Phase<Token>(pt.start); ;) {
  36
  37                 if (verbose) {
  38                     // FIXME: clean this up
  39                     String s;
  40                     s = "  " + spin[spinpos++ % (spin.length)]+" parsing ";
  41                     s += input.getName();
  42                     s += " "+input.getLocation();
  43                     while(s.indexOf(':') != -1 && s.indexOf(':') < 8) s = " " + s;
  44                     String y = "@"+gss.viewPos+" ";
  45                     while(y.length() < 9) y = " " + y;
  46                     s += y;
  47                     s += "   nodes="+gss.numOldNodes;
  48                     while(s.length() < 50) s = s + " ";
  49                     s += " shifted="+gss.numNewNodes;
  50                     while(s.length() < 60) s = s + " ";
  51                     s += " reductions="+gss.numReductions;
  52                     System.err.print("\r"+s+ANSI.clreol()+"\r");
  53                 }
  54
  55                 if (current.isDone()) return (Forest<NodeType>)current.finalResult;
  56                 Forest forest = shiftToken((Token)current.token, current.getRegion());
  57                 current = gss.new Phase<Token>(current, forest);
  58             }
  59         } finally { if (verbose) System.err.print("\r"+ANSI.clreol()); }
  60     }
  61
  62     // Spinner //////////////////////////////////////////////////////////////////////////////
  63
  64     private boolean verbose = false;
  65     private static final char[] spin = new char[] { '-', '\\', '|', '/' };
  66     private int spinpos = 0;
  67     private long last = 0;
  68     void spin() {
  69         if (!verbose) return;
  70         long now = System.currentTimeMillis();
  71         if (now-last < 70) return;
  72         last = now;
  73         System.err.print("\r  " + spin[spinpos++ % (spin.length)]+"\r");
  74     }
  75
  76     // Table //////////////////////////////////////////////////////////////////////////////
  77
  78     /** an SLR(1) parse table which may contain conflicts */
  79     class Table extends Cache<Token> {
  80
  81         /** the start state */
  82         final State<Token>   start;
  83
  84         /** a dummy state from which no reductions can be performed */
  85         private final State<Token>   dead_state;
  86
  87         /** used to generate unique values for State.idx */
  88         private int master_state_idx = 0;
  89
  90         /** all the states for this table */
  91         HashSet<State<Token>>                     all_states       = new HashSet<State<Token>>();
  92
  93         /** all the doomed states in this table */
  94         HashMap<HashSet<Position>,State<Token>>   doomed_states    = new HashMap<HashSet<Position>,State<Token>>();
  95
  96         /** all the non-doomed states in this table */
  97         HashMap<HashSet<Position>,State<Token>>   normal_states    = new HashMap<HashSet<Position>,State<Token>>();
  98
  99         Topology<Token> emptyTopology() { return Parser.this.emptyTopology(); }
 100
 101         /** construct a parse table for the given grammar */
 102         Table(Union ux) {
 103             super(new Union("0", Sequence.create(ux), true));
 104
 105             // create the "dead state"
 106             this.dead_state = new State<Token>(new HashSet<Position>(), true);
 107
 108             // construct the start state; this will recursively create *all* the states
 109             this.start = new State<Token>(reachable(rootUnion), false);
 110
 111             buildReductions();
 112             sortReductions();
 113         }
 114
 115         /** fill in the reductions table */
 116         private void buildReductions() {
 117             // for each state, fill in the corresponding "row" of the parse table
 118             for(State<Token> state : all_states)
 119                 for(Position p : state.hs) {
 120
 121                     // if the element following this position is an atom, copy the corresponding
 122                     // set of rows out of the "master" goto table and into this state's shift table
 123                     if (p.element() != null && p.element() instanceof Atom)
 124                         state.shifts.addAll(state.gotoSetTerminals.subset(((Atom)p.element()).getTokenTopology()));
 125
 126                     // RNGLR: we can potentially reduce from any "right-nullable" position -- that is,
 127                     // any position for which all Elements after it in the Sequence are capable of
 128                     // matching the empty string.
 129                     if (!isRightNullable(p)) continue;
 130                     Topology<Token> follow = follow(p.owner());
 131                     for(Position p2 = p; p2 != null && p2.element() != null; p2 = p2.next()) {
 132                         if (!(p2.element() instanceof Union))
 133                             throw new Error("impossible -- only Unions can be nullable");
 134
 135                         // interesting RNGLR-followRestriction interaction: we must intersect
 136                         // not just the follow-set of the last non-nullable element, but the
 137                         // follow-sets of the nulled elements as well.
 138                         for(Sequence s : ((Union)p2.element()))
 139                             follow = follow.intersect(follow(s));
 140                         Topology<Token> set = epsilonFollowSet((Union)p2.element());
 141                         if (set != null) follow = follow.intersect(set);
 142                     }
 143
 144                     // indicate that when the next token is in the set "follow", nodes in this
 145                     // state should reduce according to Position "p"
 146                     state.reductions.put(follow, p);
 147                     if (followEof.contains(p.owner())) state.eofReductions.add(p);
 148                 }
 149
 150             // optimize the reductions table
 151             if (emptyTopology() instanceof IntegerTopology)
 152                 for(State<Token> state : all_states) {
 153                     // FIXME: this is pretty ugly
 154                     state.oreductions = state.reductions.optimize(((IntegerTopology)emptyTopology()).functor());
 155                     state.oshifts     = state.shifts.optimize(((IntegerTopology)emptyTopology()).functor());
 156                 }
 157         }
 158
 159         // FIXME: this method needs to be cleaned up and documented
 160         private void sortReductions() {
 161             // crude algorithm to assing an ordinal ordering to every position
 162             // al will be sorted in DECREASING order (al[0] >= al[1])
 163             ArrayList<Sequence.Position> al = new ArrayList<Sequence.Position>();
 164             for(State s : all_states) {
 165                 for(Object po : s) {
 166                     Sequence.Position p = (Sequence.Position)po;
 167                     if (al.contains(p)) continue;
 168                     int i=0;
 169                     for(; i<al.size(); i++) {
 170                         if (comparePositions(p, al.get(i)) < 0)
 171                             break;
 172                     }
 173                     al.add(i, p);
 174                 }
 175             }
 176             // FIXME: this actually pollutes the "pure" objects (the ones that should not be modified by the Parser)
 177             // sort in increasing order...
 178             OUTER: while(true) {
 179                 for(int i=0; i<al.size(); i++)
 180                     for(int j=i+1; j<al.size(); j++)
 181                         if (comparePositions(al.get(i), al.get(j)) > 0) {
 182                             Sequence.Position p = al.remove(j);
 183                             al.add(i, p);
 184                             continue OUTER;
 185                         }
 186                 break;
 187             }
 188
 189             int j = 1;
 190             int pk = 0;
 191             for(int i=0; i<al.size(); i++) {
 192                 boolean inc = false;
 193                 for(int k=pk; k<i; k++) {
 194                     if (comparePositions(al.get(k), al.get(i)) > 0)
 195                         { inc = true; break; }
 196                 }
 197                 inc = true;
 198                 if (inc) {
 199                     j++;
 200                     pk = i;
 201                 }
 202                 al.get(i).ord = j;
 203             }
 204         }
 205
 206         /**
 207          *  A single state in the LR table and the transitions
 208          *  possible from it
 209          *
 210          *  A state corresponds to a set of Sequence.Position's.  Each
 211          *  Node in the GSS has a State; the Node represents a set of
 212          *  possible parses, one for each Position in the State.
 213          *
 214          *  Every state is either "doomed" or "normal".  If a Position
 215          *  is part of a Sequence which is a conjunct (that is, it was
 216          *  passed to Sequence.{and(),andnot()}), then that Position
 217          *  will appear only in doomed States.  Furthermore, any set
 218          *  of Positions reachable from a doomed State also forms a
 219          *  doomed State.  Note that in this latter case, a doomed
 220          *  state might have exactly the same set of Positions as a
 221          *  non-doomed state.
 222          *
 223          *  Nodes with non-doomed states represent nodes which
 224          *  contribute to actual valid parses.  Nodes with doomed
 225          *  States exist for no other purpose than to enable/disable
 226          *  some future reduction from a non-doomed Node.  Because of
 227          *  this, we "garbage-collect" Nodes with doomed states if
 228          *  there are no more non-doomed Nodes which they could
 229          *  affect (see Result, Reduction, and Node for details).
 230          *
 231          *  Without this optimization, many seemingly-innocuous uses
 232          *  of positive and negative conjuncts can trigger O(n^2)
 233          *  space+time complexity in otherwise simple grammars.  There
 234          *  is an example of this in the regression suite.
 235          */
 236         class State<Token> implements IntegerMappable, Iterable<Position> {
 237
 238             public  final     int               idx    = master_state_idx++;
 239             private final     HashSet<Position> hs;
 240             public HashSet<State<Token>> conjunctStates = new HashSet<State<Token>>();
 241
 242             HashMap<Sequence,State<Token>>      gotoSetNonTerminals = new HashMap<Sequence,State<Token>>();
 243             private transient TopologicalBag<Token,State<Token>>  gotoSetTerminals    = new TopologicalBag<Token,State<Token>>();
 244
 245             private           TopologicalBag<Token,Position>      reductions          = new TopologicalBag<Token,Position>();
 246             private           HashSet<Position>                   eofReductions       = new HashSet<Position>();
 247             private           TopologicalBag<Token,State<Token>>  shifts              = new TopologicalBag<Token,State<Token>>();
 248             private           boolean                             accept              = false;
 249
 250             private VisitableMap<Token,State<Token>> oshifts     = null;
 251             private VisitableMap<Token,Position>     oreductions = null;
 252             public  final boolean doomed;
 253
 254             // Interface Methods //////////////////////////////////////////////////////////////////////////////
 255
 256             boolean                    isAccepting()           { return accept; }
 257             public Iterator<Position>  iterator()              { return hs.iterator(); }
 258             boolean                    canShift(Token t)       { return oshifts!=null && oshifts.contains(t); }
 259             void                       invokeShifts(Token t, GSS.Phase phase, Result r) { oshifts.invoke(t, phase, r); }
 260             boolean                    canReduce(Token t)        {
 261                 return oreductions != null && (t==null ? eofReductions.size()>0 : oreductions.contains(t)); }
 262             void          invokeEpsilonReductions(Token t, Node node) {
 263                 if (t==null) for(Position r : eofReductions) node.invoke(r, null);
 264                 else         oreductions.invoke(t, node, null);
 265             }
 266             void          invokeReductions(Token t, Node node, Result b) {
 267                 if (t==null) for(Position r : eofReductions) node.invoke(r, b);
 268                 else         oreductions.invoke(t, node, b);
 269             }
 270
 271             // Constructor //////////////////////////////////////////////////////////////////////////////
 272
 273             /**
 274              *  create a new state consisting of all the <tt>Position</tt>s in <tt>hs</tt>
 275              *  @param hs           the set of <tt>Position</tt>s comprising this <tt>State</tt>
 276              *  @param all the set of all elements (Atom instances need not be included)
 277              *
 278              *   In principle these two steps could be merged, but they
 279              *   are written separately to highlight these two facts:
 280              * <ul>
 281              * <li> Non-atom elements either match all-or-nothing, and do not overlap
 282              *      with each other (at least not in the sense of which element corresponds
 283              *      to the last reduction performed).  Therefore, in order to make sure we
 284              *      wind up with the smallest number of states and shifts, we wait until
 285              *      we've figured out all the token-to-position multimappings before creating
 286              *      any new states
 287              *
 288              * <li> In order to be able to run the state-construction algorithm in a single
 289              *      shot (rather than repeating until no new items appear in any state set),
 290              *      we need to use the "yields" semantics rather than the "produces" semantics
 291              *      for non-Atom Elements.
 292              *  </ul>
 293              */
 294             public State(HashSet<Position> hs, boolean doomed) {
 295                 this.hs = hs;
 296                 this.doomed = doomed;
 297
 298                 // register ourselves so that no two states are ever
 299                 // created with an identical position set (termination depends on this)
 300                 ((HashMap)(doomed ? doomed_states : normal_states)).put(hs, this);
 301                 ((HashSet)all_states).add(this);
 302
 303                 for(Position p : hs) {
 304                     // Step 1a: take note if we are an accepting state
 305                     //          (last position of the root Union's sequence)
 306                     if (p.next()==null && !doomed && rootUnion.contains(p.owner()))
 307                         accept = true;
 308
 309                     // Step 1b: If any Position in the set is the first position of its sequence, then this
 310                     //          state is responsible for spawning the "doomed" states for each of the
 311                     //          Sequence's conjuncts.  This obligation is recorded by adding the to-be-spawned
 312                     //          states to conjunctStates.
 313                     if (!p.isFirst()) continue;
 314                     for(Sequence s : p.owner().needs())
 315                         if (!hs.contains(s.firstp()))
 316                             conjunctStates.add(mkstate(reachable(s.firstp()), true));
 317                     for(Sequence s : p.owner().hates())
 318                         if (!hs.contains(s.firstp()))
 319                             conjunctStates.add(mkstate(reachable(s.firstp()), true));
 320                 }
 321
 322                 // Step 2a: examine all Position's in this state and compute the mappings from
 323                 //          sets of follow tokens (tokens which could follow this position) to sets
 324                 //          of _new_ positions (positions after shifting).  These mappings are
 325                 //          collectively known as the _closure_
 326
 327                 TopologicalBag<Token,Position> bag0 = new TopologicalBag<Token,Position>();
 328                 for(Position position : hs) {
 329                     if (position.isLast() || !(position.element() instanceof Atom)) continue;
 330                     Atom a = (Atom)position.element();
 331                     HashSet<Position> hp = new HashSet<Position>();
 332                     reachable(position.next(), hp);
 333                     bag0.addAll(a.getTokenTopology(), hp);
 334                 }
 335
 336                 // Step 2b: for each _minimal, contiguous_ set of characters having an identical next-position
 337                 //          set, add that character set to the goto table (with the State corresponding to the
 338                 //          computed next-position set).
 339
 340                 for(Topology<Token> r : bag0) {
 341                     HashSet<Position> h = new HashSet<Position>();
 342                     for(Position p : bag0.getAll(r)) h.add(p);
 343                     ((TopologicalBag)gotoSetTerminals).put(r, mkstate(h, doomed));
 344                 }
 345
 346                 // Step 3: for every Sequence, compute the closure over every position in this set which
 347                 //         is followed by a symbol which could yield the Sequence.
 348                 //
 349                 //         "yields" [in one or more step] is used instead of "produces" [in exactly one step]
 350                 //         to avoid having to iteratively construct our set of States as shown in most
 351                 //         expositions of the algorithm (ie "keep doing XYZ until things stop changing").
 352
 353                 HashMapBag<Sequence,Position> move = new HashMapBag<Sequence,Position>();
 354                 for(Position p : hs)
 355                     if (!p.isLast() && p.element() instanceof Union)
 356                         for(Sequence s : ((Union)p.element())) {
 357                             HashSet<Position> hp = new HashSet<Position>();
 358                             reachable(p.next(), hp);
 359                             move.addAll(s, hp);
 360                         }
 361                 OUTER: for(Sequence y : move) {
 362                     // if a reduction is "lame", it should wind up in the dead_state after reducing
 363                     HashSet<Position> h = move.getAll(y);
 364                     State<Token> s = mkstate(h, doomed);
 365                     for(Position p : hs)
 366                         if (p.element() != null && (p.element() instanceof Union))
 367                             for(Sequence seq : ((Union)p.element()))
 368                                 if (seq.needs.contains(y) || seq.hates.contains(y)) {
 369                                     // FIXME: assumption that no sequence is ever both usefully (non-lamely) matched
 370                                     //        and also directly lamely matched
 371                                     ((HashMap)gotoSetNonTerminals).put(y, dead_state);
 372                                     continue OUTER;
 373                                 }
 374                     gotoSetNonTerminals.put(y, s);
 375                 }
 376             }
 377
 378             private State<Token> mkstate(HashSet<Position> h, boolean b) {
 379                 State ret = (b?doomed_states:normal_states).get(h);
 380                 if (ret==null) ret = new State<Token>(h,b);
 381                 return ret;
 382             }
 383
 384             public int toInt() { return idx; }
 385         }
 386
 387     }
 388
 389     // Helpers //////////////////////////////////////////////////////////////////////////////
 390
 391     private static HashSet<Position> reachable(Element e) {
 392         HashSet<Position> h = new HashSet<Position>();
 393         reachable(e, h);
 394         return h;
 395     }
 396     private static void reachable(Element e, HashSet<Position> h) {
 397         if (e instanceof Atom) return;
 398         for(Sequence s : ((Union)e))
 399             reachable(s.firstp(), h);
 400     }
 401     private static void reachable(Position p, HashSet<Position> h) {
 402         if (h.contains(p)) return;
 403         h.add(p);
 404         if (p.element() != null) reachable(p.element(), h);
 405     }
 406     private static HashSet<Position> reachable(Position p) {
 407         HashSet<Position> ret = new HashSet<Position>();
 408         reachable(p, ret);
 409         return ret;
 410     }
 411
 412 }