slow down spinner
[sbp.git] / src / edu / berkeley / sbp / Parser.java
1 // Copyright 2006 all rights reserved; see LICENSE file for BSD-style license
2
3 package edu.berkeley.sbp;
4 import edu.berkeley.sbp.*;
5 import edu.berkeley.sbp.util.*;
6 import edu.berkeley.sbp.Sequence.Position;
7 import java.io.*;
8 import java.util.*;
9
10 /** a parser which translates an Input<Token> into a Forest<NodeType> */
11 public abstract class Parser<Token, NodeType> {
12
13     protected final Table<Token> pt;
14
15     /** create a parser to parse the grammar with start symbol <tt>u</tt> */
16     public Parser(Union u, Topology<Token> top)  { this.pt = new Table<Token>(u, top); }
17
18     /** implement this method to create the output forest corresponding to a lone shifted input token */
19     public abstract Forest<NodeType> shiftToken(Token t, Input.Location newloc);
20
21     public String toString() { return pt.toString(); }
22
23     private boolean verbose = false;;
24     private static final char[] spin = new char[] { '-', '\\', '|', '/' };
25     private int spinpos = 0;
26     private long last = 0;
27     void spin() {
28         if (verbose) {
29             long now = System.currentTimeMillis();
30             if (now-last < 70) return;
31             last = now;
32             System.err.print("\r  " + spin[spinpos++ % (spin.length)]+ANSI.clreol()+"\r");
33         }
34     }
35
36     /** parse <tt>input</tt>, and return the shared packed parse forest (or throw an exception) */
37     public Forest<NodeType> parse(Input<Token> input) throws IOException, ParseFailed {
38         verbose = System.getProperty("sbp.verbose", null) != null;
39         spinpos = 0;
40         try {
41             GSS gss = new GSS(input, this);
42             for(GSS.Phase current = gss.new Phase<Token>(pt.start); ;) {
43                 
44                 if (verbose) {
45                     String s;
46                     s = "  " + spin[spinpos++ % (spin.length)]+" parsing ";
47                     s += input.getName();
48                     s += " "+input.getLocation();
49                     while(s.indexOf(':') != -1 && s.indexOf(':') < 8) s = " " + s;
50                     String y = "@"+gss.viewPos+" ";
51                     while(y.length() < 9) y = " " + y;
52                     s += y;
53                     //s += "   doom="+Node.doomedNodes;
54                     //while(s.length() < 40) s = s + " ";
55                     s += "   nodes="+gss.numOldNodes;
56                     while(s.length() < 50) s = s + " ";
57                     s += " shifted="+gss.numNewNodes;
58                     while(s.length() < 60) s = s + " ";
59                     s += " reductions="+gss.numReductions;
60                     System.err.print("\r"+s+ANSI.clreol()+"\r");
61                 }
62                 
63                 // FIXME: make sure all the locations line up properly in here
64                 if (current.isDone()) return (Forest<NodeType>)current.finalResult;
65                 Forest forest = shiftToken((Token)current.token, input.getLocation());
66                 current = gss.new Phase<Token>(current, forest);
67             }
68         } finally {
69             if (verbose)
70                 System.err.print("\r                                                                                \r");
71         }
72     }
73
74
75     // Table //////////////////////////////////////////////////////////////////////////////
76
77     /** an SLR(1) parse table which may contain conflicts */
78     static class Table<Token> extends Cache {
79
80         /** the start state */
81         public  final State<Token>   start;
82
83         /** the state from which no reductions can be done */
84         private final State<Token>   dead_state;
85
86         /** used to generate unique values for State.idx */
87         private int master_state_idx = 0;
88         HashSet<State<Token>>   all_states    = new HashSet<State<Token>>();
89         HashMap<HashSet<Position>,State<Token>>   doomed_states    = new HashMap<HashSet<Position>,State<Token>>();
90         HashMap<HashSet<Position>,State<Token>>   normal_states    = new HashMap<HashSet<Position>,State<Token>>();
91
92         /** construct a parse table for the given grammar */
93         public Table(Topology top) { this("s", top); }
94         public Table(String startSymbol, Topology top) { this(new Union(startSymbol), top); }
95         public Table(Union ux, Topology top) {
96             super(ux, top);
97             Union start0 = new Union("0");
98             Sequence seq0 = new Sequence.Singleton(ux);
99             start0.add(seq0);
100             buildFollowSet(seq0, top, true);
101
102             // construct the set of states
103             HashSet<Position> hp = new HashSet<Position>();
104             reachable(start0, hp);
105
106             this.dead_state = new State<Token>(new HashSet<Position>(), true);
107             this.start = new State<Token>(hp);
108
109             // for each state, fill in the corresponding "row" of the parse table
110             for(State<Token> state : all_states)
111                 for(Position p : state.hs) {
112
113                     // the Grammar's designated "last position" is the only accepting state
114                     if (start0.contains(p.owner()) && p.next()==null && !state.doomed)
115                         state.accept = true;
116
117                     if (isRightNullable(p)) {
118                         Topology<Token> follow = (Topology<Token>)follow(p.owner());
119                         for(Position p2 = p; p2 != null && p2.element() != null; p2 = p2.next()) {
120                             if (!(p2.element() instanceof Union)) throw new Error("impossible");
121                             Union u = (Union)p2.element();
122                             Atom set = new edu.berkeley.sbp.chr.CharAtom(new edu.berkeley.sbp.chr.CharTopology((Topology<Character>)epsilonFollowSet(u)));
123                             Element p2e = p2.element();
124                             if (p2e instanceof Union)
125                                 for(Sequence p2es : ((Union)p2e))
126                                     follow = follow.intersect(follow(p2es));
127                             if (set != null) follow = follow.intersect(set.getTokenTopology());
128                         }
129                         state.reductions.put(follow, p);
130                         if (followEof.contains(p.owner())) state.eofReductions.add(p);
131                     }
132
133                     // if the element following this position is an atom, copy the corresponding
134                     // set of rows out of the "master" goto table and into this state's shift table
135                     if (p.element() != null && p.element() instanceof Atom)
136                         state.shifts.addAll(state.gotoSetTerminals.subset(((Atom)p.element()).getTokenTopology()));
137                 }
138
139             if (top instanceof IntegerTopology)
140                 for(State<Token> state : all_states) {
141                     state.oreductions = state.reductions.optimize(((IntegerTopology)top).functor());
142                     state.oshifts = state.shifts.optimize(((IntegerTopology)top).functor());
143                 }
144
145             // crude algorithm to assing an ordinal ordering to every position
146             // al will be sorted in DECREASING order (al[0] >= al[1])
147             ArrayList<Sequence.Position> al = new ArrayList<Sequence.Position>();
148             for(State s : all_states) {
149                 for(Object po : s) {
150                     Sequence.Position p = (Sequence.Position)po;
151                     if (al.contains(p)) continue;
152                     int i=0;
153                     for(; i<al.size(); i++) {
154                         if (comparePositions(p, al.get(i)) < 0)
155                             break;
156                     }
157                     al.add(i, p);
158                 }
159             }
160             // FIXME: this actually pollutes the "pure" objects (the ones that should not be modified by the Parser)
161             // sort in increasing order...
162             OUTER: while(true) {
163                 for(int i=0; i<al.size(); i++)
164                     for(int j=i+1; j<al.size(); j++)
165                         if (comparePositions(al.get(i), al.get(j)) > 0) {
166                             Sequence.Position p = al.remove(j);
167                             al.add(i, p);
168                             continue OUTER;
169                         }
170                 break;
171             }
172
173             int j = 1;
174             int pk = 0;
175             for(int i=0; i<al.size(); i++) {
176                 boolean inc = false;
177                 for(int k=pk; k<i; k++) {
178                     if (comparePositions(al.get(k), al.get(i)) > 0)
179                         { inc = true; break; }
180                 }
181                 inc = true;
182                 if (inc) {
183                     j++;
184                     pk = i;
185                 }
186                 al.get(i).ord = j;
187             }
188
189             /*
190             for(int i=0; i<al.size(); i++)
191                 if (isRightNullable(al.get(i)))
192                     System.out.println(al.get(i).ord + "   " + al.get(i));
193             */
194             //mastercache = this;
195         }
196
197         /** a single state in the LR table and the transitions possible from it */
198         class State<Token> implements IntegerMappable, Iterable<Position> {
199         
200             public  final     int               idx    = master_state_idx++;
201             private final     HashSet<Position> hs;
202             public HashSet<State<Token>> also = new HashSet<State<Token>>();
203
204             public  transient HashMap<Sequence,State<Token>>         gotoSetNonTerminals = new HashMap<Sequence,State<Token>>();
205             private transient TopologicalBag<Token,State<Token>>     gotoSetTerminals    = new TopologicalBag<Token,State<Token>>();
206
207             private           TopologicalBag<Token,Position>      reductions          = new TopologicalBag<Token,Position>();
208             private           HashSet<Position>                   eofReductions       = new HashSet<Position>();
209             private           TopologicalBag<Token,State<Token>>  shifts              = new TopologicalBag<Token,State<Token>>();
210             private           boolean                             accept              = false;
211
212             private VisitableMap<Token,State<Token>> oshifts     = null;
213             private VisitableMap<Token,Position>     oreductions = null;
214
215             // Interface Methods //////////////////////////////////////////////////////////////////////////////
216
217             boolean                    isAccepting()           { return accept; }
218             public Iterator<Position>  iterator()              { return hs.iterator(); }
219             boolean                    canShift(Token t)       { return oshifts!=null && oshifts.contains(t); }
220             void                       invokeShifts(Token t, GSS.Phase phase, Result r) { oshifts.invoke(t, phase, r); }
221             boolean                    canReduce(Token t)        {
222                 return oreductions != null && (t==null ? eofReductions.size()>0 : oreductions.contains(t)); }
223             void          invokeEpsilonReductions(Token t, Node node) {
224                 if (t==null) for(Position r : eofReductions) node.invoke(r, null);
225                 else         oreductions.invoke(t, node, null);
226             }
227             void          invokeReductions(Token t, Node node, Result b) {
228                 //System.err.println("\rinvokage:  " + this);
229                 if (t==null) for(Position r : eofReductions) node.invoke(r, b);
230                 else         oreductions.invoke(t, node, b);
231             }
232
233             // Constructor //////////////////////////////////////////////////////////////////////////////
234
235             /**
236              *  create a new state consisting of all the <tt>Position</tt>s in <tt>hs</tt>
237              *  @param hs           the set of <tt>Position</tt>s comprising this <tt>State</tt>
238              *  @param all the set of all elements (Atom instances need not be included)
239              *  
240              *   In principle these two steps could be merged, but they
241              *   are written separately to highlight these two facts:
242              * <ul>
243              * <li> Non-atom elements either match all-or-nothing, and do not overlap
244              *      with each other (at least not in the sense of which element corresponds
245              *      to the last reduction performed).  Therefore, in order to make sure we
246              *      wind up with the smallest number of states and shifts, we wait until
247              *      we've figured out all the token-to-position multimappings before creating
248              *      any new states
249              *  
250              * <li> In order to be able to run the state-construction algorithm in a single
251              *      shot (rather than repeating until no new items appear in any state set),
252              *      we need to use the "yields" semantics rather than the "produces" semantics
253              *      for non-Atom Elements.
254              *  </ul>
255              */
256             public State(HashSet<Position> hs) { this(hs, false); }
257             public boolean doomed;
258             public State(HashSet<Position> hs, boolean doomed) {
259                 this.hs = hs;
260                 this.doomed = doomed;
261
262                 // register ourselves in the all_states hash so that no
263                 // two states are ever created with an identical position set
264                 ((HashMap)(doomed ? doomed_states : normal_states)).put(hs, this);
265                 ((HashSet)all_states).add(this);
266                 
267                 for(Position p : hs) {
268                     if (!p.isFirst()) continue;
269                     for(Sequence s : p.owner().needs()) {
270                         if (hs.contains(s.firstp())) continue;
271                         HashSet<Position> h2 = new HashSet<Position>();
272                         reachable(s, h2);
273                         also.add(mkstate(h2, true));
274                     }
275                     for(Sequence s : p.owner().hates()) {
276                         if (hs.contains(s.firstp())) continue;
277                         HashSet<Position> h2 = new HashSet<Position>();
278                         reachable(s, h2);
279                         also.add(mkstate(h2, true));
280                     }
281                 }
282
283                 // Step 1a: examine all Position's in this state and compute the mappings from
284                 //          sets of follow tokens (tokens which could follow this position) to sets
285                 //          of _new_ positions (positions after shifting).  These mappings are
286                 //          collectively known as the _closure_
287
288                 TopologicalBag<Token,Position> bag0 = new TopologicalBag<Token,Position>();
289                 for(Position position : hs) {
290                     if (position.isLast() || !(position.element() instanceof Atom)) continue;
291                     Atom a = (Atom)position.element();
292                     HashSet<Position> hp = new HashSet<Position>();
293                     reachable(position.next(), hp);
294                     bag0.addAll(a.getTokenTopology(), hp);
295                 }
296
297                 // Step 1b: for each _minimal, contiguous_ set of characters having an identical next-position
298                 //          set, add that character set to the goto table (with the State corresponding to the
299                 //          computed next-position set).
300
301                 for(Topology<Token> r : bag0) {
302                     HashSet<Position> h = new HashSet<Position>();
303                     for(Position p : bag0.getAll(r)) h.add(p);
304                     ((TopologicalBag)gotoSetTerminals).put(r, mkstate(h, doomed));
305                 }
306
307                 // Step 2: for every Sequence, compute the closure over every position in this set which
308                 //         is followed by a symbol which could yield the Sequence.
309                 //
310                 //         "yields" [in one or more step] is used instead of "produces" [in exactly one step]
311                 //         to avoid having to iteratively construct our set of States as shown in most
312                 //         expositions of the algorithm (ie "keep doing XYZ until things stop changing").
313
314                 HashMapBag<Sequence,Position> move = new HashMapBag<Sequence,Position>();
315                 for(Position p : hs)
316                     if (!p.isLast() && p.element() instanceof Union)
317                         for(Sequence s : ((Union)p.element())) {
318                             HashSet<Position> hp = new HashSet<Position>();
319                             reachable(p.next(), hp);
320                             move.addAll(s, hp);
321                         }
322                 OUTER: for(Sequence y : move) {
323                     // if a reduction is "lame", it should wind up in the dead_state after reducing
324                     HashSet<Position> h = move.getAll(y);
325                     State<Token> s = mkstate(h, doomed);
326                     for(Position p : hs)
327                         if (p.element() != null && (p.element() instanceof Union))
328                             for(Sequence seq : ((Union)p.element()))
329                                 if (seq.needs.contains(y) || seq.hates.contains(y)) {
330                                     // FIXME: assumption that no sequence is ever both usefully (non-lamely) matched
331                                     //        and also directly lamely matched
332                                     ((HashMap)gotoSetNonTerminals).put(y, dead_state);
333                                     continue OUTER;
334                                 }
335                     gotoSetNonTerminals.put(y, s);
336                 }
337             }
338
339             private State<Token> mkstate(HashSet<Position> h, boolean b) {
340                 if (b) return doomed_states.get(h) == null ? (State)new State<Token>(h,b) : (State)doomed_states.get(h);
341                 else   return normal_states.get(h) == null ? (State)new State<Token>(h,b) : (State)normal_states.get(h);
342             }
343
344             public String toStringx() {
345                 StringBuffer st = new StringBuffer();
346                 for(Position p : this) {
347                     if (st.length() > 0) st.append("\n");
348                     st.append(p);
349                 }
350                 return st.toString();
351             }
352
353             public String toString() {
354                 StringBuffer ret = new StringBuffer();
355                 ret.append("state["+idx+"]: ");
356                 for(Position p : this) ret.append("{"+p+"}  ");
357                 return ret.toString();
358             }
359
360             public int toInt() { return idx; }
361         }
362
363         public String toString() {
364             StringBuffer sb = new StringBuffer();
365             sb.append("parse table");
366             for(State<Token> state : all_states) {
367                 sb.append("  " + state + "\n");
368                 for(Topology<Token> t : state.shifts) {
369                     sb.append("      shift  \""+
370                               new edu.berkeley.sbp.chr.CharTopology((IntegerTopology<Character>)t)+"\" => ");
371                     for(State st : state.shifts.getAll(t))
372                         sb.append(st.idx+"  ");
373                     sb.append("\n");
374                 }
375                 for(Topology<Token> t : state.reductions)
376                     sb.append("      reduce \""+
377                               new edu.berkeley.sbp.chr.CharTopology((IntegerTopology<Character>)t)+"\" => " +
378                               state.reductions.getAll(t) + "\n");
379                 for(Sequence s : state.gotoSetNonTerminals.keySet())
380                     sb.append("      goto   "+state.gotoSetNonTerminals.get(s)+" from " + s + "\n");
381             }
382             return sb.toString();
383         }
384     }
385
386     // Helpers //////////////////////////////////////////////////////////////////////////////
387     
388     private static void reachable(Sequence s, HashSet<Position> h) {
389         reachable(s.firstp(), h);
390         //for(Sequence ss : s.needs()) reachable(ss, h);
391         //for(Sequence ss : s.hates()) reachable(ss, h);
392     }
393     private static void reachable(Element e, HashSet<Position> h) {
394         if (e instanceof Atom) return;
395         for(Sequence s : ((Union)e))
396             reachable(s, h);
397     }
398     private static void reachable(Position p, HashSet<Position> h) {
399         if (h.contains(p)) return;
400         h.add(p);
401         if (p.element() != null) reachable(p.element(), h);
402     }
403     //public static Cache mastercache = null;
404 }