X-Git-Url: http://git.megacz.com/?a=blobdiff_plain;f=src%2Forg%2Fibex%2Futil%2FXML.java;h=8470e8dcf15293baffcbbbf5c1f336444876ae7f;hb=d821a37fdc4f7c3fbe54216202108994a5b5bd20;hp=29e798daea833c890532ad954728284f2276626d;hpb=1f80106afa0f0eb6f12544c75304f2084aca6499;p=org.ibex.util.git diff --git a/src/org/ibex/util/XML.java b/src/org/ibex/util/XML.java index 29e798d..8470e8d 100644 --- a/src/org/ibex/util/XML.java +++ b/src/org/ibex/util/XML.java @@ -4,23 +4,32 @@ package org.ibex.util; -import java.io.Reader; -import java.io.IOException; import java.io.EOFException; +import java.io.IOException; +import java.io.OutputStream; +import java.io.Reader; +import java.io.Writer; +import java.io.Serializable; /** - * An Event-Driving, Non-Validating XML Parser with Namespace support. + * An non-validating XML Parser with Namespace support. + * + *
Subclass XML and implement the four abstract functions. Call + * parse() to begin synchronously processing reader input. + * Any number of documents may be parse()ed.
+ * + *Instansiate XML.Document and call parse(). The + * root of the document tree can be accessed by calling getRoot(). + * See the public interface XML.Element for tree traversal.
+ * + *Only one document may be parse()ed per XML.Document + * instance.
* *As the parser traverses into an element, it adds it to the linked list - * called elements. However, elements has been pre-filled - * with instances of the Element inner class. So in the vast majority of - * cases, the pointer current is moved along one, and the values for the - * new element are filled into the current object.
* *This parser supports all the unicode ranges required by the XML * Specification. However, it is optimised for well-formed ASCII documents. @@ -40,20 +49,22 @@ import java.io.EOFException; * Violation and is not recoverable * * - * @author David Crawshaw + * @author crawshaw@ibex.org * @see XML Specification * @see XML Namespaces */ public abstract class XML { - ///////////////////////////////////////////////////////////////////////////////////////////// - // XML Parser - ///////////////////////////////////////////////////////////////////////////////////////////// + // XML Parser ///////////////////////////////////////////////////////////// + + /** Default initial buffer size. */ + public static final int BUFFER_SIZE = 256; - public static final int BUFFER_SIZE = 255; + private static final int DEFAULT_ATTR_COUNT = 2; + private static final int DEFAULT_PFX_COUNT = 2; - /** static pool of XML.Element instances shared by all XML Parsers. */ - private static final Queue elements = new Queue(30); + /** static pool of XML.Elem instances shared by all XML Parsers. */ + private static final Basket.List elements = new Basket.Array(); private static final char[] single_amp = new char[] { '&' }; private static final char[] single_apos = new char[] { '\'' }; @@ -61,6 +72,8 @@ public abstract class XML private static final char[] single_lt = new char[] { '<' }; private static final char[] single_quot = new char[] { '"' }; + private final boolean poolElements; + private int line; private int col; @@ -70,19 +83,25 @@ public abstract class XML private int base; // base+off == distance into the stream private int len; - private Element current; + private Elem current; // used in readEntity() to process a single character without creating a new array private char[] singlechar = new char[1]; - public XML() { this(BUFFER_SIZE); } + /** Creates a new XML parser with that has a default initial + * buffer size and reuses its signal objects. */ + protected XML() { this(BUFFER_SIZE, true); } - public XML(int bSize) { + /** Creates a new XML parser. + * @param bSize initial buffer size. + * @param poolElements if true the objects passed to the signal functions are reused. + */ + protected XML(int bSize, boolean poolElements) { buf = new char[bSize]; + this.poolElements = poolElements; - current = (Element)elements.remove(false); - if (current == null) current = new Element(); + current = element(); } /** Returns the line number at the beginning of the last process call. */ @@ -94,44 +113,102 @@ public abstract class XML /** Returns the global file offset at the beginning of the last process call. */ public int getGlobalOffset() { return base + off; } - /** - * Parse given input and call the abstract event functions. - * - * Careful with threading, as this function is not synchronized. - */ + /** Set the reader used as a data source by the XML parser. */ + public void setReader(Reader reader) { in = reader; } + + /** Parse given input and call the abstract event functions. + * Equivalent to calling setReader(reader); parse();. */ public final void parse(Reader reader) throws IOException, Exn { - in = reader; + setReader(reader); parse(); + } + + /** Parse given input and call the abstract event functions. + * + *
This function is synchronous with event functions, meaning it + * will only return after it has finished calling all signal + * functions.
+ */ + public final void parse() throws IOException, Exn { off = len = 0; line = col = 1; - - clear(); // clean up possible mid-way linked-list element + current = null; try { // process the stream while (true) { if (!buffer(1)) { - if (current.qName == null) break; - throw new Exn("reached eof without closing <"+current.qName+"> element", Exn.WFC, getLine(), getCol()); + if (current == null) break; + throw new Exn("reached eof without closing <"+current.qName+"> element", + Exn.WFC, getLine(), getCol()); } - if (buf[off] == '<') readTag(); - readChars(current.qName != null); + if (buf[off] == '<') { + if (current == null) current = element(); + readTag(); + } + readChars(current != null); } } finally { clear(); } // clean up elements } - /** remove any leftover elements from the linked list and queue them */ - private final void clear() { - for (Element last = current; current.parent != null; ) { - current = current.parent; - last.clear(); - elements.append(last); + /** Parses the next tag or block of character data, calling the + * abstract event functions to process the data. + * + * @return True if successfully processed a block of data. + */ + public boolean parseNext() throws IOException, Exn { + if (!buffer(1)) { + if (current == null) return false; + throw new Exn("reached eof without closing <"+current.qName+"> element", + Exn.WFC, getLine(), getCol()); + } + + // move through meaningless data + //if (current != null) readChars(false); + + if (buf[off] == '<') { + // proecess and return a tag + if (current == null) current = element(); + readTag(); + } else { + // processes a block of character data + readChars(true); + } + + return true; + } + + /** Returns the current Tree.Element, or null if outside the root node. */ + public Tree.Element current() { return current; } + + /** Empty the linked list. */ + private void clear() { + while (current != null) { + Elem l = current; + current = (Elem)current.parent; + element(l); } - current.clear(); } - /** reads in a tag. expects buf[off] == '<' */ - private final void readTag() throws IOException, Exn { + /** Provides a fresh element. */ + private Elem element() { + Elem e = null; + if (poolElements) synchronized (elements) { + if (elements.size() > 0) e = (Elem)elements.remove(elements.size() - 1); + } + if (e == null) e = new Elem(); + return e; + } + + /** Frees a used element. */ + private void element(Elem e) { + if (e == null || !poolElements) return; + e.clear(); synchronized (elements) { elements.add(e); } + } + + + /** Reads in a tag. Expects buf[off] == '<'. */ + private void readTag() throws IOException, Exn { // Start Tag '<' Name (S Attribute)* S? '>' boolean starttag = true; @@ -293,9 +370,7 @@ public abstract class XML // create the in-memory element representation of this beast // if current.qName == null then this is the root element we're dealing with if (current.qName != null) { - Element next = (Element)elements.remove(false); - if (next == null) next = new Element(); - //next.clear(); // TODO: remove as elements now checked as they're added to the queue + Elem next = element(); next.parent = current; current = next; } @@ -320,9 +395,10 @@ public abstract class XML } // work out the uri of this element - current.uri = current.getUri(current.getPrefix()); - if (current.getUri().equals("") && current.getPrefix() != null) - current.addError(new Exn("undefined prefix '"+current.getPrefix()+"'", Exn.NC, getLine(), getCol())); + String p = current.getPrefix(); + String uri = current.uri(p); + if (uri == null && p != null && !p.equals("")) error(new Exn("undefined prefix '"+current.getPrefix()+"'", Exn.NC, getLine(), getCol())); + else current.uri = uri; } else { // this is an end-of-element tag @@ -352,16 +428,9 @@ public abstract class XML if (endtag) { endElement(current); - // we just closed an element, so remove it from the element 'stack' - if (current.getParent() == null) { - // we just finished the root element - current.clear(); - } else { - Element last = current; - current = current.parent; - last.clear(); - elements.append(last); - } + Elem l = current; + current = (Elem)current.parent; + element(l); } } } @@ -435,24 +504,24 @@ public abstract class XML // process attribute if (p != null && p.equals("xmlns")) { - current.addUri(n, v); + current.addPrefix(n, v); } else if (n.equals("xmlns")) { - if (current.getUri().equals("")) { - current.addUri("", v); + if (current.getUri() == null || current.getUri().equals("")) { + current.addPrefix("", v); } else { - current.addError(new Exn("default namespace definition repeated", Exn.NC, getLine(), getCol())); + error(new Exn("default namespace definition repeated", Exn.NC, getLine(), getCol())); } } else { // find attribute uri - u = current.getUri(p); - if (p != null && u.equals("")) current.addError(new Exn("undefined attribute prefix '"+p+"'", Exn.NC, getLine(), getCol())); + u = current.uri(p); + if (u == null && p != null) error(new Exn("undefined attribute prefix '"+p+"'", Exn.NC, getLine(), getCol())); // check to see if attribute is a repeat - for (int i=0; current.len > i; i++) if (n.equals(current.getAttrKey(i)) && u.equals(current.getAttrUri(i))) throw new Exn( + for (int i=0; current.attrSize() > i; i++) if (n.equals(current.getKey(i)) && u.equals(current.getUri(i))) throw new Exn( "attribute name '"+n+"' may not appear more than once in the same element tag", Exn.WFC, getLine(), getCol() ); - current.addAttr(n, v, u); + current.addAttr(n, v, u, p); } } @@ -587,18 +656,24 @@ public abstract class XML } /** - * reads until a < symbol is encountered + * Reads until a < symbol is encountered. * @param p If true call the characters(char[],int,int) funciton for the processed characters */ private final void readChars(boolean p) throws IOException, Exn { + boolean lastWhite = false; int ref; for (boolean more = true; more;) { if (!buffer(1)) return; + boolean readWhite = false; buf: for (ref = 0; ref < len; ref++) { + switch (buf[off+ref]) { case '\r': // windows or macos9 newline + if (lastWhite) { readWhite = true; break buf; } + lastWhite = true; + // normalise and process buf[off+ref] = '\n'; ref++; if (p) characters(buf, off, ref); @@ -611,28 +686,49 @@ public abstract class XML break; case '\n': // unix newline + if (lastWhite) { readWhite = true; break buf; } + lastWhite = true; + ref++; if (p) characters(buf, off, ref); off += ref; len -= ref; ref = -1; line++; col = 1; + if (buffer(1) && S(buf[off])) { + readWhite = true; break buf; + } + break; + + case ' ': + case '\t': + if (lastWhite) { readWhite = true; break buf; } + lastWhite = true; break; + case '<': // end of chars section + more = false; + break buf; + case '&': // entity if (p) { if (ref > 0) characters(buf, off, ref); off += ref; len -= ref; ref = -1; readEntity(); } - break; - - case '<': // end of chars section - more = false; - break buf; + default: + lastWhite = false; } } - if (p && ref > 0) characters(buf, off, ref); - off += ref; len -= ref; col += ref; + if (ref > 0) { + if (p) characters(buf, off, ref); + off += ref; len -= ref; col += ref; + } + + if (readWhite) { + readWhitespace(); + more = buffer(1) && !(buf[off] == '<'); + readWhite = false; + } } } @@ -653,8 +749,7 @@ public abstract class XML line++; col = 1; // windows double-char newline; skip the next char - if (!buffer(1)) return; - if (buf[off] == '\n') { off++; len--; } + if (buffer(1) && buf[off] == '\n') { off++; len--; } break; case '\n': // unix newline @@ -673,12 +768,15 @@ public abstract class XML } } - off += ref; len -= ref; col += ref; + if (ref > 0) { + whitespace(buf, off, ref); + off += ref; len -= ref; col += ref; + } } } /** - * attempt to fill the buffer. + * Attempt to fill the buffer. * * @param min Minimum number of characters to read (even if we have to block to do it). * @return return false if min can't be reached. @@ -712,20 +810,17 @@ public abstract class XML } - ///////////////////////////////////////////////////////////////////////////////////////////// - // Abstract SAX-Like Interface - ///////////////////////////////////////////////////////////////////////////////////////////// + // SAX-like Interface ///////////////////////////////////////////////////// - /** - * Called when the start of an element is processed. + /** Called when the start of an element is processed. * - *DO NOT store a reference to the Element object, as - * they are reused by XML Parser.
+ *If poolElements == true (default), DO NOT store a + * reference to the Element object, as they are reused by + * XML Parser.
*/ - public abstract void startElement(Element e) throws Exn; + public abstract void startElement(Tree.Element e) throws Exn; - /** - * Represents up to a line of character data. + /** Called when up to a line of character data is processed. * *Newlines are all normalised to the Unix \n as per the XML Spec, * and a newline will only appear as the last character in the passed @@ -737,168 +832,350 @@ public abstract class XML */ public abstract void characters(char[] ch, int start, int length) throws Exn, IOException; - /** Represents up to a line of ignorable whitespace. */ - public abstract void whitespace(char[] ch, int start, int length) throws Exn, IOException; + /** Called when the end of an Tree.Element is processed. */ + public abstract void endElement(Tree.Element e) throws Exn, IOException; - /** Represents the end of an Element. */ - public abstract void endElement(Element e) throws Exn, IOException; + /** Optional callback; called when when up to a line of ignorable whitespace is processed. */ + public void whitespace(char[] ch, int start, int length) throws Exn, IOException {} + /** Optonal callback; called when a recoverable parsing error has been encountered. */ + public void error(Exn e) throws Exn, IOException {} - ///////////////////////////////////////////////////////////////////////////////////////////// - // Inner Classes for Parser Support - ///////////////////////////////////////////////////////////////////////////////////////////// + // DOM-like Interface ///////////////////////////////////////////////////// - /** - * Represents an element in an XML document. Stores a reference to its - * parent, forming a one-way linked list. + /** A Document Object Model extension to the XML Parser. * - * Element objects are reused, so client code making use of them must - * drop their references after the specific element process function - * has returned. + *
To use, instaniate XML.Document and call parse(Reader).The + * full Block tree can then be accessed starting from the root + * element by calling getRoot().
*/ - public static final class Element { + public static class Document { + private final DXML xml; + private Tree.Element root = null; - private static final int DEFAULT_ATTR_SIZE = 10; + /** Creates a new XML.Document. Default initial buffer size is used. */ + public Document() { this(BUFFER_SIZE); } - protected Element parent = null; + /** Creates a new XML.Document with a sepcified initial buffer size. */ + public Document(int bSize) { xml = new DXML(bSize, false); } - protected String uri = null; - protected String localName = null; - protected String qName = null; - protected String prefix = null; + /** Returns the root Tree.Element of the parsed xml document. */ + public Tree.Element getRoot() { return root; } - protected Hash urimap = new Hash(3,3); + /** Sets the root element of this document. */ + public void setRoot(Tree.Element e) { root = e; } - protected String[] keys = new String[DEFAULT_ATTR_SIZE]; - protected String[] vals = new String[DEFAULT_ATTR_SIZE]; - protected String[] uris = new String[DEFAULT_ATTR_SIZE]; - protected int len = 0; + /** Parse given input create the document model. */ + public void parse(Reader r) throws IOException, Exn { xml.parse(r); } - protected Exn[] errors = new Exn[] {}; + /** Returns a character representation of this document. */ + /*public String toXML() throws IOException { FIXME + StringWriter w = new StringWriter(); toXML(w); return w.toString(); + }*/ + + /** Writes the character representation of this document to + * the given writer. Calls root.toXML(Writer).*/ + //public void toXML(Writer w) throws IOException { if (root == null) return; root.toXML(w); } - /** Parent of current element. */ - public Element getParent() { return parent; } + /** Used to hide implementation from public interface. */ + private final class DXML extends XML { + private StringBuffer chars = null; - /** Qualified Name of current element. XML Namespace Spec 14-Jan-1999 [6] */ - public String getQName() { return qName; } - - /** LocalPart of current element. XML Namespace Spec 14-Jan-1999 [8] */ - public String getLocalName() { return localName; } + private DXML(int b, boolean r) { super(b, r); } - /** Prefix of current element. Substring of qName. XML Namespace Spec 14-Jan-1999 [7] */ - public String getPrefix() { return prefix; } - - // HACK - public Hash getUriMap() { - Hash map = new Hash(); - for (Element e = this; e != null; e = e.getParent()) { - java.util.Enumeration en = e.urimap.keys(); - while(en.hasMoreElements()) { - String key = (String)en.nextElement(); - String val = getUri(key); - map.put(key, val); + public void startElement(Tree.Element e) { + if (root == null) root = e; + else { + if (chars != null) addText((Tree.Element)e.getParent()); + e.getParent().getChildren().add(e); } } - return map; - } - - /** URI of current tag. XML Namespace Spec 14-Jan-1999 section 1 */ - public String getUri() { return getUri(prefix); } + public void characters(char[] ch, int s, int l) { + if (chars == null) chars = new StringBuffer(); + chars.append(ch, s, l); + } + public void endElement(Tree.Element e) { if (chars != null) addText(e); } - /** URI of a given prefix. Never returns null, instead gives "". */ - public String getUri(String p) { - String ret = null; - for (Element e = this; e != null && ret == null; e = e.getParent()) { - ret = (String)e.urimap.get(p == null ? "" : p); + private void addText(Tree.Element e) { + e.getChildren().add(new Text(e, chars.toString())); chars = null; } - return ret == null ? "" : ret; } + } - /** An array of attribute names. */ - public String getAttrKey(int pos) { return len > pos ? keys[pos] : null; } + // Pull Interface ///////////////////////////////////////////////////////// - /** An array of attribute values. */ - public String getAttrVal(int pos) { return len > pos ? vals[pos] : null; } + public static class Stream implements Tree.Stream { + private final SXML xml; + private int depth = -1; - /** An array of attribute uris. */ - public String getAttrUri(int pos) { return len > pos ? uris[pos] : null; } + /** Creates a new XML.Stream. Default initial buffer size is used. */ + public Stream() { this(null, BUFFER_SIZE); } - /** Current number of attributes in the element. */ - public int getAttrLen() { return len; } + public Stream(Reader in) { this(in, BUFFER_SIZE); } - /** Poor performance, but easier to use when speed is not a concern */ - public Hash getAttrHash() { - Hash ret = new Hash(getAttrLen() * 2, 3); - for(int i=0; i