+// Copyright 2004 Adam Megacz, see the COPYING file for licensing [GPL]
+package org.ibex.translators;
+
+import java.util.*;
+import java.net.*;
+import java.io.*;
+import org.ibex.js.*;
+import org.ibex.util.*;
+
+/*
+ * While entities are limited to a subset of Unicode characters ,
+ * numeric character references can specify any character. Numeric
+ * character references may be given in decimal or hexadecimal, though
+ * browser support is stronger for decimal references. Decimal
+ * references are of the form &#number; while hexadecimal references
+ * take the case-insensitive form &#xnumber;. Examples of numeric
+ * character references include © or © for the copyright
+ * symbol, Α or Α for the Greek capital letter alpha, and
+ * ا or ا for the Arabic letter ALEF.
+ *
+ * http://www.htmlhelp.com/reference/html40/entities/special.html
+ * http://www.htmlhelp.com/reference/html40/entities/symbols.html
+ * http://www.htmlhelp.com/reference/html40/entities/latin1.html
+ */
+
+/**
+ * This class parses an InputStream containing HTML and returns it
+ * as an XWT DOM tree. Each HTML Element is returned as a struct,
+ * with the following members:
+ *
+ * Since HTML may have multiple top level elements (unlike XML),
+ * this class will search all top level elements for one with a tag
+ * name 'html'. If such a node is found, only it is returned. If no
+ * top-level element has the tag name 'html', such a node is
+ * fabricated, and all top level elements become the children of
+ * that node, which is then returned.
+ */
+public class HTML {
+
+ private final static String[] noEndTag =
+ new String[] { "area", "base", "basefont", "br", "col", "frame", "hr", "img",
+ "input", "isindex", "link", "meta", "param" };
+
+ /** we keep a char[] around for use by removeRedundantWhitespace() */
+ private static char[] cbuf = null;
+
+ /** we keep a StringBuffer around for use by removeRedundantWhitespace() */
+ private static StringBuffer sbuf = null;
+
+ /** true iff we have encountered an LI more recently than the last OL/UL */
+ private static boolean withinLI = false;
+
+ public static synchronized JS parseReader(Reader r) throws IOException, JSExn {
+ CharStream cs = new CharStream(r);
+ JS h = new JS();
+
+ withinLI = false;
+ h.put("$name", "html");
+
+ try {
+ while (true) parseBody(cs, h, null);
+ } catch (EOFException e) {
+ // continue until we get an EOFException
+ }
+
+ /* FIXME
+ Object[] ids = h.keys();
+ for(int i=0; i<ids.length; i++) {
+ Object el = h.get((String)ids[i]);
+ if (el instanceof JS && "html".equals(((JS)el).get("$name")))
+ return (JS)el;
+ }
+ */
+ return h;
+ }
+
+ /**
+ * Parses a single element and stores it in <tt>h</tt>. The
+ * CharStream should be positioned immediately <i>after</i> the
+ * open bracket.
+ *
+ * If a close tag not matching this open tag is found, the
+ * tagname on the close tag will be returned in order to
+ * facilitate correcting broken HTML. Otherwise, this returns
+ * null.
+ */
+ private static String parseElement(CharStream cs, JS h) throws IOException, JSExn {
+ // scan element name
+ while(Character.isSpace(cs.peek())) cs.get();
+ String elementName = parseElementName(cs);
+
+ boolean saveWithinLI = withinLI;
+ if (elementName.equals("li")) {
+ if (withinLI) {
+ cs.unread(new char[] { '<', 'l', 'i', ' ' });
+ return "li";
+ } else {
+ withinLI = true;
+ }
+ } else if (elementName.equals("ol") || elementName.equals("ul")) {
+ withinLI = false;
+ }
+
+ h.put("$name", elementName);
+ if (elementName.equals("!--")) {
+ h.put("0", parseComment(cs));
+ h.put("$numchildren", new Integer(0));
+ return null;
+ }
+
+ // scan attributes
+ while (cs.peek() != '>') {
+ String name = parseAttributeName(cs);
+ if (name.equals("")) break;
+ String value = expandEntities(parseAttributeValue(cs));
+ h.put(name, value);
+ }
+
+ // eat the close-angle bracket
+ cs.get();
+
+ // bodyless tags return here
+ for(int i=0; i<noEndTag.length; i++)
+ if (noEndTag[i].equals(elementName))
+ return null;
+
+ // scan body
+ String ret = parseBody(cs, h, elementName);
+ withinLI = saveWithinLI;
+ return ret;
+ }
+
+ /**
+ * Parses the body of an element. The CharStream should be
+ * positioned at the character immediately after the right
+ * bracket closing the start-tag
+ */
+ private static String parseBody(CharStream cs, JS h, String elementName) throws IOException, JSExn {
+ String cdata = "";
+ int length = h.get("$numchildren") == null ? 0 : Integer.parseInt(h.get("$numchildren").toString());
+ while(true) {
+ String closetag = null;
+
+ try {
+ char c = cs.get();
+ if (c != '<') { cdata += c; continue; }
+ String expanded = removeRedundantWhitespace(expandEntities(cdata));
+ if (expanded.length() > 0) {
+ h.put(String.valueOf(length), expanded);
+ h.put("$numchildren", new Integer(++length));
+ }
+ cdata = "";
+
+ } catch (EOFException e) {
+ String expanded = removeRedundantWhitespace(expandEntities(cdata));
+ if (expanded.length() > 0) {
+ h.put(String.valueOf(length), expanded);
+ h.put("$numchildren", new Integer(++length));
+ }
+ throw e;
+ }
+
+ try {
+ // scan subelement
+ if (cs.peek() != '/') {
+ JS kid = new JS();
+ closetag = parseElement(cs, kid);
+ h.put(String.valueOf(length), kid);
+ h.put("$numchildren", new Integer(++length));
+
+ // scan close-tag
+ } else {
+ cs.get(); // drop the slash
+ closetag = parseElementName(cs);
+ while(cs.get() != '>');
+ }
+ } catch (EOFException e) {
+ throw e;
+
+ }
+
+ if (closetag != null)
+ return closetag.equals(elementName) ? null : closetag;
+ }
+ }
+
+ /** Parses an element name and returns it. The CharStream should
+ * be positioned at the first character of the name.
+ */
+ private static String parseElementName(CharStream cs) throws IOException, JSExn {
+ String ret = "";
+ while (cs.peek() != '>' && !Character.isSpace(cs.peek())) ret += cs.get();
+ return ret.toLowerCase();
+ }
+
+ /** Parses an attribute name and returns it. The CharStream should
+ * be positioned at the first character of the name, possibly
+ * with intervening whitespace.
+ */
+ private static String parseAttributeName(CharStream cs) throws IOException, JSExn {
+ while(Character.isSpace(cs.peek())) cs.get();
+ String ret = "";
+ while(!Character.isSpace(cs.peek()) && cs.peek() != '=' && cs.peek() != '>') ret += cs.get();
+ return ret.toLowerCase();
+ }
+
+ /** Parses an attribute value and returns it. The CharStream
+ * should be positioned at the equals sign, possibly with
+ * intervening whitespace.
+ */
+ private static String parseAttributeValue(CharStream cs) throws IOException, JSExn {
+
+ // eat whitespace and equals sign
+ while(Character.isSpace(cs.peek())) cs.get();
+ if (cs.peek() != '=') return "";
+ cs.get();
+ while(Character.isSpace(cs.peek())) cs.get();
+
+ boolean doublequoted = false;
+ boolean singlequoted = false;
+ String ret = "";
+
+ if (cs.peek() == '\"') { doublequoted = true; cs.get(); }
+ else if (cs.peek() == '\'') { singlequoted = true; cs.get(); }
+
+ while(true) {
+ char c = cs.peek();
+ if (!doublequoted && !singlequoted && (Character.isSpace(c) || c == '>')) break;
+ if (singlequoted && c == '\'') { cs.get(); break; }
+ if (doublequoted && c == '\"') { cs.get(); break; }
+ ret += cs.get();
+ }
+ return ret;
+ }
+
+ /** Parses a comment and returns its body. The CharStream should
+ * be positioned immediately after the <!--
+ */
+ private static String parseComment(CharStream cs) throws IOException, JSExn {
+ int dashes = 0;
+ String ret = "";
+ while(true) {
+ char c = cs.get();
+ if (c == '>' && dashes == 2) return ret.substring(0, ret.length() - 2);
+ if (c == '-') dashes++;
+ else dashes = 0;
+ ret += c;
+ }
+ }
+
+ /** Expands all SGML entities in string <tt>s</tt> */
+ public static String expandEntities(String s) throws IOException, JSExn {
+ if (s.indexOf('&') == -1) return s;
+ StringBuffer sb = new StringBuffer();
+ int i=0;
+ int nextamp = 0;
+ while(nextamp != -1) {
+ nextamp = s.indexOf('&', i);
+ sb.append(nextamp == -1 ? s.substring(i) : s.substring(i, nextamp));
+ if (nextamp == -1) break;
+ if (s.regionMatches(nextamp, "&", 0, 5)) {
+ sb.append("&");
+ i = nextamp + 5;
+ } else if (s.regionMatches(nextamp, ">", 0, 4)) {
+ sb.append(">");
+ i = nextamp + 4;
+ } else if (s.regionMatches(nextamp, "<", 0, 4)) {
+ sb.append("<");
+ i = nextamp + 4;
+ } else if (s.regionMatches(nextamp, """, 0, 6)) {
+ sb.append("\"");
+ i = nextamp + 6;
+ } else if (s.regionMatches(nextamp, " ", 0, 6)) {
+ // FEATURE: perhaps we should distinguish this somehow
+ sb.append(" ");
+ i = nextamp + 6;
+ } else {
+ sb.append("&");
+ i = nextamp + 1;
+ }
+ }
+ return sb.toString();
+ }
+
+ /** removes all redundant whitespace */
+ private static String removeRedundantWhitespace(String s) throws JSExn {
+
+ if (s.indexOf(' ') == -1 && s.indexOf('\n') == -1 && s.indexOf('\t') == -1 && s.indexOf('\r') == -1) return s;
+
+ int len = s.length();
+ if (cbuf == null || cbuf.length < len) {
+ cbuf = new char[len * 2];
+ sbuf = new StringBuffer(len * 2);
+ }
+ sbuf.setLength(0);
+ s.getChars(0, len, cbuf, 0);
+
+ int last = 0;
+ boolean lastWasWhitespace = false;
+ for(int i=0; i<len; i++) {
+ boolean lastlast = lastWasWhitespace;
+ switch(cbuf[i]) {
+ case '\n': case '\r': case '\t':
+ cbuf[i] = ' ';
+ case ' ':
+ lastWasWhitespace = true;
+ break;
+ default:
+ lastWasWhitespace = false;
+ break;
+ }
+ if (lastWasWhitespace && lastlast) {
+ if (last != i) sbuf.append(cbuf, last, i - last);
+ last = i+1;
+ }
+ }
+
+ if (last != len) sbuf.append(cbuf, last, len - last);
+ return sbuf.toString().trim();
+ }
+
+ // CharStream /////////////////////////////////////////////////////////////////////
+
+ private static class CharStream extends PushbackReader {
+ public CharStream(Reader r) { super(r, 1024); }
+
+ public char peek() throws IOException {
+ char c = get();
+ unread(c);
+ return c;
+ }
+
+ public char get() throws IOException {
+ int i = read();
+ if (i == -1) throw new EOFException();
+ return (char)i;
+ }
+ }
+
+}
+