-// Copyright 2004 Adam Megacz, see the COPYING file for licensing [GPL]
-package org.xwt.translators;
-
-import java.io.*;
-import org.ibex.js.*;
-
-/*
- * While entities are limited to a subset of Unicode characters ,
- * numeric character references can specify any character. Numeric
- * character references may be given in decimal or hexadecimal, though
- * browser support is stronger for decimal references. Decimal
- * references are of the form &#number; while hexadecimal references
- * take the case-insensitive form &#xnumber;. Examples of numeric
- * character references include © or © for the copyright
- * symbol, Α or Α for the Greek capital letter alpha, and
- * ا or ا for the Arabic letter ALEF.
- *
- * http://www.htmlhelp.com/reference/html40/entities/special.html
- * http://www.htmlhelp.com/reference/html40/entities/symbols.html
- * http://www.htmlhelp.com/reference/html40/entities/latin1.html
- */
-
-/**
- * This class parses an InputStream containing HTML and returns it
- * as an Ibex DOM tree. Each HTML Element is returned as a struct,
- * with the following members:
- *
- * Since HTML may have multiple top level elements (unlike XML),
- * this class will search all top level elements for one with a tag
- * name 'html'. If such a node is found, only it is returned. If no
- * top-level element has the tag name 'html', such a node is
- * fabricated, and all top level elements become the children of
- * that node, which is then returned.
- */
-public class HTML {
-
- private final static String[] noEndTag =
- new String[] { "area", "base", "basefont", "br", "col", "frame", "hr", "img",
- "input", "isindex", "link", "meta", "param" };
-
- /** we keep a char[] around for use by removeRedundantWhitespace() */
- private static char[] cbuf = null;
-
- /** we keep a StringBuffer around for use by removeRedundantWhitespace() */
- private static StringBuffer sbuf = null;
-
- /** true iff we have encountered an LI more recently than the last OL/UL */
- private static boolean withinLI = false;
-
- public static synchronized JS parseReader(Reader r) throws IOException, JSExn {
- CharStream cs = new CharStream(r);
- JS h = new JS();
-
- withinLI = false;
- h.put("$name", "html");
-
- try {
- while (true) parseBody(cs, h, null);
- } catch (EOFException e) {
- // continue until we get an EOFException
- }
-
- /* FIXME
- Object[] ids = h.keys();
- for(int i=0; i<ids.length; i++) {
- Object el = h.get((String)ids[i]);
- if (el instanceof JS && "html".equals(((JS)el).get("$name")))
- return (JS)el;
- }
- */
- return h;
- }
-
- /**
- * Parses a single element and stores it in <tt>h</tt>. The
- * CharStream should be positioned immediately <i>after</i> the
- * open bracket.
- *
- * If a close tag not matching this open tag is found, the
- * tagname on the close tag will be returned in order to
- * facilitate correcting broken HTML. Otherwise, this returns
- * null.
- */
- private static String parseElement(CharStream cs, JS h) throws IOException, JSExn {
- // scan element name
- while(Character.isSpace(cs.peek())) cs.get();
- String elementName = parseElementName(cs);
-
- boolean saveWithinLI = withinLI;
- if (elementName.equals("li")) {
- if (withinLI) {
- cs.unread(new char[] { '<', 'l', 'i', ' ' });
- return "li";
- } else {
- withinLI = true;
- }
- } else if (elementName.equals("ol") || elementName.equals("ul")) {
- withinLI = false;
- }
-
- h.put("$name", elementName);
- if (elementName.equals("!--")) {
- h.put("0", parseComment(cs));
- h.put("$numchildren", new Integer(0));
- return null;
- }
-
- // scan attributes
- while (cs.peek() != '>') {
- String name = parseAttributeName(cs);
- if (name.equals("")) break;
- String value = expandEntities(parseAttributeValue(cs));
- h.put(name, value);
- }
-
- // eat the close-angle bracket
- cs.get();
-
- // bodyless tags return here
- for(int i=0; i<noEndTag.length; i++)
- if (noEndTag[i].equals(elementName))
- return null;
-
- // scan body
- String ret = parseBody(cs, h, elementName);
- withinLI = saveWithinLI;
- return ret;
- }
-
- /**
- * Parses the body of an element. The CharStream should be
- * positioned at the character immediately after the right
- * bracket closing the start-tag
- */
- private static String parseBody(CharStream cs, JS h, String elementName) throws IOException, JSExn {
- String cdata = "";
- int length = h.get("$numchildren") == null ? 0 : Integer.parseInt(h.get("$numchildren").toString());
- while(true) {
- String closetag = null;
-
- try {
- char c = cs.get();
- if (c != '<') { cdata += c; continue; }
- String expanded = removeRedundantWhitespace(expandEntities(cdata));
- if (expanded.length() > 0) {
- h.put(String.valueOf(length), expanded);
- h.put("$numchildren", new Integer(++length));
- }
- cdata = "";
-
- } catch (EOFException e) {
- String expanded = removeRedundantWhitespace(expandEntities(cdata));
- if (expanded.length() > 0) {
- h.put(String.valueOf(length), expanded);
- h.put("$numchildren", new Integer(++length));
- }
- throw e;
- }
-
- try {
- // scan subelement
- if (cs.peek() != '/') {
- JS kid = new JS();
- closetag = parseElement(cs, kid);
- h.put(String.valueOf(length), kid);
- h.put("$numchildren", new Integer(++length));
-
- // scan close-tag
- } else {
- cs.get(); // drop the slash
- closetag = parseElementName(cs);
- while(cs.get() != '>');
- }
- } catch (EOFException e) {
- throw e;
-
- }
-
- if (closetag != null)
- return closetag.equals(elementName) ? null : closetag;
- }
- }
-
- /** Parses an element name and returns it. The CharStream should
- * be positioned at the first character of the name.
- */
- private static String parseElementName(CharStream cs) throws IOException, JSExn {
- String ret = "";
- while (cs.peek() != '>' && !Character.isSpace(cs.peek())) ret += cs.get();
- return ret.toLowerCase();
- }
-
- /** Parses an attribute name and returns it. The CharStream should
- * be positioned at the first character of the name, possibly
- * with intervening whitespace.
- */
- private static String parseAttributeName(CharStream cs) throws IOException, JSExn {
- while(Character.isSpace(cs.peek())) cs.get();
- String ret = "";
- while(!Character.isSpace(cs.peek()) && cs.peek() != '=' && cs.peek() != '>') ret += cs.get();
- return ret.toLowerCase();
- }
-
- /** Parses an attribute value and returns it. The CharStream
- * should be positioned at the equals sign, possibly with
- * intervening whitespace.
- */
- private static String parseAttributeValue(CharStream cs) throws IOException, JSExn {
-
- // eat whitespace and equals sign
- while(Character.isSpace(cs.peek())) cs.get();
- if (cs.peek() != '=') return "";
- cs.get();
- while(Character.isSpace(cs.peek())) cs.get();
-
- boolean doublequoted = false;
- boolean singlequoted = false;
- String ret = "";
-
- if (cs.peek() == '\"') { doublequoted = true; cs.get(); }
- else if (cs.peek() == '\'') { singlequoted = true; cs.get(); }
-
- while(true) {
- char c = cs.peek();
- if (!doublequoted && !singlequoted && (Character.isSpace(c) || c == '>')) break;
- if (singlequoted && c == '\'') { cs.get(); break; }
- if (doublequoted && c == '\"') { cs.get(); break; }
- ret += cs.get();
- }
- return ret;
- }
-
- /** Parses a comment and returns its body. The CharStream should
- * be positioned immediately after the <!--
- */
- private static String parseComment(CharStream cs) throws IOException, JSExn {
- int dashes = 0;
- String ret = "";
- while(true) {
- char c = cs.get();
- if (c == '>' && dashes == 2) return ret.substring(0, ret.length() - 2);
- if (c == '-') dashes++;
- else dashes = 0;
- ret += c;
- }
- }
-
- /** Expands all SGML entities in string <tt>s</tt> */
- public static String expandEntities(String s) throws IOException, JSExn {
- if (s.indexOf('&') == -1) return s;
- StringBuffer sb = new StringBuffer();
- int i=0;
- int nextamp = 0;
- while(nextamp != -1) {
- nextamp = s.indexOf('&', i);
- sb.append(nextamp == -1 ? s.substring(i) : s.substring(i, nextamp));
- if (nextamp == -1) break;
- if (s.regionMatches(nextamp, "&", 0, 5)) {
- sb.append("&");
- i = nextamp + 5;
- } else if (s.regionMatches(nextamp, ">", 0, 4)) {
- sb.append(">");
- i = nextamp + 4;
- } else if (s.regionMatches(nextamp, "<", 0, 4)) {
- sb.append("<");
- i = nextamp + 4;
- } else if (s.regionMatches(nextamp, """, 0, 6)) {
- sb.append("\"");
- i = nextamp + 6;
- } else if (s.regionMatches(nextamp, " ", 0, 6)) {
- // FEATURE: perhaps we should distinguish this somehow
- sb.append(" ");
- i = nextamp + 6;
- } else {
- sb.append("&");
- i = nextamp + 1;
- }
- }
- return sb.toString();
- }
-
- /** removes all redundant whitespace */
- private static String removeRedundantWhitespace(String s) throws JSExn {
-
- if (s.indexOf(' ') == -1 && s.indexOf('\n') == -1 && s.indexOf('\t') == -1 && s.indexOf('\r') == -1) return s;
-
- int len = s.length();
- if (cbuf == null || cbuf.length < len) {
- cbuf = new char[len * 2];
- sbuf = new StringBuffer(len * 2);
- }
- sbuf.setLength(0);
- s.getChars(0, len, cbuf, 0);
-
- int last = 0;
- boolean lastWasWhitespace = false;
- for(int i=0; i<len; i++) {
- boolean lastlast = lastWasWhitespace;
- switch(cbuf[i]) {
- case '\n': case '\r': case '\t':
- cbuf[i] = ' ';
- case ' ':
- lastWasWhitespace = true;
- break;
- default:
- lastWasWhitespace = false;
- break;
- }
- if (lastWasWhitespace && lastlast) {
- if (last != i) sbuf.append(cbuf, last, i - last);
- last = i+1;
- }
- }
-
- if (last != len) sbuf.append(cbuf, last, len - last);
- return sbuf.toString().trim();
- }
-
- // CharStream /////////////////////////////////////////////////////////////////////
-
- private static class CharStream extends PushbackReader {
- public CharStream(Reader r) { super(r, 1024); }
-
- public char peek() throws IOException {
- char c = get();
- unread(c);
- return c;
- }
-
- public char get() throws IOException {
- int i = read();
- if (i == -1) throw new EOFException();
- return (char)i;
- }
- }
-
-}
-