From: megacz Date: Fri, 30 Jan 2004 06:48:10 +0000 (+0000) Subject: 2002/06/14 22:59:18 X-Git-Tag: RC3~1683 X-Git-Url: http://git.megacz.com/?a=commitdiff_plain;h=0a13b4988d4952f60f0b28770e6a2240315aecd3;p=org.ibex.core.git 2002/06/14 22:59:18 darcs-hash:20040130064810-2ba56-f44e3d94d5b470b59adf6bdf6647436e49432550.gz --- diff --git a/CHANGES b/CHANGES index 08308f4..bcee019 100644 --- a/CHANGES +++ b/CHANGES @@ -191,3 +191,5 @@ 05-Jun megacz AWT.java: fix for non-US keyboards +14-Jun megacz HTML.java, XWT.java: introduced xwt.parseHTML() + diff --git a/src/org/xwt/HTML.java b/src/org/xwt/HTML.java new file mode 100644 index 0000000..c8bcc6a --- /dev/null +++ b/src/org/xwt/HTML.java @@ -0,0 +1,324 @@ +// Copyright 2002 Adam Megacz, see the COPYING file for licensing [GPL] +package org.xwt; + +import org.xwt.util.*; +import java.util.*; +import java.net.*; +import java.io.*; + +/* + While entities are limited to a subset of Unicode characters , + numeric character references can specify any character. Numeric + character references may be given in decimal or hexadecimal, though + browser support is stronger for decimal references. Decimal + references are of the form &#number; while hexadecimal references + take the case-insensitive form &#xnumber;. Examples of numeric + character references include © or © for the copyright + symbol, Α or Α for the Greek capital letter alpha, and + ا or ا for the Arabic letter ALEF. + + http://www.htmlhelp.com/reference/html40/entities/special.html + http://www.htmlhelp.com/reference/html40/entities/symbols.html + http://www.htmlhelp.com/reference/html40/entities/latin1.html + + FIXME FIXME FIXME:
  • tags close enclosing
  • tags +*/ + +/** + * This class parses an InputStream containing HTML and returns it + * as an XWT DOM tree. Each HTML Element is returned as a struct, + * with the following members: + * + * Since HTML may have multiple top level elements (unlike XML), + * this class will search all top level elements for one with a tag + * name 'html'. If such a node is found, only it is returned. If no + * top-level element has the tag name 'html', such a node is + * fabricated, and all top level elements become the children of + * that node, which is then returned. + */ +public class HTML { + + // FIXME: fill in + private final static String[] bodylessTags = new String[] { "br", "hr", "input", "img", "isindex" }; + + /** we keep a char[] around for use by removeRedundantWhitespace() */ + private static char[] cbuf = null; + + /** we keep a StringBuffer around for use by removeRedundantWhitespace() */ + private static StringBuffer sbuf = null; + + public static synchronized JSObject parseReader(Reader r) throws IOException { + CharStream cs = new CharStream(r); + JSObject h = new JSObject(); + + h.put("$name", "html"); + + try { + while (true) parseBody(cs, h, null); + } catch (EOFException e) { + // continue until we get an EOFException + } + + Object[] ids = h.getIds(); + for(int i=0; ih. The + * CharStream should be positioned immediately after the + * open bracket. + * + * If a close tag not matching this open tag is found, the + * tagname on the close tag will be returned in order to + * facilitate correcting broken HTML. Otherwise, this returns + * null. + */ + private static String parseElement(CharStream cs, JSObject h) throws IOException { + + // scan element name + while(Character.isSpace(cs.peek())) cs.get(); + String elementName = parseElementName(cs); + + h.put("$name", elementName); + if (elementName.equals("!--")) { + h.put("0", parseComment(cs)); + h.put("$numchildren", new Integer(0)); + return null; + } + + // scan attributes + while (cs.peek() != '>') { + String name = parseAttributeName(cs); + if (name.equals("")) break; + String value = expandEntities(parseAttributeValue(cs)); + h.put(name, value); + } + + // eat the close-angle bracket + cs.get(); + + // bodyless tags return here + for(int i=0; i 0) { + h.put(String.valueOf(length), expanded); + h.put("$numchildren", new Integer(++length)); + } + cdata = ""; + + } catch (EOFException e) { + String expanded = removeRedundantWhitespace(expandEntities(cdata)); + if (expanded.length() > 0) { + h.put(String.valueOf(length), expanded); + h.put("$numchildren", new Integer(++length)); + } + throw e; + } + + try { + // scan subelement + if (cs.peek() != '/') { + JSObject kid = new JSObject(); + closetag = parseElement(cs, kid); + h.put(String.valueOf(length), kid); + h.put("$numchildren", new Integer(++length)); + + // scan close-tag + } else { + cs.get(); // drop the slash + closetag = parseElementName(cs); + while(cs.get() != '>'); + } + } catch (EOFException e) { + throw e; + + } + + if (closetag != null) + return closetag.equals(elementName) ? null : closetag; + } + } + + /** Parses an element name and returns it. The CharStream should + * be positioned at the first character of the name. + */ + private static String parseElementName(CharStream cs) throws IOException { + String ret = ""; + while (cs.peek() != '>' && !Character.isSpace(cs.peek())) ret += cs.get(); + return ret.toLowerCase(); + } + + /** Parses an attribute name and returns it. The CharStream should + * be positioned at the first character of the name, possibly + * with intervening whitespace. + */ + private static String parseAttributeName(CharStream cs) throws IOException { + while(Character.isSpace(cs.peek())) cs.get(); + String ret = ""; + while(!Character.isSpace(cs.peek()) && cs.peek() != '=' && cs.peek() != '>') ret += cs.get(); + return ret.toLowerCase(); + } + + /** Parses an attribute value and returns it. The CharStream + * should be positioned at the equals sign, possibly with + * intervening whitespace. + */ + private static String parseAttributeValue(CharStream cs) throws IOException { + + // eat whitespace and equals sign + while(Character.isSpace(cs.peek())) cs.get(); + if (cs.peek() != '=') return ""; + cs.get(); + while(Character.isSpace(cs.peek())) cs.get(); + + boolean doublequoted = false; + boolean singlequoted = false; + String ret = ""; + + if (cs.peek() == '\"') { doublequoted = true; cs.get(); } + else if (cs.peek() == '\'') { singlequoted = true; cs.get(); } + + while(true) { + char c = cs.peek(); + if (!doublequoted && !singlequoted && (Character.isSpace(c) || c == '>')) break; + if (singlequoted && c == '\'') { cs.get(); break; } + if (doublequoted && c == '\"') { cs.get(); break; } + ret += cs.get(); + } + return ret; + } + + /** Parses a comment and returns its body. The CharStream should + * be positioned immediately after the <!-- + */ + private static String parseComment(CharStream cs) throws IOException { + int dashes = 0; + String ret = ""; + while(true) { + char c = cs.get(); + if (c == '>' && dashes == 2) return ret.substring(0, ret.length() - 2); + if (c == '-') dashes++; + else dashes = 0; + ret += c; + } + } + + /** Expands all SGML entities in string s */ + public static String expandEntities(String s) throws IOException { + if (s.indexOf('&') == -1) return s; + StringBuffer sb = new StringBuffer(); + int i=0; + int nextamp = 0; + while(nextamp != -1) { + nextamp = s.indexOf('&', i); + sb.append(nextamp == -1 ? s.substring(i) : s.substring(i, nextamp)); + if (nextamp == -1) break; + if (s.regionMatches(nextamp, "&", 0, 5)) { + sb.append("&"); + i = nextamp + 5; + } else if (s.regionMatches(nextamp, ">", 0, 4)) { + sb.append(">"); + i = nextamp + 4; + } else if (s.regionMatches(nextamp, "<", 0, 4)) { + sb.append("<"); + i = nextamp + 4; + } else if (s.regionMatches(nextamp, """, 0, 6)) { + sb.append("\""); + i = nextamp + 6; + } else if (s.regionMatches(nextamp, " ", 0, 6)) { + // FIXME: should have a way to indicate this... + sb.append(" "); + i = nextamp + 6; + } else { + sb.append("&"); + i = nextamp + 1; + } + } + return sb.toString(); + } + + // FIXME double check this + /** removes all redundant whitespace */ + private static String removeRedundantWhitespace(String s) { + + if (s.indexOf(' ') == -1 && s.indexOf('\n') == -1 && s.indexOf('\t') == -1 && s.indexOf('\r') == -1) return s; + + int len = s.length(); + if (cbuf == null || cbuf.length < len) { + cbuf = new char[len * 2]; + sbuf = new StringBuffer(len * 2); + } + sbuf.setLength(0); + s.getChars(0, len, cbuf, 0); + + int last = 0; + boolean lastWasWhitespace = false; + for(int i=0; i"); Scriptable s = (Scriptable)o; Object[] keys = s.getIds();