1 // Copyright 2002 Adam Megacz, see the COPYING file for licensing [GPL]
10 While entities are limited to a subset of Unicode characters ,
11 numeric character references can specify any character. Numeric
12 character references may be given in decimal or hexadecimal, though
13 browser support is stronger for decimal references. Decimal
14 references are of the form &#number; while hexadecimal references
15 take the case-insensitive form &#xnumber;. Examples of numeric
16 character references include © or © for the copyright
17 symbol, Α or Α for the Greek capital letter alpha, and
18 ا or ا for the Arabic letter ALEF.
20 http://www.htmlhelp.com/reference/html40/entities/special.html
21 http://www.htmlhelp.com/reference/html40/entities/symbols.html
22 http://www.htmlhelp.com/reference/html40/entities/latin1.html
24 FIXME FIXME FIXME: <li> tags close enclosing <li> tags
28 * This class parses an InputStream containing HTML and returns it
29 * as an XWT DOM tree. Each HTML Element is returned as a struct,
30 * with the following members:
32 * Since HTML may have multiple top level elements (unlike XML),
33 * this class will search all top level elements for one with a tag
34 * name 'html'. If such a node is found, only it is returned. If no
35 * top-level element has the tag name 'html', such a node is
36 * fabricated, and all top level elements become the children of
37 * that node, which is then returned.
42 private final static String[] bodylessTags = new String[] { "br", "hr", "input", "img", "isindex" };
44 /** we keep a char[] around for use by removeRedundantWhitespace() */
45 private static char[] cbuf = null;
47 /** we keep a StringBuffer around for use by removeRedundantWhitespace() */
48 private static StringBuffer sbuf = null;
50 public static synchronized JSObject parseReader(Reader r) throws IOException {
51 CharStream cs = new CharStream(r);
52 JSObject h = new JSObject();
54 h.put("$name", "html");
57 while (true) parseBody(cs, h, null);
58 } catch (EOFException e) {
59 // continue until we get an EOFException
62 Object[] ids = h.getIds();
63 for(int i=0; i<ids.length; i++) {
64 Object el = h.get((String)ids[i]);
65 if (el instanceof JSObject && "html".equals(((JSObject)el).get("$name")))
73 * Parses a single element and stores it in <tt>h</tt>. The
74 * CharStream should be positioned immediately <i>after</i> the
77 * If a close tag not matching this open tag is found, the
78 * tagname on the close tag will be returned in order to
79 * facilitate correcting broken HTML. Otherwise, this returns
82 private static String parseElement(CharStream cs, JSObject h) throws IOException {
85 while(Character.isSpace(cs.peek())) cs.get();
86 String elementName = parseElementName(cs);
88 h.put("$name", elementName);
89 if (elementName.equals("!--")) {
90 h.put("0", parseComment(cs));
91 h.put("$numchildren", new Integer(0));
96 while (cs.peek() != '>') {
97 String name = parseAttributeName(cs);
98 if (name.equals("")) break;
99 String value = expandEntities(parseAttributeValue(cs));
103 // eat the close-angle bracket
106 // bodyless tags return here
107 for(int i=0; i<bodylessTags.length; i++)
108 if (bodylessTags[i].equals(elementName))
112 return parseBody(cs, h, elementName);
116 * Parses the body of an element. The CharStream should be
117 * positioned at the character immediately after the right
118 * bracket closing the start-tag
120 private static String parseBody(CharStream cs, JSObject h, String elementName) throws IOException {
122 int length = h.get("$numchildren") == null ? 0 : Integer.parseInt(h.get("$numchildren").toString());
124 String closetag = null;
128 if (c != '<') { cdata += c; continue; }
129 String expanded = removeRedundantWhitespace(expandEntities(cdata));
130 if (expanded.length() > 0) {
131 h.put(String.valueOf(length), expanded);
132 h.put("$numchildren", new Integer(++length));
136 } catch (EOFException e) {
137 String expanded = removeRedundantWhitespace(expandEntities(cdata));
138 if (expanded.length() > 0) {
139 h.put(String.valueOf(length), expanded);
140 h.put("$numchildren", new Integer(++length));
147 if (cs.peek() != '/') {
148 JSObject kid = new JSObject();
149 closetag = parseElement(cs, kid);
150 h.put(String.valueOf(length), kid);
151 h.put("$numchildren", new Integer(++length));
155 cs.get(); // drop the slash
156 closetag = parseElementName(cs);
157 while(cs.get() != '>');
159 } catch (EOFException e) {
164 if (closetag != null)
165 return closetag.equals(elementName) ? null : closetag;
169 /** Parses an element name and returns it. The CharStream should
170 * be positioned at the first character of the name.
172 private static String parseElementName(CharStream cs) throws IOException {
174 while (cs.peek() != '>' && !Character.isSpace(cs.peek())) ret += cs.get();
175 return ret.toLowerCase();
178 /** Parses an attribute name and returns it. The CharStream should
179 * be positioned at the first character of the name, possibly
180 * with intervening whitespace.
182 private static String parseAttributeName(CharStream cs) throws IOException {
183 while(Character.isSpace(cs.peek())) cs.get();
185 while(!Character.isSpace(cs.peek()) && cs.peek() != '=' && cs.peek() != '>') ret += cs.get();
186 return ret.toLowerCase();
189 /** Parses an attribute value and returns it. The CharStream
190 * should be positioned at the equals sign, possibly with
191 * intervening whitespace.
193 private static String parseAttributeValue(CharStream cs) throws IOException {
195 // eat whitespace and equals sign
196 while(Character.isSpace(cs.peek())) cs.get();
197 if (cs.peek() != '=') return "";
199 while(Character.isSpace(cs.peek())) cs.get();
201 boolean doublequoted = false;
202 boolean singlequoted = false;
205 if (cs.peek() == '\"') { doublequoted = true; cs.get(); }
206 else if (cs.peek() == '\'') { singlequoted = true; cs.get(); }
210 if (!doublequoted && !singlequoted && (Character.isSpace(c) || c == '>')) break;
211 if (singlequoted && c == '\'') { cs.get(); break; }
212 if (doublequoted && c == '\"') { cs.get(); break; }
218 /** Parses a comment and returns its body. The CharStream should
219 * be positioned immediately after the <!--
221 private static String parseComment(CharStream cs) throws IOException {
226 if (c == '>' && dashes == 2) return ret.substring(0, ret.length() - 2);
227 if (c == '-') dashes++;
233 /** Expands all SGML entities in string <tt>s</tt> */
234 public static String expandEntities(String s) throws IOException {
235 if (s.indexOf('&') == -1) return s;
236 StringBuffer sb = new StringBuffer();
239 while(nextamp != -1) {
240 nextamp = s.indexOf('&', i);
241 sb.append(nextamp == -1 ? s.substring(i) : s.substring(i, nextamp));
242 if (nextamp == -1) break;
243 if (s.regionMatches(nextamp, "&", 0, 5)) {
246 } else if (s.regionMatches(nextamp, ">", 0, 4)) {
249 } else if (s.regionMatches(nextamp, "<", 0, 4)) {
252 } else if (s.regionMatches(nextamp, """, 0, 6)) {
255 } else if (s.regionMatches(nextamp, " ", 0, 6)) {
256 // FIXME: should have a way to indicate this...
264 return sb.toString();
267 // FIXME double check this
268 /** removes all redundant whitespace */
269 private static String removeRedundantWhitespace(String s) {
271 if (s.indexOf(' ') == -1 && s.indexOf('\n') == -1 && s.indexOf('\t') == -1 && s.indexOf('\r') == -1) return s;
273 int len = s.length();
274 if (cbuf == null || cbuf.length < len) {
275 cbuf = new char[len * 2];
276 sbuf = new StringBuffer(len * 2);
279 s.getChars(0, len, cbuf, 0);
282 boolean lastWasWhitespace = false;
283 for(int i=0; i<len; i++) {
284 boolean lastlast = lastWasWhitespace;
286 case '\n': case '\r': case '\t':
289 lastWasWhitespace = true;
292 lastWasWhitespace = false;
295 if (lastWasWhitespace && lastlast) {
296 if (last != i) sbuf.append(cbuf, last, i - last);
301 if (last != len) sbuf.append(cbuf, last, len - last);
302 return sbuf.toString().trim();
305 // CharStream /////////////////////////////////////////////////////////////////////
307 private static class CharStream extends PushbackReader {
308 public CharStream(Reader r) { super(r); }
310 public char peek() throws IOException {
316 public char get() throws IOException {
318 if (i == -1) throw new EOFException();