1 // Copyright 2002 Adam Megacz, see the COPYING file for licensing [GPL]
10 While entities are limited to a subset of Unicode characters ,
11 numeric character references can specify any character. Numeric
12 character references may be given in decimal or hexadecimal, though
13 browser support is stronger for decimal references. Decimal
14 references are of the form &#number; while hexadecimal references
15 take the case-insensitive form &#xnumber;. Examples of numeric
16 character references include © or © for the copyright
17 symbol, Α or Α for the Greek capital letter alpha, and
18 ا or ا for the Arabic letter ALEF.
20 http://www.htmlhelp.com/reference/html40/entities/special.html
21 http://www.htmlhelp.com/reference/html40/entities/symbols.html
22 http://www.htmlhelp.com/reference/html40/entities/latin1.html
24 FIXME FIXME FIXME: <li> tags close enclosing <li> tags
28 * This class parses an InputStream containing HTML and returns it
29 * as an XWT DOM tree. Each HTML Element is returned as a struct,
30 * with the following members:
32 * Since HTML may have multiple top level elements (unlike XML),
33 * this class will search all top level elements for one with a tag
34 * name 'html'. If such a node is found, only it is returned. If no
35 * top-level element has the tag name 'html', such a node is
36 * fabricated, and all top level elements become the children of
37 * that node, which is then returned.
42 private final static String[] bodylessTags = new String[] { "br", "hr", "input", "img", "isindex" };
44 /** we keep a char[] around for use by removeRedundantWhitespace() */
45 private static char[] cbuf = null;
47 /** we keep a StringBuffer around for use by removeRedundantWhitespace() */
48 private static StringBuffer sbuf = null;
50 public static synchronized JSObject parseReader(Reader r) throws IOException {
51 CharStream cs = new CharStream(r);
52 JSObject h = new JSObject();
54 h.put("$name", "html");
57 while (true) parseBody(cs, h, null);
58 } catch (EOFException e) {
59 // continue until we get an EOFException
62 Object[] ids = h.getIds();
63 for(int i=0; i<ids.length; i++) {
64 Object el = h.get((String)ids[i]);
65 if (el instanceof JSObject && "html".equals(((JSObject)el).get("$name")))
73 * Parses a single element and stores it in <tt>h</tt>. The
74 * CharStream should be positioned immediately <i>after</i> the
77 * If a close tag not matching this open tag is found, the
78 * tagname on the close tag will be returned in order to
79 * facilitate correcting broken HTML. Otherwise, this returns
82 private static String parseElement(CharStream cs, JSObject h) throws IOException {
84 while(Character.isSpace(cs.peek())) cs.get();
85 String elementName = parseElementName(cs);
87 h.put("$name", elementName);
88 if (elementName.equals("!--")) {
89 h.put("0", parseComment(cs));
90 h.put("$numchildren", new Integer(0));
95 while (cs.peek() != '>') {
96 String name = parseAttributeName(cs);
97 if (name.equals("")) break;
98 String value = expandEntities(parseAttributeValue(cs));
102 // eat the close-angle bracket
105 // bodyless tags return here
106 for(int i=0; i<bodylessTags.length; i++)
107 if (bodylessTags[i].equals(elementName))
111 return parseBody(cs, h, elementName);
115 * Parses the body of an element. The CharStream should be
116 * positioned at the character immediately after the right
117 * bracket closing the start-tag
119 private static String parseBody(CharStream cs, JSObject h, String elementName) throws IOException {
121 int length = h.get("$numchildren") == null ? 0 : Integer.parseInt(h.get("$numchildren").toString());
123 String closetag = null;
127 if (c != '<') { cdata += c; continue; }
128 String expanded = removeRedundantWhitespace(expandEntities(cdata));
129 if (expanded.length() > 0) {
130 h.put(String.valueOf(length), expanded);
131 h.put("$numchildren", new Integer(++length));
135 } catch (EOFException e) {
136 String expanded = removeRedundantWhitespace(expandEntities(cdata));
137 if (expanded.length() > 0) {
138 h.put(String.valueOf(length), expanded);
139 h.put("$numchildren", new Integer(++length));
146 if (cs.peek() != '/') {
147 JSObject kid = new JSObject();
148 closetag = parseElement(cs, kid);
149 h.put(String.valueOf(length), kid);
150 h.put("$numchildren", new Integer(++length));
154 cs.get(); // drop the slash
155 closetag = parseElementName(cs);
156 while(cs.get() != '>');
158 } catch (EOFException e) {
163 if (closetag != null)
164 return closetag.equals(elementName) ? null : closetag;
168 /** Parses an element name and returns it. The CharStream should
169 * be positioned at the first character of the name.
171 private static String parseElementName(CharStream cs) throws IOException {
173 while (cs.peek() != '>' && !Character.isSpace(cs.peek())) ret += cs.get();
174 return ret.toLowerCase();
177 /** Parses an attribute name and returns it. The CharStream should
178 * be positioned at the first character of the name, possibly
179 * with intervening whitespace.
181 private static String parseAttributeName(CharStream cs) throws IOException {
182 while(Character.isSpace(cs.peek())) cs.get();
184 while(!Character.isSpace(cs.peek()) && cs.peek() != '=' && cs.peek() != '>') ret += cs.get();
185 return ret.toLowerCase();
188 /** Parses an attribute value and returns it. The CharStream
189 * should be positioned at the equals sign, possibly with
190 * intervening whitespace.
192 private static String parseAttributeValue(CharStream cs) throws IOException {
194 // eat whitespace and equals sign
195 while(Character.isSpace(cs.peek())) cs.get();
196 if (cs.peek() != '=') return "";
198 while(Character.isSpace(cs.peek())) cs.get();
200 boolean doublequoted = false;
201 boolean singlequoted = false;
204 if (cs.peek() == '\"') { doublequoted = true; cs.get(); }
205 else if (cs.peek() == '\'') { singlequoted = true; cs.get(); }
209 if (!doublequoted && !singlequoted && (Character.isSpace(c) || c == '>')) break;
210 if (singlequoted && c == '\'') { cs.get(); break; }
211 if (doublequoted && c == '\"') { cs.get(); break; }
217 /** Parses a comment and returns its body. The CharStream should
218 * be positioned immediately after the <!--
220 private static String parseComment(CharStream cs) throws IOException {
225 if (c == '>' && dashes == 2) return ret.substring(0, ret.length() - 2);
226 if (c == '-') dashes++;
232 /** Expands all SGML entities in string <tt>s</tt> */
233 public static String expandEntities(String s) throws IOException {
234 if (s.indexOf('&') == -1) return s;
235 StringBuffer sb = new StringBuffer();
238 while(nextamp != -1) {
239 nextamp = s.indexOf('&', i);
240 sb.append(nextamp == -1 ? s.substring(i) : s.substring(i, nextamp));
241 if (nextamp == -1) break;
242 if (s.regionMatches(nextamp, "&", 0, 5)) {
245 } else if (s.regionMatches(nextamp, ">", 0, 4)) {
248 } else if (s.regionMatches(nextamp, "<", 0, 4)) {
251 } else if (s.regionMatches(nextamp, """, 0, 6)) {
254 } else if (s.regionMatches(nextamp, " ", 0, 6)) {
255 // FIXME: should have a way to indicate this...
263 return sb.toString();
266 // FIXME double check this
267 /** removes all redundant whitespace */
268 private static String removeRedundantWhitespace(String s) {
270 if (s.indexOf(' ') == -1 && s.indexOf('\n') == -1 && s.indexOf('\t') == -1 && s.indexOf('\r') == -1) return s;
272 int len = s.length();
273 if (cbuf == null || cbuf.length < len) {
274 cbuf = new char[len * 2];
275 sbuf = new StringBuffer(len * 2);
278 s.getChars(0, len, cbuf, 0);
281 boolean lastWasWhitespace = false;
282 for(int i=0; i<len; i++) {
283 boolean lastlast = lastWasWhitespace;
285 case '\n': case '\r': case '\t':
288 lastWasWhitespace = true;
291 lastWasWhitespace = false;
294 if (lastWasWhitespace && lastlast) {
295 if (last != i) sbuf.append(cbuf, last, i - last);
300 if (last != len) sbuf.append(cbuf, last, len - last);
301 return sbuf.toString().trim();
304 // CharStream /////////////////////////////////////////////////////////////////////
306 private static class CharStream extends PushbackReader {
307 public CharStream(Reader r) { super(r); }
309 public char peek() throws IOException {
315 public char get() throws IOException {
317 if (i == -1) throw new EOFException();