1 // Copyright 2002 Adam Megacz, see the COPYING file for licensing [GPL]
10 While entities are limited to a subset of Unicode characters ,
11 numeric character references can specify any character. Numeric
12 character references may be given in decimal or hexadecimal, though
13 browser support is stronger for decimal references. Decimal
14 references are of the form &#number; while hexadecimal references
15 take the case-insensitive form &#xnumber;. Examples of numeric
16 character references include © or © for the copyright
17 symbol, Α or Α for the Greek capital letter alpha, and
18 ا or ا for the Arabic letter ALEF.
20 http://www.htmlhelp.com/reference/html40/entities/special.html
21 http://www.htmlhelp.com/reference/html40/entities/symbols.html
22 http://www.htmlhelp.com/reference/html40/entities/latin1.html
26 * This class parses an InputStream containing HTML and returns it
27 * as an XWT DOM tree. Each HTML Element is returned as a struct,
28 * with the following members:
30 * Since HTML may have multiple top level elements (unlike XML),
31 * this class will search all top level elements for one with a tag
32 * name 'html'. If such a node is found, only it is returned. If no
33 * top-level element has the tag name 'html', such a node is
34 * fabricated, and all top level elements become the children of
35 * that node, which is then returned.
40 private final static String[] bodylessTags = new String[] { "br", "hr", "input", "img", "isindex" };
42 /** we keep a char[] around for use by removeRedundantWhitespace() */
43 private static char[] cbuf = null;
45 /** we keep a StringBuffer around for use by removeRedundantWhitespace() */
46 private static StringBuffer sbuf = null;
48 /** true iff we have encountered an LI more recently than the last OL/UL */
49 private static boolean withinLI = false;
51 public static synchronized JSObject parseReader(Reader r) throws IOException {
52 CharStream cs = new CharStream(r);
53 JSObject h = new JSObject();
56 h.put("$name", "html");
59 while (true) parseBody(cs, h, null);
60 } catch (EOFException e) {
61 // continue until we get an EOFException
64 Object[] ids = h.getIds();
65 for(int i=0; i<ids.length; i++) {
66 Object el = h.get((String)ids[i]);
67 if (el instanceof JSObject && "html".equals(((JSObject)el).get("$name")))
75 * Parses a single element and stores it in <tt>h</tt>. The
76 * CharStream should be positioned immediately <i>after</i> the
79 * If a close tag not matching this open tag is found, the
80 * tagname on the close tag will be returned in order to
81 * facilitate correcting broken HTML. Otherwise, this returns
84 private static String parseElement(CharStream cs, JSObject h) throws IOException {
86 while(Character.isSpace(cs.peek())) cs.get();
87 String elementName = parseElementName(cs);
89 // FIXME: this might not deal correctly with EOFExceptions
90 boolean saveWithinLI = withinLI;
91 if (elementName.equals("li")) {
93 cs.unread(new char[] { '<', 'l', 'i', ' ' });
98 } else if (elementName.equals("ol") || elementName.equals("ul")) {
102 h.put("$name", elementName);
103 if (elementName.equals("!--")) {
104 h.put("0", parseComment(cs));
105 h.put("$numchildren", new Integer(0));
110 while (cs.peek() != '>') {
111 String name = parseAttributeName(cs);
112 if (name.equals("")) break;
113 String value = expandEntities(parseAttributeValue(cs));
117 // eat the close-angle bracket
120 // bodyless tags return here
121 for(int i=0; i<bodylessTags.length; i++)
122 if (bodylessTags[i].equals(elementName))
126 String ret = parseBody(cs, h, elementName);
127 withinLI = saveWithinLI;
132 * Parses the body of an element. The CharStream should be
133 * positioned at the character immediately after the right
134 * bracket closing the start-tag
136 private static String parseBody(CharStream cs, JSObject h, String elementName) throws IOException {
138 int length = h.get("$numchildren") == null ? 0 : Integer.parseInt(h.get("$numchildren").toString());
140 String closetag = null;
144 if (c != '<') { cdata += c; continue; }
145 String expanded = removeRedundantWhitespace(expandEntities(cdata));
146 if (expanded.length() > 0) {
147 h.put(String.valueOf(length), expanded);
148 h.put("$numchildren", new Integer(++length));
152 } catch (EOFException e) {
153 String expanded = removeRedundantWhitespace(expandEntities(cdata));
154 if (expanded.length() > 0) {
155 h.put(String.valueOf(length), expanded);
156 h.put("$numchildren", new Integer(++length));
163 if (cs.peek() != '/') {
164 JSObject kid = new JSObject();
165 closetag = parseElement(cs, kid);
166 h.put(String.valueOf(length), kid);
167 h.put("$numchildren", new Integer(++length));
171 cs.get(); // drop the slash
172 closetag = parseElementName(cs);
173 while(cs.get() != '>');
175 } catch (EOFException e) {
180 if (closetag != null)
181 return closetag.equals(elementName) ? null : closetag;
185 /** Parses an element name and returns it. The CharStream should
186 * be positioned at the first character of the name.
188 private static String parseElementName(CharStream cs) throws IOException {
190 while (cs.peek() != '>' && !Character.isSpace(cs.peek())) ret += cs.get();
191 return ret.toLowerCase();
194 /** Parses an attribute name and returns it. The CharStream should
195 * be positioned at the first character of the name, possibly
196 * with intervening whitespace.
198 private static String parseAttributeName(CharStream cs) throws IOException {
199 while(Character.isSpace(cs.peek())) cs.get();
201 while(!Character.isSpace(cs.peek()) && cs.peek() != '=' && cs.peek() != '>') ret += cs.get();
202 return ret.toLowerCase();
205 /** Parses an attribute value and returns it. The CharStream
206 * should be positioned at the equals sign, possibly with
207 * intervening whitespace.
209 private static String parseAttributeValue(CharStream cs) throws IOException {
211 // eat whitespace and equals sign
212 while(Character.isSpace(cs.peek())) cs.get();
213 if (cs.peek() != '=') return "";
215 while(Character.isSpace(cs.peek())) cs.get();
217 boolean doublequoted = false;
218 boolean singlequoted = false;
221 if (cs.peek() == '\"') { doublequoted = true; cs.get(); }
222 else if (cs.peek() == '\'') { singlequoted = true; cs.get(); }
226 if (!doublequoted && !singlequoted && (Character.isSpace(c) || c == '>')) break;
227 if (singlequoted && c == '\'') { cs.get(); break; }
228 if (doublequoted && c == '\"') { cs.get(); break; }
234 /** Parses a comment and returns its body. The CharStream should
235 * be positioned immediately after the <!--
237 private static String parseComment(CharStream cs) throws IOException {
242 if (c == '>' && dashes == 2) return ret.substring(0, ret.length() - 2);
243 if (c == '-') dashes++;
249 /** Expands all SGML entities in string <tt>s</tt> */
250 public static String expandEntities(String s) throws IOException {
251 if (s.indexOf('&') == -1) return s;
252 StringBuffer sb = new StringBuffer();
255 while(nextamp != -1) {
256 nextamp = s.indexOf('&', i);
257 sb.append(nextamp == -1 ? s.substring(i) : s.substring(i, nextamp));
258 if (nextamp == -1) break;
259 if (s.regionMatches(nextamp, "&", 0, 5)) {
262 } else if (s.regionMatches(nextamp, ">", 0, 4)) {
265 } else if (s.regionMatches(nextamp, "<", 0, 4)) {
268 } else if (s.regionMatches(nextamp, """, 0, 6)) {
271 } else if (s.regionMatches(nextamp, " ", 0, 6)) {
272 // FIXME: should have a way to indicate this...
280 return sb.toString();
283 // FIXME double check this
284 /** removes all redundant whitespace */
285 private static String removeRedundantWhitespace(String s) {
287 if (s.indexOf(' ') == -1 && s.indexOf('\n') == -1 && s.indexOf('\t') == -1 && s.indexOf('\r') == -1) return s;
289 int len = s.length();
290 if (cbuf == null || cbuf.length < len) {
291 cbuf = new char[len * 2];
292 sbuf = new StringBuffer(len * 2);
295 s.getChars(0, len, cbuf, 0);
298 boolean lastWasWhitespace = false;
299 for(int i=0; i<len; i++) {
300 boolean lastlast = lastWasWhitespace;
302 case '\n': case '\r': case '\t':
305 lastWasWhitespace = true;
308 lastWasWhitespace = false;
311 if (lastWasWhitespace && lastlast) {
312 if (last != i) sbuf.append(cbuf, last, i - last);
317 if (last != len) sbuf.append(cbuf, last, len - last);
318 return sbuf.toString().trim();
321 // CharStream /////////////////////////////////////////////////////////////////////
323 private static class CharStream extends PushbackReader {
324 public CharStream(Reader r) { super(r, 1024); }
326 public char peek() throws IOException {
332 public char get() throws IOException {
334 if (i == -1) throw new EOFException();