1 // Copyright 2002 Adam Megacz, see the COPYING file for licensing [GPL]
11 * While entities are limited to a subset of Unicode characters ,
12 * numeric character references can specify any character. Numeric
13 * character references may be given in decimal or hexadecimal, though
14 * browser support is stronger for decimal references. Decimal
15 * references are of the form &#number; while hexadecimal references
16 * take the case-insensitive form &#xnumber;. Examples of numeric
17 * character references include © or © for the copyright
18 * symbol, Α or Α for the Greek capital letter alpha, and
19 * ا or ا for the Arabic letter ALEF.
21 * http://www.htmlhelp.com/reference/html40/entities/special.html
22 * http://www.htmlhelp.com/reference/html40/entities/symbols.html
23 * http://www.htmlhelp.com/reference/html40/entities/latin1.html
27 * This class parses an InputStream containing HTML and returns it
28 * as an XWT DOM tree. Each HTML Element is returned as a struct,
29 * with the following members:
31 * Since HTML may have multiple top level elements (unlike XML),
32 * this class will search all top level elements for one with a tag
33 * name 'html'. If such a node is found, only it is returned. If no
34 * top-level element has the tag name 'html', such a node is
35 * fabricated, and all top level elements become the children of
36 * that node, which is then returned.
41 private final static String[] bodylessTags = new String[] { "br", "hr", "input", "img", "isindex" };
43 /** we keep a char[] around for use by removeRedundantWhitespace() */
44 private static char[] cbuf = null;
46 /** we keep a StringBuffer around for use by removeRedundantWhitespace() */
47 private static StringBuffer sbuf = null;
49 /** true iff we have encountered an LI more recently than the last OL/UL */
50 private static boolean withinLI = false;
52 public static synchronized JS parseReader(Reader r) throws IOException {
53 CharStream cs = new CharStream(r);
54 JS.Obj h = new JS.Obj();
57 h.put("$name", "html");
60 while (true) parseBody(cs, h, null);
61 } catch (EOFException e) {
62 // continue until we get an EOFException
65 Object[] ids = h.keys();
66 for(int i=0; i<ids.length; i++) {
67 Object el = h.get((String)ids[i]);
68 if (el instanceof JS && "html".equals(((JS)el).get("$name")))
76 * Parses a single element and stores it in <tt>h</tt>. The
77 * CharStream should be positioned immediately <i>after</i> the
80 * If a close tag not matching this open tag is found, the
81 * tagname on the close tag will be returned in order to
82 * facilitate correcting broken HTML. Otherwise, this returns
85 private static String parseElement(CharStream cs, JS h) throws IOException {
87 while(Character.isSpace(cs.peek())) cs.get();
88 String elementName = parseElementName(cs);
90 // FIXME: this might not deal correctly with EOFExceptions
91 boolean saveWithinLI = withinLI;
92 if (elementName.equals("li")) {
94 cs.unread(new char[] { '<', 'l', 'i', ' ' });
99 } else if (elementName.equals("ol") || elementName.equals("ul")) {
103 h.put("$name", elementName);
104 if (elementName.equals("!--")) {
105 h.put("0", parseComment(cs));
106 h.put("$numchildren", new Integer(0));
111 while (cs.peek() != '>') {
112 String name = parseAttributeName(cs);
113 if (name.equals("")) break;
114 String value = expandEntities(parseAttributeValue(cs));
118 // eat the close-angle bracket
121 // bodyless tags return here
122 for(int i=0; i<bodylessTags.length; i++)
123 if (bodylessTags[i].equals(elementName))
127 String ret = parseBody(cs, h, elementName);
128 withinLI = saveWithinLI;
133 * Parses the body of an element. The CharStream should be
134 * positioned at the character immediately after the right
135 * bracket closing the start-tag
137 private static String parseBody(CharStream cs, JS h, String elementName) throws IOException {
139 int length = h.get("$numchildren") == null ? 0 : Integer.parseInt(h.get("$numchildren").toString());
141 String closetag = null;
145 if (c != '<') { cdata += c; continue; }
146 String expanded = removeRedundantWhitespace(expandEntities(cdata));
147 if (expanded.length() > 0) {
148 h.put(String.valueOf(length), expanded);
149 h.put("$numchildren", new Integer(++length));
153 } catch (EOFException e) {
154 String expanded = removeRedundantWhitespace(expandEntities(cdata));
155 if (expanded.length() > 0) {
156 h.put(String.valueOf(length), expanded);
157 h.put("$numchildren", new Integer(++length));
164 if (cs.peek() != '/') {
165 JS kid = new JS.Obj();
166 closetag = parseElement(cs, kid);
167 h.put(String.valueOf(length), kid);
168 h.put("$numchildren", new Integer(++length));
172 cs.get(); // drop the slash
173 closetag = parseElementName(cs);
174 while(cs.get() != '>');
176 } catch (EOFException e) {
181 if (closetag != null)
182 return closetag.equals(elementName) ? null : closetag;
186 /** Parses an element name and returns it. The CharStream should
187 * be positioned at the first character of the name.
189 private static String parseElementName(CharStream cs) throws IOException {
191 while (cs.peek() != '>' && !Character.isSpace(cs.peek())) ret += cs.get();
192 return ret.toLowerCase();
195 /** Parses an attribute name and returns it. The CharStream should
196 * be positioned at the first character of the name, possibly
197 * with intervening whitespace.
199 private static String parseAttributeName(CharStream cs) throws IOException {
200 while(Character.isSpace(cs.peek())) cs.get();
202 while(!Character.isSpace(cs.peek()) && cs.peek() != '=' && cs.peek() != '>') ret += cs.get();
203 return ret.toLowerCase();
206 /** Parses an attribute value and returns it. The CharStream
207 * should be positioned at the equals sign, possibly with
208 * intervening whitespace.
210 private static String parseAttributeValue(CharStream cs) throws IOException {
212 // eat whitespace and equals sign
213 while(Character.isSpace(cs.peek())) cs.get();
214 if (cs.peek() != '=') return "";
216 while(Character.isSpace(cs.peek())) cs.get();
218 boolean doublequoted = false;
219 boolean singlequoted = false;
222 if (cs.peek() == '\"') { doublequoted = true; cs.get(); }
223 else if (cs.peek() == '\'') { singlequoted = true; cs.get(); }
227 if (!doublequoted && !singlequoted && (Character.isSpace(c) || c == '>')) break;
228 if (singlequoted && c == '\'') { cs.get(); break; }
229 if (doublequoted && c == '\"') { cs.get(); break; }
235 /** Parses a comment and returns its body. The CharStream should
236 * be positioned immediately after the <!--
238 private static String parseComment(CharStream cs) throws IOException {
243 if (c == '>' && dashes == 2) return ret.substring(0, ret.length() - 2);
244 if (c == '-') dashes++;
250 /** Expands all SGML entities in string <tt>s</tt> */
251 public static String expandEntities(String s) throws IOException {
252 if (s.indexOf('&') == -1) return s;
253 StringBuffer sb = new StringBuffer();
256 while(nextamp != -1) {
257 nextamp = s.indexOf('&', i);
258 sb.append(nextamp == -1 ? s.substring(i) : s.substring(i, nextamp));
259 if (nextamp == -1) break;
260 if (s.regionMatches(nextamp, "&", 0, 5)) {
263 } else if (s.regionMatches(nextamp, ">", 0, 4)) {
266 } else if (s.regionMatches(nextamp, "<", 0, 4)) {
269 } else if (s.regionMatches(nextamp, """, 0, 6)) {
272 } else if (s.regionMatches(nextamp, " ", 0, 6)) {
273 // FIXME: should have a way to indicate this...
281 return sb.toString();
284 // FIXME double check this
285 /** removes all redundant whitespace */
286 private static String removeRedundantWhitespace(String s) {
288 if (s.indexOf(' ') == -1 && s.indexOf('\n') == -1 && s.indexOf('\t') == -1 && s.indexOf('\r') == -1) return s;
290 int len = s.length();
291 if (cbuf == null || cbuf.length < len) {
292 cbuf = new char[len * 2];
293 sbuf = new StringBuffer(len * 2);
296 s.getChars(0, len, cbuf, 0);
299 boolean lastWasWhitespace = false;
300 for(int i=0; i<len; i++) {
301 boolean lastlast = lastWasWhitespace;
303 case '\n': case '\r': case '\t':
306 lastWasWhitespace = true;
309 lastWasWhitespace = false;
312 if (lastWasWhitespace && lastlast) {
313 if (last != i) sbuf.append(cbuf, last, i - last);
318 if (last != len) sbuf.append(cbuf, last, len - last);
319 return sbuf.toString().trim();
322 // CharStream /////////////////////////////////////////////////////////////////////
324 private static class CharStream extends PushbackReader {
325 public CharStream(Reader r) { super(r, 1024); }
327 public char peek() throws IOException {
333 public char get() throws IOException {
335 if (i == -1) throw new EOFException();