1 // Copyright 2000-2005 the Contributors, as shown in the revision logs.
2 // Licensed under the GNU General Public License version 2 ("the License").
3 // You may not use this file except in compliance with the License.
5 package org.ibex.graphics;
11 import org.ibex.util.*;
14 * While entities are limited to a subset of Unicode characters ,
15 * numeric character references can specify any character. Numeric
16 * character references may be given in decimal or hexadecimal, though
17 * browser support is stronger for decimal references. Decimal
18 * references are of the form &#number; while hexadecimal references
19 * take the case-insensitive form &#xnumber;. Examples of numeric
20 * character references include © or © for the copyright
21 * symbol, Α or Α for the Greek capital letter alpha, and
22 * ا or ا for the Arabic letter ALEF.
24 * http://www.htmlhelp.com/reference/html40/entities/special.html
25 * http://www.htmlhelp.com/reference/html40/entities/symbols.html
26 * http://www.htmlhelp.com/reference/html40/entities/latin1.html
30 * This class parses an InputStream containing HTML and returns it
31 * as an XWT DOM tree. Each HTML Element is returned as a struct,
32 * with the following members:
34 * Since HTML may have multiple top level elements (unlike XML),
35 * this class will search all top level elements for one with a tag
36 * name 'html'. If such a node is found, only it is returned. If no
37 * top-level element has the tag name 'html', such a node is
38 * fabricated, and all top level elements become the children of
39 * that node, which is then returned.
43 private final static String[] noEndTag =
44 new String[] { "area", "base", "basefont", "br", "col", "frame", "hr", "img",
45 "input", "isindex", "link", "meta", "param" };
47 /** we keep a char[] around for use by removeRedundantWhitespace() */
48 private static char[] cbuf = null;
50 /** we keep a StringBuffer around for use by removeRedundantWhitespace() */
51 private static StringBuffer sbuf = null;
53 /** true iff we have encountered an LI more recently than the last OL/UL */
54 private static boolean withinLI = false;
56 // FEATURE: This is ugly
57 private static class JS extends org.ibex.js.JS.Obj {
58 public void put(String key, Object value) throws JSExn {
59 if(value instanceof String) put(JSU.S(key),JSU.S((String)value));
60 else if(value instanceof Number) put(JSU.S(key), JSU.N((Number)value));
61 else if(value == null) put(JSU.S(key), (JS)null);
62 else throw new Error("FIXME");
64 public Object _get(String key) throws JSExn {
65 org.ibex.js.JS js = get(JSU.S(key));
66 if(JSU.isInt(js)) return new Integer(JSU.toInt(js));
67 return JSU.toString(js);
71 public static synchronized JS parseReader(Reader r) throws IOException, JSExn {
72 CharStream cs = new CharStream(r);
76 h.put("$name", "html");
79 while (true) parseBody(cs, h, null);
80 } catch (EOFException e) {
81 // continue until we get an EOFException
85 Object[] ids = h.keys();
86 for(int i=0; i<ids.length; i++) {
87 Object el = h.get((String)ids[i]);
88 if (el instanceof JS && "html".equals(((JS)el).get("$name")))
96 * Parses a single element and stores it in <tt>h</tt>. The
97 * CharStream should be positioned immediately <i>after</i> the
100 * If a close tag not matching this open tag is found, the
101 * tagname on the close tag will be returned in order to
102 * facilitate correcting broken HTML. Otherwise, this returns
105 private static String parseElement(CharStream cs, JS h) throws IOException, JSExn {
107 while(Character.isSpace(cs.peek())) cs.get();
108 String elementName = parseElementName(cs);
110 boolean saveWithinLI = withinLI;
111 if (elementName.equals("li")) {
113 cs.unread(new char[] { '<', 'l', 'i', ' ' });
118 } else if (elementName.equals("ol") || elementName.equals("ul")) {
122 h.put("$name", elementName);
123 if (elementName.equals("!--")) {
124 h.put("0", parseComment(cs));
125 h.put("$numchildren", new Integer(0));
130 while (cs.peek() != '>') {
131 String name = parseAttributeName(cs);
132 if (name.equals("")) break;
133 String value = expandEntities(parseAttributeValue(cs));
137 // eat the close-angle bracket
140 // bodyless tags return here
141 for(int i=0; i<noEndTag.length; i++)
142 if (noEndTag[i].equals(elementName))
146 String ret = parseBody(cs, h, elementName);
147 withinLI = saveWithinLI;
152 * Parses the body of an element. The CharStream should be
153 * positioned at the character immediately after the right
154 * bracket closing the start-tag
156 private static String parseBody(CharStream cs, JS h, String elementName) throws IOException, JSExn {
158 int length = h._get("$numchildren") == null ? 0 : Integer.parseInt(h._get("$numchildren").toString());
160 String closetag = null;
164 if (c != '<') { cdata += c; continue; }
165 String expanded = removeRedundantWhitespace(expandEntities(cdata));
166 if (expanded.length() > 0) {
167 h.put(String.valueOf(length), expanded);
168 h.put("$numchildren", new Integer(++length));
172 } catch (EOFException e) {
173 String expanded = removeRedundantWhitespace(expandEntities(cdata));
174 if (expanded.length() > 0) {
175 h.put(String.valueOf(length), expanded);
176 h.put("$numchildren", new Integer(++length));
183 if (cs.peek() != '/') {
185 closetag = parseElement(cs, kid);
186 h.put(String.valueOf(length), kid);
187 h.put("$numchildren", new Integer(++length));
191 cs.get(); // drop the slash
192 closetag = parseElementName(cs);
193 while(cs.get() != '>');
195 } catch (EOFException e) {
200 if (closetag != null)
201 return closetag.equals(elementName) ? null : closetag;
205 /** Parses an element name and returns it. The CharStream should
206 * be positioned at the first character of the name.
208 private static String parseElementName(CharStream cs) throws IOException, JSExn {
210 while (cs.peek() != '>' && !Character.isSpace(cs.peek())) ret += cs.get();
211 return ret.toLowerCase();
214 /** Parses an attribute name and returns it. The CharStream should
215 * be positioned at the first character of the name, possibly
216 * with intervening whitespace.
218 private static String parseAttributeName(CharStream cs) throws IOException, JSExn {
219 while(Character.isSpace(cs.peek())) cs.get();
221 while(!Character.isSpace(cs.peek()) && cs.peek() != '=' && cs.peek() != '>') ret += cs.get();
222 return ret.toLowerCase();
225 /** Parses an attribute value and returns it. The CharStream
226 * should be positioned at the equals sign, possibly with
227 * intervening whitespace.
229 private static String parseAttributeValue(CharStream cs) throws IOException, JSExn {
231 // eat whitespace and equals sign
232 while(Character.isSpace(cs.peek())) cs.get();
233 if (cs.peek() != '=') return "";
235 while(Character.isSpace(cs.peek())) cs.get();
237 boolean doublequoted = false;
238 boolean singlequoted = false;
241 if (cs.peek() == '\"') { doublequoted = true; cs.get(); }
242 else if (cs.peek() == '\'') { singlequoted = true; cs.get(); }
246 if (!doublequoted && !singlequoted && (Character.isSpace(c) || c == '>')) break;
247 if (singlequoted && c == '\'') { cs.get(); break; }
248 if (doublequoted && c == '\"') { cs.get(); break; }
254 /** Parses a comment and returns its body. The CharStream should
255 * be positioned immediately after the <!--
257 private static String parseComment(CharStream cs) throws IOException, JSExn {
262 if (c == '>' && dashes == 2) return ret.substring(0, ret.length() - 2);
263 if (c == '-') dashes++;
269 /** Expands all SGML entities in string <tt>s</tt> */
270 public static String expandEntities(String s) throws IOException, JSExn {
271 if (s.indexOf('&') == -1) return s;
272 StringBuffer sb = new StringBuffer();
275 while(nextamp != -1) {
276 nextamp = s.indexOf('&', i);
277 sb.append(nextamp == -1 ? s.substring(i) : s.substring(i, nextamp));
278 if (nextamp == -1) break;
279 if (s.regionMatches(nextamp, "&", 0, 5)) {
282 } else if (s.regionMatches(nextamp, ">", 0, 4)) {
285 } else if (s.regionMatches(nextamp, "<", 0, 4)) {
288 } else if (s.regionMatches(nextamp, """, 0, 6)) {
291 } else if (s.regionMatches(nextamp, " ", 0, 6)) {
292 // FEATURE: perhaps we should distinguish this somehow
300 return sb.toString();
303 /** removes all redundant whitespace */
304 private static String removeRedundantWhitespace(String s) throws JSExn {
306 if (s.indexOf(' ') == -1 && s.indexOf('\n') == -1 && s.indexOf('\t') == -1 && s.indexOf('\r') == -1) return s;
308 int len = s.length();
309 if (cbuf == null || cbuf.length < len) {
310 cbuf = new char[len * 2];
311 sbuf = new StringBuffer(len * 2);
314 s.getChars(0, len, cbuf, 0);
317 boolean lastWasWhitespace = false;
318 for(int i=0; i<len; i++) {
319 boolean lastlast = lastWasWhitespace;
321 case '\n': case '\r': case '\t':
324 lastWasWhitespace = true;
327 lastWasWhitespace = false;
330 if (lastWasWhitespace && lastlast) {
331 if (last != i) sbuf.append(cbuf, last, i - last);
336 if (last != len) sbuf.append(cbuf, last, len - last);
337 return sbuf.toString().trim();
340 // CharStream /////////////////////////////////////////////////////////////////////
342 private static class CharStream extends PushbackReader {
343 public CharStream(Reader r) { super(r, 1024); }
345 public char peek() throws IOException {
351 public char get() throws IOException {
353 if (i == -1) throw new EOFException();