2003/09/07 03:04:31

[org.ibex.core.git] / src / org / xwt / HTML.java
diff --git a/src/org/xwt/HTML.java b/src/org/xwt/HTML.java

index b04b9d1..022e7a0 100644 (file)
--- a/src/org/xwt/HTML.java
+++ b/src/org/xwt/HTML.java
@@ -1,28 +1,27 @@
  // Copyright 2002 Adam Megacz, see the COPYING file for licensing [GPL]
  package org.xwt;
  
-import org.xwt.util.*;
  import java.util.*;
  import java.net.*;
  import java.io.*;
+import org.xwt.js.*;
+import org.xwt.util.*;
  
  /* 
-   While entities are limited to a subset of Unicode characters ,
-   numeric character references can specify any character. Numeric
-   character references may be given in decimal or hexadecimal, though
-   browser support is stronger for decimal references. Decimal
-   references are of the form &#number; while hexadecimal references
-   take the case-insensitive form &#xnumber;. Examples of numeric
-   character references include &#169; or &#xA9; for the copyright
-   symbol, &#913; or &#x391; for the Greek capital letter alpha, and
-   &#1575; or &#x627; for the Arabic letter ALEF.
-
-   http://www.htmlhelp.com/reference/html40/entities/special.html
-   http://www.htmlhelp.com/reference/html40/entities/symbols.html
-   http://www.htmlhelp.com/reference/html40/entities/latin1.html
-
-   FIXME FIXME FIXME: <li> tags close enclosing <li> tags
-*/
+ * While entities are limited to a subset of Unicode characters ,
+ * numeric character references can specify any character. Numeric
+ * character references may be given in decimal or hexadecimal, though
+ * browser support is stronger for decimal references. Decimal
+ * references are of the form &#number; while hexadecimal references
+ * take the case-insensitive form &#xnumber;. Examples of numeric
+ * character references include &#169; or &#xA9; for the copyright
+ * symbol, &#913; or &#x391; for the Greek capital letter alpha, and
+ * &#1575; or &#x627; for the Arabic letter ALEF.
+ *
+ * http://www.htmlhelp.com/reference/html40/entities/special.html
+ * http://www.htmlhelp.com/reference/html40/entities/symbols.html
+ * http://www.htmlhelp.com/reference/html40/entities/latin1.html
+ */
  
  /**
   *   This class parses an InputStream containing HTML and returns it
@@ -47,10 +46,14 @@ public class HTML {
      /** we keep a StringBuffer around for use by removeRedundantWhitespace() */
      private static StringBuffer sbuf = null;
  
-    public static synchronized JSObject parseReader(Reader r) throws IOException {
+    /** true iff we have encountered an LI more recently than the last OL/UL */
+    private static boolean withinLI = false;
+
+    public static synchronized JS parseReader(Reader r) throws IOException {
          CharStream cs = new CharStream(r);
-        JSObject h = new JSObject();
+        JS.Obj h = new JS.Obj();
  
+        withinLI = false;
          h.put("$name", "html");
  
          try {
@@ -59,11 +62,11 @@ public class HTML {
              // continue until we get an EOFException
          }
          
-        Object[] ids = h.getIds();
+        Object[] ids = h.keys();
          for(int i=0; i<ids.length; i++) {
              Object el = h.get((String)ids[i]);
-            if (el instanceof JSObject && "html".equals(((JSObject)el).get("$name")))
-                return (JSObject)el;
+            if (el instanceof JS && "html".equals(((JS)el).get("$name")))
+                return (JS)el;
          }
          
          return h;
@@ -79,11 +82,24 @@ public class HTML {
       *  facilitate correcting broken HTML. Otherwise, this returns
       *  null.
       */
-    private static String parseElement(CharStream cs, JSObject h) throws IOException {
+    private static String parseElement(CharStream cs, JS h) throws IOException {
          // scan element name
          while(Character.isSpace(cs.peek())) cs.get();
          String elementName = parseElementName(cs);
  
+        // FIXME: this might not deal correctly with EOFExceptions
+        boolean saveWithinLI = withinLI;
+        if (elementName.equals("li")) {
+            if (withinLI) {
+                cs.unread(new char[] { '<', 'l', 'i', ' ' });
+                return "li";
+            } else {
+                withinLI = true;
+            }
+        } else if (elementName.equals("ol") || elementName.equals("ul")) {
+            withinLI = false;
+        }
+
          h.put("$name", elementName);
          if (elementName.equals("!--")) {
              h.put("0", parseComment(cs));
@@ -108,7 +124,9 @@ public class HTML {
                  return null;
  
          // scan body
-        return parseBody(cs, h, elementName);
+        String ret = parseBody(cs, h, elementName);
+        withinLI = saveWithinLI;
+        return ret;
      }
  
      /**
@@ -116,7 +134,7 @@ public class HTML {
       *  positioned at the character immediately after the right
       *  bracket closing the start-tag
       */
-    private static String parseBody(CharStream cs, JSObject h, String elementName) throws IOException {
+    private static String parseBody(CharStream cs, JS h, String elementName) throws IOException {
          String cdata = "";
          int length = h.get("$numchildren") == null ? 0 : Integer.parseInt(h.get("$numchildren").toString());
          while(true) {
@@ -144,7 +162,7 @@ public class HTML {
              try {
                  // scan subelement
                  if (cs.peek() != '/') {
-                    JSObject kid = new JSObject();
+                    JS kid = new JS.Obj();
                      closetag = parseElement(cs, kid);
                      h.put(String.valueOf(length), kid); 
                      h.put("$numchildren", new Integer(++length));
@@ -304,7 +322,7 @@ public class HTML {
      // CharStream /////////////////////////////////////////////////////////////////////
  
      private static class CharStream extends PushbackReader {
-        public CharStream(Reader r) { super(r); }
+        public CharStream(Reader r) { super(r, 1024); }
  
          public char peek() throws IOException {
              char c = get();