[Zpt] CVS: Packages/TAL - HTMLTALParser.py:1.4

Wed, 14 Mar 2001 15:14:59 -0500 (EST)

Update of /cvs-repository/Packages/TAL
In directory korak:/tmp/cvs-serv22124

Modified Files:
	HTMLTALParser.py 
Log Message:

Add support for reasonable closing of open elements for which end tags
are considered optional.  This does not always place the closing tags
as a human editor would, but makes the structure correct.

Removed support for the empty start & end tags which are no longer reported
by nsgmllib (an SGML feature not used with HTML).

--- Updated File HTMLTALParser.py in package Packages/TAL --
--- HTMLTALParser.py	2001/03/14 16:35:42	1.3
+++ HTMLTALParser.py	2001/03/14 20:14:59	1.4
@@ -7,16 +7,35 @@
     "compact", "nowrap", "ismap", "declare", "noshade", "checked",
     "disabled", "readonly", "multiple", "selected", "noresize",
     "defer"
-]
+    ]
 
 EMPTY_HTML_TAGS = [
     # List of HTML tags with an empty content model; these are
     # rendered in minimized form, e.g. <img />.
     # From http://www.w3.org/TR/xhtml1/#dtds
     "base", "meta", "link", "hr", "br", "param", "img", "area",
-    "input", "col", "basefont", "isindex", "frame", 
-]
+    "input", "col", "basefont", "isindex", "frame",
+    ]
 
+PARA_LEVEL_HTML_TAGS = [
+    # List of HTML elements that close open paragraph-level elements
+    # and are themselves paragraph-level.
+    "h1", "h2", "h3", "h4", "h5", "h6", "p",
+    ]
+
+CLOSING_BLOCK_LEVEL_HTML_TAGS = [
+    # These are HTML tags that close others in this list, but are not
+    # closed by paragraph-level tags.  They don't close across other
+    # block-level boundaries.
+    "li", "dt", "dd", "td", "th", "tr",
+    ]
+
+BLOCK_LEVEL_HTML_TAGS = [
+    # List of HTML tags that denote larger sections than paragraphs.
+    "blockquote", "table", "tr", "th", "td", "thead", "tfoot", "tbody",
+    "noframe", "ul", "ol", "li", "dl", "dt", "dd", "div",
+    ]
+
 from TALGenerator import TALGenerator
 
 class HTMLTALParser(SGMLParser):
@@ -40,8 +59,8 @@
         self.close()
         while self.tagstack:
             self.finish_endtag(None)
-        assert self.tagstack == []
-        assert self.nsstack == []
+        assert self.tagstack == [], self.tagstack
+        assert self.nsstack == [], self.nsstack
         assert self.nsdict == {}, self.nsdict
 
     def getCode(self):
@@ -68,21 +87,46 @@
 
     def finish_starttag(self, tag, attrs):
         self.scan_xmlns(attrs)
-        if tag not in EMPTY_HTML_TAGS:
+        if tag in EMPTY_HTML_TAGS:
+            print "<%s>" % tag
+            self.pop_xmlns()
+        elif tag in CLOSING_BLOCK_LEVEL_HTML_TAGS:
+            close_to = -1
+            for i in range(len(self.tagstack)):
+                t = self.tagstack[i]
+                if t in CLOSING_BLOCK_LEVEL_HTML_TAGS:
+                    close_to = i
+                elif t in BLOCK_LEVEL_HTML_TAGS:
+                    close_to = -1
+            self._close_to_level(close_to)
+            self.tagstack.append(tag)
+        elif tag in PARA_LEVEL_HTML_TAGS + BLOCK_LEVEL_HTML_TAGS:
+            close_to = -1
+            for i in range(len(self.tagstack)):
+                if self.tagstack[i] in BLOCK_LEVEL_HTML_TAGS:
+                    close_to = -1
+                elif self.tagstack[i] in PARA_LEVEL_HTML_TAGS:
+                    if close_to == -1:
+                        close_to = i
             self.tagstack.append(tag)
+            self._close_to_level(close_to)
         else:
-            self.pop_xmlns()
+            self.tagstack.append(tag)
         self.gen.emitStartTag(tag, attrs)
 
+    def _close_to_level(self, close_to):
+        if close_to > -1:
+            closing = self.tagstack[close_to:]
+            closing.reverse()
+            for t in closing:
+                self.finish_endtag(t)
+
     def finish_endtag(self, tag):
         if tag not in EMPTY_HTML_TAGS:
-            if not tag:
-                tag = self.tagstack.pop()
-            else:
-                assert tag in self.tagstack
-                while self.tagstack[-1] != tag:
-                    self.finish_endtag(None)
-                self.tagstack.pop()
+            assert tag in self.tagstack
+            while self.tagstack[-1] != tag:
+                self.finish_endtag(self.tagstack[-1])
+            self.tagstack.pop()
             self.pop_xmlns()
         self.gen.emitEndTag(tag)