[Zpt] CVS: Packages/TAL - HTMLTALParser.py:1.4
fred@digiciool.com
fred@digiciool.com
Wed, 14 Mar 2001 15:14:59 -0500 (EST)
Update of /cvs-repository/Packages/TAL
In directory korak:/tmp/cvs-serv22124
Modified Files:
HTMLTALParser.py
Log Message:
Add support for reasonable closing of open elements for which end tags
are considered optional. This does not always place the closing tags
as a human editor would, but makes the structure correct.
Removed support for the empty start & end tags which are no longer reported
by nsgmllib (an SGML feature not used with HTML).
--- Updated File HTMLTALParser.py in package Packages/TAL --
--- HTMLTALParser.py 2001/03/14 16:35:42 1.3
+++ HTMLTALParser.py 2001/03/14 20:14:59 1.4
@@ -7,16 +7,35 @@
"compact", "nowrap", "ismap", "declare", "noshade", "checked",
"disabled", "readonly", "multiple", "selected", "noresize",
"defer"
-]
+ ]
EMPTY_HTML_TAGS = [
# List of HTML tags with an empty content model; these are
# rendered in minimized form, e.g. <img />.
# From http://www.w3.org/TR/xhtml1/#dtds
"base", "meta", "link", "hr", "br", "param", "img", "area",
- "input", "col", "basefont", "isindex", "frame",
-]
+ "input", "col", "basefont", "isindex", "frame",
+ ]
+PARA_LEVEL_HTML_TAGS = [
+ # List of HTML elements that close open paragraph-level elements
+ # and are themselves paragraph-level.
+ "h1", "h2", "h3", "h4", "h5", "h6", "p",
+ ]
+
+CLOSING_BLOCK_LEVEL_HTML_TAGS = [
+ # These are HTML tags that close others in this list, but are not
+ # closed by paragraph-level tags. They don't close across other
+ # block-level boundaries.
+ "li", "dt", "dd", "td", "th", "tr",
+ ]
+
+BLOCK_LEVEL_HTML_TAGS = [
+ # List of HTML tags that denote larger sections than paragraphs.
+ "blockquote", "table", "tr", "th", "td", "thead", "tfoot", "tbody",
+ "noframe", "ul", "ol", "li", "dl", "dt", "dd", "div",
+ ]
+
from TALGenerator import TALGenerator
class HTMLTALParser(SGMLParser):
@@ -40,8 +59,8 @@
self.close()
while self.tagstack:
self.finish_endtag(None)
- assert self.tagstack == []
- assert self.nsstack == []
+ assert self.tagstack == [], self.tagstack
+ assert self.nsstack == [], self.nsstack
assert self.nsdict == {}, self.nsdict
def getCode(self):
@@ -68,21 +87,46 @@
def finish_starttag(self, tag, attrs):
self.scan_xmlns(attrs)
- if tag not in EMPTY_HTML_TAGS:
+ if tag in EMPTY_HTML_TAGS:
+ print "<%s>" % tag
+ self.pop_xmlns()
+ elif tag in CLOSING_BLOCK_LEVEL_HTML_TAGS:
+ close_to = -1
+ for i in range(len(self.tagstack)):
+ t = self.tagstack[i]
+ if t in CLOSING_BLOCK_LEVEL_HTML_TAGS:
+ close_to = i
+ elif t in BLOCK_LEVEL_HTML_TAGS:
+ close_to = -1
+ self._close_to_level(close_to)
+ self.tagstack.append(tag)
+ elif tag in PARA_LEVEL_HTML_TAGS + BLOCK_LEVEL_HTML_TAGS:
+ close_to = -1
+ for i in range(len(self.tagstack)):
+ if self.tagstack[i] in BLOCK_LEVEL_HTML_TAGS:
+ close_to = -1
+ elif self.tagstack[i] in PARA_LEVEL_HTML_TAGS:
+ if close_to == -1:
+ close_to = i
self.tagstack.append(tag)
+ self._close_to_level(close_to)
else:
- self.pop_xmlns()
+ self.tagstack.append(tag)
self.gen.emitStartTag(tag, attrs)
+ def _close_to_level(self, close_to):
+ if close_to > -1:
+ closing = self.tagstack[close_to:]
+ closing.reverse()
+ for t in closing:
+ self.finish_endtag(t)
+
def finish_endtag(self, tag):
if tag not in EMPTY_HTML_TAGS:
- if not tag:
- tag = self.tagstack.pop()
- else:
- assert tag in self.tagstack
- while self.tagstack[-1] != tag:
- self.finish_endtag(None)
- self.tagstack.pop()
+ assert tag in self.tagstack
+ while self.tagstack[-1] != tag:
+ self.finish_endtag(self.tagstack[-1])
+ self.tagstack.pop()
self.pop_xmlns()
self.gen.emitEndTag(tag)