[ZPT] CVS: Packages/TAL - HTMLParser.py:1.12
fred@digicool.com
fred@digicool.com
Fri, 6 Apr 2001 16:09:42 -0400 (EDT)
Update of /cvs-repository/Packages/TAL
In directory korak:/tmp/cvs-serv19345
Modified Files:
HTMLParser.py
Log Message:
locatestarttagend: Completely re-write the expression so to be much
more strict about matchnig only what's legal. The expression ends
up being a bit more complex, and needs additional checks to be
done on what follows.
HTMLParser.check_for_whole_start_tag(): Helper method that uses
locatestarttagend, performs the required additional checks, and
determines whether we've actually found the end of the start tag,
are at a buffer boundary, or have encountered an syntactical
error.
HTMLParser.parse_starttag(): Use check_for_whole_start_tag() to see
if we really have the start tag.
HTMLParseError.__init__(): Simplify assertion.
This should close ZPT(18).
--- Updated File HTMLParser.py in package Packages/TAL --
--- HTMLParser.py 2001/03/26 16:48:32 1.11
+++ HTMLParser.py 2001/04/06 20:09:40 1.12
@@ -32,7 +32,20 @@
r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./:;+*%?!&$\(\)_#=~]*))?')
-locatestarttagend = re.compile("('[^']*'|\"[^\"]*\"|[^'\">]+)*/?>")
+locatestarttagend = re.compile(r"""
+ <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
+ (?:\s+ # whitespace before attribute name
+ (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
+ (?:\s*=\s* # value indicator
+ (?:'[^']*' # LITA-enclosed value
+ |\"[^\"]*\" # LIT-enclosed value
+ |[^'\">\s]+ # bare value
+ )
+ )?
+ )
+ )*
+ \s* # trailing whitespace
+""", re.VERBOSE)
endstarttag = re.compile(r"\s*/?>")
endendtag = re.compile('>')
endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
@@ -45,7 +58,7 @@
"""Exception raised for all parse errors."""
def __init__(self, msg, position=(None, None)):
- assert msg != ""
+ assert msg
self.msg = msg
self.lineno = position[0]
self.offset = position[1]
@@ -255,11 +268,10 @@
# Internal -- handle starttag, return end or -1 if not terminated
def parse_starttag(self, i):
self.__starttag_text = None
+ endpos = self.check_for_whole_start_tag(i)
+ if endpos < 0:
+ return endpos
rawdata = self.rawdata
- m = locatestarttagend.match(rawdata, i) # > outside quotes
- if not m:
- return -1
- endpos = m.end()
self.__starttag_text = rawdata[i:endpos]
# Now parse the data between i+1 and j into a tag and attrs
@@ -301,6 +313,29 @@
else:
self.handle_starttag(tag, attrs)
return endpos
+
+ # Internal -- check to see if we have a complete starttag; return end
+ # or -1 if incomplete.
+ def check_for_whole_start_tag(self, i):
+ rawdata = self.rawdata
+ m = locatestarttagend.match(rawdata, i)
+ if m:
+ j = m.end()
+ next = rawdata[j:j+1]
+ if next == ">":
+ return j + 1
+ if rawdata[j:j+2] == "/>":
+ return j + 2
+ if next == "":
+ # end of input
+ return -1
+ if next in ("abcdefghijklmnopqrstuvwxyz="
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
+ # end of input in or before attribute value
+ return -1
+ self.updatepos(i, j)
+ raise HTMLParseError("malformed start tag", self.getpos())
+ raise AssertionError("we should not gt here!")
# Internal -- parse endtag, return end or -1 if incomplete
def parse_endtag(self, i):