[ZPT] CVS: Packages/TAL - HTMLParser.py:1.2
fred@digicool.com
fred@digicool.com
Mon, 19 Mar 2001 22:22:13 -0500 (EST)
Update of /cvs-repository/Packages/TAL
In directory korak:/tmp/cvs-serv4456
Modified Files:
HTMLParser.py
Log Message:
Remove case where "self" got passed to self.updatepos() as a parameter.
Re-wrote parsing of attributes to be more robust and detect
substantial errors more reliably. (Now works even if "<" or ">"
occurs in an attribute value!)
--- Updated File HTMLParser.py in package Packages/TAL --
--- HTMLParser.py 2001/03/19 17:18:18 1.1
+++ HTMLParser.py 2001/03/20 03:22:12 1.2
@@ -24,7 +24,6 @@
piopen = re.compile('<\?')
piclose = re.compile('>')
endtagopen = re.compile('</[a-zA-Z]')
-endbracket = re.compile('[<>]')
special = re.compile('<![^<>]*>')
commentopen = re.compile('<!--')
commentclose = re.compile(r'--\s*>')
@@ -33,6 +32,10 @@
r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./:;+*%?!&$\(\)_#=~]*))?')
+locatestarttagend = re.compile("('[^']*'|\"[^\"]*\"|[^'\">]+)*/?>")
+endstarttag = re.compile(r"\s*/?>")
+endendtag = re.compile('[>]')
+
declname = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*')
declstringlit = re.compile(r'(\'[^\']*\'|"[^"]*")\s*')
@@ -215,7 +218,7 @@
if j == n:
break # Really incomplete
self.handle_data(rawdata[i:j])
- i = self.updatepos(self, i, j)
+ i = self.updatepos(i, j)
# end while
if end and i < n:
self.handle_data(rawdata[i:n])
@@ -290,14 +293,13 @@
# Internal -- handle starttag, return length or -1 if not terminated
def parse_starttag(self, i):
self.__starttag_text = None
- start_pos = i
rawdata = self.rawdata
- # XXX The following should skip matching quotes (' or ")
- match = endbracket.search(rawdata, i+1)
- if not match:
+ m = locatestarttagend.match(rawdata, i)
+ if not m:
return -1
- self.__starttag_text = rawdata[i:match.end()]
- j = match.start(0)
+ endpos = m.end(0)
+ self.__starttag_text = rawdata[i:endpos]
+
# Now parse the data between i+1 and j into a tag and attrs
attrs = []
match = tagfind.match(rawdata, i+1)
@@ -305,9 +307,9 @@
raise HTMLParseError('unexpected call to parse_starttag()',
self.getpos())
k = match.end(0)
- tag = string.lower(rawdata[i+1:k])
- self.lasttag = tag
- while k < j:
+ self.lasttag = tag = string.lower(rawdata[i+1:k])
+
+ while k < endpos:
m = attrfind.match(rawdata, k)
if not m:
break
@@ -317,32 +319,38 @@
elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
attrvalue[:1] == '"' == attrvalue[-1:]:
attrvalue = attrvalue[1:-1]
- attrvalue = self.unescape(attrvalue)
+ attrvalue = self.unescape(attrvalue)
attrs.append((string.lower(attrname), attrvalue))
k = m.end(0)
- if rawdata[j:j+1] == '/>':
- explicit_empty = 1
- j = j + 2
- elif rawdata[j] == '>':
- j = j + 1
+
+ end = string.strip(rawdata[k:endpos])
+ if end not in (">", "/>"):
+ lineno, offset = self.getpos()
+ if "\n" in self.__starttag_text:
+ lineno = lineno + string.count(self.__starttag_text, "\n")
+ offset = len(self.__starttag_text) \
+ - string.rfind(self.__starttag_text, "\n")
+ else:
+ offset = offset + len(self.__starttag_text)
+ raise HTMLParseError("junk characters in start tag: %s"
+ % `rawdata[k:endpos][:20]`,
+ (lineno, offset))
self.finish_starttag(tag, attrs)
- if self.__starttag_text[-2:] == '/>':
+ if end[-2:] == '/>':
# XHTML-style empty tag: <span attr="value" />
self.finish_endtag(tag)
- return j
+ return endpos
# Internal -- parse endtag
def parse_endtag(self, i):
rawdata = self.rawdata
- match = endbracket.search(rawdata, i+1)
+ match = endendtag.search(rawdata, i+1)
if not match:
return -1
j = match.start(0)
tag = string.lower(string.strip(rawdata[i+2:j]))
- if rawdata[j] == '>':
- j = j+1
self.finish_endtag(tag)
- return j
+ return j + 1
# Internal -- finish processing of start tag
# Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag