[ZPT] CVS: Packages/TAL - HTMLParser.py:1.7
guido@digicool.com
guido@digicool.com
Thu, 22 Mar 2001 12:16:24 -0500 (EST)
Update of /cvs-repository/Packages/TAL
In directory korak:/tmp/cvs-serv27857
Modified Files:
HTMLParser.py
Log Message:
Yet another major redesign, simplifying (I hope) the logic of
goahead(). This rejects most invalid constructs: a lone & or <
without proper syntax following it is now an error. Even at EOF.
--- Updated File HTMLParser.py in package Packages/TAL --
--- HTMLParser.py 2001/03/21 23:13:13 1.6
+++ HTMLParser.py 2001/03/22 17:16:24 1.7
@@ -14,10 +14,7 @@
# Regular expressions used for parsing
interesting = re.compile('[&<]')
-incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|'
- '<([a-zA-Z][^<>]*|'
- '/([a-zA-Z][^<>]*)?|'
- '![^<>]*)?')
+incomplete = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*|#[0-9]*)?')
entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
charref = re.compile('&#([0-9]+)[^0-9]')
@@ -25,7 +22,8 @@
starttagopen = re.compile('<[a-zA-Z]')
piopen = re.compile(r'<\?')
piclose = re.compile('>')
-endtagopen = re.compile('</[a-zA-Z]')
+endtagopen = re.compile('</')
+declopen = re.compile('<!')
special = re.compile('<![^<>]*>')
commentopen = re.compile('<!--')
commentclose = re.compile(r'--\s*>')
@@ -36,7 +34,7 @@
locatestarttagend = re.compile("('[^']*'|\"[^\"]*\"|[^'\">]+)*/?>")
endstarttag = re.compile(r"\s*/?>")
-endendtag = re.compile('[>]')
+endendtag = re.compile('>')
declname = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*')
declstringlit = re.compile(r'(\'[^\']*\'|"[^"]*")\s*')
@@ -83,19 +81,9 @@
self.rawdata = ''
self.stack = []
self.lasttag = '???'
- self.nomoretags = 0
- self.literal = 0
self.lineno = 1
self.offset = 0
- # For derived classes only -- enter literal mode (CDATA) till EOF
- def setnomoretags(self):
- self.nomoretags = self.literal = 1
-
- # For derived classes only -- enter literal mode (CDATA)
- def setliteral(self, *args):
- self.literal = 1
-
# Interface -- feed some data to the parser. Call this as
# often as you want, with as little or as much text as you
# want (may include '\n'). (This just saves the text, all the
@@ -129,6 +117,12 @@
def getpos(self):
return self.lineno, self.offset
+ __starttag_text = None
+
+ # Interface -- return full source of start tag: "<...>"
+ def get_starttag_text(self):
+ return self.__starttag_text
+
# Internal -- handle data as far as reasonable. May leave state
# and data to be processed by a subsequent call. If 'end' is
# true, force handling all data as if followed by EOF marker.
@@ -137,70 +131,41 @@
i = 0
n = len(rawdata)
while i < n:
- if self.nomoretags:
- self.handle_data(rawdata[i:n])
- i = self.updatepos(i, n)
- break
- match = interesting.search(rawdata, i)
- if match: j = match.start(0)
+ match = interesting.search(rawdata, i) # < or &
+ if match: j = match.start()
else: j = n
if i < j: self.handle_data(rawdata[i:j])
i = self.updatepos(i, j)
if i == n: break
- assert rawdata[i] in "<&", "interesting.search() lied"
if rawdata[i] == '<':
- if starttagopen.match(rawdata, i):
- if self.literal:
- self.handle_data(rawdata[i])
- i = self.updatepos(i, i+1)
- continue
+ if starttagopen.match(rawdata, i): # < + letter
k = self.parse_starttag(i)
- if k < 0: break
- i = self.updatepos(i, k)
- continue
- if endtagopen.match(rawdata, i):
+ elif endtagopen.match(rawdata, i): # </
k = self.parse_endtag(i)
- if k < 0: break
- i = self.updatepos(i, k)
- self.literal = 0
- continue
- if commentopen.match(rawdata, i):
- if self.literal:
- self.handle_data(rawdata[i])
- i = self.updatepos(i, i+1)
- continue
+ elif commentopen.match(rawdata, i): # <!--
k = self.parse_comment(i)
- if k < 0: break
- i = self.updatepos(i, i+k)
- continue
- if piopen.match(rawdata, i):
- if self.literal:
- self.handle_data(rawdata[i])
- i = self.updatepos(i, i+1)
- continue
+ elif piopen.match(rawdata, i): # <?
k = self.parse_pi(i)
- if k < 0: break
- i = self.updatepos(i, i+k)
- continue
- match = special.match(rawdata, i)
- if match:
- if self.literal:
- self.handle_data(rawdata[i])
- i = self.updatepos(i, i+1)
- continue
- # This is some sort of declaration; in "HTML as
- # deployed," this should only be the document type
- # declaration ("<!DOCTYPE html...>").
+ elif declopen.match(rawdata, i): # <!
k = self.parse_declaration(i)
- if k < 0: break
- i = self.updatepos(i, k)
- continue
+ else:
+ if i < n-1:
+ raise HTMLParseError(
+ "invalid '<' construct: %s" % `rawdata[i:i+2]`,
+ self.getpos())
+ k = -1
+ if k < 0:
+ if end:
+ raise HTMLParseError("EOF in middle of construct",
+ self.getpos())
+ break
+ i = self.updatepos(i, k)
elif rawdata[i] == '&':
match = charref.match(rawdata, i)
if match:
name = match.group(1)
self.handle_charref(name)
- k = match.end(0)
+ k = match.end()
if rawdata[k-1] != ';':
k = k-1
i = self.updatepos(i, k)
@@ -209,49 +174,51 @@
if match:
name = match.group(1)
self.handle_entityref(name)
- k = match.end(0)
+ k = match.end()
if rawdata[k-1] != ';':
k = k-1
i = self.updatepos(i, k)
continue
- # We get here only if incomplete matches but
- # nothing else
- match = incomplete.match(rawdata, i)
- if not match:
- self.handle_data(rawdata[i])
- i = self.updatepos(i, i+1)
- continue
- j = match.end(0)
- if j == n:
- break # Really incomplete
- self.handle_data(rawdata[i:j])
- i = self.updatepos(i, j)
+ if incomplete.match(rawdata, i):
+ if end:
+ raise HTMLParseError(
+ "EOF in middle of entity or char ref",
+ self.getpos())
+ return -1 # incomplete
+ raise HTMLParseError("'&' not part of entity or char ref",
+ self.getpos())
+ else:
+ assert 0, "interesting.search() lied"
# end while
if end and i < n:
self.handle_data(rawdata[i:n])
i = self.updatepos(i, n)
self.rawdata = rawdata[i:]
- # XXX if end: check for empty stack
- # Internal -- parse comment, return length or -1 if not terminated
+ # Internal -- parse comment, return end or -1 if not terminated
def parse_comment(self, i):
rawdata = self.rawdata
assert rawdata[i:i+4] == '<!--', 'unexpected call to parse_comment()'
match = commentclose.search(rawdata, i+4)
if not match:
return -1
- j = match.start(0)
+ j = match.start()
self.handle_comment(rawdata[i+4: j])
- j = match.end(0)
- return j-i
+ j = match.end()
+ return j
# Internal -- parse declaration.
def parse_declaration(self, i):
+ # This is some sort of declaration; in "HTML as
+ # deployed," this should only be the document type
+ # declaration ("<!DOCTYPE html...>").
rawdata = self.rawdata
j = i + 2
+ assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"
# in practice, this should look like: ((name|stringlit) S*)+ '>'
- while 1:
- c = rawdata[j:j+1]
+ n = len(rawdata)
+ while j < n:
+ c = rawdata[j]
if c == ">":
# end of declaration syntax
self.handle_decl(rawdata[i+2:j])
@@ -259,55 +226,46 @@
if c in "\"'":
m = declstringlit.match(rawdata, j)
if not m:
- # incomplete or an error?
- return -1
+ return -1 # incomplete
j = m.end()
elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
m = declname.match(rawdata, j)
if not m:
- # incomplete or an error?
- return -1
+ return -1 # incomplete
j = m.end()
- elif i == len(rawdata):
- # end of buffer between tokens
- return -1
else:
raise HTMLParseError(
- "unexpected char in declaration: %s" % `rawdata[i]`,
+ "unexpected char in declaration: %s" % `rawdata[j]`,
self.getpos())
- assert 0, "can't get here!"
+ return -1 # incomplete
- # Internal -- parse processing instr, return length or -1 if not terminated
+ # Internal -- parse processing instr, return end or -1 if not terminated
def parse_pi(self, i):
rawdata = self.rawdata
assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
- match = piclose.search(rawdata, i+2)
+ match = piclose.search(rawdata, i+2) # >
if not match:
return -1
- j = match.start(0)
+ j = match.start()
self.handle_pi(rawdata[i+2: j])
- j = match.end(0)
- return j-i
-
- __starttag_text = None
- def get_starttag_text(self):
- return self.__starttag_text
+ j = match.end()
+ return j
- # Internal -- handle starttag, return length or -1 if not terminated
+ # Internal -- handle starttag, return end or -1 if not terminated
def parse_starttag(self, i):
self.__starttag_text = None
rawdata = self.rawdata
- m = locatestarttagend.match(rawdata, i)
+ m = locatestarttagend.match(rawdata, i) # > outside quotes
if not m:
return -1
- endpos = m.end(0)
+ endpos = m.end()
self.__starttag_text = rawdata[i:endpos]
# Now parse the data between i+1 and j into a tag and attrs
attrs = []
match = tagfind.match(rawdata, i+1)
assert match, 'unexpected call to parse_starttag()'
- k = match.end(0)
+ k = match.end()
self.lasttag = tag = string.lower(rawdata[i+1:k])
while k < endpos:
@@ -322,7 +280,7 @@
attrvalue = attrvalue[1:-1]
attrvalue = self.unescape(attrvalue)
attrs.append((string.lower(attrname), attrvalue))
- k = m.end(0)
+ k = m.end()
end = string.strip(rawdata[k:endpos])
if end not in (">", "/>"):
@@ -343,16 +301,19 @@
self.finish_starttag(tag, attrs)
return endpos
- # Internal -- parse endtag
+ # Internal -- parse endtag, return end or -1 if incomplete
def parse_endtag(self, i):
rawdata = self.rawdata
- match = endendtag.search(rawdata, i+1)
+ assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
+ match = endendtag.search(rawdata, i+1) # >
if not match:
return -1
- j = match.start(0)
- tag = string.lower(string.strip(rawdata[i+2:j]))
+ j = match.end()
+ tag = string.lower(string.strip(rawdata[i+2:j-1]))
+ if not tag:
+ raise HTMLParseError("empty start tag", self.getpos())
self.finish_endtag(tag)
- return j + 1
+ return j
# Overridable -- finish processing of start+end tag: <tag.../>
def finish_startendtag(self, tag, attrs):
@@ -360,7 +321,6 @@
self.finish_endtag(tag)
# Overridable -- finish processing of start tag
- # Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag
def finish_starttag(self, tag, attrs):
try:
method = getattr(self, 'start_' + tag)