[ZPT] CVS: Packages/TAL - HTMLParser.py:1.2

fred@digicool.com fred@digicool.com
Mon, 19 Mar 2001 22:22:13 -0500 (EST)


Update of /cvs-repository/Packages/TAL
In directory korak:/tmp/cvs-serv4456

Modified Files:
	HTMLParser.py 
Log Message:
Remove case where "self" got passed to self.updatepos() as a parameter.
 
Re-wrote parsing of attributes to be more robust and detect
substantial errors more reliably.  (Now works even if "<" or ">"
occurs in an attribute value!)



--- Updated File HTMLParser.py in package Packages/TAL --
--- HTMLParser.py	2001/03/19 17:18:18	1.1
+++ HTMLParser.py	2001/03/20 03:22:12	1.2
@@ -24,7 +24,6 @@
 piopen = re.compile('<\?')
 piclose = re.compile('>')
 endtagopen = re.compile('</[a-zA-Z]')
-endbracket = re.compile('[<>]')
 special = re.compile('<![^<>]*>')
 commentopen = re.compile('<!--')
 commentclose = re.compile(r'--\s*>')
@@ -33,6 +32,10 @@
     r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
     r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./:;+*%?!&$\(\)_#=~]*))?')
 
+locatestarttagend = re.compile("('[^']*'|\"[^\"]*\"|[^'\">]+)*/?>")
+endstarttag = re.compile(r"\s*/?>")
+endendtag = re.compile('[>]')
+
 declname = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*')
 declstringlit = re.compile(r'(\'[^\']*\'|"[^"]*")\s*')
 
@@ -215,7 +218,7 @@
             if j == n:
                 break # Really incomplete
             self.handle_data(rawdata[i:j])
-            i = self.updatepos(self, i, j)
+            i = self.updatepos(i, j)
         # end while
         if end and i < n:
             self.handle_data(rawdata[i:n])
@@ -290,14 +293,13 @@
     # Internal -- handle starttag, return length or -1 if not terminated
     def parse_starttag(self, i):
         self.__starttag_text = None
-        start_pos = i
         rawdata = self.rawdata
-        # XXX The following should skip matching quotes (' or ")
-        match = endbracket.search(rawdata, i+1)
-        if not match:
+        m = locatestarttagend.match(rawdata, i)
+        if not m:
             return -1
-        self.__starttag_text = rawdata[i:match.end()]
-        j = match.start(0)
+        endpos = m.end(0)
+        self.__starttag_text = rawdata[i:endpos]
+
         # Now parse the data between i+1 and j into a tag and attrs
         attrs = []
         match = tagfind.match(rawdata, i+1)
@@ -305,9 +307,9 @@
             raise HTMLParseError('unexpected call to parse_starttag()',
                                  self.getpos())
         k = match.end(0)
-        tag = string.lower(rawdata[i+1:k])
-        self.lasttag = tag
-        while k < j:
+        self.lasttag = tag = string.lower(rawdata[i+1:k])
+
+        while k < endpos:
             m = attrfind.match(rawdata, k)
             if not m:
                 break
@@ -317,32 +319,38 @@
             elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
                  attrvalue[:1] == '"' == attrvalue[-1:]:
                 attrvalue = attrvalue[1:-1]
-            attrvalue = self.unescape(attrvalue)
+                attrvalue = self.unescape(attrvalue)
             attrs.append((string.lower(attrname), attrvalue))
             k = m.end(0)
-        if rawdata[j:j+1] == '/>':
-            explicit_empty = 1
-            j = j + 2
-        elif rawdata[j] == '>':
-            j = j + 1
+
+        end = string.strip(rawdata[k:endpos])
+        if end not in (">", "/>"):
+            lineno, offset = self.getpos()
+            if "\n" in self.__starttag_text:
+                lineno = lineno + string.count(self.__starttag_text, "\n")
+                offset = len(self.__starttag_text) \
+                         - string.rfind(self.__starttag_text, "\n")
+            else:
+                offset = offset + len(self.__starttag_text)
+            raise HTMLParseError("junk characters in start tag: %s"
+                                 % `rawdata[k:endpos][:20]`,
+                                 (lineno, offset))
         self.finish_starttag(tag, attrs)
-        if self.__starttag_text[-2:] == '/>':
+        if end[-2:] == '/>':
             # XHTML-style empty tag: <span attr="value" />
             self.finish_endtag(tag)
-        return j
+        return endpos
 
     # Internal -- parse endtag
     def parse_endtag(self, i):
         rawdata = self.rawdata
-        match = endbracket.search(rawdata, i+1)
+        match = endendtag.search(rawdata, i+1)
         if not match:
             return -1
         j = match.start(0)
         tag = string.lower(string.strip(rawdata[i+2:j]))
-        if rawdata[j] == '>':
-            j = j+1
         self.finish_endtag(tag)
-        return j
+        return j + 1
 
     # Internal -- finish processing of start tag
     # Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag