[ZPT] CVS: Packages/TAL - HTMLParser.py:1.13
fred@digicool.com
fred@digicool.com
Fri, 6 Apr 2001 18:23:32 -0400 (EDT)
Update of /cvs-repository/Packages/TAL
In directory korak:/tmp/cvs-serv7643
Modified Files:
HTMLParser.py
Log Message:
Add general support for CDATA element content, enabled by default for
<script> and <style> elements since those are the ones I found in HTML 4.01.
--- Updated File HTMLParser.py in package Packages/TAL --
--- HTMLParser.py 2001/04/06 20:09:40 1.12
+++ HTMLParser.py 2001/04/06 22:23:31 1.13
@@ -13,7 +13,8 @@
# Regular expressions used for parsing
-interesting = re.compile('[&<]')
+interesting_normal = re.compile('[&<]')
+interesting_cdata = re.compile('</')
incomplete = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*|#[0-9]*)?')
entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
@@ -85,9 +86,11 @@
class HTMLParser:
+ CDATA_CONTENT_ELEMENTS = ("script", "style")
+
+
# Interface -- initialize and reset this instance
- def __init__(self, verbose=0):
- self.verbose = verbose
+ def __init__(self):
self.reset()
# Interface -- reset this instance. Loses all unprocessed data
@@ -97,6 +100,7 @@
self.lasttag = '???'
self.lineno = 1
self.offset = 0
+ self.interesting = interesting_normal
# Interface -- feed some data to the parser. Call this as
# often as you want, with as little or as much text as you
@@ -137,6 +141,9 @@
def get_starttag_text(self):
return self.__starttag_text
+ def set_cdata_mode(self):
+ self.interesting = interesting_cdata
+
# Internal -- handle data as far as reasonable. May leave state
# and data to be processed by a subsequent call. If 'end' is
# true, force handling all data as if followed by EOF marker.
@@ -145,9 +152,12 @@
i = 0
n = len(rawdata)
while i < n:
- match = interesting.search(rawdata, i) # < or &
- if match: j = match.start()
- else: j = n
+ match = self.interesting.search(rawdata, i) # < or &
+ if match:
+ j = match.start()
+ self.interesting = interesting_normal
+ else:
+ j = n
if i < j: self.handle_data(rawdata[i:j])
i = self.updatepos(i, j)
if i == n: break
@@ -312,6 +322,8 @@
self.handle_startendtag(tag, attrs)
else:
self.handle_starttag(tag, attrs)
+ if tag in self.CDATA_CONTENT_ELEMENTS:
+ self.set_cdata_mode()
return endpos
# Internal -- check to see if we have a complete starttag; return end