[Zope-CVS] CVS: Packages/HTMLStructure - Wrapper.py:1.1 Parser.py:1.3 Printer.py:1.2 Validator.py:1.2
Evan Simpson
evan@zope.com
Mon, 28 Jan 2002 14:25:02 -0500
Update of /cvs-repository/Packages/HTMLStructure
In directory cvs.zope.org:/tmp/cvs-serv28646
Modified Files:
Parser.py Printer.py Validator.py
Added Files:
Wrapper.py
Log Message:
Refactored, added sample HTML for tests.
=== Added File Packages/HTMLStructure/Wrapper.py ===
class WrapTags:
index = -1
tag = None
def __init__(self, parsed_text):
self.parsed_text = parsed_text
def next(self):
'''Move to next tag'''
i = self.index + 1
parse = self.parsed_text.parse
plen = len(parse)
while i < plen:
elem = parse[i]
if elem[0] == 'tag':
self.index = i
self.tag = elem
return elem
i += 1
def prev(self):
'''Move to previous tag'''
i = self.index
parse = self.parsed_text.parse
if i < 0:
i = len(parse)
while i:
i -= 1
elem = parse[i]
if elem[0] == 'tag':
self.index = i
self.tag = elem
return elem
def goto(self, index):
'''Move to a specified parse index'''
parse = self.parsed_text.parse
try:
elem = parse[index]
except:
return
if elem[0] == 'tag':
self.index = index
self.tag = elem
return elem
def tName(self):
'''Tag name'''
sub = self.tag[3][0]
if sub[0] in ('open', 'close'):
return self.parsed_text.elementText(sub[3][0]).lower()
def tType(self, is_empty=None):
'''Get the open/close/empty type of the tag.
Return None for elements that are not one of
these three types of tag.
'''
# The first sub-element of a tag is the type element.
sub = self.tag[3][0]
ttype = sub[0]
# We only care about opening, closing, and empty tags
if ttype == 'close':
return ttype
if ttype == 'open':
# Check whether an open tag is empty
if (sub[3][-1][0] == 'empty' or
(is_empty and is_empty(self.tName())) ):
return 'empty'
return ttype
def _attrs(self):
sub = self.tag[3][0]
if sub[0] != 'open' or len(sub[3]) < 2 or sub[3][1][0] != 'attrs':
return
get_text = self.parsed_text.elementText
attrs = []
for attr in sub[3][1][3]:
name = get_text(attr[3][0])
value = None
if len(attr[3]) == 2:
value = get_text(attr[3][1])
attrs.append((name, value))
return attrs
def tAttrDict(self):
'''Tag attributes as a dictionary'''
attrs = self._attrs()
if not attrs: return
amap = {}
for name, value in attrs:
amap[name.lower()] = value
def tAttrNames(self):
'''Tag attribute names as a list'''
attrs = self._attrs()
if not attrs: return
return [name for name, value in attrs]
=== Packages/HTMLStructure/Parser.py 1.2 => 1.3 ===
if parseTables is None:
parseTables = ParseTables()
- self.parse = tag(text, parseTables.pt_html)
- self.linemap = getLines(text)
+ self.complete, self.parse, self.up_to = tag(text, parseTables.pt_html)
+ self.lines = getLines(text)
def posOfChar(self, index):
'''Convert a character index into a line number and line position.'''
line = self.lineOfChar(index)
if line > 0:
- index -= self.linemap[line - 1]
+ index -= self.lines[line - 1]
return line, index
def lineOfChar(self, index):
'''Convert a character index into a line number.'''
- return bisect(self.linemap, index)
+ return bisect(self.lines, index)
def elementText(self, element):
'''Get the text for a parse element.'''
return self.text[element[1]:element[2]]
-
- def tagName(self, tag):
- '''Get the name of a tag parse element.'''
- return self.elementText(tag[3][0][3][0]).lower()
-
- def tagType(self, tag, is_empty=None):
- '''Get the open/close/empty type of a tag.
-
- Return None for elements that are not tags, or not one of
- these three types of tag.
- '''
- if tag[0] != 'tag':
- return
- sub = tag[3][0]
- # The first sub-element of a tag is the type.
- ttype = sub[0]
- # We only care about opening, closing, and empty tags
- if ttype == 'close':
- return ttype
- if ttype == 'open':
- # Check whether an open tag is empty
- if (sub[3][-1][0] == 'empty' or
- (is_empty and is_empty(self.tagName(tag))) ):
- return 'empty'
- return ttype
# Parse HTML. Needs to be read from bottom to top.
=== Packages/HTMLStructure/Printer.py 1.1 => 1.2 ===
out = []
output = out.append
- linelist = parsedtext.linemap
+ linelist = parsedtext.lines
- #linemap = {}
- #for sm in span_marks:
- # bline = bisect(linelist, sm[0])
- # eline = bisect(linelist, sm[1])
- # if sm[1] == linemap[eline]:
- # eline -= 1
- # linemap.setdefault(bline, [])
- # linemap[bline].append(sm)
- # if bline != eline:
- # linemap.setdefault(eline, [])
- # linemap[eline].append(sm)
+ # for each span mark, get the class, start, and stop.
+ # find out the line number for the start and stop
+ # each line has two lists: starts here, ends here.
+ # Add to starts of start, ends of end.
+
+ startmarks = {}
+ endmarks = {}
+ for sm in span_marks:
+ if sm[2] <= sm[1]:
+ continue
+ bline = bisect(linelist, sm[1])
+ eline = bisect(linelist, sm[2])
+ if sm[2] == linelist[eline]:
+ eline -= 1
+ startmarks.setdefault(bline, []).append(sm)
+ endmarks.setdefault(eline, []).append(sm)
output('<pre class="source">')
text = parsedtext.text
lend = 0
+ marks = []
for n in range(len(linelist)):
output('<a name="%s" class="linenumber">%s</a>' % (n + 1, n + 1))
lbegin = lend
lend = linelist[n]
+
+ for mark in marks:
+ output('<span class="%s">' % mark)
+
+ linemarks = []
+ for mark in startmarks.get(n, []):
+ linemarks.append((mark[1], 1, mark[0]))
+ for mark in endmarks.get(n, []):
+ linemarks.append((mark[2], 0, mark[0]))
+ linemarks.sort()
+
+ for index, is_start, mark in linemarks:
+ output(escape(text[lbegin:index]))
+ if is_start:
+ output('<span class="%s">' % mark)
+ marks.append(mark)
+ else:
+ output('</span>')
+ marks.pop()
+ lbegin = index
output(escape(text[lbegin:lend]))
+
+ for mark in marks:
+ output('</span>')
output('</pre>')
return ''.join(out)
=== Packages/HTMLStructure/Validator.py 1.1 => 1.2 ===
from bisect import bisect
class Validator:
@@ -11,7 +12,7 @@
'''Check all character entity references'''
all = []
errors = []
- self._collect_entities(self.pt.parse[1], all)
+ self._collect_entities(self.pt.parse, all)
for entity in all:
sub = entity[3]
if sub[-1][0] == 'error' or sub[0][2] - sub[0][1] > 32:
@@ -45,14 +46,19 @@
unclosed = []
unopened = []
- parse = self.pt.parse[1]
- tagName = self.pt.tagName
unop_names = []
- # Scan non-empty tags (in reverse, as collect_tags returns them)
- for (i, ttype) in self.collect_tags(empty=0):
- tname = tagName(parse[i])
- if ttype == 'close':
+ # Scan non-empty tags in reverse order
+ tags = Wrapper.WrapTags(self.pt)
+ while tags.prev():
+ tag = tags.tag
+ tname = tags.tName()
+ ttype = tags.tType(self.is_empty)
+ i = tags.index
+
+ if not ttype or ttype == 'empty':
+ pass
+ elif ttype == 'close':
unopened.append(i)
unop_names.append(tname)
elif unopened and unop_names[-1] == tname:
@@ -66,32 +72,6 @@
unclosed.reverse()
return matched, unclosed, unopened
- def collect_tags(self, opening=1, closing=1, empty=1):
- '''Return a list of tags (in reverse order).
-
- Elements of the list are (parse index, tag type) pairs.
- Arguments to the method indicate which types of tag to include.
- '''
- parse = self.pt.parse[1]
- tagName = self.pt.tagName
- tagType = self.pt.tagType
- is_empty = self.is_empty
- octags = []
- # Scan backwards
- i = len(parse)
- while i:
- i = i - 1
- ttype = tagType(parse[i], is_empty)
- # Only bother with open/close/empty tags
- if ttype is None:
- continue
- # Skip tag types that aren't asked for
- if ((opening and ttype == 'open') or
- (closing and ttype == 'close') or
- (empty and ttype == 'empty')):
- octags.append((i, ttype))
- return octags
-
def implicit_match_tags(self, matched, unclosed):
'''Try to find implicit closing tags for unmatched opening tags.
@@ -108,16 +88,15 @@
# Save effort
if not unclosed:
return (), ()
- parse = self.pt.parse[1]
- tagName = self.pt.tagName
impclose_following_map = self.impclose_following_map
impclose_enclosing_map = self.impclose_enclosing_map
new_matched = []
still_unclosed = []
+ tags = Wrapper.WrapTags(self.pt)
for pidx in unclosed:
- tag = parse[pidx]
- tname = tagName(tag)
+ tags.goto(pidx)
+ tname = tags.tName()
enc_closers = impclose_enclosing_map.get(tname)
if enc_closers is None:
still_unclosed.append(pidx)
@@ -127,7 +106,8 @@
while 1:
idx = tag_iter.next()
if idx is None: break
- ftname = tagName(parse[idx])
+ tags.goto(idx)
+ ftname = tags.tName()
if ftname in closers:
break
if idx is None:
@@ -138,7 +118,8 @@
# Implicit close at EOF
idx = -1
else:
- etname = tagName(parse[enc_tag[0]])
+ tags.goto(enc_tag[0])
+ etname = tags.tName()
if etname in enc_closers or "" in enc_closers:
# Implicit close at end of enclosing tag
idx = enc_tag[1]
@@ -152,7 +133,6 @@
stop = 0
enc_tag = None
def __init__(self, matched, unclosed, pidx):
- #import pdb; pdb.set_trace()
self.matched = matched
self.unclosed = unclosed
mpos = bisect(matched, (pidx,))