[Zope-CVS] CVS: Packages/HTMLStructure - Wrapper.py:1.1 Parser.py:1.3 Printer.py:1.2 Validator.py:1.2

Evan Simpson evan@zope.com
Mon, 28 Jan 2002 14:25:02 -0500


Update of /cvs-repository/Packages/HTMLStructure
In directory cvs.zope.org:/tmp/cvs-serv28646

Modified Files:
	Parser.py Printer.py Validator.py 
Added Files:
	Wrapper.py 
Log Message:
Refactored, added sample HTML for tests.


=== Added File Packages/HTMLStructure/Wrapper.py ===
class WrapTags:
    index = -1
    tag = None

    def __init__(self, parsed_text):
        self.parsed_text = parsed_text

    def next(self):
        '''Move to next tag'''
        i = self.index + 1
        parse = self.parsed_text.parse
        plen = len(parse)
        while i < plen:
            elem = parse[i]
            if elem[0] == 'tag':
                self.index = i
                self.tag = elem
                return elem
            i += 1

    def prev(self):
        '''Move to previous tag'''
        i = self.index
        parse = self.parsed_text.parse
        if i < 0:
            i = len(parse)
        while i:
            i -= 1
            elem = parse[i]
            if elem[0] == 'tag':
                self.index = i
                self.tag = elem
                return elem

    def goto(self, index):
        '''Move to a specified parse index'''
        parse = self.parsed_text.parse
        try:
            elem = parse[index]
        except:
            return
        if elem[0] == 'tag':
            self.index = index
            self.tag = elem
            return elem

    def tName(self):
        '''Tag name'''
        sub = self.tag[3][0]
        if sub[0] in ('open', 'close'):
            return self.parsed_text.elementText(sub[3][0]).lower()

    def tType(self, is_empty=None):
        '''Get the open/close/empty type of the tag.

        Return None for elements that are not one of
        these three types of tag.
        '''
        # The first sub-element of a tag is the type element.
        sub = self.tag[3][0]
        ttype = sub[0]
        # We only care about opening, closing, and empty tags
        if ttype == 'close':
            return ttype
        if ttype == 'open':
            # Check whether an open tag is empty
            if (sub[3][-1][0] == 'empty' or
                (is_empty and is_empty(self.tName())) ):
                return 'empty'
            return ttype

    def _attrs(self):
        sub = self.tag[3][0]
        if sub[0] != 'open' or len(sub[3]) < 2 or sub[3][1][0] != 'attrs':
            return
        get_text = self.parsed_text.elementText
        attrs = []
        for attr in sub[3][1][3]:
            name = get_text(attr[3][0])
            value = None
            if len(attr[3]) == 2:
                value = get_text(attr[3][1])
            attrs.append((name, value))
        return attrs
        

    def tAttrDict(self):
        '''Tag attributes as a dictionary'''
        attrs = self._attrs()
        if not attrs: return
        amap = {}
        for name, value in attrs:
            amap[name.lower()] = value

    def tAttrNames(self):
        '''Tag attribute names as a list'''
        attrs = self._attrs()
        if not attrs: return        
        return [name for name, value in attrs]


=== Packages/HTMLStructure/Parser.py 1.2 => 1.3 ===
         if parseTables is None:
             parseTables = ParseTables()
-        self.parse = tag(text, parseTables.pt_html)
-        self.linemap = getLines(text)
+        self.complete, self.parse, self.up_to = tag(text, parseTables.pt_html)
+        self.lines = getLines(text)
 
     def posOfChar(self, index):
         '''Convert a character index into a line number and line position.'''
         line = self.lineOfChar(index)
         if line > 0:
-            index -= self.linemap[line - 1]
+            index -= self.lines[line - 1]
         return line, index
 
     def lineOfChar(self, index):
         '''Convert a character index into a line number.'''
-        return bisect(self.linemap, index)
+        return bisect(self.lines, index)
 
     def elementText(self, element):
         '''Get the text for a parse element.'''
         return self.text[element[1]:element[2]]
-
-    def tagName(self, tag):
-        '''Get the name of a tag parse element.'''
-        return self.elementText(tag[3][0][3][0]).lower()
-
-    def tagType(self, tag, is_empty=None):
-        '''Get the open/close/empty type of a tag.
-
-        Return None for elements that are not tags, or not one of
-        these three types of tag.
-        '''
-        if tag[0] != 'tag':
-            return
-        sub = tag[3][0]
-        # The first sub-element of a tag is the type.
-        ttype = sub[0]
-        # We only care about opening, closing, and empty tags
-        if ttype == 'close':
-            return ttype
-        if ttype == 'open':
-            # Check whether an open tag is empty
-            if (sub[3][-1][0] == 'empty' or
-                (is_empty and is_empty(self.tagName(tag))) ):
-                return 'empty'
-            return ttype
 
 # Parse HTML.  Needs to be read from bottom to top.
 


=== Packages/HTMLStructure/Printer.py 1.1 => 1.2 ===
     out = []
     output = out.append
-    linelist = parsedtext.linemap
+    linelist = parsedtext.lines
 
-    #linemap = {}
-    #for sm in span_marks:
-    #    bline = bisect(linelist, sm[0])
-    #    eline = bisect(linelist, sm[1])
-    #    if sm[1] == linemap[eline]:
-    #        eline -= 1
-    #    linemap.setdefault(bline, [])
-    #    linemap[bline].append(sm)
-    #    if bline != eline:
-    #        linemap.setdefault(eline, [])
-    #        linemap[eline].append(sm)
+    # for each span mark, get the class, start, and stop.
+    #   find out the line number for the start and stop
+    #   each line has two lists: starts here, ends here. 
+    #   Add to starts of start, ends of end.
+
+    startmarks = {}
+    endmarks = {}
+    for sm in span_marks:
+        if sm[2] <= sm[1]:
+            continue
+        bline = bisect(linelist, sm[1])
+        eline = bisect(linelist, sm[2])
+        if sm[2] == linelist[eline]:
+            eline -= 1
+        startmarks.setdefault(bline, []).append(sm)
+        endmarks.setdefault(eline, []).append(sm)
 
     output('<pre class="source">')
     text = parsedtext.text
     lend = 0
+    marks = []
     for n in range(len(linelist)):
         output('<a name="%s" class="linenumber">%s</a>' % (n + 1, n + 1))
         lbegin = lend
         lend = linelist[n]
+
+        for mark in marks:
+            output('<span class="%s">' % mark)
+
+        linemarks = []
+        for mark in startmarks.get(n, []):
+            linemarks.append((mark[1], 1, mark[0]))
+        for mark in endmarks.get(n, []):
+            linemarks.append((mark[2], 0, mark[0]))
+        linemarks.sort()
+        
+        for index, is_start, mark in linemarks:
+            output(escape(text[lbegin:index]))
+            if is_start:
+                output('<span class="%s">' % mark)
+                marks.append(mark)
+            else:
+                output('</span>')
+                marks.pop()
+            lbegin = index
         output(escape(text[lbegin:lend]))
+        
+        for mark in marks:
+            output('</span>')
     output('</pre>')
     return ''.join(out)


=== Packages/HTMLStructure/Validator.py 1.1 => 1.2 ===
 from bisect import bisect
 
 class Validator:
@@ -11,7 +12,7 @@
         '''Check all character entity references'''
         all = []
         errors = []
-        self._collect_entities(self.pt.parse[1], all)
+        self._collect_entities(self.pt.parse, all)
         for entity in all:
             sub = entity[3]
             if sub[-1][0] == 'error' or sub[0][2] - sub[0][1] > 32:
@@ -45,14 +46,19 @@
         unclosed = []
         unopened = []
         
-        parse = self.pt.parse[1]
-        tagName = self.pt.tagName
         unop_names = []
 
-        # Scan non-empty tags (in reverse, as collect_tags returns them)
-        for (i, ttype) in self.collect_tags(empty=0):
-            tname = tagName(parse[i])
-            if ttype == 'close':
+        # Scan non-empty tags in reverse order
+        tags = Wrapper.WrapTags(self.pt)
+        while tags.prev():
+            tag = tags.tag
+            tname = tags.tName()
+            ttype = tags.tType(self.is_empty)
+            i = tags.index
+
+            if not ttype or ttype == 'empty':
+                pass
+            elif ttype == 'close':
                 unopened.append(i)
                 unop_names.append(tname)
             elif unopened and unop_names[-1] == tname:
@@ -66,32 +72,6 @@
         unclosed.reverse()
         return matched, unclosed, unopened
 
-    def collect_tags(self, opening=1, closing=1, empty=1):
-        '''Return a list of tags (in reverse order).
-
-        Elements of the list are (parse index, tag type) pairs.
-        Arguments to the method indicate which types of tag to include.
-        '''
-        parse = self.pt.parse[1]
-        tagName = self.pt.tagName
-        tagType = self.pt.tagType
-        is_empty = self.is_empty
-        octags = []
-        # Scan backwards
-        i = len(parse)
-        while i:
-            i = i - 1
-            ttype = tagType(parse[i], is_empty)
-            # Only bother with open/close/empty tags
-            if ttype is None:
-                continue
-            # Skip tag types that aren't asked for
-            if ((opening and ttype == 'open') or
-                (closing and ttype == 'close') or
-                (empty and ttype == 'empty')):
-                octags.append((i, ttype))
-        return octags
-
     def implicit_match_tags(self, matched, unclosed):
         '''Try to find implicit closing tags for unmatched opening tags.
 
@@ -108,16 +88,15 @@
         # Save effort
         if not unclosed:
             return (), ()
-        parse = self.pt.parse[1]
-        tagName = self.pt.tagName
         impclose_following_map = self.impclose_following_map
         impclose_enclosing_map = self.impclose_enclosing_map
 
         new_matched = []
         still_unclosed = []
+        tags = Wrapper.WrapTags(self.pt)
         for pidx in unclosed:
-            tag = parse[pidx]
-            tname = tagName(tag)
+            tags.goto(pidx)
+            tname = tags.tName()
             enc_closers = impclose_enclosing_map.get(tname)
             if enc_closers is None:
                 still_unclosed.append(pidx)
@@ -127,7 +106,8 @@
                 while 1:
                     idx = tag_iter.next()
                     if idx is None: break
-                    ftname = tagName(parse[idx])
+                    tags.goto(idx)
+                    ftname = tags.tName()
                     if ftname in closers:
                         break
                 if idx is None:
@@ -138,7 +118,8 @@
                             # Implicit close at EOF
                             idx = -1
                     else:
-                        etname = tagName(parse[enc_tag[0]])
+                        tags.goto(enc_tag[0])
+                        etname = tags.tName()
                         if etname in enc_closers or "" in enc_closers:
                             # Implicit close at end of enclosing tag
                             idx = enc_tag[1]
@@ -152,7 +133,6 @@
     stop = 0
     enc_tag = None
     def __init__(self, matched, unclosed, pidx):
-        #import pdb; pdb.set_trace()
         self.matched = matched
         self.unclosed = unclosed
         mpos = bisect(matched, (pidx,))